## Colorado Elk Tag Application Data Collection

Raw Data that was ingested can be found [Here](https://cpw.state.co.us/thingstodo/Pages/Statistics-Elk.aspx)

### Packages

In [10]:
import pandas as pd
import requests
import pdfplumber
import os
import shutil
from io import StringIO
from bs4 import BeautifulSoup
from re import search

In [12]:
#Webscraping URL's Related to Elk Draw Recaps
#Scraping html
url = 'https://cpw.state.co.us/thingstodo/Pages/Statistics-Elk.aspx'
r = requests.get(url)
html_doc = r.text
soup = BeautifulSoup(html_doc)

#isolating href files to a list
site_URLs = []
for link in soup.find_all('a'):
    site_URLs.append((link.get('href')))

#Converting to DataFrame and Filtering for information on Draw Recap Statistics
elk_URL_DF_Raw = pd.DataFrame(site_URLs,columns =['URL'])
elk_URL_DF_Filtered = elk_URL_DF_Raw.loc[elk_URL_DF_Raw['URL'].str.contains('/Documents/Hunting/BigGame/Statistics/ELK',case=False,na=False)]
elk_URL_DF_Filtered = elk_URL_DF_Filtered.loc[elk_URL_DF_Raw['URL'].str.contains('ElkDrawRecap.pdf',case=False,na=False)]
elk_URL_DF_Filtered['URL'] = 'https://cpw.state.co.us' + elk_URL_DF_Filtered['URL'].astype(str)
elk_URL_DF_Filtered.reset_index(inplace = True, drop=True)

print(elk_URL_DF_Filtered['URL'])

0    https://cpw.state.co.us/Documents/Hunting/BigG...
1    https://cpw.state.co.us/Documents/Hunting/BigG...
2    https://cpw.state.co.us/Documents/Hunting/BigG...
3    https://cpw.state.co.us/Documents/Hunting/BigG...
4    https://cpw.state.co.us/Documents/Hunting/BigG...
5    https://cpw.state.co.us/Documents/Hunting/BigG...
6    https://cpw.state.co.us/Documents/Hunting/BigG...
7    https://cpw.state.co.us/Documents/Hunting/BigG...
Name: URL, dtype: object


## Functions

In [13]:
def download_file(url):
    local_filename = url.split('/')[-1]

    with requests.get(url) as r:
        with open(local_filename, 'wb') as f:
            f.write(r.content)
        
    return local_filename

def check_space(string):
    """Function that returns the number of strings of the inputted string"""
    count = 0
    for i in string:
        if i == " ":
            count += 1
    return count

def find_hunt_code(df):
    """
    Input: Dataframe from the pdf page

    Output: Huntcode as a string
    """
    # Utilize a regular expression to find the Hunt Code
    Hunt_Code_Search = search('[A-Z]{2}\d{3}[A-Z]{1}\d{1}[A-Z]{1}',df.iloc[2,0])

    try:
        Hunt_Code = Hunt_Code_Search.group(0)
        Hunt_Code_Storage = Hunt_Code_Search.group(0)
    except:
        Hunt_Code = None
    
    return Hunt_Code

def strip_whitespace(df):
    """
    Input: Dataframe from the pdf page
    
    Output: Dataframe with whitespaces stripped
    """
    # Verifying that the dtypes are objects
    df_object = df.select_dtypes(['object'])

    #Strip all white spaces
    df[df_object.columns] = df_object.apply(lambda x: x.str.strip())

    return df

def find_preference_point_table(df):
    """
    Input: Dataframe from the pdf page

    Output: Preference Point table or Null
    """
    try:
        PP_Start = df.loc[df['                  Colorado Parks and Wildlife   Draw Recap'].str.contains('Choice Preference',na=False,case=False)]
        Preference_Points = df.iloc[PP_Start.index[0]+1:]
    except:
        PP_Start = df.loc[df['                  Colorado Parks and Wildlife   Draw Recap'].str.contains('Page',na=False,case=False)]
        Preference_Points = df.iloc[PP_Start.index[0]+1:]
    
    return Preference_Points

def choice_finder(df):
    """
    Input: Pre processing Preference Points Dataframe
    
    Output: Isolated Column
    """
    #Check how many spaces are in the strings to see if its in the standardized format, or if an extra character is there
    df['Choice Finder'] = df["Preference Points Table Buffer"].apply(lambda x: check_space(x))

    #Check if the number of spaces matches the number in the standardized format and the format with the choice included
    df = df.loc[(df['Choice Finder'] == 15) | (df['Choice Finder'] == 13)]
    df.reset_index(inplace=True, drop=True)

    #Isolate the choice made in the draw, which is indicated based on a string length of 15
    df['Choice'] = [x[:1] if y == 15 else None for x,y in zip(df['Preference Points Table Buffer'],df['Choice Finder']) ]
    Choice_Index = df[df['Choice'].notnull()].index
    df.bfill(axis='rows',inplace=True)
    df.ffill(axis='rows',inplace=True)

    #Table to merge choices by index after the preference points transpormation is complete
    Choice_Merge = df['Choice']

    #Restucting rows that had the choice in with the preference points values
    df['Preference Points Table Buffer'] = [x[2:] if y == 15 else x for x,y in zip(df['Preference Points Table Buffer'],df['Choice Finder'])]

    return df, Choice_Merge, Choice_Index

def preference_points_clean_up(df,Choice_Index):
    """
    Input: Preference Points Buffer 2 and the Choice Index
        
    Output: Cleaned up preference points table
    """

    #Expand the restructured preference point dataframe, so it matches the format in the pdf
    df_Expanded = df.iloc[:,0].str.split(' ',expand=True)

    try:
        df_Expanded = df_Expanded[[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]]

        #For loop to handle the choice being in the draw portion of the preference point table
        for j in range(7,14):
            df_Expanded.iloc[Choice_Index,j] = df_Expanded.iloc[Choice_Index,j+1]

        #Removes the blank column at the end
        del df_Expanded[14]

        #Remove the redundent preference point column
        del df_Expanded[7]

    except:
        df_Expanded = df_Expanded[[0,1,2,3,4,5,6,7,8,9,10,11,12,13]]

        #Remove the redundent preference point column
        del df_Expanded[7]

    # Remove any residule components that aren't an integer in the preference points columns
    df_Expanded[0] = pd.to_numeric(df_Expanded[0],errors = 'coerce')
    df_Expanded.dropna(inplace = True)
    df_Expanded[0] = pd.to_numeric(df_Expanded[0],downcast="integer")

    #Rename the columns, so they correlate with the pdf format
    df_Expanded.columns = ['Preference Points','A-Adult-Res','A-Adult-NonRes','A-Youth-Res','A-Youth-NonRes','A-Landowner(LPP)-Unrestricted' \
    ,'A-Landownder(LPP)-Restricted','D-Adult-Res','D-Adult-NonRes','D-Youth-Res','D-Youth-NonRes'
    ,'D-Landowner(LPP)-Unrestricted','D-Landownder(LPP)-Restricted']

    return df_Expanded

def preference_points_finalize(df, Choice_Merge, Hunt_Code, Draw_Year):
    """
    Input: Preference Points Expanded DataFrame, Choice Merge Dataframe, Hunt_Code, and Draw Year

    Output: Applicants and Drew Dataframes accordingly
    """
    #Isolate the columns appliable to the draw applicants
    Applicant_Preference_Points_Buffer = df[['Preference Points','A-Adult-Res','A-Adult-NonRes','A-Youth-Res','A-Youth-NonRes','A-Landowner(LPP)-Unrestricted' \
    ,'A-Landownder(LPP)-Restricted']]

    #Merge the Choice on index; creating the applicant and successful draw dataframe
    Applicant_Preference_Points = Applicant_Preference_Points_Buffer.merge(Choice_Merge, left_index=True, right_index=True)

    Draw_Preference_Points_Buffer = df[['Preference Points','D-Adult-Res','D-Adult-NonRes','D-Youth-Res','D-Youth-NonRes'
    ,'D-Landowner(LPP)-Unrestricted','D-Landownder(LPP)-Restricted']]

    Draw_Preference_Points = Draw_Preference_Points_Buffer.merge(Choice_Merge, left_index=True,right_index=True)

    #Add the Hunt Code and Draw year to the dataframes
    Applicant_Preference_Points['Hunt Code'] = Hunt_Code
    Applicant_Preference_Points['Year'] = Draw_Year
    Draw_Preference_Points['Hunt Code'] = Hunt_Code
    Draw_Preference_Points['Year'] = Draw_Year

    #Create a Primary Key for the dataframe
    Applicant_Preference_Points['Hunt Key'] = [x + '-' + y + '-' + str(z) for x,y,z in zip(Applicant_Preference_Points['Hunt Code'], \
        Applicant_Preference_Points['Year'],Applicant_Preference_Points['Preference Points'])]
    Draw_Preference_Points['Hunt Key'] = [x + '-' + y + '-' + str(z) for x,y,z in zip(Draw_Preference_Points['Hunt Code'],Draw_Preference_Points['Year'] \
        ,Draw_Preference_Points['Preference Points'])]

    return Applicant_Preference_Points, Draw_Preference_Points

## Main Processing Section

In [None]:
for i in range(0,len(elk_URL_DF_Filtered)):

    stats_url = download_file(elk_URL_DF_Filtered['URL'][i])

    # Instanciate empty dataframes for the consolidated output of processed pages
    All_Applicant_Preference_Points = pd.DataFrame()
    All_Draw_Preference_Points = pd.DataFrame()
    Pages_with_issues = []

    with pdfplumber.open(stats_url) as pdf:
        number_of_pages = len(pdf.pages)
        for j in range(2,number_of_pages):
            # Export the pdf page's raw text as a dataframe
            page = pdf.pages[j]
            text = page.extract_text()
            df = pd.read_csv(StringIO(text))

            #Length of less than 8 skips pages that only have the portions of the summary table below the preference point table
            if len(df) > 8:
                try:
                    #Find the Hunt Code on the page and leave it the same if there isn't one on the page
                    Hunt_Code_Buffer = find_hunt_code(df)
                    Hunt_Code = Hunt_Code if Hunt_Code_Buffer is None else Hunt_Code_Buffer

                    #print('Hunt Code Found Successfully...')

                    # Utilize a regular expression to find the Year of the Draw Recap
                    Draw_Year_Search = search('\d{4}',df.iloc[0,0])
                    Draw_Year = Draw_Year_Search.group(0)

                    #print('Year Found Successfully...')

                    #Strip whitespaces
                    df_stripped = strip_whitespace(df)

                    #print('Whitespaces Stripped Successfully..')

                    #Isolating where the preference points portion of the dataframe starts
                    Preference_Points_Buffer = find_preference_point_table(df_stripped)

                    #print('Preference Point Table Found Successfully...')

                    Preference_Points_Buffer.reset_index(inplace=True, drop=True)
                    Preference_Points_Buffer.columns = ["Preference Points Table Buffer"]

                    #print('Index reset and column renamed successfully...')

                    #!!!!!!NEED TO LOOK AT UNITS THAT HAVE MULTIPLE CHOICES!!!!!!
                    #Isolate the Choice and reformat to a standardize format for separating the preference points
                    Preference_Points_Buffer2, Choice_Merge, Choice_Index = choice_finder(Preference_Points_Buffer)

                    #print('Choice found successfully...')

                    # Reformat Preference Point DataFrame, so it's easier to interpret
                    Preference_Points_Expanded = preference_points_clean_up(Preference_Points_Buffer2, Choice_Index)

                    #print('Preference Point table cleaned up successfully...')

                    #Perform final clean-up to and segregation
                    Applicant_Preference_Points, Draw_Preference_Points = preference_points_finalize(Preference_Points_Expanded, Choice_Merge, Hunt_Code, Draw_Year)

                    #print('Preference Point Clean-up Completed Successfully')

                    #Append to a generalized Dataframe for multiple pages processed
                    All_Applicant_Preference_Points = All_Applicant_Preference_Points.append(Applicant_Preference_Points)
                    All_Draw_Preference_Points = All_Draw_Preference_Points.append(Draw_Preference_Points)

                    #print('Appended to consolidated dataframe successfully')
                except:
                    Pages_with_issues.append(j)
                    continue
            else:
                continue

    All_Applicant_Preference_Points.to_excel('Output-Data\\Applicant-Data\\'+ Draw_Year+'-All-Applicant-Preference-Points.xlsx')
    All_Draw_Preference_Points.to_excel('Output-Data\\Draw-Data\\'+Draw_Year+'-All-Draw-Preference-Points.xlsx')
    
    Pages_With_Issues_DF = pd.DataFrame(Pages_with_issues, columns=['Pages'])
    if len(Pages_With_Issues_DF) > 0:
        Pages_With_Issues_DF.to_excel('Output-Data\\Pages-with-Issues\\'+ Draw_Year+'-Pages-With-Issues.xlsx')


## Directory Clean-up

In [1]:
# Get current directory
directory = os.getcwd()
    
#Iterate over the directory looking for the downloaded input pdf documents and put into a separate directory
for file in os.listdir(directory):
     filename = os.fsdecode(file)
     if filename.endswith(".pdf"): 
        original = filename
        target = 'Input-Data\\'+filename

        shutil.move(original, target)
        continue
     else:
         continue