In [None]:
# Standard library imports
import time
import pickle
import urllib.request
from itertools import product
from random import randrange



raw_data_path='../data/raw_data_all_positions.pkl'
parsed_data_path='../data/parsed_data_all_positions.pkl'

# PyPI imports
import pandas as pd
from bs4 import BeautifulSoup

# Set the data file paths
raw_data_path='../data/raw_data_all_positions.pkl'
parsed_data_path='../data/parsed_data_all_positions.pkl'

In [None]:
def download_url(url: str) -> bytes:
    '''Takes string url, downloads URL and returns HTML bytes object'''

    headers={
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
        "Accept-Language": "en-US,en;q=0.9",
        "Connection": "keep-alive",
        "Host": "httpbin.io",
        "Sec-Ch-Ua": '"Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
        "Sec-Ch-Ua-Mobile": "?0",
        "Sec-Ch-Ua-Platform": '"Linux"',
        "Sec-Fetch-Dest": "document",
        "Sec-Fetch-Mode": "navigate",
        "Sec-Fetch-Site": "cross-site",
        "Sec-Fetch-User": "?1",
        "Upgrade-Insecure-Requests": "1",
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
    }

    # Create the request
    request_params = urllib.request.Request(
        url=url,
        headers=headers
    )   

    # Get the html
    with urllib.request.urlopen(request_params) as response:
        html=response.read()

    return html

In [None]:
def parse_html_table(html: bytes, year: int, week: int, profile: str) -> pd.DataFrame:
    '''Takes a html bytes object from URL, parses data table, adds
    year, week, position and scoring profile and returns as pandas dataframe'''

    # Extract the table rows
    soup=BeautifulSoup(html, 'html.parser')
    table=soup.find('table',{'class':'datasmall table'})
    table_rows=table.find_all('tr')

    # Get the column names from the first row
    columns=table_rows[0].find_all('th')
    column_names=[column.getText() for column in columns]
    column_names.extend(['Year', 'Week', 'Scoring profile'])

    # Get the values for each row
    data=[]

    for row in table_rows[1:]:
        columns=row.find_all('td')
        values=[column.getText() for column in columns]
        values.extend([year, week, profile])
        data.append(values)

    # Convert to pandas dataframe and return
    return pd.DataFrame(columns=column_names, data=data)

In [None]:
%%time

# Main script to download data
download_data = False  # Or False, depending on what you want to do

if download_data is True:
    positions = ['qb', 'rb', 'wr', 'te']
    profile = 'p'
    years = list(range(2020, 2024))
    weeks = list(range(1, 19))

    # Empty dict. to store dataframes for each position
    position_data={}

    # Loop on positions first, and create a separate dataframe for each
    for position in positions:

        # Empty list to collect data for this position
        results = []

        for year, week in product(years, weeks):
            print(f'Downloading {position.upper()}, {year}, week {week}', end='\r')
            url = f'https://www.footballguys.com/playerhistoricalstats?pos={position}&yr={year}&startwk={week}&stopwk={week}&profile={profile}'
            
            # Get the HTML
            html = download_url(url)
            
            # Parse the HTML
            result = parse_html_table(html, year, week, profile)
            
            # Collect the result
            results.append(result)

            # Wait before downloading the next page
            time.sleep(randrange(1, 5))

        # Combine the week-by-week dataframes
        data_df = pd.concat(results)

        # Add the dataframe for this position to the collection
        position_data[position]=data_df
    
    # Save the raw data
    pickle.dump(position_data, open(raw_data_path, 'wb'))
    
elif download_data is False:
    position_data = pickle.load(open(raw_data_path, 'rb'))
    print('Loaded data from file', end='')

print('\n')

In [None]:
# Take a look at the result
for position, data_df in position_data.items():
    print(f'\nPosition: {position}\n')
    print(data_df.head())