In [1]:
# Standard library imports
import time
import pickle
import urllib.request
from itertools import product
from random import randrange



raw_data_path='../data/raw_data_all_positions.pkl'
parsed_data_path='../data/parsed_data_all_positions.pkl'

# PyPI imports
import pandas as pd
from bs4 import BeautifulSoup

# Set the data file paths
raw_data_path='../data/raw_data_all_positions.pkl'
parsed_data_path='../data/parsed_data_all_positions.pkl'

In [2]:
def download_url(url: str) -> bytes:
    '''Takes string url, downloads URL and returns HTML bytes object'''

    headers={
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
        "Accept-Language": "en-US,en;q=0.9",
        "Connection": "keep-alive",
        "Host": "httpbin.io",
        "Sec-Ch-Ua": '"Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
        "Sec-Ch-Ua-Mobile": "?0",
        "Sec-Ch-Ua-Platform": '"Linux"',
        "Sec-Fetch-Dest": "document",
        "Sec-Fetch-Mode": "navigate",
        "Sec-Fetch-Site": "cross-site",
        "Sec-Fetch-User": "?1",
        "Upgrade-Insecure-Requests": "1",
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
    }

    # Create the request
    request_params = urllib.request.Request(
        url=url,
        headers=headers
    )   

    # Get the html
    with urllib.request.urlopen(request_params) as response:
        html=response.read()

    return html

In [3]:
def parse_html_table(html: bytes, year: int, week: int, profile: str) -> pd.DataFrame:
    '''Takes a html bytes object from URL, parses data table, adds
    year, week, position and scoring profile and returns as pandas dataframe'''

    # Extract the table rows
    soup=BeautifulSoup(html, 'html.parser')
    table=soup.find('table',{'class':'datasmall table'})
    table_rows=table.find_all('tr')

    # Get the column names from the first row
    columns=table_rows[0].find_all('th')
    column_names=[column.getText() for column in columns]
    column_names.extend(['Year', 'Week', 'Scoring profile'])

    # Get the values for each row
    data=[]

    for row in table_rows[1:]:
        columns=row.find_all('td')
        values=[column.getText() for column in columns]
        values.extend([year, week, profile])
        data.append(values)

    # Convert to pandas dataframe and return
    return pd.DataFrame(columns=column_names, data=data)

In [4]:
%%time

# Main script to download data
download_data = False  # Or False, depending on what you want to do

if download_data is True:
    positions = ['qb', 'rb', 'wr', 'te']
    profile = 'p'
    years = list(range(2020, 2024))
    weeks = list(range(1, 19))

    # Empty dict. to store dataframes for each position
    position_data={}

    # Loop on positions first, and create a separate dataframe for each
    for position in positions:

        # Empty list to collect data for this position
        results = []

        for year, week in product(years, weeks):
            print(f'Downloading {position.upper()}, {year}, week {week}', end='\r')
            url = f'https://www.footballguys.com/playerhistoricalstats?pos={position}&yr={year}&startwk={week}&stopwk={week}&profile={profile}'
            
            # Get the HTML
            html = download_url(url)
            
            # Parse the HTML
            result = parse_html_table(html, year, week, profile)
            
            # Collect the result
            results.append(result)

            # Wait before downloading the next page
            time.sleep(randrange(1, 5))

        # Combine the week-by-week dataframes
        data_df = pd.concat(results)

        # Add the dataframe for this position to the collection
        position_data[position]=data_df
    
    # Save the raw data
    pickle.dump(position_data, open(raw_data_path, 'wb'))
    
elif download_data is False:
    position_data = pickle.load(open(raw_data_path, 'rb'))
    print('Loaded data from file', end='')

print('\n')

Loaded data from file

CPU times: user 19 ms, sys: 20.8 ms, total: 39.8 ms
Wall time: 49.5 ms


In [5]:
# Take a look at the result
for position, data_df in position_data.items():
    print(f'\nPosition: {position}\n')
    print(data_df.head())


Position: qb

  Rank                Name   Age   Exp  G Cmp Att   Cm%  PYd  Y/Att PTD Int  \
0    1      Josh Allen BUF  24.0   3.0  1  33  46  71.7  312   6.78   2   0   
1    2  Russell Wilson SEA  32.0   9.0  1  31  35  88.6  322   9.20   4   0   
2    3    Aaron Rodgers GB  37.0  16.0  1  32  44  72.7  364   8.27   4   0   
3    4   Lamar Jackson BAL  23.0   3.0  1  20  25  80.0  275  11.00   3   0   
4    5    Kyler Murray ARI  23.0   2.0  1  26  40  65.0  230   5.75   1   1   

  Rsh RshYd RshTD  FP/G FantPt  Year  Week Scoring profile  
0  14    57     1  32.2   32.2  2020     1               p  
1   3    29     0  31.8   31.8  2020     1               p  
2   1     2     0  30.8   30.8  2020     1               p  
3   7    45     0  27.5   27.5  2020     1               p  
4  13    91     1  26.3   26.3  2020     1               p  

Position: rb

  Rank                     Name   Age  Exp  G Rsh RshYd Y/Rsh RshTD Rec RecYd  \
0    1           Josh Jacobs LV  22.0  2.0  1  2

In [6]:
test_df = data_df.copy()
test_df[['Player', 'Team']] = test_df['Name'].str.extract(r'^(.*?\b(?:I{1,3}|IV)?)(?:\s+)([A-Z]{2,3})$')
test_df.drop(columns=['Name'], inplace=True)
test_df.rename(columns={'Position': 'Week', 'Year': 'Position', 'Week': 'Year'}, inplace=True)
test_df.head()



Unnamed: 0,Rank,Age,Exp,G,Rec,RecYd,Y/Rec,RecTD,FP/G,FantPt,Position,Year,Scoring profile,Player,Team
0,1,25.0,3.0,1,8,101,12.6,1,24.1,24.1,2020,1,p,Dallas Goedert,PHI
1,2,25.0,3.0,1,5,58,11.6,2,22.8,22.8,2020,1,p,Mark Andrews,BAL
2,3,23.0,2.0,1,5,81,16.2,1,19.1,19.1,2020,1,p,Noah Fant,DEN
3,4,31.0,8.0,1,6,50,8.3,1,17.0,17.0,2020,1,p,Travis Kelce,KC
4,5,23.0,2.0,1,5,56,11.2,1,16.6,16.6,2020,1,p,T.J. Hockenson,DET


In [7]:
import pandas as pd

parsed_data_path = '../data/parsed_data_all_positions.pkl'


data_df.to_pickle(parsed_data_path)