#### 1.1 Handle imports up-front 

In [1]:
import time
import urllib.request
from itertools import product
from random import randrange
import pandas as pd
from bs4 import BeautifulSoup

raw_data_path='../data/raw_data_all_positions.pkl'
parsed_data_path='../data/parsed_data_all_positions.pkl'

In [2]:
def download_url(url: str) -> bytes:
    '''Takes string url, downloads URL and returns HTML bytes object'''
    headers={
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
        "Accept-Language": "en-US,en;q=0.9",
        "Connection": "keep-alive",
        "Host": "httpbin.io",
        "Sec-Ch-Ua": '"Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
        "Sec-Ch-Ua-Mobile": "?0",
        "Sec-Ch-Ua-Platform": '"Linux"',
        "Sec-Fetch-Dest": "document",
        "Sec-Fetch-Mode": "navigate",
        "Sec-Fetch-Site": "cross-site",
        "Sec-Fetch-User": "?1",
        "Upgrade-Insecure-Requests": "1",
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
    }
    # Create the request
    request_params = urllib.request.Request(
        url=url,
        headers=headers
    )
    # Get the html
    with urllib.request.urlopen(request_params) as response:
        html=response.read()
    return html

In [3]:
def parse_html_table(html: bytes, year: int, week: int, position: str, profile: str) -> pd.DataFrame:
    '''Takes a html bytes object from URL, parses data table, adds
    year, week, position and scoring profile and returns as pandas dataframe'''
    # Extract the table rows
    soup=BeautifulSoup(html, 'html.parser')
    table=soup.find('table',{'class':'datasmall table'})
    table_rows=table.find_all('tr')
    # Get the column names from the first row
    columns=table_rows[0].find_all('th')
    column_names=[column.getText() for column in columns]
    column_names.extend(['Position', 'Year', 'Week', 'Scoring profile'])
    # Get the values for each row
    data=[]
    for row in table_rows[1:]:
        columns=row.find_all('td')
        values=[column.getText() for column in columns]
        values.extend([position, year, week, profile])
        data.append(values)
    # Convert to pandas dataframe and return
    return pd.DataFrame(columns=column_names, data=data)

In [4]:
# Main script to download data
download_data = True  # Or False, depending on what you want to do
if download_data is True:
    positions = ['qb', 'rb', 'wr', 'te']
    profile = 'p'
    years = list(range(2020, 2024))
    weeks = list(range(1, 19))
    results = []
    for position, year, week in product(positions, years, weeks):
        print(f'Downloading {position.upper()}, {year}, week {week}')
        url = f'https://www.footballguys.com/playerhistoricalstats?pos={position}&yr={year}&startwk={week}&stopwk={week}&profile={profile}'
        # Get the HTML
        html = download_url(url)
        # Parse the HTML
        result = parse_html_table(html, position, year, week, profile)
        # Collect the result
        results.append(result)
        # Wait before downloading the next page
        time.sleep(randrange(1, 5))
    # Combine the week-by-week dataframes
    data_df = pd.concat(results)
elif download_data is False:
    data_df = pd.read_parquet('../data/raw_qb_data.parquet')
    print('Loaded data from file')
# View the resulting DataFrame
print(data_df.head())

Downloading QB, 2020, week 1
Downloading QB, 2020, week 2
Downloading QB, 2020, week 3
Downloading QB, 2020, week 4
Downloading QB, 2020, week 5
Downloading QB, 2020, week 6
Downloading QB, 2020, week 7
Downloading QB, 2020, week 8
Downloading QB, 2020, week 9
Downloading QB, 2020, week 10
Downloading QB, 2020, week 11
Downloading QB, 2020, week 12
Downloading QB, 2020, week 13
Downloading QB, 2020, week 14
Downloading QB, 2020, week 15
Downloading QB, 2020, week 16
Downloading QB, 2020, week 17
Downloading QB, 2020, week 18
Downloading QB, 2021, week 1
Downloading QB, 2021, week 2
Downloading QB, 2021, week 3
Downloading QB, 2021, week 4
Downloading QB, 2021, week 5
Downloading QB, 2021, week 6
Downloading QB, 2021, week 7
Downloading QB, 2021, week 8
Downloading QB, 2021, week 9
Downloading QB, 2021, week 10
Downloading QB, 2021, week 11
Downloading QB, 2021, week 12
Downloading QB, 2021, week 13
Downloading QB, 2021, week 14
Downloading QB, 2021, week 15
Downloading QB, 2021, week 1

In [8]:
test_df = data_df.copy()
test_df[['Player', 'Team']] = test_df['Name'].str.extract(r'^(.*?\b(?:I{1,3}|IV)?)(?:\s+)([A-Z]{2,3})$')
test_df.drop(columns=['Name'], inplace=True)
test_df.rename(columns={'Position': 'Week', 'Year': 'Position', 'Week': 'Year'}, inplace=True)
test_df.head()

Unnamed: 0,Rank,Age,Exp,G,Cmp,Att,Cm%,PYd,Y/Att,PTD,...,Position,Year,Scoring profile,Y/Rsh,Rec,RecYd,RecTD,Y/Rec,Player,Team
0,1,24.0,3.0,1,33,46,71.7,312,6.78,2,...,qb,2020,p,,,,,,Josh Allen,BUF
1,2,32.0,9.0,1,31,35,88.6,322,9.2,4,...,qb,2020,p,,,,,,Russell Wilson,SEA
2,3,37.0,16.0,1,32,44,72.7,364,8.27,4,...,qb,2020,p,,,,,,Aaron Rodgers,GB
3,4,23.0,3.0,1,20,25,80.0,275,11.0,3,...,qb,2020,p,,,,,,Lamar Jackson,BAL
4,5,23.0,2.0,1,26,40,65.0,230,5.75,1,...,qb,2020,p,,,,,,Kyler Murray,ARI


In [9]:
import pandas as pd

parsed_data_path = '../data/parsed_data_all_positions.pkl'


data_df.to_pickle(parsed_data_path)

In [10]:
print(data_df.columns)

Index(['Rank', 'Name', 'Age', 'Exp', 'G', 'Cmp', 'Att', 'Cm%', 'PYd', 'Y/Att',
       'PTD', 'Int', 'Rsh', 'RshYd', 'RshTD', 'FP/G', 'FantPt', 'Position',
       'Year', 'Week', 'Scoring profile', 'Y/Rsh', 'Rec', 'RecYd', 'RecTD',
       'Y/Rec'],
      dtype='object')
