In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
from bs4 import BeautifulSoup, Comment
import time
import json
import os
import warnings
from bs4 import MarkupResemblesLocatorWarning

# Ignore MarkupResemblesLocatorWarning
warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)

In [2]:
team_dict = {'Arizona Cardinals': 'ARI',
 'Atlanta Falcons': 'ATL',
 'Baltimore Colts': 'IND',
 'Baltimore Ravens': 'BAL',
 'Boston Patriots': 'NE',
 'Buffalo Bills': 'BUF',
 'Carolina Panthers': 'CAR',
 'Chicago Bears': 'CHI',
 'Cincinnati Bengals': 'CIN',
 'Cleveland Browns': 'CLE',
 'Dallas Cowboys': 'DAL',
 'Denver Broncos': 'DEN',
 'Detroit Lions': 'DET',
 'Green Bay Packers': 'GB',
 'Houston Oilers': 'TEN',
 'Houston Texans': 'HOU',
 'Indianapolis Colts': 'IND',
 'Jacksonville Jaguars': 'JAX',
 'Kansas City Chiefs': 'KC',
 'Las Vegas Raiders': 'LVR',
 'Los Angeles Chargers': 'LAC',
 'Los Angeles Raiders': 'LVR',
 'Los Angeles Rams': 'LAR',
 'Miami Dolphins': 'MIA',
 'Minnesota Vikings': 'MIN',
 'New England Patriots': 'NE',
 'New Orleans Saints': 'NO',
 'New York Giants': 'NYG',
 'New York Jets': 'NYJ',
 'Oakland Raiders': 'LVR',
 'Philadelphia Eagles': 'PHI',
 'Phoenix Cardinals': 'ARI',
 'Pittsburgh Steelers': 'PIT',
 'San Diego Chargers': 'LAC',
 'San Francisco 49ers': 'SF',
 'Seattle Seahawks': 'SEA',
 'St. Louis Cardinals': 'ARI',
 'St. Louis Rams': 'LAR',
 'Tampa Bay Buccaneers': 'TB',
 'Tennessee Oilers': 'TEN',
 'Tennessee Titans': 'TEN',
 'Washington Commanders': 'WAS',
 'Washington Football Team': 'WAS',
 'Washington Redskins': 'WAS'}

def format_week(week):
    if week.isdigit():
        return f"Week{week}"
    return week

In [3]:
def fetch_game_context(season):
    base_url = f"https://www.pro-football-reference.com/years/{season}/games.htm"
    response = requests.get(base_url)
    if response.status_code != 200:
        print(f"Error fetching the season page: {response.status_code}")
        return None
    
    soup = BeautifulSoup(response.content, 'html.parser')
    games_raw = soup.find_all('script', attrs={'type': 'application/ld+json'})
    games_content = games_raw[0].string  
    games_dict = json.loads(games_content)
    game_names = [game['name'] for game in games_dict]
    game_urls = [game['url'] for game in games_dict]

    games_table = soup.find('table', id='games')
    games_df = pd.read_html(str(games_table))[0]
    games_df = games_df.loc[games_df['Week'] != 'Week']
    games_df = games_df.dropna(subset=['Week'])
    games_df['url'] = game_urls

    aways = []
    homes = []
    for game in game_names:
        away, home = game.split('@')
        aways.append(team_dict[away[:-1]])
        homes.append(team_dict[home[1:]])
    games_df['Home'] = homes
    games_df['Away'] = aways
    games_df['Week'] = games_df['Week'].apply(format_week)
    games_df['File'] = '' + str(season) + '_' + games_df['Week'] + '_' + games_df['Home'] + '_vs_' + games_df['Away']

    return games_df

def fetch_game_stats(game_url):
    response = requests.get(game_url)
    if response.status_code != 200:
        print(f"Error fetching the game page: {response.status_code}")
        return None

    soup = BeautifulSoup(response.content, 'html.parser')
    
    all_tables = []
    # Find tables not in comments
    all_tables.extend(soup.find_all('table'))

    # Find all comments and then find tables within those comments
    comments = soup.find_all(string=lambda text: isinstance(text, Comment))
    for comment in comments:
        comment_soup = BeautifulSoup(str(comment), 'html.parser')
        tables_in_comment = comment_soup.find_all('table')
        all_tables.extend(tables_in_comment)

    game_data = {}
    for table in all_tables:
        table_id = table.get('id')
        if table_id:
            game_data[table_id] = (pd.read_html(str(table))[0])

    return game_data

In [4]:
def collect_season_stats(season, debug = False):
    start_time = time.time()
    df = fetch_game_context(season)

    all_game_stats = {}
    request_count = 0
    
    directory_path = os.path.join('raw_data', str(season))
    file_path = os.path.join(directory_path, f"{season}games.csv")
    os.makedirs(directory_path, exist_ok=True)
    df.to_csv(file_path, index=False)
    
    for i in range(len(df)):
        game = df.iloc[i]
        if request_count >= 19:
            print("Reached 20 requests, sleeping for 60 seconds...")
            time.sleep(60)  # sleep for 60 seconds to respect the rate limit
            request_count = 0
        directory_path = os.path.join('raw_data', str(season), game['Week'], game['File'])
        os.makedirs(directory_path, exist_ok=True)
        
        try:
            game_stats = fetch_game_stats(game['url'])
        except:
            print("Requests being limited, sleeping for 60 seconds...")
            time.sleep(60)
            request_count = 0
            game_stats = fetch_game_stats(game['url'])
        
        if game_stats:
            all_game_stats[game['File']] = game_stats
            for key in game_stats.keys():
                file_path = os.path.join(directory_path, f"{key}.csv")
                game_stats[key].to_csv(file_path, index=False)
            request_count += 1
            if debug:
                print(f"Finished Processing {game['File']}...")
    
    decimal_seconds = time.time() - start_time
    whole_seconds = int(decimal_seconds)
    fractional_seconds = decimal_seconds - whole_seconds
    minutes = whole_seconds // 60 
    seconds = whole_seconds % 60
    seconds += fractional_seconds
    formatted_time = f"{minutes:02d}:{seconds:06.3f}"
    print(f'Completed Data Scraping of {season} NFL Season in {formatted_time} Seconds')
    return df, all_game_stats

In [None]:
for season in range(2022, 2023):
    _, _ = collect_season_stats(season, debug = True)