In [2]:
import os
import pandas as pd
from urllib.request import urlopen
import re
import random
import time
from io import StringIO

# Download all the games for the years 2020 to 2024

In [8]:
def download_nfl_games(years:range, output_folder: str, year_url: str):
    for year in years:
        print("Downloading NFL games for year:", year)
        url = year_url.format(year=year)
        try:
            # Fetch the HTML content from the URL
            html = urlopen(url).read().decode('utf-8')
            html_cleaned = re.sub(r"<!--|-->", "", html)
            df = pd.read_html(StringIO(html_cleaned), header=0, attrs={"id": "games"})[0]
        except Exception as e:
            print(f"Error downloading data for year {year}: {e}")
            continue

        df_aux = df[df["Week"].astype(str) != "Week"].copy()
        df_aux["phase"] = df_aux["Week"].astype(str).apply(lambda w: "regular" if w.isdigit() else "playoff")
        df_aux = df_aux[df_aux["Date"] != "Playoffs"].copy().reset_index(drop=True)
        df_aux.rename(columns={"Pts": "PtsWinner", "Pts.1": "PtsLosser"}, inplace=True)
        df_aux["PtsWinner"] = pd.to_numeric(df_aux["PtsWinner"], errors="coerce")
        df_aux["PtsLosser"] = pd.to_numeric(df_aux["PtsLosser"], errors="coerce")
        
        df_aux["margin"] = df_aux["PtsWinner"] - df_aux["PtsLosser"]
        df_aux["url"] = url
        
        out_path = os.path.join(output_folder, f"games_{year}.csv")
        df_aux.to_csv(out_path, index=False)
        time.sleep( random.uniform(1, 5))
    print("Download completed for all years.")

In [10]:
year_url = "https://www.pro-football-reference.com/years/{year}/games.htm"
years = range(2020, 2025)
output_folder = "raw_data/nfl_games"

if not os.path.exists(output_folder):
    os.makedirs(output_folder)

download_nfl_games(years, output_folder, year_url)

Downloading NFL games for year: 2020
Downloading NFL games for year: 2021
Downloading NFL games for year: 2022
Downloading NFL games for year: 2023
Downloading NFL games for year: 2024
Download completed for all years.


# Download all the boxscores for the years 2020 to 2024

In [8]:
team_abbr_map = {
    "Buffalo Bills":        "buf",
    "Miami Dolphins":       "mia",
    "New England Patriots": "nwe",
    "New York Jets":        "nyj",
    "Kansas City Chiefs":   "kan",
    "Denver Broncos":       "den",
    "Las Vegas Raiders":    "rai",
    "Los Angeles Chargers": "sdg",
    "Pittsburgh Steelers":  "pit",
    "Cleveland Browns":     "cle",
    "Baltimore Ravens":     "rav",
    "Cincinnati Bengals":   "cin",
    "Indianapolis Colts":   "clt",
    "Tennessee Titans":     "oti",
    "Houston Texans":       "htx",
    "Jacksonville Jaguars": "jax",
    "Dallas Cowboys":       "dal",
    "Philadelphia Eagles":  "phi",
    "Washington Commanders":"was",
    "Washington Football Team": "was",
    "New York Giants":      "nyg",
    "Green Bay Packers":    "gnb",
    "Minnesota Vikings":    "min",
    "Chicago Bears":        "chi",
    "Detroit Lions":        "det",
    "New Orleans Saints":   "nor",
    "Tampa Bay Buccaneers": "tam",
    "Carolina Panthers":    "car",
    "Atlanta Falcons":      "atl",
    "Los Angeles Rams":     "ram",
    "San Francisco 49ers":  "sfo",
    "Arizona Cardinals":    "crd",
    "Seattle Seahawks":     "sea",
}

In [3]:
output_folder_teams = "raw_data/boxscores_prr"
if not os.path.exists(output_folder_teams):
    os.makedirs(output_folder_teams)

In [3]:
def read_games(years: range, output_folder: str):
    games = []
    for year in years:
        file_path = os.path.join(output_folder, f"games_{year}.csv")
        if os.path.exists(file_path):
            df = pd.read_csv(file_path)
            df["season"] = year
            games.append(df)
        else:
            print(f"File not found: {file_path}")
    return pd.concat(games, ignore_index=True) if games else pd.DataFrame()

In [13]:
years = range(2020, 2025)
output_folder = "raw_data/nfl_games"
read_games_df = read_games(years, output_folder)
read_games_df

Unnamed: 0,Week,Day,Date,Time,Winner/tie,Unnamed: 5,Loser/tie,Unnamed: 7,PtsWinner,PtsLosser,YdsW,TOW,YdsL,TOL,phase,margin,url,season
0,1,Thu,2020-09-10,8:20PM,Kansas City Chiefs,,Houston Texans,boxscore,34,20,369,0,360,1,regular,14,https://www.pro-football-reference.com/years/2...,2020
1,1,Sun,2020-09-13,1:00PM,Seattle Seahawks,@,Atlanta Falcons,boxscore,38,25,383,0,506,2,regular,13,https://www.pro-football-reference.com/years/2...,2020
2,1,Sun,2020-09-13,1:00PM,Buffalo Bills,,New York Jets,boxscore,27,17,404,2,254,2,regular,10,https://www.pro-football-reference.com/years/2...,2020
3,1,Sun,2020-09-13,1:00PM,Las Vegas Raiders,@,Carolina Panthers,boxscore,34,30,372,0,388,0,regular,4,https://www.pro-football-reference.com/years/2...,2020
4,1,Sun,2020-09-13,1:00PM,Chicago Bears,@,Detroit Lions,boxscore,27,23,363,0,426,1,regular,4,https://www.pro-football-reference.com/years/2...,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1403,Division,Sun,2025-01-19,3:00PM,Philadelphia Eagles,,Los Angeles Rams,boxscore,28,22,350,0,402,2,playoff,6,https://www.pro-football-reference.com/years/2...,2024
1404,Division,Sun,2025-01-19,6:30PM,Buffalo Bills,,Baltimore Ravens,boxscore,27,25,273,0,416,3,playoff,2,https://www.pro-football-reference.com/years/2...,2024
1405,ConfChamp,Sun,2025-01-26,3:00PM,Philadelphia Eagles,,Washington Commanders,boxscore,55,23,459,0,350,4,playoff,32,https://www.pro-football-reference.com/years/2...,2024
1406,ConfChamp,Sun,2025-01-26,6:30PM,Kansas City Chiefs,,Buffalo Bills,boxscore,32,29,368,1,374,0,playoff,3,https://www.pro-football-reference.com/years/2...,2024


In [4]:
def get_home_abbr_from_row(row) -> str:
    marker = str(row.get("Unnamed: 5", "")).strip()
    if marker == "@":
        home_fullname = row["Loser/tie"]
    else:
        home_fullname = row["Winner/tie"]
    return team_abbr_map.get(home_fullname.strip(), "")

def create_date_param(row) -> str:
    date_str = str(row["Date"])
    try:
        y, m, d = date_str.split("-")
    except ValueError:
        # Handle cases where the date format is not as expected
        print(f"Unexpected date format: {date_str}")
        return ""
    fecha_num = f"{y}{m.zfill(2)}{d.zfill(2)}0"
    return fecha_num

def fetch_table_from_boxscore(url:str) -> pd.DataFrame:
    """
    Descarga la página y extrae la tabla “Passing, Rushing & Receiving”.
    Args:
        url (str): URL de la página de boxscore de un partido de la NFL.

    Returns:
        pd.DataFrame: DataFrame con la tabla de estadísticas de jugadores.
    """
    try:
        response = urlopen(url, timeout=10)
        html = response.read().decode('utf-8')
    except Exception as e:
        print(f"Error fetching URL {url}: {e}")
        return pd.DataFrame()
    html_cleaned = re.sub(r"<!--|-->", "", html)
    try:
        df_prr = pd.read_html(StringIO(html_cleaned),
                                header=1,
                                match="Passing, Rushing")[0]
    except ValueError as e:
        print(f"Error reading HTML table from {url}: {e}")
        return pd.DataFrame()

    if "Rk" in df_prr.columns:
        df_prr = df_prr[df_prr["Rk"].astype(str) != "Rk"].copy()

    # Filtramos las filas que NO tienen 'Passing' ni 'Sk' en columnas clave
    df_clean = df_prr[~df_prr['Player'].isin(['Passing', 'Player']) & ~df_prr['Sk'].isin(['Passing', 'Sk'])].copy().reset_index(drop=True)

    return df_clean


def process_all_games(df:pd.DataFrame, output_folder:str):
    """
    Procesa todos los juegos y guarda las estadísticas de jugadores en archivos CSV.
    Args:
        df (pd.DataFrame): DataFrame con la información de los juegos.
        output_folder (str): Carpeta donde se guardarán los archivos CSV.
    """
    for _, row in df.iterrows():
        home_abbr = get_home_abbr_from_row(row)
        fecha_num = create_date_param(row)
        margin = float(row["margin"])
        if margin < 8:
            print(f"Skipping game with margin < 8: {row['Date']} {home_abbr} vs {row['Loser/tie']} (Margin: {margin})")
            continue
        if not home_abbr or not fecha_num:
            print(f"Skipping row due to missing data: {row}")
            continue

        season = row["season"]
        year_folder = os.path.join(output_folder,str(season))
        os.makedirs(year_folder, exist_ok=True)

        boxscore_url = f"https://www.pro-football-reference.com/boxscores/{fecha_num}{home_abbr}.htm"
        file_name = f"{fecha_num}_{home_abbr}.csv"
        file_path = os.path.join(year_folder, file_name)

        if os.path.exists(file_path):
            print(f"File already exists: {file_path}")
            continue

        df_prr = fetch_table_from_boxscore(boxscore_url)
        if df_prr.empty:
            print(f"No data found for URL: {boxscore_url}")
            continue

        df_prr["fecha"] = fecha_num
        df_prr["team"] = home_abbr
        df_prr.to_csv(file_path, index=False)
        print(f"Saved data to {file_path}")
        time.sleep(random.uniform(1, 5))

In [27]:
years = range(2020, 2025)
output_folder = "raw_data/boxscores_prr"
read_games_df = read_games(years, "raw_data/nfl_games")
process_all_games(read_games_df, output_folder)

Saved data to raw_data/boxscores_prr/2020/202009100_kan.csv
Saved data to raw_data/boxscores_prr/2020/202009130_atl.csv
Saved data to raw_data/boxscores_prr/2020/202009130_buf.csv
Skipping game with margin < 8: 2020-09-13 car vs Carolina Panthers (Margin: 4.0)
Skipping game with margin < 8: 2020-09-13 det vs Detroit Lions (Margin: 4.0)
Saved data to raw_data/boxscores_prr/2020/202009130_rav.csv
Skipping game with margin < 8: 2020-09-13 jax vs Indianapolis Colts (Margin: 7.0)
Saved data to raw_data/boxscores_prr/2020/202009130_min.csv
Saved data to raw_data/boxscores_prr/2020/202009130_nwe.csv
Saved data to raw_data/boxscores_prr/2020/202009130_was.csv
Skipping game with margin < 8: 2020-09-13 cin vs Cincinnati Bengals (Margin: 3.0)
Skipping game with margin < 8: 2020-09-13 sfo vs San Francisco 49ers (Margin: 4.0)
Saved data to raw_data/boxscores_prr/2020/202009130_nor.csv
Skipping game with margin < 8: 2020-09-13 ram vs Dallas Cowboys (Margin: 3.0)
Saved data to raw_data/boxscores_prr/

In [5]:
def read_all_boxscores(base_folder="raw_data/boxscores_prr", years=range(2020, 2025)):
    all_data = []
    for year in years:
        year_folder = os.path.join(base_folder, str(year))
        if not os.path.exists(year_folder):
            continue
        for file in os.listdir(year_folder):
            if file.endswith(".csv"):
                df = pd.read_csv(os.path.join(year_folder, file))
                df["season"] = year
                df["file"] = file
                all_data.append(df)
    return pd.concat(all_data, ignore_index=True) if all_data else pd.DataFrame()


In [6]:
def merge_games_and_boxscores(games_df: pd.DataFrame, boxscores_df: pd.DataFrame) -> pd.DataFrame:
    """
    Combina los DataFrames de juegos y boxscores en base a una clave común.

    Args:
        games_df (pd.DataFrame): DataFrame con información de los juegos.
        boxscores_df (pd.DataFrame): DataFrame con información de los boxscores.

    Returns:
        pd.DataFrame: DataFrame combinado con los datos de juegos y boxscores.
    """
    games_df["boxscore_id"] = games_df.apply(
        lambda row: str(create_date_param(row)) + str(get_home_abbr_from_row(row)), axis=1
    )
    boxscores_df["boxscore_id"] = boxscores_df["fecha"].astype(str) + boxscores_df["team"].astype(str)
    merged_df = pd.merge(games_df, boxscores_df, on="boxscore_id", how="left")

    return merged_df

In [None]:

games_df = read_games(range(2020, 2025), "raw_data/nfl_games")
boxscores_df = read_all_boxscores()
merged_df = merge_games_and_boxscores(games_df, boxscores_df)

def asignar_local_visitante(row):
    if row["Unnamed: 5"] == '@':
        local = row["Loser/tie"]
        visitante = row["Winner/tie"]
    else:
        local = row["Winner/tie"]
        visitante = row["Loser/tie"]
    return pd.Series({"local": local, "visitante": visitante})
merged_df[["local", "visitante"]] = merged_df.apply(asignar_local_visitante, axis=1)
merged_df["versus"] = merged_df["local"] + " vs " + merged_df["visitante"]

merged_df.to_csv("raw_data/merged_games_boxscores.csv", index=False)
merged_df

Unnamed: 0,Week,Day,Date,Time,Winner/tie,Unnamed: 5,Loser/tie,Unnamed: 7,PtsWinner,PtsLosser,...,Lng.2,Fmb,FL,fecha,team,season_y,file,local,visitante,versus
0,1,Thu,2020-09-10,8:20PM,Kansas City Chiefs,,Houston Texans,boxscore,34,20,...,0.0,0.0,0.0,202009100.0,kan,2020.0,202009100_kan.csv,Kansas City Chiefs,Houston Texans,Kansas City Chiefs vs Houston Texans
1,1,Thu,2020-09-10,8:20PM,Kansas City Chiefs,,Houston Texans,boxscore,34,20,...,15.0,0.0,0.0,202009100.0,kan,2020.0,202009100_kan.csv,Kansas City Chiefs,Houston Texans,Kansas City Chiefs vs Houston Texans
2,1,Thu,2020-09-10,8:20PM,Kansas City Chiefs,,Houston Texans,boxscore,34,20,...,0.0,0.0,0.0,202009100.0,kan,2020.0,202009100_kan.csv,Kansas City Chiefs,Houston Texans,Kansas City Chiefs vs Houston Texans
3,1,Thu,2020-09-10,8:20PM,Kansas City Chiefs,,Houston Texans,boxscore,34,20,...,31.0,0.0,0.0,202009100.0,kan,2020.0,202009100_kan.csv,Kansas City Chiefs,Houston Texans,Kansas City Chiefs vs Houston Texans
4,1,Thu,2020-09-10,8:20PM,Kansas City Chiefs,,Houston Texans,boxscore,34,20,...,20.0,0.0,0.0,202009100.0,kan,2020.0,202009100_kan.csv,Kansas City Chiefs,Houston Texans,Kansas City Chiefs vs Houston Texans
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15307,SuperBowl,Sun,2025-02-09,6:30PM,Philadelphia Eagles,,Kansas City Chiefs,boxscore,40,22,...,46.0,0.0,0.0,202502090.0,phi,2024.0,202502090_phi.csv,Philadelphia Eagles,Kansas City Chiefs,Philadelphia Eagles vs Kansas City Chiefs
15308,SuperBowl,Sun,2025-02-09,6:30PM,Philadelphia Eagles,,Kansas City Chiefs,boxscore,40,22,...,22.0,0.0,0.0,202502090.0,phi,2024.0,202502090_phi.csv,Philadelphia Eagles,Kansas City Chiefs,Philadelphia Eagles vs Kansas City Chiefs
15309,SuperBowl,Sun,2025-02-09,6:30PM,Philadelphia Eagles,,Kansas City Chiefs,boxscore,40,22,...,27.0,0.0,0.0,202502090.0,phi,2024.0,202502090_phi.csv,Philadelphia Eagles,Kansas City Chiefs,Philadelphia Eagles vs Kansas City Chiefs
15310,SuperBowl,Sun,2025-02-09,6:30PM,Philadelphia Eagles,,Kansas City Chiefs,boxscore,40,22,...,20.0,0.0,0.0,202502090.0,phi,2024.0,202502090_phi.csv,Philadelphia Eagles,Kansas City Chiefs,Philadelphia Eagles vs Kansas City Chiefs
