# NFL Analytics Data Pipeline

This notebook sets up SSL certificates, installs dependencies, and runs the pipeline to fetch NFL datasets via `nfl_data_py`.

In [14]:
# SSL + dependencies
# If you created the venv via `make venv`, you can instead select that kernel here.
%pip install -q -r requirements.txt

import os
try:
    import certifi
    os.environ.setdefault('SSL_CERT_FILE', certifi.where())
except Exception:
    pass
print('SSL_CERT_FILE=', os.environ.get('SSL_CERT_FILE', ''))

Note: you may need to restart the kernel to use updated packages.
SSL_CERT_FILE= /Users/aksharravichandran/Documents/GitHub/NFLAnalytics/.venv/lib/python3.12/site-packages/certifi/cacert.pem


In [15]:
# Verify imports
import nfl_data_py as nfl
import pandas as pd
import pyarrow as pa
print('nfl_data_py ready')

nfl_data_py ready


In [16]:
# Parameters
YEARS = '2019-2024'  # e.g., '2021,2022,2023' or '2019-2024'
PRESET = 'modeling'  # 'basic' | 'modeling' | 'all'
OUT_DIR = 'data/raw'
FORMAT = 'csv'   # 'parquet' | 'csv'
PBP_CACHE = True
PBP_CACHE_DIR = '.nfl_pbp_cache'


In [17]:
# Run the pipeline script using the current Python kernel
import sys, shlex, subprocess
cmd = f"{sys.executable} scripts/fetch_nfl_data.py --years {YEARS} --preset {PRESET} --out {OUT_DIR} --format {FORMAT}"
if PBP_CACHE:
    cmd += ' --pbp-cache'
if PBP_CACHE_DIR:
    cmd += f" --pbp-cache-dir {PBP_CACHE_DIR}"
print(cmd)
completed = subprocess.run(shlex.split(cmd), check=False)
print('Return code:', completed.returncode)

/Users/aksharravichandran/Documents/GitHub/NFLAnalytics/.venv/bin/python scripts/fetch_nfl_data.py --years 2019-2024 --preset modeling --out data/raw --format csv --pbp-cache --pbp-cache-dir .nfl_pbp_cache


2019 done.
2020 done.
2021 done.
2022 done.
2023 done.
2024 done.
2019 done.
2020 done.
2021 done.
2022 done.
2023 done.
2024 done.


weekly_rosters: cannot reindex on an axis with duplicate labels


Done. Output written under: data/raw
Return code: 0


pfr/season_pass: import_seasonal_pfr() got multiple values for argument 's_type'
pfr/season_rec: import_seasonal_pfr() got multiple values for argument 's_type'
pfr/season_rush: import_seasonal_pfr() got multiple values for argument 's_type'
pfr/weekly_pass: import_weekly_pfr() got multiple values for argument 's_type'
pfr/weekly_rec: import_weekly_pfr() got multiple values for argument 's_type'
pfr/weekly_rush: import_weekly_pfr() got multiple values for argument 's_type'


In [18]:
# Inspect outputs
from pathlib import Path
root = Path(OUT_DIR)
files = list(root.rglob('*.csv')) + list(root.rglob('*.parquet'))
print(f'Total files: {len(files)}')
for p in sorted(files)[:20]:
    print(p)

Total files: 26
data/raw/depth_charts/depth_charts_2019_2024.csv
data/raw/depth_charts/depth_charts_2019_2024.parquet
data/raw/ids/ids_all.csv
data/raw/ids/ids_all.parquet
data/raw/injuries/injuries_2019_2024.csv
data/raw/injuries/injuries_2019_2024.parquet
data/raw/ngs/passing/ngs_passing_2019_2024.csv
data/raw/ngs/passing/ngs_passing_2019_2024.parquet
data/raw/ngs/receiving/ngs_receiving_2019_2024.csv
data/raw/ngs/receiving/ngs_receiving_2019_2024.parquet
data/raw/ngs/rushing/ngs_rushing_2019_2024.csv
data/raw/ngs/rushing/ngs_rushing_2019_2024.parquet
data/raw/pbp/pbp_2019_2024.csv
data/raw/pbp/pbp_2019_2024.parquet
data/raw/schedules/schedules_2019_2024.csv
data/raw/schedules/schedules_2019_2024.parquet
data/raw/seasonal/seasonal_REG_2019_2024.csv
data/raw/seasonal/seasonal_REG_2019_2024.parquet
data/raw/seasonal_rosters/seasonal_rosters_2019_2024.csv
data/raw/seasonal_rosters/seasonal_rosters_2019_2024.parquet


In [19]:
# Preview a sample file
import pandas as pd
from pathlib import Path
sample = None
for ext in ('*.csv', '*.parquet'):
    paths = list(Path(OUT_DIR).rglob(ext))
    if paths:
        sample = sorted(paths)[0]
        break
if sample:
    print('Reading', sample)
    if str(sample).endswith('.csv'):
        df = pd.read_csv(sample)
    else:
        df = pd.read_parquet(sample)
    display(df.head())
else:
    print('No files found. Run the pipeline cell above.')

Reading data/raw/depth_charts/depth_charts_2019_2024.csv


Unnamed: 0,season,club_code,week,game_type,depth_team,last_name,first_name,football_name,formation,gsis_id,jersey_number,position,elias_id,depth_position,full_name
0,2019,ATL,1.0,REG,3,Miller,Jordan,Jordan,Defense,00-0035285,32,CB,MIL373762,CB,Jordan Miller
1,2019,ATL,1.0,REG,2,Neasman,Sharrod,Sharrod,Defense,00-0032592,35,CB,NEA752713,S,Sharrod Neasman
2,2019,ATL,1.0,REG,1,Bosher,Matt,Matt,Special Teams,00-0028130,5,P,BOS219856,P,Matt Bosher
3,2019,ATL,1.0,REG,1,Bosher,Matt,Matt,Special Teams,00-0028130,5,P,BOS219856,H,Matt Bosher
4,2019,ATL,1.0,REG,1,Bosher,Matt,Matt,Special Teams,00-0028130,5,P,BOS219856,KOS,Matt Bosher
