In [None]:
# 1. Imports
import nflreadpy as nfl
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

In [None]:
# 2. Prepare the weekly feature dataframe
team_week_df = nfl.load_team_stats(seasons=[2022, 2023, 2024, 2025])
sched_df = nfl.load_schedules(seasons=[2022, 2023, 2024, 2025])

# Only REG season games
# sched_df = 

team_week_df.write_csv('team_data.csv')
sched_df.write_csv('schedules.csv')

# Ensure pandas DataFrames (nflreadpy returns Polars by default)
try:
    team_week_df = team_week_df.to_pandas()
except Exception:
    team_week_df = pd.DataFrame(team_week_df)

try:
    sched_df = sched_df.to_pandas()
except Exception:
    sched_df = pd.DataFrame(sched_df)

# Safe numeric conversions
team_week_df['season'] = team_week_df['season'].astype(int)
team_week_df['week'] = team_week_df['week'].astype(int)

team_week_df['completion_pct'] = team_week_df['completions'] / team_week_df['attempts']

sched_df['season'] = sched_df['season'].astype(int)
sched_df['week']   = sched_df['week'].astype(int)



In [None]:
# Rename team column to something neutral for joining
tw = team_week_df.rename(columns={'team': 'team_code'})

# Merge home features
home_features = tw.add_prefix('home_')
home_features = home_features.rename(columns={'home_season': 'season',
                                              'home_week': 'week',
                                              'home_team_code': 'home_team'})

games = sched_df.merge(
    home_features,
    on=['season', 'week', 'home_team'],
    how='left'
)

# Merge away features
away_features = tw.add_prefix('away_')
away_features = away_features.rename(columns={'away_season': 'season',
                                              'away_week': 'week',
                                              'away_team_code': 'away_team'})

games = games.merge(
    away_features,
    on=['season', 'week', 'away_team'],
    how='left'
)

In [None]:
# 4. Avoid leakage: use lagged or rolling features

tw = team_week_df.sort_values(['team', 'season', 'week']).copy()

group = tw.groupby(['team', 'season'], group_keys=False)

# Shift current-week stats to become "prev week" stats
for col in [
    'passing_yards', 'passing_epa', 'rushing_epa',
    'receiving_epa', 'def_sacks', 'def_interceptions',
    'penalties', 'penalty_yards', 'fg_pct', 'pat_pct', 'completion_pct'
]:
    tw[f'{col}_prev'] = group[col].shift(1)

# Example rolling averages over last N games
N = 4
for col in ['passing_epa', 'rushing_epa', 'def_sacks', 'def_interceptions']:
    tw[f'{col}_roll{N}'] = group[col].shift(1).rolling(N, min_periods=1).mean()

In [None]:
# 4b. Rebuild `games` with rolling features from `tw`
# Ensures home/away only prior-week or rolling features (e.g., passing_epa_roll4) exist in `games`
games = sched_df.copy()

# Select only safe (no-leakage) columns from team-week: team, season, week + *_prev and *_roll{N}
tw_safe_cols = ['team', 'season', 'week'] + [c for c in tw.columns if c.endswith('_prev') or c.endswith(f'_roll{N}')]
tw_safe = tw[tw_safe_cols].copy()

# Home features merge (prefix, then align season/week for join)
home_features = tw_safe.add_prefix('home_')
home_features = home_features.rename(columns={
    'home_season': 'season',
    'home_week': 'week'
})
games = games.merge(
    home_features,
    on=['season', 'week', 'home_team'],
    how='left'
)

# Away features merge
away_features = tw_safe.add_prefix('away_')
away_features = away_features.rename(columns={
    'away_season': 'season',
    'away_week': 'week'
})
games = games.merge(
    away_features,
    on=['season', 'week', 'away_team'],
    how='left'
)

# Chronological sort prior to modeling to enforce time-based split
games = games.sort_values(['season', 'week']).reset_index(drop=True)

# Quick check for expected rolling columns
expected_home = f"home_passing_epa_roll{N}"
expected_away = f"away_passing_epa_roll{N}"
print('Has home rolling:', expected_home in games.columns)
print('Has away rolling:', expected_away in games.columns)

Has home rolling: True
Has away rolling: True


In [None]:
"""
5. Define target variables for score and win/loss
Now in games, you already have:
- home_score
- away_score
- result (often margin from home team perspective; check your data)
You can create targets:

"""

games['home_win']   = (games['home_score'] > games['away_score']).astype(int)
games['margin']     = games['home_score'] - games['away_score']
games['total_pts']  = games['home_score'] + games['away_score']

# For direct score prediction
target_cols = ['home_score', 'away_score']
# For win/loss classification
target_cls  = 'home_win'
# For spread-type / regression
target_margin = 'margin'

In [None]:
# 6. Select and engineer game-level features

games['diff_passing_epa_roll4'] = games['home_passing_epa_roll4'] - games['away_passing_epa_roll4']
games['diff_rushing_epa_roll4'] = games['home_rushing_epa_roll4'] - games['away_rushing_epa_roll4']
games['diff_def_sacks_roll4']   = games['home_def_sacks_roll4']   - games['away_def_sacks_roll4']
games['diff_def_int_roll4']     = games['home_def_interceptions_roll4'] - games['away_def_interceptions_roll4']
games['diff_rest']              = games['home_rest'] - games['away_rest']

In [None]:
games.to_csv('games_with_team_stats.csv')

In [None]:


# Example feature set
feature_cols = [
    'home_passing_epa_roll4', 'away_passing_epa_roll4',
    'home_rushing_epa_roll4', 'away_rushing_epa_roll4',
    'home_def_sacks_roll4', 'away_def_sacks_roll4',
    'home_def_interceptions_roll4', 'away_def_interceptions_roll4',
    'home_rest', 'away_rest', 'home_completion_pct_roll4', 'away_completion_pct_roll4',
    'temp', 'wind',
    'diff_passing_epa_roll4', 'diff_rushing_epa_roll4',
    'diff_def_sacks_roll4', 'diff_def_int_roll4', 'diff_rest'
]

df_model = games.dropna(subset=feature_cols + ['home_win'])
X = df_model[feature_cols]
y = df_model['home_win']



In [None]:

y_margin = df_model['margin']
X_train, X_test, y_train, y_test = train_test_split(
    X, y_margin, test_size=0.2, shuffle=False
)

reg = RandomForestRegressor(
    n_estimators=400,
    max_depth=8,
    random_state=42
)
reg.fit(X_train, y_train)

pred_margin = reg.predict(X_test)
print("MAE:", mean_absolute_error(y_test, pred_margin))

MAE: 11.318751750392718
