In [1]:
# Let's load first some libraries
import numpy as np
import pandas as pd
import nflreadpy as nfl
import pyarrow

In [2]:
# Load the 2015 data to generate the baseline ELO for 2016
df_ls_2015 = nfl.load_schedules(seasons= 2015)
df_ls_2015 = df_ls_2015.to_pandas()

In [3]:
# Formula to calculate the new Elo after each match

def delta(r_home, r_away, points_home, points_away, elo_hfa = 65, K = 20):
    s_home = 1 if points_home > points_away else (0.5 if points_home == points_away else 0)
    delta = K * np.log(np.abs(points_home - points_away) + 1) * (s_home - (1) / (1 + 10 ** - ((r_home - r_away + elo_hfa) /(400))))
    return r_home + delta, r_away - delta

In [4]:
# Select the important columns to calculate the ELON

cols = ['game_type', 'week', 'away_team', 'away_score', 'home_team',
        'home_score', 'result']

# Reduce the DataFrame with the important columns

df_ls_2015_sub = df_ls_2015[cols]

In [5]:
# We will start with a default Elo value of 1500 before the 2015 season. This will help us to have a baseline for the 2016 season

elo_init = 1500 # We are using 1500 as default 
rho = 0.75 #This value is to make an actualization in each new season
elos = {}  # current rating by team

r_home_list = []
r_away_list = []

for _, row in df_ls_2015_sub.iterrows():
    home = row.home_team
    away = row.away_team

    r_home = elos.get(home, elo_init)
    r_away = elos.get(away, elo_init)

    K_row = 24 if row.game_type != "REG" else 20  # 1.2K if not REG

    r_home_new, r_away_new = delta(
        r_home, r_away, row.home_score, row.away_score, K=K_row
    )

    # saving ratings (post-match) in the list
    r_home_list.append(r_home_new)
    r_away_list.append(r_away_new)

    # actualize ratings in the dictionary for the further matchs
    elos[home] = r_home_new
    elos[away] = r_away_new
    
df_ls_2015_sub["r_away"] = r_away_list
df_ls_2015_sub["r_home"] = r_home_list

# The created elos dictionary has the 'SD' and 'OAK' keys, replace them to 'LAC' and 'LV'

rename_map = {"SD": "LAC", "OAK": "LV", "STL": "LA"}

for old, new in rename_map.items():
    if old in elos:
        elos[new] = elos.pop(old)

# We will use the elos dictionary to use it as the baseline for the 2016 season.

In [6]:
# Load the data from 2015 to calculate the off_epa and def_epa_allowed, as well as the turnover_rate

df_lts_2015 = nfl.load_team_stats(seasons = 2015)
df_lts_2015 = df_lts_2015.to_pandas()

In [7]:
# Filter by the usefull columns to calculate the off_epa_pp_prior_2015

cols_off_epa = ['team', 'attempts', 'passing_epa', 'carries', 'rushing_epa']
df_lts_2015_off_epa = df_lts_2015[cols_off_epa]

# Calculate the off_epa_pp_2015

df_lts_2015_off_epa = df_lts_2015_off_epa.groupby(by="team").sum()
df_lts_2015_off_epa['off_epa_pp_2015'] = (df_lts_2015_off_epa["passing_epa"] + df_lts_2015_off_epa["rushing_epa"]) / (df_lts_2015_off_epa["attempts"] + df_lts_2015_off_epa["carries"])

In [8]:
# Filter by the usefull columns to calculate the def_epa_allowed_pp_prior_2015

cols_def_epa = ['opponent_team', 'attempts', 'passing_epa', 'carries', 'rushing_epa']
df_lts_2015_def_epa = df_lts_2015[cols_def_epa]

# Calculate the def_epa_allowed_pp_2015

df_lts_2015_def_epa = df_lts_2015_def_epa.groupby(by="opponent_team").sum()
df_lts_2015_def_epa['def_epa_allowed_pp_2015'] = (df_lts_2015_def_epa["passing_epa"] + df_lts_2015_def_epa["rushing_epa"]) / (df_lts_2015_def_epa["attempts"] + df_lts_2015_def_epa["carries"])

In [9]:
# Filter by the usefull columns to calculate the turnover_rate_2015

cols_turnover = ['team', 'passing_interceptions', 'rushing_fumbles_lost', 'receiving_fumbles_lost', 'sack_fumbles_lost', 'attempts', 'carries']
df_lts_2015_turnover = df_lts_2015[cols_turnover]

# Calculate the turnover_rate_2015

df_lts_2015_turn_over_rate = df_lts_2015_turnover.groupby(by="team").sum()
df_lts_2015_turn_over_rate['turnover_rate_2015'] = (df_lts_2015_turn_over_rate['passing_interceptions'] + 
                                                    df_lts_2015_turn_over_rate['rushing_fumbles_lost'] + 
                                                    df_lts_2015_turn_over_rate['receiving_fumbles_lost'] +
                                                    df_lts_2015_turn_over_rate['sack_fumbles_lost']) / (
                                                    df_lts_2015_turn_over_rate['attempts'] + 
                                                    df_lts_2015_turn_over_rate['carries']
                                                    )

In [10]:
# Convert the off_epa, def_epa and turn_over_rate to dictionaries

off_epa_pp = df_lts_2015_off_epa["off_epa_pp_2015"].to_dict()
def_epa_pp = df_lts_2015_def_epa['def_epa_allowed_pp_2015'].to_dict()
turnover_rate = df_lts_2015_turn_over_rate['turnover_rate_2015'].to_dict()

# The dictionaries are elos, off_epa_pp, def_epa_pp, turnover_rate

In [11]:
# Load the data for the 2016-2025 seasons

# Making a list for the desired seasons

seasons = list(range(2016, 2026))

# Loading the schedules information

df_ls_16_25 = nfl.load_schedules(seasons= seasons)
df_ls_16_25 = df_ls_16_25.to_pandas()

# Loading the team stats information to calculate the off_epa, def_epa, and turnover_rate

df_lts_16_25 = nfl.load_team_stats(seasons = seasons)
df_lts_16_25 = df_lts_16_25.to_pandas()

In [12]:
# Calculate the epa_pp for each offense and the turnovers, first filter the usefull columns

cols_support = [
    'season','week','team','opponent_team',
    'attempts','carries','passing_epa','rushing_epa','passing_interceptions',
    'rushing_fumbles_lost','receiving_fumbles_lost','sack_fumbles_lost'
]

df_lts_16_25_sub = df_lts_16_25.loc[:, cols_support].copy()

plays = df_lts_16_25_sub['attempts'] + df_lts_16_25_sub['carries']
off_epa = df_lts_16_25_sub['passing_epa'] + df_lts_16_25_sub['rushing_epa']
turnover_count = df_lts_16_25_sub['passing_interceptions'] 
+ df_lts_16_25_sub['rushing_fumbles_lost']
+ df_lts_16_25_sub['receiving_fumbles_lost']
+ df_lts_16_25_sub['sack_fumbles_lost']

df_lts_16_25_sub = df_lts_16_25_sub.assign(
    plays =  plays,
    off_epa = off_epa,
    off_epa_pp = off_epa / plays,
    turnover_count = turnover_count,
    turnover_rate = turnover_count / plays
)[['season', 'week', 'team', 'opponent_team', 'plays', 'off_epa', 'off_epa_pp', 'turnover_count', 'turnover_rate']]

In [13]:
# Select the important columns for the final model

cols_model = ['game_id', 'season', 'game_type', 'week', 'away_team', 
              'away_score', 'home_team', 'home_score', 'result', 'total', 
              'overtime', 'away_rest', 'home_rest', 'div_game', 'roof', 
              'surface']

# Reduce the DataFrame with the important columns

df_ls_16_25_sub = df_ls_16_25[cols_model]

# Replace SD to LAC and OAK to LV

team_map = {"SD": "LAC", "OAK": "LV"}

df_ls_16_25_sub["home_team"] = df_ls_16_25_sub["home_team"].replace(team_map)
df_ls_16_25_sub["away_team"] = df_ls_16_25_sub["away_team"].replace(team_map)

In [14]:
r_home_list = []
r_away_list = []
off_epa_pp_home_list = []
off_epa_pp_away_list = []
def_epa_pp_home_list = []
def_epa_pp_away_list = []
turnover_rate_home_list = []
turnover_rate_away_list = []

for _, row in df_ls_16_25_sub.iterrows():

    current_season = row.season
    current_week = row.week

    home = row.home_team
    away = row.away_team
    
    r_home = elos.get(home)
    r_away = elos.get(away)
    off_epa_pp_home = off_epa_pp.get(home)
    off_epa_pp_away = off_epa_pp.get(away)
    def_epa_pp_home = def_epa_pp.get(home)
    def_epa_pp_away = def_epa_pp.get(away)
    turnover_rate_home = turnover_rate.get(home)
    turnover_rate_away = turnover_rate.get(away)
    
    if row.week == 1:
        r_home = elo_init + rho * (r_home - elo_init)
        r_away = elo_init + rho * (r_away - elo_init)
    
    r_home_list.append(r_home)
    r_away_list.append(r_away)
    off_epa_pp_home_list.append(off_epa_pp_home)
    off_epa_pp_away_list.append(off_epa_pp_away)
    def_epa_pp_home_list.append(def_epa_pp_home)
    def_epa_pp_away_list.append(def_epa_pp_away)
    turnover_rate_home_list.append(turnover_rate_home)
    turnover_rate_away_list.append(turnover_rate_away)

    if pd.isna(row["away_score"]):
        continue  # equivalente a "pass" pero s√≠ se brinca la fila
    
    # Calculating the new elo values
    
    K_row = 24 if row.game_type != "REG" else 20  # 1.2K if not REG
    
    r_home_new, r_away_new = delta(
        r_home, r_away, row.home_score, row.away_score, K=K_row
    )
    
    # If the week is less than 4, then we make a ponderation between the last season and the data we have from the previous weeks.
    epa_lambda = 0.35 if row.week < 5 else 0.28
    
    off_epa_pp_home_new = (1 - epa_lambda) * off_epa_pp_home + epa_lambda * df_lts_16_25_sub[(df_lts_16_25_sub.team == row.home_team) & (df_lts_16_25_sub.season == row.season) & (df_lts_16_25_sub.week == row.week)].off_epa_pp.sum()
    off_epa_pp_away_new = (1 - epa_lambda) * off_epa_pp_away + epa_lambda * df_lts_16_25_sub[(df_lts_16_25_sub.team == row.away_team) & (df_lts_16_25_sub.season == row.season) & (df_lts_16_25_sub.week == row.week)].off_epa_pp.sum()
    
    def_epa_pp_home_new = (1 - epa_lambda) * def_epa_pp_home + epa_lambda * df_lts_16_25_sub[(df_lts_16_25_sub.opponent_team == row.home_team) & (df_lts_16_25_sub.season == row.season) & (df_lts_16_25_sub.week == row.week)].off_epa_pp.sum()
    def_epa_pp_away_new = (1 - epa_lambda) * def_epa_pp_away + epa_lambda * df_lts_16_25_sub[(df_lts_16_25_sub.opponent_team == row.away_team) & (df_lts_16_25_sub.season == row.season) & (df_lts_16_25_sub.week == row.week)].off_epa_pp.sum()
    
    turnover_rate_home_new = (1 - epa_lambda) * turnover_rate_home + epa_lambda * df_lts_16_25_sub[(df_lts_16_25_sub.team == row.home_team) & (df_lts_16_25_sub.season == row.season) & (df_lts_16_25_sub.week == row.week)].turnover_rate.sum()
    turnover_rate_away_new = (1 - epa_lambda) * turnover_rate_away + epa_lambda * df_lts_16_25_sub[(df_lts_16_25_sub.team == row.away_team) & (df_lts_16_25_sub.season == row.season) & (df_lts_16_25_sub.week == row.week)].turnover_rate.sum()

    elos[home] = r_home_new
    elos[away] = r_away_new
    off_epa_pp[home] = off_epa_pp_home_new
    off_epa_pp[away] = off_epa_pp_away_new
    def_epa_pp[home] = def_epa_pp_home_new
    def_epa_pp[away] = def_epa_pp_away_new
    turnover_rate[home] = turnover_rate_home_new
    turnover_rate[away] = turnover_rate_away_new

df_ls_16_25_sub["r_away"] = r_away_list
df_ls_16_25_sub["r_home"] = r_home_list
df_ls_16_25_sub["off_epa_pp_away"] = off_epa_pp_away_list
df_ls_16_25_sub["off_epa_pp_home"] = off_epa_pp_home_list
df_ls_16_25_sub["def_epa_pp_away"] = def_epa_pp_away_list
df_ls_16_25_sub["def_epa_pp_home"] = def_epa_pp_home_list
df_ls_16_25_sub["turnover_rate_away"] = turnover_rate_away_list
df_ls_16_25_sub["turnover_rate_home"] = turnover_rate_home_list

In [16]:
df_ls_16_25_sub.to_parquet("schedule_features_2016_2025.parquet", index=False)