In [3]:
### --------------------------------------------------------------------------
### PART 1 - Configuration
### --------------------------------------------------------------------------
# @jbuddavis
# https://github.com/jbuddavis

# Load modules
import pandas as pd 
import time
import cfbd
import os
from sklearn import linear_model

# track time to complete, turn off pandas warning
start = time.time()
pd.options.mode.chained_assignment = None

#%% Configure Inputs
# Choose what year you would like to perform adjustment on
u_year = 2021 # year of interest

# Configure API key authorization
configuration = cfbd.Configuration()
configuration.api_key['Authorization'] = 'SYwgOy4mzUTmokMpWJEPdBthkwwN/H3hNJtVKudClwqWX9jyt8M89imyMk7ssYbi'
configuration.api_key_prefix['Authorization'] = 'Bearer'

# create empty dataframes to be filled
dfCal = pd.DataFrame() # dataframe for calendar
dfPBP = pd.DataFrame() # dataframe for pbp data
dfGame = pd.DataFrame() # dataframe for game information
dfTeam = pd.DataFrame() # dataframe for team information


In [5]:
### --------------------------------------------------------------------------
### PART 2 - Ping the API
### --------------------------------------------------------------------------
# get calendar for season
api_instance = cfbd.GamesApi(cfbd.ApiClient(configuration))
api_response = api_instance.get_calendar(u_year)
dfCal = pd.DataFrame().from_records([g.to_dict()for g in api_response])

In [24]:
# loop through the calendar and get the PBP for each week
print('Getting PBP data for year '+str(u_year)+'...')
for i in range (0,len(dfCal)):
    # iterate through calendar to get week/season_type variables to pass to the API
    u_week = int(dfCal.loc[i,'week']) # get week from calendar
    u_season_type = dfCal.loc[i,'season_type'] # get season type from calendar
    # Get play-by-play Data
    api_instance = cfbd.PlaysApi(cfbd.ApiClient(configuration))
    api_response = api_instance.get_plays(year=u_year, 
                                          week=u_week, 
                                          season_type=u_season_type)
    dfWk = pd.DataFrame().from_records([g.to_dict()for g in api_response])
    dfPBP = dfPBP.append(dfWk)
    # Get game info (used for homefield advantage)
    api_instance = cfbd.GamesApi(cfbd.ApiClient(configuration))
    api_response = api_instance.get_games(year=u_year, 
                                          week=u_week, 
                                          season_type=u_season_type)
    dfGameWk = pd.DataFrame().from_records([g.to_dict()for g in api_response])
    dfGame = dfGame.append(dfGameWk)
    print('PBP data downloaded for '+u_season_type+' week',u_week)

Getting PBP data for year 2021...
PBP data downloaded for regular week 1
PBP data downloaded for regular week 2
PBP data downloaded for regular week 3
PBP data downloaded for regular week 4
PBP data downloaded for regular week 5
PBP data downloaded for regular week 6
PBP data downloaded for regular week 7
PBP data downloaded for regular week 8
PBP data downloaded for regular week 9
PBP data downloaded for regular week 10
PBP data downloaded for regular week 11
PBP data downloaded for regular week 12
PBP data downloaded for regular week 13
PBP data downloaded for regular week 14
PBP data downloaded for regular week 15
PBP data downloaded for postseason week 1


In [13]:
dfGame = pd.read_csv('games'+str(u_year)+'.csv')
dfPBP = pd.read_csv('allPBP'+str(u_year)+'.csv')

# Get FBS teams
api_instance = cfbd.TeamsApi(cfbd.ApiClient(configuration))
api_response = api_instance.get_fbs_teams(year=u_year) 
dfTeam = pd.DataFrame().from_records([g.to_dict()for g in api_response])
dfTeam = dfTeam[['school']]
dfTeam.to_csv('teams.csv',index=False) #print FBS teams to csv for record keeping

In [14]:
### --------------------------------------------------------------------------
### PART 3 - Format the data
### --------------------------------------------------------------------------
print('Formatting data...')

# Drop non-"fbs-vs-fbs" games
dfPBP = dfPBP[dfPBP['home'].isin(dfTeam.school.to_list())] # drop data if home team not in list of teams
dfPBP = dfPBP[dfPBP['away'].isin(dfTeam.school.to_list())] # drop data if away team not in list of teams
dfPBP.reset_index(inplace=True,drop=True)
dfGame = dfGame[dfGame['home_team'].isin(dfTeam.school.to_list())]
dfGame = dfGame[dfGame['away_team'].isin(dfTeam.school.to_list())]
dfGame.to_csv('games'+str(year)+'.csv', index=False) # print game data to csv for record keeping
dfGame.reset_index(inplace=True,drop=True)   

# drop nas
dfPBP.dropna(subset=['ppa'],inplace=True)
dfPBP.reset_index(inplace=True,drop=True)

# create list of neutral site games
neutralGames = dfGame['id'][dfGame['neutral_site']==True].to_list()

# All Plays
df = dfPBP[['game_id','home','offense','defense','ppa']] # columns of interest
df.loc['hfa'] = None # homefield advantage
df.loc[(df.home == df.offense),'hfa']=1 # home team on offense
df.loc[(df.home == df.defense),'hfa']=-1 # away team on offense
df.loc[(df.game_id.isin(neutralGames)),'hfa']=0 # neutral site games
df = df[['offense','hfa','defense','ppa']] # drop unneeded colums
df.dropna(subset=['ppa'],inplace=True) # drop nas
df.reset_index(inplace=True,drop=True) # reset index
df.to_csv('allPBP'+str(u_year)+'.csv',index=False) # output to csv for record keeping

print('Data formatted')

Formatting data...


KeyError: 'home'

In [21]:
### --------------------------------------------------------------------------
### PART 4 - Initialize Opponent Adjustment
### --------------------------------------------------------------------------
#%% Call the opponent adjustment on our dataframes of interest
# if you just need to tweek the opp-adj func, to save time, after the first 
# round of pbp downloading, comment out lines ~85-167 and read in the pbp csvs here
dfTeam = pd.read_csv('teams.csv')
df = pd.read_csv('allPBP'+str(u_year)+'.csv')

# dataframe column names to help guide opponent adjustment function
offStr = 'offense' # Column of interest, either the team/player we want to adjust
hfaStr = 'hfa' # Homefield Advantage column name
defStr = 'defense' # Opponent column name
stat = 'ppa' # stat to adjust on

In [23]:
### --------------------------------------------------------------------------
### PART 5 - Perform Opponent Adjustment 
### --------------------------------------------------------------------------
# loop through our list of dataframes & adjust each for opponent & homefield advantage
print('Performing Opponent-Adjustment...')

# Create dummy variables for each Team/Opponent, plus Homefield Advantage
dfDummies = pd.get_dummies(df[[offStr, hfaStr, defStr]])

# Hyperparameter tuning for alpha (aka lambda, ie the penalty term) 
# for full season PBP data, the alpha will be 150-200, for smaller sample sizes it may find a higher alpha
rdcv = linear_model.RidgeCV(alphas = [75,100,125,150,175,200,225,250,275,300,325], fit_intercept = True)
rdcv.fit(dfDummies,df[stat]);
alf = rdcv.alpha_

# Or set Alpha directly here
# alf = 175

# Set up ridge regression model parameters
reg = linear_model.Ridge(alpha = alf, fit_intercept = True)  

# Run the regression
# X values in the regression will be dummy variables each Offense/Defense, plus Homefield Advantage
# y values will be the raw value from each game for the specific stat we're adjusting
reg.fit(X = dfDummies, y = df[stat])

# Extract regression coefficients
dfRegResults = pd.DataFrame({
    'coef_name': dfDummies.columns.values,
    'ridge_reg_coef': reg.coef_})

# Add intercept back in to reg coef to get 'adjusted' value
dfRegResults['ridge_reg_value'] = (dfRegResults['ridge_reg_coef']+reg.intercept_)

#Print the HFA and Alpha values
print('Homefield Advantage: (alpha: '+str(alf)+')')
print('{:.3f}'.format(dfRegResults[dfRegResults['coef_name'] == hfaStr]['ridge_reg_coef'][0]))

# Offense
dfAdjOff = (dfRegResults[dfRegResults['coef_name'].str.slice(0, len(offStr)) == offStr].
   rename(columns = {"ridge_reg_value": stat}).
  reset_index(drop = True))
dfAdjOff['coef_name'] = dfAdjOff['coef_name'].str.replace(offStr+'_','')
dfAdjOff = dfAdjOff.drop(columns=['ridge_reg_coef'])

# Defense
dfAdjDef = (dfRegResults[dfRegResults['coef_name'].str.slice(0, len(defStr)) == defStr].
   rename(columns = {"ridge_reg_value": stat}).
  reset_index(drop = True))
dfAdjDef['coef_name'] = dfAdjDef['coef_name'].str.replace(defStr+'_','')
dfAdjDef = dfAdjDef.drop(columns=['ridge_reg_coef'])

Performing Opponent-Adjustment...
Homefield Advantage: (alpha: 225)
0.013


In [26]:
### --------------------------------------------------------------------------
### PART 6 - Join Data and Output
### --------------------------------------------------------------------------

# associate the raw and adjusted epa with each team
dfTeam['rawOff'] = dfTeam.join(df.groupby('offense').mean().ppa, on='school').ppa # raw avg ppa
dfTeam['adjOff'] = dfTeam.join(dfAdjOff.set_index('coef_name'), on='school').ppa # adj est ppa
dfTeam['rawDef'] = dfTeam.join(df.groupby('defense').mean().ppa, on='school').ppa
dfTeam['adjDef'] = dfTeam.join(dfAdjDef.set_index('coef_name'), on='school').ppa

# final formatting and output
dfTeam = dfTeam.round(3) # round adjusted value to thousandths 
print(dfTeam)
dfTeam.to_csv('adj'+str(u_year)+'.csv', index=False)
print('Adjusted Data ouput to: adj'+str(u_year)+'.csv')
end = time.time()
print('Time Elapsed (s): ',round(end-start,1))

                school  rawOff  adjOff  rawDef  adjDef
0            Air Force   0.281   0.270   0.178   0.188
1                Akron   0.142   0.145   0.460   0.386
2              Alabama   0.294   0.326   0.069   0.067
3    Appalachian State   0.258   0.238   0.101   0.123
4              Arizona   0.040   0.090   0.248   0.218
..                 ...     ...     ...     ...     ...
125   Western Kentucky   0.354   0.320   0.190   0.206
126   Western Michigan   0.308   0.266   0.246   0.239
127      West Virginia   0.175   0.173   0.186   0.164
128          Wisconsin   0.156   0.187  -0.010   0.040
129            Wyoming   0.215   0.202   0.170   0.192

[130 rows x 5 columns]
Adjusted Data ouput to: adj2021.csv
Time Elapsed (s):  2593.2
