In [None]:
import nfl_data_py as nfl
import pandas as pd
import numpy as np

In [None]:
# pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [None]:
year = 2007
pbp = nfl.import_pbp_data([year])

In [None]:
def game_epa(game, penalties=False, weight_by_wp=True):
    # remove posteam None, epa NaN. should catch start, end, timeouts
    game.drop(game[game['posteam'].isna()].index, inplace=True)
    game.drop(game[game['epa'].isna()].index, inplace=True)

    # we can't really properly measure impact of penalties, because we can't calculate expected points of the counterfactual
    # so let's just try removing plays with penalties for now. they are a part of a team's skill, so we can try adding them back in later
    # actually let's make it an arg
    if not penalties:
        game.drop(game[game['penalty']==1].index, inplace=True)

    home_team = game['home_team'].values[0]
    away_team = game['away_team'].values[0]
    game_id = game['game_id'].values[0]
    game_date = game['game_date'].values[0]

    # before doing this, we could downweight epa rows based on win prob
    # honestly i don't think a team is going to play that differently until around 95% - which is basically equivalent to being up 2 and a bit scores in the 3rd quarter
    # so let's do .01 .05 .95 .99, so almost everything is weighted 1, then down to .5 and .1, something like that
    if weight_by_wp:
        weights = pd.to_numeric(pd.cut(abs(game['wp'] - .5), bins=[0, .45, .49, .5], labels=['competitive', 'lopsided', 'over']).replace({'competitive': 1, 'lopsided': .5, 'over': .1}))
        game['epa'] = weights * game['epa']    

    df = game.groupby(['posteam', 'play_type']).agg({'epa': sum})

    # the opposing team's epa during their possessions is the inverse of your defense's epa
    home_off = pd.Series(df.loc[home_team]['epa'])
    home_def = pd.Series(df.loc[away_team]['epa']) * -1
    away_off = pd.Series(df.loc[away_team]['epa'])
    away_def = pd.Series(df.loc[home_team]['epa']) * -1

    columns=['game_id', 'team', "game_date", "extra_point", "extra_point_def", "field_goal", "field_goal_def", "kickoff", "kickoff_def", "punt", "punt_def", "pass", "pass_def", "run", "run_def", "qb_kneel", "qb_kneel_def"]

    home_epa = pd.Series([game_id, home_team, game_date, 
                                        home_off.get('extra_point', default=0), home_def.get('extra_point', default=0),
                                        home_off.get('field_goal', default=0), home_def.get('field_goal', default=0),
                                        home_off.get('kickoff', default=0), home_def.get('kickoff', default=0),
                                        home_off.get('punt', default=0), home_def.get('punt', default=0),
                                        home_off.get('pass', default=0), home_def.get('pass', default=0),
                                        home_off.get('run', default=0), home_def.get('run', default=0),
                                        home_off.get('qb_kneel', default=0), home_def.get('qb_kneel', default=0)],
                        index=columns)
    away_epa = pd.Series([game_id, away_team, game_date, 
                                        away_off.get('extra_point', default=0), away_def.get('extra_point', default=0),
                                        away_off.get('field_goal', default=0), away_def.get('field_goal', default=0),
                                        away_off.get('kickoff', default=0), away_def.get('kickoff', default=0),
                                        away_off.get('punt', default=0), away_def.get('punt', default=0),
                                        away_off.get('pass', default=0), away_def.get('pass', default=0),
                                        away_off.get('run', default=0), away_def.get('run', default=0),
                                        away_off.get('qb_kneel', default=0), away_def.get('qb_kneel', default=0)],
                        index=columns)
    epa = pd.DataFrame([home_epa, away_epa])
    
    return(epa) 


In [None]:
game_ids = pbp['game_id'].unique()

In [None]:
# index = pd.MultiIndex.from_tuples([], names=['game_id', 'team'])
game_epas_raw = pd.DataFrame(columns=['game_id', 'team', "game_date", "extra_point", "extra_point_def", "field_goal", "field_goal_def", "kickoff", "kickoff_def", "punt", "punt_def", "pass", "pass_def", "run", "run_def", "qb_kneel", "qb_kneel_def"])

In [None]:
for id in game_ids:
    game = pbp.loc[pbp['game_id']==id].copy()
    game_epas_raw = pd.concat([game_epas_raw, game_epa(game)])    

In [None]:
epapg_raw = game_epas_raw.drop(['game_id','game_date'], axis=1).groupby('team').agg({'mean'})
epapg_raw.columns = [name[0] for name in epapg_raw.columns.to_flat_index()]
epapg_raw['special_teams'] = epapg_raw['extra_point'] + epapg_raw['extra_point_def'] + epapg_raw['field_goal'] + epapg_raw['field_goal_def'] + epapg_raw['kickoff'] + epapg_raw['kickoff_def'] + epapg_raw['punt'] + epapg_raw['punt_def']
epapg_raw['total_offense'] = epapg_raw['pass'] + epapg_raw['run']
epapg_raw['total_defense'] = epapg_raw['pass_def'] + epapg_raw['run_def']
epapg_raw['total'] = epapg_raw['total_offense'] + epapg_raw['total_defense'] + epapg_raw['special_teams']
epapg_raw.sort_values(by='total', ascending=False).to_csv(f'epapg_raw_{year}.csv')

In [None]:
# so now we take these values as our starting point, and we loop through the games again
# but instead of just slapping the two resulting rows from the game onto the full dataframe,
# we adjust them by the values from this epapg df.
# instead of using pass, we use pass + opponent's average pass defense, etc.

# TODO: at some point, i'd like to a do a version where we actually calculate epapg after each week, instead of using the end-of-season value, but this will do for now

In [None]:
epapg = epapg_raw.copy()

In [None]:
def adjust_row(r,o):
    return pd.Series({'game_id': r['game_id'], 'team': r['team'], 'game_date': r['game_date'],
                               'extra_point': r['extra_point'] + o['extra_point_def'], 'extra_point_def': r['extra_point_def'] + o['extra_point'],
                               'field_goal': r['field_goal'] + o['field_goal_def'], 'field_goal_def': r['field_goal_def'] + o['field_goal'],
                               'kickoff': r['kickoff'] + o['kickoff_def'], 'kickoff_def': r['kickoff_def'] + o['kickoff'],
                               'punt': r['punt'] + o['punt_def'], 'punt_def': r['punt_def'] + o['punt'],
                               'pass': r['pass'] + o['pass_def'], 'pass_def': r['pass_def'] + o['pass'],
                               'run': r['run'] + o['run_def'], 'run_def': r['run_def'] + o['run'],
                               'qb_kneel': r['qb_kneel'] + o['qb_kneel_def'], 'qb_kneel_def': r['qb_kneel_def'] + o['qb_kneel']})

In [None]:
def adjust_game(game_id):
    game_epa_raw = game_epas_raw.loc[game_epas_raw['game_id']==game_id]
    (home_team, away_team) = game_epa_raw['team']
    # r = game_epa_raw.loc[game_epa_raw['team']==home_team]
    game_epa_adj = pd.DataFrame([adjust_row(game_epa_raw.loc[0], epapg.loc[away_team]), adjust_row(game_epa_raw.loc[1], epapg.loc[home_team])])   

    return(game_epa_adj)

In [None]:
i = 0
while i < 25:
    game_epas_adj = pd.DataFrame(columns=['game_id', 'team', "game_date", "extra_point", "extra_point_def", "field_goal", "field_goal_def", "kickoff", "kickoff_def", "punt", "punt_def", "pass", "pass_def", "run", "run_def", "qb_kneel", "qb_kneel_def"])
    for id in game_ids:
        game_epas_adj = pd.concat([game_epas_adj, adjust_game(id)])
    epapg = game_epas_adj.drop(['game_id','game_date'], axis=1).groupby('team').agg({'mean'})
    epapg.columns = [name[0] for name in epapg.columns.to_flat_index()]
    epapg['special_teams'] = epapg['extra_point'] + epapg['extra_point_def'] + epapg['field_goal'] + epapg['field_goal_def'] + epapg['kickoff'] + epapg['kickoff_def'] + epapg['punt'] + epapg['punt_def']
    epapg['total_offense'] = epapg['pass'] + epapg['run']
    epapg['total_defense'] = epapg['pass_def'] + epapg['run_def']
    epapg['total'] = epapg['total_offense'] + epapg['total_defense'] + epapg['special_teams']
    i+=1

In [None]:
epapg.sort_values(by='total', ascending=False).to_csv(f'epapg_{year}.csv')

In [None]:
sos = pd.DataFrame({'epapg_adj': epapg['total'], 'epapg_raw': epapg_raw['total']})
sos['sos'] = sos['epapg_adj'] - sos['epapg_raw']
sos.sort_values(by='sos',ascending=False).to_csv(f'sos_{year}.csv')

In [None]:
from scipy.stats import norm

In [None]:
def win_prob(spread):
    prob_win = 1 - norm.cdf(x=0.5, loc=spread, scale=13.45)
    prob_tie = norm.cdf(x=0.5, loc=spread, scale=13.45) - norm.cdf(x=-0.5, loc=spread, scale=13.45)
    return prob_win + 0.5 * prob_tie

In [None]:
def matchup(team_a, team_b):
    a = epapg.loc[team_a]
    b = epapg.loc[team_b]
    spread = a['total'] - b['total']
    prob = win_prob(spread)
    return spread, prob