In [118]:
import numpy as np
import pandas as pd
import gc


In [119]:
seasons = list(range(2011,2020))

df = None

for s in seasons:
    path = './output/{}_boxscores.csv'.format(str(s))
    _df = pd.read_csv(path)
    
    if df is not None:
        df = pd.concat([df,_df],axis=0)
    else:
        df = _df
        
print(len(df))

df = df.drop(columns=['away_ranking','home_ranking'])


102617


In [120]:
all_teams = list(set(list(df['winning_name'].values)+list(df['losing_name'].values)))
print(all_teams[:15])

teams = []
for t in all_teams:
    if '\n\t\t\t' not in t:
        teams.append(t)
        
print(len(teams))
print(teams[:15])

['Saginaw Valley State\n\t\t\t', 'Coppin State', 'Wiley\n\t\t\t', 'Jarvis Christian\n\t\t\t', 'Southern Arkansas\n\t\t\t', 'Catholic\n\t\t\t', 'Northern Michigan\n\t\t\t', 'Brown', 'Louisiana-Monroe', 'Shawnee State\n\t\t\t', 'Madonna\n\t\t\t', 'Erskine\n\t\t\t', 'Iona', 'Hartwick\n\t\t\t', 'Ohio-Chillicothe\n\t\t\t']
355
['Coppin State', 'Brown', 'Louisiana-Monroe', 'Iona', 'UC-Irvine', 'Binghamton', 'Western Kentucky', 'USC Upstate', 'South Dakota State', 'Davidson', 'Lipscomb', 'Arizona', 'Idaho State', 'Baylor', 'Wofford']


In [121]:
def add_season(df):
    # datetime
    df['date'] = pd.to_datetime(df['date'])
    
    df['month'] = df['date'].copy().dt.month.astype(int)
    df['year'] = df['date'].copy().dt.year.astype(int)
    df['season'] = df['year'].copy()
    df['season'] = np.where(df['month']>9, df['season']+1, df['season'])
    
    df = df.drop(columns=['month','year'])
    
    return df

df = add_season(df)


## Features

Time span types
- EWA season
- RA season
- EWA last 5 seasons

Box score feats
- offensive rating
- defensive rating
- pct of points from FTs
- pct of points from 2Pts
- pct of points form 3Pts
- offensive rebound rate
- defensive rebound rate
- pace
- turnover rate
- win pct

*Both for team and opponent



In [122]:
# game ids
def add_ids(df):
    df['wn_copy'] = df['winning_name'].str.replace('\n\t\t\t','').str.replace(' ','').str.lower()
    df['ln_copy'] = df['losing_name'].str.replace('\n\t\t\t','').str.replace(' ','').str.lower()
    
    df['wn_copy2'] = df['wn_copy'].str[:3]
    df['wn_copy3'] = df['wn_copy'].str[-3:]
    
    df['ln_copy2'] = df['ln_copy'].str[:3]
    df['ln_copy3'] = df['ln_copy'].str[-3:]
    
    df['game_id'] = df['date'].dt.strftime('%m%d%y') + df['wn_copy2'] + df['wn_copy3'] + df['ln_copy2']  + df['ln_copy3']
    
#     old = len(df)
    df = df.drop_duplicates(subset=['game_id'])
#     new = len(df)
    # 53184
    
#     print("Successfully dropped {} duplicate box scores".format(old-new))
    
    df = df.drop(columns=['wn_copy2','ln_copy2','wn_copy3','ln_copy3'])
    
    df['team1_id'] = df['season'].astype(str)+df['wn_copy']
    df['team2_id'] = df['season'].astype(str)+df['ln_copy']
    
    df['alt1_id'] = df['wn_copy']
    df['alt2_id'] = df['ln_copy']
    
    df2 = df.copy()
    
    df = df.drop(columns=['team1_id'])
    df2 = df2.drop(columns=['team2_id'])
    df = df.drop(columns=['alt1_id'])
    df2 = df2.drop(columns=['alt2_id'])
    
    df = df.rename(columns={'team2_id':'team_id','alt2_id':'alt_id'})
    df2 = df2.rename(columns={'team1_id':'team_id','alt1_id':'alt_id'})
    
    df = pd.concat([df,df2],axis=0)
    
    df['home_id'] = np.where(df['winner']=='Home', df['season'].astype(str)+df['wn_copy'], df['season'].astype(str)+df['ln_copy'])
    df['away_id'] = np.where(df['winner']=='Away', df['season'].astype(str)+df['wn_copy'], df['season'].astype(str)+df['ln_copy'])
    
    df['win_id'] = df['season'].astype(str)+df['wn_copy']
    df['lose_id'] = df['season'].astype(str)+df['ln_copy']
    
    df = df.drop(columns=['wn_copy','ln_copy'])
    
    print(len(df))
    #106,368
    
    return df

df = add_ids(df)

df[['home_id','away_id','alt_id']].head()

106368


Unnamed: 0,home_id,away_id,alt_id
0,2011airforce,2011colorado-coloradosprings,colorado-coloradosprings
1,2011airforce,2011coloradocollege,airforce
2,2011airforce,2011tennesseestate,tennesseestate
3,2011wofford,2011airforce,wofford
4,2011airforce,2011calstatenorthridge,calstatenorthridge


In [123]:
# need team-specific ratings leading up to game
# opponent ratings will be harder
home_games = df.loc[df['home_id']==df['team_id']]
away_games = df.loc[df['away_id']==df['team_id']]

del df
gc.collect()

118

In [124]:
# change column names
old_cols = list(home_games)
home_cols = []

for oc in old_cols:
    if 'away_' in oc:
        hc = oc.replace('away_','opp_')
    elif 'home_' in oc:
        hc = oc.replace('home_','')
    else:
        hc = oc
    home_cols.append(hc)
    
# opposite for away_cols
away_cols = []

for oc in old_cols:
    if 'home_' in oc:
        ac = oc.replace('home_','opp_')
    elif 'away_' in oc:
        ac = oc.replace('away_','')
    else:
        ac = oc
    away_cols.append(ac)
    
home_games.columns=home_cols
away_games.columns=away_cols

col_order = list(away_games)
home_games = home_games[col_order]

df = pd.concat([home_games,away_games], axis=0)

# already have this column
df = df.drop(columns=['team_id'])

del home_games
del away_games
gc.collect()

14

In [125]:
# cols ready for running averages
dcols = ['offensive_rating','defensive_rating','opp_offensive_rating','opp_defensive_rating','pace']

# cols needed for feature creation
ncols = ['free_throws','two_point_field_goals','three_point_field_goals',
         'offensive_rebounds','defensive_rebounds','opp_offensive_rebounds','opp_defensive_rebounds',
         'turnovers',
         'wins','losses']

# cols necessary to keep 
nncols = ['date','id','alt_id','opp_id','game_id','season','win_id','lose_id','points','opp_points']

cols = nncols+dcols+ncols

old_num_cols = len(list(df))
df = df[cols]
new_num_cols = len(list(df))

print("Dropped {} unnecessary columns".format(old_num_cols-new_num_cols))

Dropped 66 unnecessary columns


In [126]:
df.head()

Unnamed: 0,date,id,alt_id,opp_id,game_id,season,win_id,lose_id,points,opp_points,...,free_throws,two_point_field_goals,three_point_field_goals,offensive_rebounds,defensive_rebounds,opp_offensive_rebounds,opp_defensive_rebounds,turnovers,wins,losses
1,2010-11-17,2011airforce,airforce,2011coloradocollege,111710colegeairrce,2011,2011coloradocollege,2011airforce,57,60,...,16,16.0,3.0,5,30,10,26,16,1,1
3,2010-11-24,2011wofford,wofford,2011airforce,112410airrcewoford,2011,2011airforce,2011wofford,66,72,...,12,15.0,8.0,21,24,2,21,18,2,4
15,2011-01-15,2011airforce,airforce,2011unlv,011511unlnlvairrce,2011,2011unlv,2011airforce,52,64,...,12,11.0,6.0,4,20,9,27,9,10,6
19,2011-01-29,2011tcu,tcu,2011airforce,012911airrcetcutcu,2011,2011airforce,2011tcu,65,66,...,7,17.0,8.0,10,26,3,22,14,10,13
20,2011-02-01,2011airforce,airforce,2011newmexico,020111newicoairrce,2011,2011newmexico,2011airforce,61,75,...,7,18.0,6.0,4,14,9,17,16,12,9


In [127]:
# let's do season by season first
# easy first
gb = df.groupby(['season','alt_id'])['offensive_rating','defensive_rating','pace'].mean().reset_index()

gb = gb.dropna()

gb[['ORtg','DRtg','Pace']] = gb.groupby(['alt_id'])['offensive_rating','defensive_rating','pace'].shift()

gb = gb.dropna()


def team_ewm(team, span=5, alpha=0.85):
    feature_ewm = team.rolling(window=span, min_periods=1).mean()[:span]
    rest = team[span:]
    return pd.concat([feature_ewm, rest]).ewm(alpha=alpha, adjust=False).mean()


gb[['5y_ORtg','5y_DRtg','5y_Pace']] = gb.groupby(['alt_id'])['ORtg','DRtg','Pace'].transform(team_ewm)

gb['id'] = gb['season'].astype(str)+gb['alt_id']

gb.head()


Unnamed: 0,season,alt_id,offensive_rating,defensive_rating,pace,ORtg,DRtg,Pace,5y_ORtg,5y_DRtg,5y_Pace,id
607,2012,airforce,96.251724,97.037931,63.596552,101.38125,101.91875,63.278125,101.38125,101.91875,63.278125,2012airforce
608,2012,akron,103.941176,94.923529,69.105882,100.638889,94.333333,68.527778,100.638889,94.333333,68.527778,2012akron
609,2012,alabama,100.20303,90.230303,64.212121,99.02973,88.462162,67.016216,99.02973,88.462162,67.016216,2012alabama
610,2012,alabama-birmingham,95.109677,96.967742,63.174194,103.645161,94.329032,64.551613,103.645161,94.329032,64.551613,2012alabama-birmingham
611,2012,alabamaa&m,88.796429,99.089286,70.125,87.492857,90.9,73.457143,87.492857,90.9,73.457143,2012alabamaa&m


In [128]:
# now we have 3/10 features. rest will require extra step

gb2 = df.groupby(['season','alt_id'])['free_throws','two_point_field_goals','three_point_field_goals','offensive_rebounds','defensive_rebounds','opp_offensive_rebounds','opp_defensive_rebounds','turnovers','wins','losses','pace'].sum().reset_index()

gb2 = gb2.dropna()

gb2[['fts','2pt','3pt','oreb','dreb','oor','odr','tos','w','l','pace']] = gb2.groupby(['alt_id'])['free_throws','two_point_field_goals','three_point_field_goals','offensive_rebounds','defensive_rebounds','opp_offensive_rebounds','opp_defensive_rebounds','turnovers','wins','losses','pace'].shift()

gb2 = gb2.dropna()


# extra step
gb2['dra'] = gb2['dreb'] + gb2['oor']

# offensive rebounds available
gb2['ora'] = gb2['oreb'] + gb2['odr']

#off/def rebound percentage

gb2['DRebPct'] = gb2['dreb']/gb2['dra']
gb2['ORebPct'] = gb2['oreb']/gb2['ora']
gb2['_points'] = gb2['fts'] + 2*gb2['2pt'] + 3*gb2['3pt']
gb2['FTPct'] = gb2['fts']/gb2['_points']
gb2['2ptPct'] = gb2['2pt']/gb2['_points']
gb2['3ptPct'] = gb2['3pt']/gb2['_points']
gb2['TO%'] = gb2['tos']/gb2['pace']
gb2['gp'] = gb2['w'] + gb2['l']
gb2['W%'] = gb2['w']/gb2['gp']

def team_ewm(team, span=5, alpha=0.85):
    feature_ewm = team.rolling(window=span, min_periods=1).mean()[:span]
    rest = team[span:]
    return pd.concat([feature_ewm, rest]).ewm(alpha=alpha, adjust=False).mean()


gb2[['5y_DReb','5y_OReb','5y_FTPct','5y_2PT','5y_3PT','5y_TO','5y_Win']] = gb2.groupby(['alt_id'])['DRebPct','ORebPct','FTPct','2ptPct','3ptPct','TO%','W%'].transform(team_ewm)

gb2 = gb2.dropna()

gb2['id'] = gb2['season'].astype(str)+gb2['alt_id']

gb2.head()


Unnamed: 0,season,alt_id,free_throws,two_point_field_goals,three_point_field_goals,offensive_rebounds,defensive_rebounds,opp_offensive_rebounds,opp_defensive_rebounds,turnovers,...,gp,W%,5y_DReb,5y_OReb,5y_FTPct,5y_2PT,5y_3PT,5y_TO,5y_Win,id
607,2012,airforce,362,424.0,192.0,153,614,267,659,365,...,528.0,0.583333,0.667954,0.178824,0.201543,0.245902,0.102218,0.185194,0.583333,2012airforce
608,2012,akron,513,661.0,207.0,349,749,330,677,476,...,666.0,0.612613,0.691189,0.263473,0.181062,0.241284,0.112124,0.173085,0.612613,2012akron
609,2012,alabama,443,642.0,136.0,301,765,317,651,415,...,703.0,0.644381,0.687447,0.354839,0.186413,0.314598,0.061464,0.204468,0.644381,2012alabama
610,2012,alabama-birmingham,388,475.0,179.0,306,691,293,591,431,...,496.0,0.75,0.715464,0.313305,0.173667,0.243983,0.112789,0.185398,0.75,2012alabama-birmingham
611,2012,alabamaa&m,359,482.0,147.0,269,580,269,638,448,...,406.0,0.475369,0.682203,0.28934,0.230642,0.280973,0.069137,0.211494,0.475369,2012alabamaa&m


In [129]:
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,date,id,alt_id,opp_id,game_id,season,win_id,lose_id,points,opp_points,...,free_throws,two_point_field_goals,three_point_field_goals,offensive_rebounds,defensive_rebounds,opp_offensive_rebounds,opp_defensive_rebounds,turnovers,wins,losses
0,2010-11-17,2011airforce,airforce,2011coloradocollege,111710colegeairrce,2011,2011coloradocollege,2011airforce,57,60,...,16,16.0,3.0,5,30,10,26,16,1,1
1,2010-11-24,2011wofford,wofford,2011airforce,112410airrcewoford,2011,2011airforce,2011wofford,66,72,...,12,15.0,8.0,21,24,2,21,18,2,4
2,2011-01-15,2011airforce,airforce,2011unlv,011511unlnlvairrce,2011,2011unlv,2011airforce,52,64,...,12,11.0,6.0,4,20,9,27,9,10,6
3,2011-01-29,2011tcu,tcu,2011airforce,012911airrcetcutcu,2011,2011airforce,2011tcu,65,66,...,7,17.0,8.0,10,26,3,22,14,10,13
4,2011-02-01,2011airforce,airforce,2011newmexico,020111newicoairrce,2011,2011newmexico,2011airforce,61,75,...,7,18.0,6.0,4,14,9,17,16,12,9


In [130]:
# compute vectors for every team at time of each game
df = df.sort_values(by=['date'], ascending=True)

# easy first

# season-to-date avg
df[['ORtg','DRtg','Pace']] = df.groupby(['id'])['offensive_rating','defensive_rating','pace'].shift()

# expanding mean
em = df.groupby(['id'])['ORtg','DRtg','Pace'].expanding().mean().reset_index()

if 'level_1' in list(em):
    em = em.drop(columns=['level_1'])

em.head()

# expanding weighted mean

def team_ewm(team, span=50, alpha=0.65):
    feature_ewm = team.rolling(window=span, min_periods=1).mean()[:span]
    rest = team[span:]
    return pd.concat([feature_ewm, rest]).ewm(alpha=alpha, adjust=False).mean()


ewm = df.groupby(['id'])['ORtg','DRtg','Pace'].apply(team_ewm).reset_index()

if 'level_1' in list(ewm):
    ewm = ewm.drop(columns=['level_1'])

ewm.columns=['id','w_ORtg', 'w_DRtg','w_Pace']

gb3 = pd.concat([em, ewm.drop(columns=['id'])], axis=1)

gb3.head()

Unnamed: 0,id,ORtg,DRtg,Pace,w_ORtg,w_DRtg,w_Pace
0,2011airforce,,,,,,
1,2011airforce,106.5,85.5,61.8,106.5,85.5,61.8
2,2011airforce,93.95,85.6,62.0,98.3425,85.565,61.93
3,2011airforce,103.466667,90.866667,65.133333,101.673208,89.011083,64.012167
4,2011airforce,101.925,90.45,65.375,101.836873,89.946379,64.898008


In [132]:
# now let's do the other

# shift up one
df[['fts','2pt','3pt','oreb','dreb','oor','odr','tos','w','l','Pace']] = df.groupby(['id'])['free_throws','two_point_field_goals','three_point_field_goals','offensive_rebounds','defensive_rebounds','opp_offensive_rebounds','opp_defensive_rebounds','turnovers','wins','losses','pace'].shift()


es = df.groupby(['id'])['fts','2pt','3pt','oreb','dreb','oor','odr','tos','w','l','Pace'].expanding().sum().reset_index()


if 'level_1' in list(es):
    es = es.drop(columns=['level_1'])
    
    
# defensive rebounds available
es['dra'] = es['dreb'] + es['oor']

# offensive rebounds available
es['ora'] = es['oreb'] + es['odr']

#off/def rebound percentage

es['DRebPct'] = es['dreb']/es['dra']
es['ORebPct'] = es['oreb']/es['ora']
es['_points'] = es['fts'] + 2*es['2pt'] + 3*es['3pt']
es['FTPct'] = es['fts']/es['_points']
es['2ptPct'] = es['2pt']/es['_points']
es['3ptPct'] = es['3pt']/es['_points']
es['TO%'] = es['tos']/es['Pace']
es['gp'] = es['w'] + es['l']
es['W%'] = es['w']/es['gp']

es.head()



Unnamed: 0,id,fts,2pt,3pt,oreb,dreb,oor,odr,tos,w,...,ora,DRebPct,ORebPct,_points,FTPct,2ptPct,3ptPct,TO%,gp,W%
0,2011airforce,,,,,,,,,,...,,,,,,,,,,
1,2011airforce,14.0,14.0,8.0,12.0,25.0,11.0,22.0,12.0,1.0,...,34.0,0.694444,0.352941,66.0,0.212121,0.212121,0.121212,0.194175,1.0,1.0
2,2011airforce,30.0,30.0,11.0,17.0,55.0,21.0,48.0,28.0,2.0,...,65.0,0.723684,0.261538,123.0,0.243902,0.243902,0.089431,0.225806,3.0,0.666667
3,2011airforce,49.0,55.0,17.0,24.0,79.0,34.0,63.0,44.0,4.0,...,87.0,0.699115,0.275862,210.0,0.233333,0.261905,0.080952,0.225179,6.0,0.666667
4,2011airforce,67.0,73.0,23.0,26.0,100.0,55.0,87.0,59.0,7.0,...,113.0,0.645161,0.230088,282.0,0.237589,0.258865,0.08156,0.225621,10.0,0.7


In [134]:
# weighted sum

def team_ews(team, span=50, alpha=0.65):
    feature_ewm = team.rolling(window=span, min_periods=1).sum()[:span]
    rest = team[span:]
    return pd.concat([feature_ewm, rest]).ewm(alpha=alpha, adjust=False).mean()


ews = df.groupby(['id'])['fts','2pt','3pt','oreb','dreb','oor','odr','tos','w','l','Pace'].apply(team_ews).reset_index()

if 'level_1' in list(ews):
    ews = ews.drop(columns=['level_1'])
    
    
# defensive rebounds available
ews['dra'] = ews['dreb'] + ews['oor']

# offensive rebounds available
ews['ora'] = ews['oreb'] + ews['odr']

#off/def rebound percentage

ews['DRebPct'] = ews['dreb']/es['dra']
ews['ORebPct'] = ews['oreb']/ews['ora']
ews['_points'] = ews['fts'] + 2*ews['2pt'] + 3*ews['3pt']
ews['FTPct'] = ews['fts']/ews['_points']
ews['2ptPct'] = ews['2pt']/ews['_points']
ews['3ptPct'] = ews['3pt']/ews['_points']
ews['TO%'] = ews['tos']/ews['Pace']
ews['gp'] = ews['w'] + ews['l']
ews['W%'] = ews['w']/ews['gp']

es = es[['id','DRebPct','ORebPct','FTPct','2ptPct','3ptPct','TO%','W%']]
ews = ews[['DRebPct','ORebPct','FTPct','2ptPct','3ptPct','TO%','W%']]

es.columns= ['id','Sea_DReb','Sea_OReb','Sea_FT','Sea_2pt','Sea_3pt','Sea_TO','Sea_W']
ews.columns= ['wSea_DReb','wSea_OReb','wSea_FT','wSea_2pt','wSea_3pt','wSea_TO','wSea_W']

gb4 = pd.concat([es, ews], axis=1)

gb4.head()



Unnamed: 0,id,fts,2pt,3pt,oreb,dreb,oor,odr,tos,w,...,ora,DRebPct,ORebPct,_points,FTPct,2ptPct,3ptPct,TO%,gp,W%
0,2011airforce,,,,,,,,,,...,,,,,,,,,,
1,2011airforce,14.0,14.0,8.0,12.0,25.0,11.0,22.0,12.0,1.0,...,34.0,0.694444,0.352941,66.0,0.212121,0.212121,0.121212,0.194175,1.0,1.0
2,2011airforce,24.4,24.4,9.95,15.25,44.5,17.5,38.9,22.4,1.65,...,54.15,0.585526,0.281625,103.05,0.236778,0.236778,0.096555,0.219114,2.3,0.717391
3,2011airforce,40.39,44.29,14.5325,20.9375,66.925,28.225,54.565,36.44,3.1775,...,75.5025,0.592257,0.277309,172.5675,0.234053,0.256653,0.084213,0.223846,4.705,0.675345
4,2011airforce,57.6865,62.9515,20.036375,24.228125,88.42375,45.62875,75.64775,51.104,5.662125,...,99.875875,0.570476,0.242582,243.698625,0.236712,0.258317,0.082218,0.225176,8.14675,0.695016


In [None]:
print(list(df))