In [7]:
import numpy as np
import pandas as pd
import gc


In [2]:
seasons = list(range(2011,2020))

df = None

for s in seasons:
    path = './output/{}_boxscores.csv'.format(str(s))
    _df = pd.read_csv(path)
    
    if df is not None:
        df = pd.concat([df,_df],axis=0)
    else:
        df = _df
        
print(len(df))

df = df.drop(columns=['away_ranking','home_ranking'])


102617


In [3]:
all_teams = list(set(list(df['winning_name'].values)+list(df['losing_name'].values)))
print(all_teams[:15])

teams = []
for t in all_teams:
    if '\n\t\t\t' not in t:
        teams.append(t)
        
print(len(teams))
print(teams[:15])

['Ferrum\n\t\t\t', 'Georgian Court\n\t\t\t', 'Martin Methodist\n\t\t\t', 'Idaho', 'Denison\n\t\t\t', 'Louisiana College\n\t\t\t', 'Cumberland\n\t\t\t', 'Milwaukee', 'Dalton State\n\t\t\t', 'Minot State\n\t\t\t', 'St. Thomas Aquinas\n\t\t\t', 'Ohio', 'UConn', 'Quincy\n\t\t\t', 'Louisiana State-Alexandria\n\t\t\t']
355
['Idaho', 'Milwaukee', 'Ohio', 'UConn', 'North Dakota', 'Omaha', 'Colgate', 'Ball State', 'Delaware', 'Lafayette', 'Creighton', 'Oregon', 'Utah State', 'Wake Forest', 'Miami (FL)']


In [4]:
def add_season(df):
    # datetime
    df['date'] = pd.to_datetime(df['date'])
    
    df['month'] = df['date'].copy().dt.month.astype(int)
    df['year'] = df['date'].copy().dt.year.astype(int)
    df['season'] = df['year'].copy()
    df['season'] = np.where(df['month']>9, df['season']+1, df['season'])
    
    df = df.drop(columns=['month','year'])
    
    return df

df = add_season(df)


## Features

Time span types
- EWA season
- RA season
- EWA last 5 seasons

Box score feats
- offensive rating
- defensive rating
- pct of points from FTs
- pct of points from 2Pts
- pct of points form 3Pts
- offensive rebound rate
- defensive rebound rate
- pace
- win pct
- past opp offensive rtg
- past opp defensive rtg

*Both for team and opponent



In [5]:
# game ids
def add_ids(df):
    df['wn_copy'] = df['winning_name'].str.replace('\n\t\t\t','').str.replace(' ','').str.lower()
    df['ln_copy'] = df['losing_name'].str.replace('\n\t\t\t','').str.replace(' ','').str.lower()
    
    df['wn_copy2'] = df['wn_copy'].str[:3]
    df['wn_copy3'] = df['wn_copy'].str[-3:]
    
    df['ln_copy2'] = df['ln_copy'].str[:3]
    df['ln_copy3'] = df['ln_copy'].str[-3:]
    
    df['game_id'] = df['date'].dt.strftime('%m%d%y') + df['wn_copy2'] + df['wn_copy3'] + df['ln_copy2']  + df['ln_copy3']
    
#     old = len(df)
    df = df.drop_duplicates(subset=['game_id'])
#     new = len(df)
    # 53184
    
#     print("Successfully dropped {} duplicate box scores".format(old-new))
    
    df = df.drop(columns=['wn_copy2','ln_copy2','wn_copy3','ln_copy3'])
    
    df['team1_id'] = df['season'].astype(str)+df['wn_copy']
    df['team2_id'] = df['season'].astype(str)+df['ln_copy']
    
    df2 = df.copy()
    
    df = df.drop(columns=['team1_id'])
    df2 = df2.drop(columns=['team2_id'])
    df = df.rename(columns={'team2_id':'team_id'})
    df2 = df2.rename(columns={'team1_id':'team_id'})
    
    df = pd.concat([df,df2],axis=0)
    
    df['home_id'] = np.where(df['winner']=='Home', df['season'].astype(str)+df['wn_copy'], df['season'].astype(str)+df['ln_copy'])
    df['away_id'] = np.where(df['winner']=='Away', df['season'].astype(str)+df['wn_copy'], df['season'].astype(str)+df['ln_copy'])
    
    df['win_id'] = df['season'].astype(str)+df['wn_copy']
    df['lose_id'] = df['season'].astype(str)+df['ln_copy']
    
    df = df.drop(columns=['wn_copy','ln_copy'])
    
    print(len(df))
    #106,368
    
    return df

df = add_ids(df)

df[['home_id','away_id']].head()

106368


Unnamed: 0,home_id,away_id
0,2011airforce,2011colorado-coloradosprings
1,2011airforce,2011coloradocollege
2,2011airforce,2011tennesseestate
3,2011wofford,2011airforce
4,2011airforce,2011calstatenorthridge


In [6]:
# need team-specific ratings leading up to game
# opponent ratings will be harder
home_games = df.loc[df['home_id']==df['team_id']]
away_games = df.loc[df['away_id']==df['team_id']]

del df
gc.collect()

NameError: name 'gc' is not defined

In [8]:
# change column names
old_cols = list(home_games)
home_cols = []

for oc in old_cols:
    if 'away_' in oc:
        hc = oc.replace('away_','opp_')
    elif 'home_' in oc:
        hc = oc.replace('home_','')
    else:
        hc = oc
    home_cols.append(hc)
    
# opposite for away_cols
away_cols = []

for oc in old_cols:
    if 'home_' in oc:
        ac = oc.replace('home_','opp_')
    elif 'away_' in oc:
        ac = oc.replace('away_','')
    else:
        ac = oc
    away_cols.append(ac)
    
home_games.columns=home_cols
away_games.columns=away_cols

col_order = list(away_games)
home_games = home_games[col_order]

df = pd.concat([home_games,away_games], axis=0)

# already have this column
df = df.drop(columns=['team_id'])

del home_games
del away_games
gc.collect()

['assist_percentage', 'assists', 'block_percentage', 'blocks', 'defensive_rating', 'defensive_rebound_percentage', 'defensive_rebounds', 'effective_field_goal_percentage', 'field_goal_attempts', 'field_goal_percentage', 'field_goals', 'free_throw_attempt_rate', 'free_throw_attempts', 'free_throw_percentage', 'free_throws', 'losses', 'minutes_played', 'offensive_rating', 'offensive_rebound_percentage', 'offensive_rebounds', 'personal_fouls', 'points', 'steal_percentage', 'steals', 'three_point_attempt_rate', 'three_point_field_goal_attempts', 'three_point_field_goal_percentage', 'three_point_field_goals', 'total_rebound_percentage', 'total_rebounds', 'true_shooting_percentage', 'turnover_percentage', 'turnovers', 'two_point_field_goal_attempts', 'two_point_field_goal_percentage', 'two_point_field_goals', 'win_percentage', 'wins', 'date', 'opp_assist_percentage', 'opp_assists', 'opp_block_percentage', 'opp_blocks', 'opp_defensive_rating', 'opp_defensive_rebound_percentage', 'opp_defensiv

14

In [21]:
# cols ready for running averages
dcols = ['offensive_rating','defensive_rating','pace']

# cols needed for feature creation
ncols = ['free_throws','two_point_field_goals','three_point_field_goals',
        'offensive_rebounds','defensive_rebounds','opp_offensive_rebounds','opp_defensive_rebounds',
        'wins','losses',
        'opp_offensive_rating','opp_defensive_rating']

# cols necessary to keep 
nncols = ['date','id','opp_id','game_id','season','win_id','lose_id','points','opp_points']

cols = nncols+dcols+ncols

old_num_cols = len(list(df))
df = df[cols]
new_num_cols = len(list(df))

print("Dropped {} unnecessary columns".format(old_num_cols-new_num_cols))

Dropped 67 unnecessary columns


In [17]:
# compute vectors for every team at time of each game
df = df.sort_values(by=['date'], ascending=True)


In [20]:
df[['points','opp_points']].head()

Unnamed: 0,points,opp_points
1299,65,79
7567,75,83
6166,83,52
5349,105,76
6166,52,83


In [22]:
print(list(df))

['date', 'id', 'opp_id', 'game_id', 'season', 'win_id', 'lose_id', 'points', 'opp_points', 'offensive_rating', 'defensive_rating', 'pace', 'free_throws', 'two_point_field_goals', 'three_point_field_goals', 'offensive_rebounds', 'defensive_rebounds', 'opp_offensive_rebounds', 'opp_defensive_rebounds', 'wins', 'losses', 'opp_offensive_rating', 'opp_defensive_rating']
