# NCAA ML Competition - Women's 2018

## Notes

Notes about what's in the data files: https://www.kaggle.com/c/womens-machine-learning-competition-2018/data

Starter Kernel might help: https://www.kaggle.com/juliaelliott/basic-starter-kernel-ncaa-women-s-dataset

In [491]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

## Load regular season data

In [492]:
df = pd.read_csv('WStage2DataFiles/WRegularSeasonDetailedResults.csv') ## regular season exploration
df.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
0,2010,11,3103,63,3237,49,H,0,23,54,...,13,6,10,11,27,11,23,7,6,19
1,2010,11,3104,73,3399,68,N,0,26,62,...,21,14,27,14,26,7,20,4,2,27
2,2010,11,3110,71,3224,59,A,0,29,62,...,14,19,23,17,23,8,15,6,0,15
3,2010,11,3111,63,3267,58,A,0,27,52,...,26,16,25,22,22,15,11,14,5,14
4,2010,11,3119,74,3447,70,H,1,30,74,...,17,11,21,21,32,12,14,4,2,14


## Load the league that each team belongs to

In [493]:
df_teams = pd.read_csv('./WStage2DataFiles/' + 'WTeams.csv')
df_leagues = pd.merge(left=df_teams, right=pd.read_csv('./WStage2DataFiles/' + 'WLeagues.csv'), how='left', on=['TeamName'])
df_winning_team_leagues = df_leagues
df_winning_team_leagues = df_winning_team_leagues.rename(columns={'TeamID': 'WTeamID', 'LeagueName': 'WLeagueName'})
df_winning_team_leagues = df_winning_team_leagues.drop('TeamName', 1)
df_winning_team_leagues.head()

Unnamed: 0,WTeamID,WLeagueName
0,3101,Southland
1,3102,MWC
2,3103,MAC
3,3104,SEC
4,3105,SWAC


## Load and bin league performance (to dampen the effect of small variations)

In [494]:
## Bin league performance into 10 bins
df_league_perf = pd.read_pickle('league_performance')
df_league_perf['Leaguebin'] = pd.cut(df_league_perf['PctWins mean'], 24, labels=False)

df_league_perf = df_league_perf[['LeagueName', 'Leaguebin']]
df_team_league_perf = pd.merge(left=df_leagues, right=df_league_perf, how='left', on=['LeagueName'])

df_team_league_perf = df_team_league_perf[['TeamID', 'Leaguebin']]
df_team_league_perf.head()



Unnamed: 0,TeamID,Leaguebin
0,3101,1.0
1,3102,7.0
2,3103,2.0
3,3104,23.0
4,3105,0.0


In [495]:
##df = df[df['Season'] > 2012] ## 
##df = df[df['DayNum'] > 30]

## Inject winning and losing teams' leaugues into df
df_merged = pd.merge(left=df, right=df_winning_team_leagues, how='left', on=['WTeamID'])

df_winning_team_leagues = df_winning_team_leagues.rename(columns={'WTeamID': 'LTeamID', 'WLeagueName': 'LLeagueName'})
df = pd.merge(left=df_merged, right=df_winning_team_leagues, how='left', on=['LTeamID'])
df.head()


Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF,WLeagueName,LLeagueName
0,2010,11,3103,63,3237,49,H,0,23,54,...,10,11,27,11,23,7,6,19,MAC,Horizon
1,2010,11,3104,73,3399,68,N,0,26,62,...,27,14,26,7,20,4,2,27,SEC,OVC
2,2010,11,3110,71,3224,59,A,0,29,62,...,23,17,23,8,15,6,0,15,Patriot,MEAC
3,2010,11,3111,63,3267,58,A,0,27,52,...,25,22,22,15,11,14,5,14,Sun Belt,C-USA
4,2010,11,3119,74,3447,70,H,1,30,74,...,21,21,32,12,14,4,2,14,Patriot,NEC


## Feature Engineering
-  Operationalize the notion of Point Opportunities Developed (POD) and Opportunity Convertion Rate (OCR)
-  Demean and rescale the performance of teams by league and season
-  Bin performance to dampen small variations


In [496]:
df['WPOD'] = df.WFGA3 * 3 + (df.WFGA - df.WFGA3) * 2 + df.WFTA * 1
df['LPOD'] = df.LFGA3 * 3 + (df.LFGA - df.LFGA3) * 2 + df.LFTA * 1

df['WOCR'] = 1.0 * df.WScore / df.WPOD
df['LOCR'] = 1.0 * df.LScore / df.LPOD


In [497]:
## Isolate field goal percentage by team regardless of who won a game
df['WFGP'] = df.WFGM / df.WFGA ## Field goal percentage for the winning team
df['LFGP'] = df.LFGM / df.LFGA ## Field goal percentage for the losing team

df_WTeamFGP_game = df[['Season','WLeagueName', 'WTeamID','WFGP']].rename(columns = {'WLeagueName':'LeagueName', 'WTeamID':'TeamID', 'WFGP':'FGP'})
df_LTeamFGP_game = df[['Season','LLeagueName', 'LTeamID','LFGP']].rename(columns = {'LLeagueName':'LeagueName', 'LTeamID':'TeamID', 'LFGP':'FGP'})
df_TeamFGP_game = pd.concat([df_WTeamFGP_game, df_LTeamFGP_game])

df_TeamFGP_season = df_TeamFGP_game.groupby(['Season', 'LeagueName', 'TeamID'], as_index=False).agg({'FGP':['mean']})
df_TeamFGP_season.columns = df_TeamFGP_season.columns.droplevel(1)

df_FGP_season = df_TeamFGP_game.groupby(['Season', 'LeagueName'], as_index=False).agg({'FGP':['mean','std']})

df_FGP_season.columns = [' '.join(col).strip() for col in df_FGP_season.columns.values] ## Flatten column name levels

df_TeamFGP_season_merged = pd.merge(left=df_TeamFGP_season, right=df_FGP_season, how='left', on=['Season', 'LeagueName'])

## Normalize field goal by demeaning and standardizing
df_TeamFGP_season_merged['FGP_norm'] = ((df_TeamFGP_season_merged['FGP'] - df_TeamFGP_season_merged['FGP mean'])/df_TeamFGP_season_merged['FGP std'])

## Bin normalized field goal percentage
df_TeamFGP_season_merged['FGPbin'] = pd.cut(df_TeamFGP_season_merged.FGP_norm, 24, labels=False)
df_TeamFGP_season_merged.head()

Unnamed: 0,Season,LeagueName,TeamID,FGP,FGP mean,FGP std,FGP_norm,FGPbin
0,2010,??,3147,0.343658,0.343658,0.088824,0.0,10
1,2010,AAC,3153,0.386357,0.417876,0.077519,-0.406599,6
2,2010,AAC,3163,0.514743,0.417876,0.077519,1.249591,21
3,2010,AAC,3187,0.467757,0.417876,0.077519,0.643471,15
4,2010,AAC,3222,0.402092,0.417876,0.077519,-0.203606,8


In [498]:
## Isolate free throw percentage by team regardless of who won a game
df['WFTP'] = df.WFTM / df.WFTA ## Field goal percentage for the winning team
df['LFTP'] = df.LFTM / df.LFTA ## Field goal percentage for the losing team

df_WTeamFTP_game = df[['Season','WLeagueName', 'WTeamID','WFTP']].rename(columns = {'WLeagueName':'LeagueName', 'WTeamID':'TeamID', 'WFTP':'FTP'})
df_LTeamFTP_game = df[['Season','LLeagueName', 'LTeamID','LFTP']].rename(columns = {'LLeagueName':'LeagueName', 'LTeamID':'TeamID', 'LFTP':'FTP'})
df_TeamFTP_game = pd.concat([df_WTeamFTP_game, df_LTeamFTP_game])

df_TeamFTP_season = df_TeamFTP_game.groupby(['Season', 'LeagueName', 'TeamID'], as_index=False).agg({'FTP':['mean']})
df_TeamFTP_season.columns = df_TeamFTP_season.columns.droplevel(1)

df_FTP_season = df_TeamFTP_game.groupby(['Season', 'LeagueName'], as_index=False).agg({'FTP':['mean','std']})

df_FTP_season.columns = [' '.join(col).strip() for col in df_FTP_season.columns.values] ## Flatten column name levels

df_TeamFTP_season_merged = pd.merge(left=df_TeamFTP_season, right=df_FTP_season, how='left', on=['Season', 'LeagueName'])

## Normalize free throw by demeaning and standardizing
df_TeamFTP_season_merged['FTP_norm'] = ((df_TeamFTP_season_merged['FTP'] - df_TeamFTP_season_merged['FTP mean'])/df_TeamFTP_season_merged['FTP std'])

## Bin normalized free throw percentage
df_TeamFTP_season_merged['FTPbin'] = pd.cut(df_TeamFTP_season_merged.FTP_norm, 24, labels=False)
df_TeamFTP_season_merged.head()

Unnamed: 0,Season,LeagueName,TeamID,FTP,FTP mean,FTP std,FTP_norm,FTPbin
0,2010,??,3147,0.66242,0.66242,0.134582,0.0,13
1,2010,AAC,3153,0.682553,0.659702,0.131597,0.173643,15
2,2010,AAC,3163,0.71756,0.659702,0.131597,0.439659,18
3,2010,AAC,3187,0.69358,0.659702,0.131597,0.257432,16
4,2010,AAC,3222,0.646145,0.659702,0.131597,-0.103019,12


In [499]:
## Isolate FGA by team regardless of who won a game

df_WTeamFGA_game = df[['Season','WLeagueName', 'WTeamID','WFGA']].rename(columns = {'WLeagueName':'LeagueName', 'WTeamID':'TeamID', 'WFGA':'FGA'})
df_LTeamFGA_game = df[['Season','LLeagueName', 'LTeamID','LFGA']].rename(columns = {'LLeagueName':'LeagueName', 'LTeamID':'TeamID', 'LFGA':'FGA'})
df_TeamFGA_game = pd.concat([df_WTeamFGA_game, df_LTeamFGA_game])

df_TeamFGA_season = df_TeamFGA_game.groupby(['Season', 'LeagueName', 'TeamID'], as_index=False).agg({'FGA':['mean']})
df_TeamFGA_season.columns = df_TeamFGA_season.columns.droplevel(1)

df_FGA_season = df_TeamFGA_game.groupby(['Season', 'LeagueName'], as_index=False).agg({'FGA':['mean','std']})

df_FGA_season.columns = [' '.join(col).strip() for col in df_FGA_season.columns.values] ## Flatten column name levels

df_TeamFGA_season_merged = pd.merge(left=df_TeamFGA_season, right=df_FGA_season, how='left', on=['Season', 'LeagueName'])

## Normalize FGA by demeaning and standardizing
df_TeamFGA_season_merged['FGA_norm'] = ((df_TeamFGA_season_merged['FGA'] - df_TeamFGA_season_merged['FGA mean'])/df_TeamFGA_season_merged['FGA std'])

## Bin normalized FGA
df_TeamFGA_season_merged['FGAbin'] = pd.cut(df_TeamFGA_season_merged.FGA_norm, 24, labels=False)
df_TeamFGA_season_merged.head()

Unnamed: 0,Season,LeagueName,TeamID,FGA,FGA mean,FGA std,FGA_norm,FGAbin
0,2010,??,3147,47.551724,47.551724,8.086966,0.0,10
1,2010,AAC,3153,52.172414,58.855956,7.818871,-0.854796,4
2,2010,AAC,3163,60.8125,58.855956,7.818871,0.250234,11
3,2010,AAC,3187,55.354839,58.855956,7.818871,-0.447778,7
4,2010,AAC,3222,62.129032,58.855956,7.818871,0.418612,13


In [500]:
## Isolate FTP by team regardless of who won a game

df_WTeamFTA_game = df[['Season','WLeagueName', 'WTeamID','WFTA']].rename(columns = {'WLeagueName':'LeagueName', 'WTeamID':'TeamID', 'WFTA':'FTA'})
df_LTeamFTA_game = df[['Season','LLeagueName', 'LTeamID','LFTA']].rename(columns = {'LLeagueName':'LeagueName', 'LTeamID':'TeamID', 'LFTA':'FTA'})
df_TeamFTA_game = pd.concat([df_WTeamFTA_game, df_LTeamFTA_game])

df_TeamFTA_season = df_TeamFTA_game.groupby(['Season', 'LeagueName', 'TeamID'], as_index=False).agg({'FTA':['mean']})
df_TeamFTA_season.columns = df_TeamFTA_season.columns.droplevel(1)

df_FTA_season = df_TeamFTA_game.groupby(['Season', 'LeagueName'], as_index=False).agg({'FTA':['mean','std']})

df_FTA_season.columns = [' '.join(col).strip() for col in df_FTA_season.columns.values] ## Flatten column name levels

df_TeamFTA_season_merged = pd.merge(left=df_TeamFTA_season, right=df_FTA_season, how='left', on=['Season', 'LeagueName'])

## Normalize FTP by demeaning and standardizing
df_TeamFTA_season_merged['FTA_norm'] = ((df_TeamFTA_season_merged['FTA'] - df_TeamFTA_season_merged['FTA mean'])/df_TeamFTA_season_merged['FTA std'])

## Bin normalized FTP
df_TeamFTA_season_merged['FTAbin'] = pd.cut(df_TeamFTA_season_merged.FTA_norm, 24, labels=False)
df_TeamFTA_season_merged.head()

Unnamed: 0,Season,LeagueName,TeamID,FTA,FTA mean,FTA std,FTA_norm,FTAbin
0,2010,??,3147,15.896552,15.896552,6.24125,0.0,12
1,2010,AAC,3153,16.137931,19.531856,7.289239,-0.465608,7
2,2010,AAC,3163,18.28125,19.531856,7.289239,-0.171569,10
3,2010,AAC,3187,21.870968,19.531856,7.289239,0.320899,16
4,2010,AAC,3222,22.225806,19.531856,7.289239,0.369579,17


In [501]:
## Isolate assists by team regardless of who won a game

df_WTeamAst_game = df[['Season','WLeagueName', 'WTeamID','WAst']].rename(columns = {'WLeagueName':'LeagueName', 'WTeamID':'TeamID', 'WAst':'Ast'})
df_LTeamAst_game = df[['Season','LLeagueName', 'LTeamID','LAst']].rename(columns = {'LLeagueName':'LeagueName', 'LTeamID':'TeamID', 'LAst':'Ast'})
df_TeamAst_game = pd.concat([df_WTeamAst_game, df_LTeamAst_game])

df_TeamAst_season = df_TeamAst_game.groupby(['Season', 'LeagueName', 'TeamID'], as_index=False).agg({'Ast':['mean']})
df_TeamAst_season.columns = df_TeamAst_season.columns.droplevel(1)

df_Ast_season = df_TeamAst_game.groupby(['Season', 'LeagueName'], as_index=False).agg({'Ast':['mean','std']})

df_Ast_season.columns = [' '.join(col).strip() for col in df_Ast_season.columns.values] ## Flatten column name levels

df_TeamAst_season_merged = pd.merge(left=df_TeamAst_season, right=df_Ast_season, how='left', on=['Season', 'LeagueName'])

## Normalize assists by demeaning and standardizing
df_TeamAst_season_merged['Ast_norm'] = ((df_TeamAst_season_merged['Ast'] - df_TeamAst_season_merged['Ast mean'])/df_TeamAst_season_merged['Ast std'])

## Bin normalized assists
df_TeamAst_season_merged['Astbin'] = pd.cut(df_TeamAst_season_merged.Ast_norm, 24, labels=False)
df_TeamAst_season_merged.head()

Unnamed: 0,Season,LeagueName,TeamID,Ast,Ast mean,Ast std,Ast_norm,Astbin
0,2010,??,3147,9.689655,9.689655,3.818222,0.0,9
1,2010,AAC,3153,11.103448,13.68144,4.749376,-0.542807,5
2,2010,AAC,3163,19.46875,13.68144,4.749376,1.218541,18
3,2010,AAC,3187,16.258065,13.68144,4.749376,0.542518,13
4,2010,AAC,3222,11.935484,13.68144,4.749376,-0.367618,6


In [502]:
## Isolate blocks by team regardless of who won a game

df_WTeamBlk_game = df[['Season','WLeagueName', 'WTeamID','WBlk']].rename(columns = {'WLeagueName':'LeagueName', 'WTeamID':'TeamID', 'WBlk':'Blk'})
df_LTeamBlk_game = df[['Season','LLeagueName', 'LTeamID','LBlk']].rename(columns = {'LLeagueName':'LeagueName', 'LTeamID':'TeamID', 'LBlk':'Blk'})
df_TeamBlk_game = pd.concat([df_WTeamBlk_game, df_LTeamBlk_game])

df_TeamBlk_season = df_TeamBlk_game.groupby(['Season', 'LeagueName', 'TeamID'], as_index=False).agg({'Blk':['mean']})
df_TeamBlk_season.columns = df_TeamBlk_season.columns.droplevel(1)

df_Blk_season = df_TeamBlk_game.groupby(['Season', 'LeagueName'], as_index=False).agg({'Blk':['mean','std']})

df_Blk_season.columns = [' '.join(col).strip() for col in df_Blk_season.columns.values] ## Flatten column name levels

df_TeamBlk_season_merged = pd.merge(left=df_TeamBlk_season, right=df_Blk_season, how='left', on=['Season', 'LeagueName'])

## Normalize blocks by demeaning and standardizing
df_TeamBlk_season_merged['Blk_norm'] = ((df_TeamBlk_season_merged['Blk'] - df_TeamBlk_season_merged['Blk mean'])/df_TeamBlk_season_merged['Blk std'])

## Bin normalized blocks
df_TeamBlk_season_merged['Blkbin'] = pd.cut(df_TeamBlk_season_merged.Blk_norm, 24, labels=False)
df_TeamBlk_season_merged.head()

Unnamed: 0,Season,LeagueName,TeamID,Blk,Blk mean,Blk std,Blk_norm,Blkbin
0,2010,??,3147,3.275862,3.275862,1.980198,0.0,9
1,2010,AAC,3153,2.241379,3.31856,2.348975,-0.458575,5
2,2010,AAC,3163,5.5,3.31856,2.348975,0.928678,17
3,2010,AAC,3187,2.483871,3.31856,2.348975,-0.355342,6
4,2010,AAC,3222,3.451613,3.31856,2.348975,0.056643,10


In [503]:
## Isolate offensive rebounds by team regardless of who won a game

df_WTeamOR_game = df[['Season','WLeagueName', 'WTeamID','WOR']].rename(columns = {'WLeagueName':'LeagueName', 'WTeamID':'TeamID', 'WOR':'OR'})
df_LTeamOR_game = df[['Season','LLeagueName', 'LTeamID','LOR']].rename(columns = {'LLeagueName':'LeagueName', 'LTeamID':'TeamID', 'LOR':'OR'})
df_TeamOR_game = pd.concat([df_WTeamOR_game, df_LTeamOR_game])

df_TeamOR_season = df_TeamOR_game.groupby(['Season', 'LeagueName', 'TeamID'], as_index=False).agg({'OR':['mean']})
df_TeamOR_season.columns = df_TeamOR_season.columns.droplevel(1)

df_OR_season = df_TeamOR_game.groupby(['Season', 'LeagueName'], as_index=False).agg({'OR':['mean','std']})

df_OR_season.columns = [' '.join(col).strip() for col in df_OR_season.columns.values] ## Flatten column name levels

df_TeamOR_season_merged = pd.merge(left=df_TeamOR_season, right=df_OR_season, how='left', on=['Season', 'LeagueName'])

## Normalize offensive rebounds by demeaning and standardizing
df_TeamOR_season_merged['OR_norm'] = ((df_TeamOR_season_merged['OR'] - df_TeamOR_season_merged['OR mean'])/df_TeamOR_season_merged['OR std'])

## Bin normalized offensive rebounds
df_TeamOR_season_merged['ORbin'] = pd.cut(df_TeamOR_season_merged.OR_norm, 24, labels=False)
df_TeamOR_season_merged.head()

Unnamed: 0,Season,LeagueName,TeamID,OR,OR mean,OR std,OR_norm,ORbin
0,2010,??,3147,10.965517,10.965517,6.304662,0.0,11
1,2010,AAC,3153,10.448276,14.132964,4.592753,-0.802283,5
2,2010,AAC,3163,13.125,14.132964,4.592753,-0.219468,10
3,2010,AAC,3187,12.516129,14.132964,4.592753,-0.35204,9
4,2010,AAC,3222,14.451613,14.132964,4.592753,0.069381,12


In [504]:
## Isolate defensive rebounds by team regardless of who won a game

df_WTeamDR_game = df[['Season','WLeagueName', 'WTeamID','WDR']].rename(columns = {'WLeagueName':'LeagueName', 'WTeamID':'TeamID', 'WDR':'DR'})
df_LTeamDR_game = df[['Season','LLeagueName', 'LTeamID','LDR']].rename(columns = {'LLeagueName':'LeagueName', 'LTeamID':'TeamID', 'LDR':'DR'})
df_TeamDR_game = pd.concat([df_WTeamDR_game, df_LTeamDR_game])

df_TeamDR_season = df_TeamDR_game.groupby(['Season', 'LeagueName', 'TeamID'], as_index=False).agg({'DR':['mean']})
df_TeamDR_season.columns = df_TeamDR_season.columns.droplevel(1)

df_DR_season = df_TeamDR_game.groupby(['Season', 'LeagueName'], as_index=False).agg({'DR':['mean','std']})

df_DR_season.columns = [' '.join(col).strip() for col in df_DR_season.columns.values] ## Flatten column name levels

df_TeamDR_season_merged = pd.merge(left=df_TeamDR_season, right=df_DR_season, how='left', on=['Season', 'LeagueName'])

## Normalize defensive rebounds by demeaning and standardizing
df_TeamDR_season_merged['DR_norm'] = ((df_TeamDR_season_merged['DR'] - df_TeamDR_season_merged['DR mean'])/df_TeamDR_season_merged['DR std'])

## Bin normalized defensive rebounds
df_TeamDR_season_merged['DRbin'] = pd.cut(df_TeamDR_season_merged.DR_norm, 24, labels=False)
df_TeamDR_season_merged.head()


Unnamed: 0,Season,LeagueName,TeamID,DR,DR mean,DR std,DR_norm,DRbin
0,2010,??,3147,20.724138,20.724138,4.534759,0.0,12
1,2010,AAC,3153,23.793103,25.252078,5.559791,-0.262415,9
2,2010,AAC,3163,30.625,25.252078,5.559791,0.966389,22
3,2010,AAC,3187,27.129032,25.252078,5.559791,0.337594,15
4,2010,AAC,3222,26.548387,25.252078,5.559791,0.233158,14


In [505]:
## Isolate steals by team regardless of who won a game

df_WTeamStl_game = df[['Season','WLeagueName', 'WTeamID','WStl']].rename(columns = {'WLeagueName':'LeagueName', 'WTeamID':'TeamID', 'WStl':'Stl'})
df_LTeamStl_game = df[['Season','LLeagueName', 'LTeamID','LStl']].rename(columns = {'LLeagueName':'LeagueName', 'LTeamID':'TeamID', 'LStl':'Stl'})
df_TeamStl_game = pd.concat([df_WTeamStl_game, df_LTeamStl_game])

df_TeamStl_season = df_TeamStl_game.groupby(['Season', 'LeagueName', 'TeamID'], as_index=False).agg({'Stl':['mean']})
df_TeamStl_season.columns = df_TeamStl_season.columns.droplevel(1)

df_Stl_season = df_TeamStl_game.groupby(['Season', 'LeagueName'], as_index=False).agg({'Stl':['mean','std']})

df_Stl_season.columns = [' '.join(col).strip() for col in df_Stl_season.columns.values] ## Flatten column name levels

df_TeamStl_season_merged = pd.merge(left=df_TeamStl_season, right=df_Stl_season, how='left', on=['Season', 'LeagueName'])

## Normalize steals by demeaning and standardizing
df_TeamStl_season_merged['Stl_norm'] = ((df_TeamStl_season_merged['Stl'] - df_TeamStl_season_merged['Stl mean'])/df_TeamStl_season_merged['Stl std'])

## Bin normalized steals
df_TeamStl_season_merged['Stlbin'] = pd.cut(df_TeamStl_season_merged.Stl_norm, 24, labels=False)
df_TeamStl_season_merged.head()

Unnamed: 0,Season,LeagueName,TeamID,Stl,Stl mean,Stl std,Stl_norm,Stlbin
0,2010,??,3147,6.482759,6.482759,3.031038,0.0,8
1,2010,AAC,3153,6.103448,9.066482,3.675143,-0.806236,2
2,2010,AAC,3163,9.375,9.066482,3.675143,0.083947,8
3,2010,AAC,3187,9.032258,9.066482,3.675143,-0.009312,8
4,2010,AAC,3222,8.677419,9.066482,3.675143,-0.105863,7


In [506]:
## Isolate POD by team regardless of who won a game

df_WTeamPOD_game = df[['Season','WLeagueName', 'WTeamID','WPOD']].rename(columns = {'WLeagueName':'LeagueName', 'WTeamID':'TeamID', 'WPOD':'POD'})
df_LTeamPOD_game = df[['Season','LLeagueName', 'LTeamID','LPOD']].rename(columns = {'LLeagueName':'LeagueName', 'LTeamID':'TeamID', 'LPOD':'POD'})
df_TeamPOD_game = pd.concat([df_WTeamPOD_game, df_LTeamPOD_game])

df_TeamPOD_season = df_TeamPOD_game.groupby(['Season', 'LeagueName', 'TeamID'], as_index=False).agg({'POD':['mean']})
df_TeamPOD_season.columns = df_TeamPOD_season.columns.droplevel(1)

df_POD_season = df_TeamPOD_game.groupby(['Season', 'LeagueName'], as_index=False).agg({'POD':['mean','std']})

df_POD_season.columns = [' '.join(col).strip() for col in df_POD_season.columns.values] ## Flatten column name levels

df_TeamPOD_season_merged = pd.merge(left=df_TeamPOD_season, right=df_POD_season, how='left', on=['Season', 'LeagueName'])

## Normalize POD by demeaning and standardizing
df_TeamPOD_season_merged['POD_norm'] = ((df_TeamPOD_season_merged['POD'] - df_TeamPOD_season_merged['POD mean'])/df_TeamPOD_season_merged['POD std'])

## Bin normalized POD
df_TeamPOD_season_merged['PODbin'] = pd.cut(df_TeamPOD_season_merged.POD_norm, 24, labels=False)
df_TeamPOD_season_merged.head()

Unnamed: 0,Season,LeagueName,TeamID,POD,POD mean,POD std,POD_norm,PODbin
0,2010,??,3147,126.655172,126.655172,18.79262,0.0,9
1,2010,AAC,3153,136.448276,152.817175,18.424465,-0.888433,4
2,2010,AAC,3163,156.71875,152.817175,18.424465,0.211761,11
3,2010,AAC,3187,146.741935,152.817175,18.424465,-0.329738,7
4,2010,AAC,3222,162.645161,152.817175,18.424465,0.53342,13


In [507]:
## Isolate OCR by team regardless of who won a game

df_WTeamOCR_game = df[['Season','WLeagueName', 'WTeamID','WOCR']].rename(columns = {'WLeagueName':'LeagueName', 'WTeamID':'TeamID', 'WOCR':'OCR'})
df_LTeamOCR_game = df[['Season','LLeagueName', 'LTeamID','LOCR']].rename(columns = {'LLeagueName':'LeagueName', 'LTeamID':'TeamID', 'LOCR':'OCR'})
df_TeamOCR_game = pd.concat([df_WTeamOCR_game, df_LTeamOCR_game])

df_TeamOCR_season = df_TeamOCR_game.groupby(['Season', 'LeagueName', 'TeamID'], as_index=False).agg({'OCR':['mean']})
df_TeamOCR_season.columns = df_TeamOCR_season.columns.droplevel(1)

df_OCR_season = df_TeamOCR_game.groupby(['Season', 'LeagueName'], as_index=False).agg({'OCR':['mean','std']})

df_OCR_season.columns = [' '.join(col).strip() for col in df_OCR_season.columns.values] ## Flatten column name levels

df_TeamOCR_season_merged = pd.merge(left=df_TeamOCR_season, right=df_OCR_season, how='left', on=['Season', 'LeagueName'])

## Normalize OCR by demeaning and standardizing
df_TeamOCR_season_merged['OCR_norm'] = ((df_TeamOCR_season_merged['OCR'] - df_TeamOCR_season_merged['OCR mean'])/df_TeamOCR_season_merged['OCR std'])

## Bin normalized OCR
df_TeamOCR_season_merged['OCRbin'] = pd.cut(df_TeamOCR_season_merged.OCR_norm, 24, labels=False)
df_TeamOCR_season_merged.head()

Unnamed: 0,Season,LeagueName,TeamID,OCR,OCR mean,OCR std,OCR_norm,OCRbin
0,2010,??,3147,0.379124,0.379124,0.091734,0.0,10
1,2010,AAC,3153,0.415902,0.439815,0.07209,-0.33171,7
2,2010,AAC,3163,0.519842,0.439815,0.07209,1.110103,21
3,2010,AAC,3187,0.496129,0.439815,0.07209,0.78116,18
4,2010,AAC,3222,0.429771,0.439815,0.07209,-0.139324,9


In [508]:
# right now, only looking at tournament seed / performance, not regular season games
data_dir = './WDataFiles/'
df_seeds = pd.read_csv(data_dir + 'WNCAATourneySeeds.csv')
df_tour = pd.read_csv(data_dir + 'WNCAATourneyCompactResults.csv')

df_tour = df_tour[df_tour['Season'] > 2013]
df_tour.head()


Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
1008,2014,138,3113,69,3435,61,N,0
1009,2014,138,3124,87,3443,74,H,0
1010,2014,138,3140,72,3301,57,N,0
1011,2014,138,3143,64,3200,63,N,0
1012,2014,138,3177,104,3328,100,N,0


In [509]:
## Create a dataframe with regular season engineeried features
## An engineered features here reflects the performance of a team during a season on a performance type dimension

df_temp = pd.merge(left=df_TeamFGP_season_merged, right=df_TeamAst_season_merged, how='left', on=['Season', 'TeamID'])
df_temp = pd.merge(left=df_temp, right=df_TeamBlk_season_merged, how='left', on=['Season', 'TeamID'])
df_engineered_features = pd.merge(left=df_temp, right=df_TeamFGA_season_merged, how='left', on=['Season', 'TeamID'])
df_engineered_features = pd.merge(left=df_engineered_features, right=df_TeamFTP_season_merged, how='left', on=['Season', 'TeamID'])
df_engineered_features = pd.merge(left=df_engineered_features, right=df_TeamFTA_season_merged, how='left', on=['Season', 'TeamID'])
df_engineered_features = pd.merge(left=df_engineered_features, right=df_TeamOR_season_merged, how='left', on=['Season', 'TeamID'])
df_engineered_features = pd.merge(left=df_engineered_features, right=df_TeamDR_season_merged, how='left', on=['Season', 'TeamID'])
df_engineered_features = pd.merge(left=df_engineered_features, right=df_TeamStl_season_merged, how='left', on=['Season', 'TeamID'])
df_engineered_features = pd.merge(left=df_engineered_features, right=df_TeamPOD_season_merged, how='left', on=['Season', 'TeamID'])
df_engineered_features = pd.merge(left=df_engineered_features, right=df_TeamOCR_season_merged, how='left', on=['Season', 'TeamID'])
df_engineered_features = pd.merge(left=df_engineered_features, right=df_team_league_perf, how='left', on=['TeamID'])

print df_engineered_features.shape
df_engineered_features.head()



(3118, 69)


Unnamed: 0,Season,LeagueName_x,TeamID,FGP,FGP mean,FGP std,FGP_norm,FGPbin,LeagueName_y,Ast,...,POD std,POD_norm,PODbin,LeagueName,OCR,OCR mean,OCR std,OCR_norm,OCRbin,Leaguebin
0,2010,??,3147,0.343658,0.343658,0.088824,0.0,10,??,9.689655,...,18.79262,0.0,9,??,0.379124,0.379124,0.091734,0.0,10,
1,2010,AAC,3153,0.386357,0.417876,0.077519,-0.406599,6,AAC,11.103448,...,18.424465,-0.888433,4,AAC,0.415902,0.439815,0.07209,-0.33171,7,9.0
2,2010,AAC,3163,0.514743,0.417876,0.077519,1.249591,21,AAC,19.46875,...,18.424465,0.211761,11,AAC,0.519842,0.439815,0.07209,1.110103,21,9.0
3,2010,AAC,3187,0.467757,0.417876,0.077519,0.643471,15,AAC,16.258065,...,18.424465,-0.329738,7,AAC,0.496129,0.439815,0.07209,0.78116,18,9.0
4,2010,AAC,3222,0.402092,0.417876,0.077519,-0.203606,8,AAC,11.935484,...,18.424465,0.53342,13,AAC,0.429771,0.439815,0.07209,-0.139324,9,9.0


In [510]:
## Inject regular season engineered feature into touranement data for the corresponding seasons and teams

df_engineered_features = df_engineered_features[['Season', 'TeamID', 'FGPbin', 'FTPbin', 'FGAbin', 'FTAbin', 'Astbin', 'Blkbin', 'ORbin', 'DRbin', 'Leaguebin', 'Stlbin', 'PODbin', 'OCRbin']]

# merge seeds with team IDs for tourney performance
df_W = df_seeds.rename(columns={'TeamID':'WTeamID', 'Seed':'WSeed'})
df_L = df_seeds.rename(columns={'TeamID':'LTeamID', 'Seed':'LSeed'})

df_engineered_features_W = df_engineered_features.rename(columns={'Leaguebin':'WLeaguebin', 'TeamID':'WTeamID', 'FGPbin':'WFGPbin', 'FTPbin':'WFTPbin', 'FGAbin':'WFGAbin', 'FTAbin':'WFTAbin', 'Astbin':'WAstbin', 'Blkbin':'WBlkbin', 'ORbin':'WORbin', 'DRbin':'WDRbin', 'Stlbin':'WStlbin', 'PODbin':'WPODbin', 'OCRbin':'WOCRbin'})
df_engineered_features_L = df_engineered_features.rename(columns={'Leaguebin':'LLeaguebin', 'TeamID':'LTeamID', 'FGPbin':'LFGPbin', 'FTPbin':'LFTPbin', 'FGAbin':'LFGAbin', 'FTAbin':'LFTAbin', 'Astbin':'LAstbin', 'Blkbin':'LBlkbin', 'ORbin':'LORbin', 'DRbin':'LDRbin', 'Stlbin':'LStlbin', 'PODbin':'LPODbin', 'OCRbin':'LOCRbin'})


df_dummy = pd.merge(left=df_tour, right=df_W, how='left', on=['Season', 'WTeamID'])
df_concat = pd.merge(left=df_dummy, right=df_L, on=['Season', 'LTeamID']) ## 

df_dummy2 = pd.merge(left=df_concat, right=df_engineered_features_W, on=['Season', 'WTeamID'])
df_concat = pd.merge(left=df_dummy2, right=df_engineered_features_L, how='left', on=['Season', 'LTeamID'])


# at the beginning of the tourney, teams play within their region
# final 3 games = between regions

df_concat.head()



Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WSeed,LSeed,...,LFGAbin,LFTAbin,LAstbin,LBlkbin,LORbin,LDRbin,LLeaguebin,LStlbin,LPODbin,LOCRbin
0,2014,138,3113,69,3435,61,N,0,Z09,Z08,...,6,12,12,0,8,7,23.0,6,6,15
1,2014,138,3124,87,3443,74,H,0,Z02,Z15,...,9,13,10,10,10,9,12.0,12,9,15
2,2014,140,3124,75,3143,56,N,0,Z02,Z07,...,11,17,7,7,14,16,21.0,8,10,9
3,2014,145,3124,90,3246,72,N,0,Z02,Z03,...,16,18,9,9,15,13,23.0,11,15,11
4,2014,138,3140,72,3301,57,N,0,W12,W05,...,8,11,13,5,11,9,22.0,4,10,12


### Calculate a win probability for each pair of teams during regular season based on regular season data


In [511]:
### Recast regular season games a pairwise team cobinations within seasons
df_wins = pd.DataFrame()
df_wins['Season'] = df['Season']
df_wins['First_TeamID'] = df['WTeamID']
df_wins['Second_TeamID'] = df['LTeamID']
df_wins['Result'] = 1

df_losses = pd.DataFrame()
df_losses['Season'] = df['Season']
df_losses['First_TeamID'] = df['LTeamID']
df_losses['Second_TeamID'] = df['WTeamID']
df_losses['Result'] = 0

df_regular_season_games = pd.concat((df_wins, df_losses))
df_regular_season_games.head()

Unnamed: 0,Season,First_TeamID,Second_TeamID,Result
0,2010,3103,3237,1
1,2010,3104,3399,1
2,2010,3110,3224,1
3,2010,3111,3267,1
4,2010,3119,3447,1


### Inject engineered features

In [512]:
df_first_team_features = df_engineered_features_W
df_first_team_features = df_first_team_features.rename(columns = {'WTeamID':'First_TeamID', 'LTeamID':'Second_TeamID', 'WFGPbin':'First_FGPbin', 'WPODbin':'First_PODbin', 'WOCRbin':'First_OCRbin', 'WFTPbin':'First_FTPbin', 'WFGAbin':'First_FGAbin', 'WFTAbin':'First_FTAbin', 'WAstbin':'First_Astbin', 'WBlkbin':'First_Blkbin', 'WORbin':'First_ORbin', 'WDRbin':'First_DRbin', 'WStlbin':'First_Stlbin', 'WLeaguebin':'First_Leaguebin', 'WSeed':'First_Seed', 'LFGPbin':'Second_FGPbin', 'LFTPbin':'Second_FTPbin', 'LPODbin':'Second_PODbin', 'LOCRbin':'Second_OCRbin', 'LAstbin':'Second_Astbin', 'LBlkbin':'Second_Blkbin', 'LORbin':'Second_ORbin', 'LDRbin':'Second_DRbin', 'LStlbin':'Second_Stlbin', 'LLeaguebin':'Second_Leaguebin', 'LSeed':'Second_Seed'})

df_regular_season_games_with_features = pd.merge(left=df_regular_season_games, right=df_first_team_features, on=['Season', 'First_TeamID'])

df_second_team_features = df_engineered_features_L
df_second_team_features = df_second_team_features.rename(columns = {'WTeamID':'First_TeamID', 'LTeamID':'Second_TeamID', 'WFGPbin':'First_FGPbin', 'WFTPbin':'First_FTPbin', 'WFGAbin':'First_FGAbin', 'WPODbin':'First_PODbin', 'WOCRbin':'First_OCRbin', 'WFTAbin':'First_FTAbin', 'WAstbin':'First_Astbin', 'WBlkbin':'First_Blkbin', 'WORbin':'First_ORbin', 'WDRbin':'First_DRbin', 'WStlbin':'First_Stlbin', 'WLeaguebin':'First_Leaguebin', 'WSeed':'First_Seed', 'LFGPbin':'Second_FGPbin', 'LFTPbin':'Second_FTPbin', 'LPODbin':'Second_PODbin', 'LOCRbin':'Second_OCRbin', 'LAstbin':'Second_Astbin', 'LBlkbin':'Second_Blkbin', 'LORbin':'Second_ORbin', 'LDRbin':'Second_DRbin', 'LStlbin':'Second_Stlbin', 'LLeaguebin':'Second_Leaguebin', 'LSeed':'Second_Seed'})

df_regular_season_games_with_features = pd.merge(left=df_regular_season_games_with_features, right=df_second_team_features, on=['Season', 'Second_TeamID'])
df_regular_season_games_with_features.head()

Unnamed: 0,Season,First_TeamID,Second_TeamID,Result,First_FGPbin,First_FTPbin,First_FGAbin,First_FTAbin,First_Astbin,First_Blkbin,...,LFGAbin,LFTAbin,Second_Astbin,Second_Blkbin,Second_ORbin,Second_DRbin,Second_Leaguebin,Second_Stlbin,Second_PODbin,Second_OCRbin
0,2010,3103,3237,1,10,14,7,12,10,6,...,5,14,2,15,12,11,4.0,7,5,6
1,2010,3231,3237,1,6,12,14,14,6,2,...,5,14,2,15,12,11,4.0,7,5,6
2,2010,3282,3237,1,12,14,10,11,10,7,...,5,14,2,15,12,11,4.0,7,5,6
3,2010,3282,3237,1,12,14,10,11,10,7,...,5,14,2,15,12,11,4.0,7,5,6
4,2010,3293,3237,1,12,19,13,12,12,13,...,5,14,2,15,12,11,4.0,7,5,6


### Train model on regular season data to predict wins based on regular season performance

In [513]:
df_temp = df_regular_season_games_with_features.dropna(how='any')
df_temp = df_temp[['Result', 'Season', 'First_TeamID', 'Second_TeamID', 'First_PODbin', 'Second_PODbin', 'First_OCRbin', 'Second_OCRbin', 'First_Leaguebin', 'Second_Leaguebin', 'First_Astbin', 'Second_Astbin', 'First_Blkbin', 'Second_Blkbin', 'First_ORbin', 'Second_ORbin', 'First_Stlbin', 'Second_Stlbin']]

##cat1 = pd.get_dummies(df_temp['First_TeamID'], prefix = 'First_TeamID')
##cat2 = pd.get_dummies(df_temp['Second_TeamID'], prefix = 'Second_TeamID')
##cat3 = pd.get_dummies(df_temp['Season'], prefix = 'Season')

##df_temp = pd.concat([df_temp, cat1, cat2],axis=1)
##df_temp = df_temp.drop('First_TeamID', 1)
##df_temp = df_temp.drop('Second_TeamID', 1)

##df_temp = df_temp.drop({ORbinDiff','DRbinDiff', 'FGAbinDiff', 'FGPbinDiff', 'FTAbinDiff', 'AstbinDiff', 'StlbinDiff', 'FTPbinDiff'}, 1)

X_train = df_temp[df_temp['Season'] < 2017]
X_test = df_temp[df_temp['Season'] == 2017]

##X_train = X_train.drop({'SeedDiff'}, 1)
##X_test = X_test.drop({'SeedDiff'}, 1)


## cat5 = pd.get_dummies(X_train['Season'], prefix = 'Season')
## X_train = pd.concat([X_train, cat5],axis=1)

##X_train = X_train.drop('Season', 1)
##X_test = X_test.drop('Season', 1)


##X_train = X_train.drop('Season', 1)
##X_test = X_test.drop('Season', 1)

y_train = X_train.Result.values
y_test = X_test.Result.values

X_train = X_train.drop('Result', 1)
X_test = X_test.drop('Result', 1)

##X_train['RbinDiff'] = X_train.ORbinDiff + X_train.DRbinDiff

## X_temp = df_temp.drop({'ORbinDiff','DRbinDiff', 'AstbinDiff', 'FGPbinDiff', 'FGAbinDiff', 'FTPbinDiff', 'FTAbinDiff', 'LeaguebinDiff'}, 1)
## X_temp = df_temp[['SeedDiff', 'FGPbinDiff', 'AstbinDiff', 'BlkbinDiff']]


X_train.head()


Unnamed: 0,Season,First_TeamID,Second_TeamID,First_PODbin,Second_PODbin,First_OCRbin,Second_OCRbin,First_Leaguebin,Second_Leaguebin,First_Astbin,Second_Astbin,First_Blkbin,Second_Blkbin,First_ORbin,Second_ORbin,First_Stlbin,Second_Stlbin
0,2010,3103,3237,6,5,11,6,2.0,4.0,10,2,6,15,12,12,5,7
1,2010,3231,3237,13,5,6,6,17.0,4.0,6,2,2,15,13,12,10,7
2,2010,3282,3237,9,5,12,6,0.0,4.0,10,2,7,15,14,12,6,7
3,2010,3282,3237,9,5,12,6,0.0,4.0,10,2,7,15,14,12,6,7
4,2010,3293,3237,13,5,13,6,0.0,4.0,12,2,13,15,9,12,6,7


In [514]:
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

def eval(clf):
    #Get just the digits from the seeding. Return as int
    clf.fit(X_train,y_train)
    print "score = " + str(clf.score(X_test, y_test))

    pred = clf.predict_proba(X_test)
    print "log loss = " + str(metrics.log_loss(y_test,pred))

print "fitting logistic regression model"
modelLR = LogisticRegression()
eval(modelLR)

print "\nfitting Naive Bayes model"
modelNB = GaussianNB()
eval(modelNB)

print "\nfitting Random Forest model"
modelRF = RandomForestClassifier(n_jobs=2, random_state=0)
eval(modelRF)

fitting logistic regression model
score = 0.738003838772
log loss = 0.51689183183

fitting Naive Bayes model
score = 0.702303262956
log loss = 0.594573921007

fitting Random Forest model
score = 0.693857965451
log loss = 0.695652211622


In [515]:
## Fit a logistic regression model through grid search
lr = LogisticRegression()
params = {'C': np.logspace(start=-5, stop=3, num=9)}
lr = GridSearchCV(lr, params, scoring='neg_log_loss', refit=True)
lr.fit(X_train, y_train)
print('Best log_loss: {:.4}, with best C: {}'.format(lr.best_score_, lr.best_params_['C']))

Best log_loss: -0.5394, with best C: 0.001


In [516]:
lr = LogisticRegression(C=.01)

lr.fit(X_train,y_train)
lr.score(X_test,y_test)
##lr.predict_proba(X_test)

0.73800383877159303

In [517]:
pred = lr.predict_proba(X_test)
print "log loss = " + str(metrics.log_loss(y_test,pred))

log loss = 0.516913916426


### Engineer a feature 'winprob' for tournament matchups based on regular season data

In [518]:
df_tour_matchups = df_concat[['Season', 'WTeamID', 'LTeamID']]
df_tour_matchups.shape

(252, 3)

In [519]:
df_wins = pd.DataFrame()
df_wins['Season'] = df_tour_matchups['Season']
df_wins['First_TeamID'] = df_tour_matchups['WTeamID']
df_wins['Second_TeamID'] = df_tour_matchups['LTeamID']
df_wins['Result'] = 1

df_losses = pd.DataFrame()
df_losses['Season'] = df_tour_matchups['Season']
df_losses['First_TeamID'] = df_tour_matchups['LTeamID']
df_losses['Second_TeamID'] = df_tour_matchups['WTeamID']
df_losses['Result'] = 0

df_tour_matchups = pd.concat((df_wins, df_losses))
df_tour_matchups.head()

Unnamed: 0,Season,First_TeamID,Second_TeamID,Result
0,2014,3113,3435,1
1,2014,3124,3443,1
2,2014,3124,3143,1
3,2014,3124,3246,1
4,2014,3140,3301,1


In [520]:
### Inject engineered features into Tournament matchups
df_first_team_features = df_engineered_features_W
df_first_team_features = df_first_team_features.rename(columns = {'WTeamID':'First_TeamID', 'LTeamID':'Second_TeamID', 'WFGPbin':'First_FGPbin', 'WPODbin':'First_PODbin', 'WOCRbin':'First_OCRbin', 'WFTPbin':'First_FTPbin', 'WFGAbin':'First_FGAbin', 'WFTAbin':'First_FTAbin', 'WAstbin':'First_Astbin', 'WBlkbin':'First_Blkbin', 'WORbin':'First_ORbin', 'WDRbin':'First_DRbin', 'WStlbin':'First_Stlbin', 'WLeaguebin':'First_Leaguebin', 'WSeed':'First_Seed', 'LFGPbin':'Second_FGPbin', 'LFTPbin':'Second_FTPbin', 'LPODbin':'Second_PODbin', 'LOCRbin':'Second_OCRbin', 'LAstbin':'Second_Astbin', 'LBlkbin':'Second_Blkbin', 'LORbin':'Second_ORbin', 'LDRbin':'Second_DRbin', 'LStlbin':'Second_Stlbin', 'LLeaguebin':'Second_Leaguebin', 'LSeed':'Second_Seed'})

df_tour_matchups_with_features = pd.merge(left=df_tour_matchups, right=df_first_team_features, on=['Season', 'First_TeamID'])

df_second_team_features = df_engineered_features_L
df_second_team_features = df_second_team_features.rename(columns = {'WTeamID':'First_TeamID', 'LTeamID':'Second_TeamID', 'WFGPbin':'First_FGPbin', 'WFTPbin':'First_FTPbin', 'WFGAbin':'First_FGAbin', 'WPODbin':'First_PODbin', 'WOCRbin':'First_OCRbin', 'WFTAbin':'First_FTAbin', 'WAstbin':'First_Astbin', 'WBlkbin':'First_Blkbin', 'WORbin':'First_ORbin', 'WDRbin':'First_DRbin', 'WStlbin':'First_Stlbin', 'WLeaguebin':'First_Leaguebin', 'WSeed':'First_Seed', 'LFGPbin':'Second_FGPbin', 'LFTPbin':'Second_FTPbin', 'LPODbin':'Second_PODbin', 'LOCRbin':'Second_OCRbin', 'LAstbin':'Second_Astbin', 'LBlkbin':'Second_Blkbin', 'LORbin':'Second_ORbin', 'LDRbin':'Second_DRbin', 'LStlbin':'Second_Stlbin', 'LLeaguebin':'Second_Leaguebin', 'LSeed':'Second_Seed'})

df_tour_matchups_with_features = pd.merge(left=df_tour_matchups_with_features, right=df_second_team_features, on=['Season', 'Second_TeamID'])
df_tour_matchups_with_features.head()

Unnamed: 0,Season,First_TeamID,Second_TeamID,Result,First_FGPbin,First_FTPbin,First_FGAbin,First_FTAbin,First_Astbin,First_Blkbin,...,LFGAbin,LFTAbin,Second_Astbin,Second_Blkbin,Second_ORbin,Second_DRbin,Second_Leaguebin,Second_Stlbin,Second_PODbin,Second_OCRbin
0,2014,3113,3435,1,11,13,8,18,10,5,...,6,12,12,0,8,7,23.0,6,6,15
1,2014,3113,3323,0,11,13,8,18,10,5,...,12,14,18,11,11,16,22.0,10,10,21
2,2014,3124,3323,0,13,15,18,17,16,10,...,12,14,18,11,11,16,22.0,10,10,21
3,2014,3329,3323,0,11,12,12,11,9,6,...,12,14,18,11,11,16,22.0,10,10,21
4,2014,3163,3323,1,20,20,12,8,21,22,...,12,14,18,11,11,16,22.0,10,10,21


In [521]:
## Use Naive Bayes probs
df_tour_matchups = df_tour_matchups_with_features[['Result', 'Season', 'First_TeamID', 'Second_TeamID', 'First_PODbin', 'Second_PODbin', 'First_OCRbin', 'Second_OCRbin', 'First_Astbin', 'Second_Astbin', 'First_Blkbin', 'Second_Blkbin', 'First_ORbin', 'Second_ORbin', 'First_Stlbin', 'Second_Stlbin', 'First_Leaguebin', 'Second_Leaguebin']]
df_tour_matchups = df_tour_matchups.drop('Result', 1)
df_temp = pd.DataFrame(modelNB.predict_proba(df_tour_matchups), columns=list('xy'))
df_temp.head()

Unnamed: 0,x,y
0,0.600978,0.399022
1,0.978684,0.021316
2,0.814556,0.185444
3,0.981458,0.018542
4,0.145267,0.854733


In [522]:
df_pass['winprob'] = df_temp.x
df_pass['lossprob'] = df_temp.y

df_pass = df_pass[['Season', 'First_TeamID', 'Second_TeamID', 'winprob', 'lossprob']]
df_pass.head()

Unnamed: 0,Season,First_TeamID,Second_TeamID,winprob,lossprob
0,2014,3113,3435,0.600978,0.399022
1,2014,3113,3323,0.978684,0.021316
2,2014,3124,3323,0.814556,0.185444
3,2014,3329,3323,0.981458,0.018542
4,2014,3163,3323,0.145267,0.854733


### Calculate a win probability for each pair of teams during a tournament season based on regular season data


In [523]:
# calculate difference in performance based for all features  
df_concat['SeedDiff'] = df_concat.apply(lambda row: int(row['WSeed'][1:3]) - int(row['LSeed'][1:3]), axis=1)
df_concat['SeedDiffPct'] = df_concat['SeedDiff'] / (df_concat.apply(lambda row: int(row['WSeed'][1:3]) + int(row['LSeed'][1:3]), axis=1))
df_concat['FGPbinDiff'] = df_concat.apply(lambda row: int(row['WFGPbin']) - int(row['LFGPbin']), axis=1)
df_concat['FTPbinDiff'] = df_concat.apply(lambda row: int(row['WFTPbin']) - int(row['LFTPbin']), axis=1)
df_concat['FGAbinDiff'] = df_concat.apply(lambda row: int(row['WFGAbin']) - int(row['LFGAbin']), axis=1)
df_concat['FTAbinDiff'] = df_concat.apply(lambda row: int(row['WFTAbin']) - int(row['LFTAbin']), axis=1)
df_concat['AstbinDiff'] = df_concat.apply(lambda row: int(row['WAstbin']) - int(row['LAstbin']), axis=1)
df_concat['BlkbinDiff'] = df_concat.apply(lambda row: int(row['WBlkbin']) - int(row['LBlkbin']), axis=1)
df_concat['ORbinDiff'] = df_concat.apply(lambda row: int(row['WORbin']) - int(row['LORbin']), axis=1)
df_concat['DRbinDiff'] = df_concat.apply(lambda row: int(row['WDRbin']) - int(row['LDRbin']), axis=1)
df_concat['StlbinDiff'] = df_concat.apply(lambda row: int(row['WStlbin']) - int(row['LStlbin']), axis=1)
df_concat['PODbinDiff'] = df_concat.apply(lambda row: int(row['WPODbin']) - int(row['LPODbin']), axis=1)
df_concat['OCRbinDiff'] = df_concat.apply(lambda row: int(row['WOCRbin']) - int(row['LOCRbin']), axis=1)
df_concat['LeaguebinDiff'] = df_concat.apply(lambda row: int(row['WLeaguebin']) - int(row['LLeaguebin']), axis=1)

df_concat.tail(10)

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WSeed,LSeed,...,FGAbinDiff,FTAbinDiff,AstbinDiff,BlkbinDiff,ORbinDiff,DRbinDiff,StlbinDiff,PODbinDiff,OCRbinDiff,LeaguebinDiff
242,2017,140,3390,69,3243,48,A,0,Y02,Y07,...,3,-2,1,6,2,6,-3,2,3,0
243,2017,144,3390,77,3400,66,N,0,Y02,Y03,...,-4,5,2,2,-4,3,-2,0,1,0
244,2017,146,3390,76,3323,75,N,0,Y02,Y01,...,-2,1,-7,1,2,-1,-3,0,-6,-1
245,2017,138,3393,85,3235,65,N,0,W08,W09,...,7,5,1,6,7,-3,8,7,-3,1
246,2017,138,3397,66,3173,57,N,0,X05,X12,...,3,-3,1,-3,-4,2,-1,0,1,13
247,2017,138,3401,63,3335,61,N,0,W05,W12,...,1,3,2,-9,-1,2,-1,-1,5,21
248,2017,138,3417,83,3129,56,H,0,W04,W13,...,0,8,-4,2,0,2,2,3,-2,14
249,2017,140,3417,75,3401,43,H,0,W04,W05,...,5,5,-4,4,2,0,6,8,-5,-2
250,2017,138,3449,91,3286,63,H,0,X03,X14,...,0,3,-3,-5,-6,1,-2,4,6,21
251,2017,140,3449,108,3328,82,H,0,X03,X06,...,0,3,1,-6,-3,9,-7,4,7,0


In [524]:
df_wins = pd.DataFrame()

df_wins['SeedDiff'] = df_concat['SeedDiff']
df_wins['SeedDiffPct'] = df_concat['SeedDiffPct']
df_wins['FGPbinDiff'] = df_concat['FGPbinDiff']
df_wins['FTPbinDiff'] = df_concat['FTPbinDiff']
df_wins['FGAbinDiff'] = df_concat['FGAbinDiff']
df_wins['FTAbinDiff'] = df_concat['FTAbinDiff']
df_wins['AstbinDiff'] = df_concat['AstbinDiff']
df_wins['BlkbinDiff'] = df_concat['BlkbinDiff']
df_wins['ORbinDiff'] = df_concat['ORbinDiff']
df_wins['DRbinDiff'] = df_concat['DRbinDiff']
df_wins['StlbinDiff'] = df_concat['StlbinDiff']
df_wins['PODbinDiff'] = df_concat['PODbinDiff']
df_wins['OCRbinDiff'] = df_concat['OCRbinDiff']
df_wins['LeaguebinDiff'] = df_concat['LeaguebinDiff']
df_wins['Season'] = df_concat['Season']
df_wins['First_TeamID'] = df_concat['WTeamID']
df_wins['Second_TeamID'] = df_concat['LTeamID']


## df_wins['DayNum'] = df_concat['DayNum']



df_wins['Result'] = 1

df_losses = pd.DataFrame()

df_losses['SeedDiff'] = -df_concat['SeedDiff']
df_losses['SeedDiffPct'] = -df_concat['SeedDiffPct']
df_losses['FGPbinDiff'] = -df_concat['FGPbinDiff']
df_losses['FTPbinDiff'] = -df_concat['FTPbinDiff']
df_losses['FGAbinDiff'] = -df_concat['FGAbinDiff']
df_losses['FTAbinDiff'] = -df_concat['FTAbinDiff']
df_losses['AstbinDiff'] = -df_concat['AstbinDiff']
df_losses['BlkbinDiff'] = -df_concat['BlkbinDiff']
df_losses['ORbinDiff'] = -df_concat['ORbinDiff']
df_losses['DRbinDiff'] = -df_concat['DRbinDiff']
df_losses['StlbinDiff'] = -df_concat['StlbinDiff']
df_losses['PODbinDiff'] = -df_concat['PODbinDiff']
df_losses['OCRbinDiff'] = -df_concat['OCRbinDiff']
df_losses['LeaguebinDiff'] = -df_concat['LeaguebinDiff']
df_losses['Season'] = df_concat['Season']
df_losses['First_TeamID'] = df_concat['LTeamID']
df_losses['Second_TeamID'] = df_concat['WTeamID']
## df_losses['DayNum'] = df_concat['DayNum']



df_losses['Result'] = 0

df_predictions = pd.concat((df_wins, df_losses))
df_predictions.head()


Unnamed: 0,SeedDiff,SeedDiffPct,FGPbinDiff,FTPbinDiff,FGAbinDiff,FTAbinDiff,AstbinDiff,BlkbinDiff,ORbinDiff,DRbinDiff,StlbinDiff,PODbinDiff,OCRbinDiff,LeaguebinDiff,Season,First_TeamID,Second_TeamID,Result
0,1,0.058824,-2,-5,2,6,-2,5,5,-1,4,2,-3,-2,2014,3113,3435,1
1,-13,-0.764706,0,-2,9,4,6,0,8,9,-5,7,-1,9,2014,3124,3443,1
2,-5,-0.555556,4,4,7,0,9,3,4,2,-1,6,5,0,2014,3124,3143,1
3,-1,-0.2,3,4,2,-1,7,1,3,5,-4,1,3,-2,2014,3124,3246,1
4,7,0.411765,0,-3,-2,5,1,13,-3,11,-1,-2,1,-16,2014,3140,3301,1


In [525]:
## Convert to arrays
## X_train = df_predictions.SeedDiff.values.reshape(-1,1)
## y_train = df_predictions.Result.values

df_temp = df_predictions[['First_TeamID', 'Second_TeamID', 'SeedDiffPct', 'Season', 'Result', 'LeaguebinDiff']]
df_temp = pd.merge(left=df_temp, right=df_pass, how='left', on=['Season', 'First_TeamID', 'Second_TeamID'])
df_temp = df_temp.drop({ 'lossprob' , 'First_TeamID', 'Second_TeamID',}, 1)
##df_temp['stage2pred'] = stage2pred


##df_temp = df_temp.drop({ORbinDiff','DRbinDiff', 'FGAbinDiff', 'FGPbinDiff', 'FTAbinDiff', 'AstbinDiff', 'StlbinDiff', 'FTPbinDiff'}, 1)

X_train = df_temp[df_temp['Season'] < 2017]
X_test = df_temp[df_temp['Season'] == 2017]

##X_train = X_train.drop({'SeedDiff'}, 1)
##X_test = X_test.drop({'SeedDiff'}, 1)


## cat5 = pd.get_dummies(X_train['Season'], prefix = 'Season')
## X_train = pd.concat([X_train, cat5],axis=1)

X_train = X_train.drop('Season', 1)
X_test = X_test.drop('Season', 1)


y_train = X_train.Result.values
y_test = X_test.Result.values

X_train = X_train.drop('Result', 1)
X_test = X_test.drop('Result', 1)

##X_train['RbinDiff'] = X_train.ORbinDiff + X_train.DRbinDiff

## X_temp = df_temp.drop({'ORbinDiff','DRbinDiff', 'AstbinDiff', 'FGPbinDiff', 'FGAbinDiff', 'FTPbinDiff', 'FTAbinDiff', 'LeaguebinDiff'}, 1)
## X_temp = df_temp[['SeedDiff', 'FGPbinDiff', 'AstbinDiff', 'BlkbinDiff']]


print X_train.head()


   SeedDiffPct  LeaguebinDiff   winprob
0     0.058824             -2  0.600978
1    -0.764706              9  0.061416
2    -0.555556              0  0.068480
3    -0.200000             -2  0.290245
4     0.411765            -16  0.399836


In [526]:
##clf = RandomForestClassifier(n_jobs=2, random_state=0)
##clf = svm.SVC(probability=True)
##clf = LogisticRegression()
##clf = GaussianNB()
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

def eval(clf):
    clf.fit(X_train,y_train)
    print "score = " + str(clf.score(X_test, y_test))

    pred = clf.predict_proba(X_test)
    print "log loss = " + str(metrics.log_loss(y_test,pred))

print "fitting logistic regression model"
model = LogisticRegression()
eval(model)




fitting logistic regression model
score = 0.873015873016
log loss = 0.39545287466
