# NCAA ML Competition - Women's 2018

## Notes

Notes about what's in the data files: https://www.kaggle.com/c/womens-machine-learning-competition-2018/data

Starter Kernel might help: https://www.kaggle.com/juliaelliott/basic-starter-kernel-ncaa-women-s-dataset

In [532]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import itertools as it

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import svm

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

## Load regular season data

In [533]:
# Load the data
data_dir = './WStage2DataFiles/'
df_seeds = pd.read_csv(data_dir + 'WNCAATourneySeeds.csv')
df_tour_compact = pd.read_csv(data_dir + 'WNCAATourneyCompactResults.csv')
df_tour_detail = pd.read_csv(data_dir + 'WNCAATourneyDetailedResults.csv')
df_reg_compact = pd.read_csv(data_dir + 'WRegularSeasonCompactResults.csv')
df_reg_detail = pd.read_csv(data_dir + 'WRegularSeasonDetailedResults.csv')
df_teams = pd.read_csv(data_dir + 'WTeams.csv')
df_teams_leagues = pd.merge(left=df_teams, right=pd.read_csv(data_dir + 'WLeagues.csv'), how='left', on=['TeamName'])

## Load the league that each team belongs to

In [534]:
df_winning_team_leagues = df_teams_leagues.rename(columns={'TeamID' : 'WTeamID', 'LeagueName' : 'WLeagueName'}).drop('TeamName',1)
df_winning_team_leagues.head()

Unnamed: 0,WTeamID,WLeagueName
0,3101,Southland
1,3102,MWC
2,3103,MAC
3,3104,SEC
4,3105,SWAC


## Load and bin league performance (to dampen the effect of small variations)

In [535]:
## Bin league performance into 10 bins
df_league_perf = pd.read_pickle('league_performance')
df_league_perf['Leaguebin'] = pd.cut(df_league_perf['PctWins mean'], 24, labels=False)

df_league_perf = df_league_perf[['LeagueName', 'Leaguebin']]
df_team_league_perf = pd.merge(left=df_teams_leagues, right=df_league_perf, how='left', on=['LeagueName'])

df_team_league_perf = df_team_league_perf[['TeamID', 'Leaguebin']]
df_team_league_perf.head()

Unnamed: 0,TeamID,Leaguebin
0,3101,1.0
1,3102,7.0
2,3103,2.0
3,3104,23.0
4,3105,0.0


In [536]:
## Inject winning and losing teams' leaugues into df
# Total of 36 columns
df_reg_detail = pd.merge(left=df_reg_detail, right=df_winning_team_leagues, how='left', on=['WTeamID'])

df_losing_team_leagues = df_winning_team_leagues.rename(columns={'WTeamID': 'LTeamID', 'WLeagueName': 'LLeagueName'})
df_reg_detail = pd.merge(left=df_reg_detail, right=df_losing_team_leagues, how='left', on=['LTeamID'])
df_reg_detail.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF,WLeagueName,LLeagueName
0,2010,11,3103,63,3237,49,H,0,23,54,...,10,11,27,11,23,7,6,19,MAC,Horizon
1,2010,11,3104,73,3399,68,N,0,26,62,...,27,14,26,7,20,4,2,27,SEC,OVC
2,2010,11,3110,71,3224,59,A,0,29,62,...,23,17,23,8,15,6,0,15,Patriot,MEAC
3,2010,11,3111,63,3267,58,A,0,27,52,...,25,22,22,15,11,14,5,14,Sun Belt,C-USA
4,2010,11,3119,74,3447,70,H,1,30,74,...,21,21,32,12,14,4,2,14,Patriot,NEC


## Feature Engineering
-  Operationalize the notion of Point Opportunities Developed (POD) and Opportunity Convertion Rate (OCR)
-  Demean and rescale the performance of teams by league and season
-  Bin performance to dampen small variations


In [537]:
## Add a few more columns 
## Now have a total of 44 columns

df_reg_detail['WPOD'] = df_reg_detail.WFGA3 * 3 + (df_reg_detail.WFGA - df_reg_detail.WFGA3) * 2 + df_reg_detail.WFTA * 1
df_reg_detail['LPOD'] = df_reg_detail.LFGA3 * 3 + (df_reg_detail.LFGA - df_reg_detail.LFGA3) * 2 + df_reg_detail.LFTA * 1

df_reg_detail['WOCR'] = 1.0 * df_reg_detail.WScore / df_reg_detail.WPOD
df_reg_detail['LOCR'] = 1.0 * df_reg_detail.LScore / df_reg_detail.LPOD

df_reg_detail['WFGP'] = df_reg_detail.WFGM / df_reg_detail.WFGA ## Field goal percentage for the winning team
df_reg_detail['LFGP'] = df_reg_detail.LFGM / df_reg_detail.LFGA ## Field goal percentage for the losing team

df_reg_detail['WFTP'] = df_reg_detail.WFTM / df_reg_detail.WFTA ## Field goal percentage for the winning team
df_reg_detail['LFTP'] = df_reg_detail.LFTM / df_reg_detail.LFTA ## Field goal percentage for the losing team

In [538]:
features_todo = ['FGP','FTP','FGA','FTA','Ast','Blk','OR','DR','POD','OCR','Stl']

In [539]:
# take in string of feature name
def calc_feats(dat,feat_list):
    
    # get just the list of teams, to build our features from
    w_cols = dat[[col for col in dat if col.startswith('W')]].copy()
    w_cols.columns = [s[1:] for s in w_cols.columns]
    w_cols['Season'] = dat['Season']
    
    l_cols = dat[[col for col in dat if col.startswith('L')]].copy()
    l_cols.columns = [s[1:] for s in l_cols.columns]
    l_cols['Season'] = dat['Season']
    
    # this is a dataframe with all games for each season, with relevant stats for each team on a row
    all_teams = pd.concat([w_cols,l_cols])
    
    all_tm_lg_szn = all_teams[['TeamID','LeagueName','Season']].drop_duplicates()
    
    for f in feat_list:
        # aggregate data
        team_agg = all_teams.groupby(['Season', 'LeagueName', 'TeamID'], as_index=False).agg({f:['mean']})
        team_agg.columns = team_agg.columns.droplevel(1)

        league_agg = all_teams.groupby(['Season', 'LeagueName'], as_index=False).agg({f:['mean','std']})
        league_agg.columns = [''.join(col).strip() for col in league_agg.columns.values]
        
        team_agg = pd.merge(left=team_agg,right=league_agg,how='left',on=['Season','LeagueName'])
        
        team_agg[f+'norm'] = ((team_agg[f] - team_agg[f+'mean'])/ team_agg[f+'std'])
        team_agg[f+'bin'] = pd.cut(team_agg[f+'norm'], 24, labels=False)
        
        all_tm_lg_szn = pd.merge(left=all_tm_lg_szn,right=team_agg,how='left',on=['Season','LeagueName','TeamID'])
            
    return all_tm_lg_szn  

In [540]:
# note: this includes teams from 2018
df_engineered_features = calc_feats(df_reg_detail,features_todo)
df_engineered_features = pd.merge(left=df_engineered_features,right=df_team_league_perf,how='left',on='TeamID')

In [543]:
# This has 10 fewer columns than Prashant's because it doesn't have duplicate LeagueName columns
df_engineered_features.head()

Unnamed: 0,TeamID,LeagueName,Season,FGP,FGPmean,FGPstd,FGPnorm,FGPbin,FTP,FTPmean,...,OCRmean,OCRstd,OCRnorm,OCRbin,Stl,Stlmean,Stlstd,Stlnorm,Stlbin,Leaguebin
0,3103,MAC,2010,0.405346,0.402665,0.072138,0.037166,10,0.704329,0.698111,...,0.432482,0.068308,0.104973,11,8.033333,9.304709,3.193953,-0.398057,5,2.0
1,3104,SEC,2010,0.40077,0.415768,0.074202,-0.202128,8,0.621109,0.67691,...,0.43703,0.07129,-0.327385,7,7.37931,8.371495,3.85994,-0.257047,6,23.0
2,3110,Patriot,2010,0.39955,0.381652,0.067985,0.263277,12,0.740438,0.686326,...,0.410244,0.065445,0.343623,14,7.952381,7.6,3.381968,0.104194,9,0.0
3,3111,Sun Belt,2010,0.435525,0.394316,0.076786,0.53667,14,0.622631,0.671585,...,0.422639,0.073208,0.376765,14,9.935484,8.425287,3.595169,0.420063,11,1.0
4,3119,Patriot,2010,0.358097,0.381652,0.067985,-0.346465,7,0.673364,0.686326,...,0.410244,0.065445,-0.471252,6,6.458333,7.6,3.381968,-0.337575,5,0.0


In [542]:
# Create 'winning team' and 'losing team' versions of engineered features DF
# TO align with Prashant, this needs to only contain 'Bin' features. Season, and teamID

df_engineered_features_w = df_engineered_features[[col for col in df_engineered_features if col.endswith('bin')]].copy()
df_engineered_features_w.columns = map(lambda x:'1'+x,df_engineered_features_w.columns)
df_engineered_features_w['Season'] = df_engineered_features['Season']
df_engineered_features_w['1TeamID'] = df_engineered_features['TeamID']

df_engineered_features_l = df_engineered_features[[col for col in df_engineered_features if col.endswith('bin')]].copy()
df_engineered_features_l.columns = map(lambda x:'2'+x,df_engineered_features_l.columns)
df_engineered_features_l['Season'] = df_engineered_features['Season']
df_engineered_features_l['2TeamID'] = df_engineered_features['TeamID']

In [544]:
df_engineered_features_l.head()

Unnamed: 0,2FGPbin,2FTPbin,2FGAbin,2FTAbin,2Astbin,2Blkbin,2ORbin,2DRbin,2PODbin,2OCRbin,2Stlbin,2Leaguebin,Season,2TeamID
0,10,14,7,12,10,6,12,11,6,11,5,2.0,2010,3103
1,8,9,11,10,9,6,11,14,10,7,6,23.0,2010,3104
2,12,17,9,13,8,13,14,9,9,14,9,0.0,2010,3110
3,14,10,11,9,16,22,13,17,9,14,11,1.0,2010,3111
4,7,12,9,8,5,10,13,10,8,6,5,0.0,2010,3119


## Build Regular Season and Tournament Datasets

In [545]:
# Recast regular season games a pairwise team combinations within seasons
df_wins = pd.DataFrame()
df_wins['Season'] = df_reg_detail['Season']
df_wins['1TeamID'] = df_reg_detail['WTeamID']
df_wins['2TeamID'] = df_reg_detail['LTeamID']
df_wins['Result'] = 1

df_losses = pd.DataFrame()
df_losses['Season'] = df_reg_detail['Season']
df_losses['1TeamID'] = df_reg_detail['LTeamID']
df_losses['2TeamID'] = df_reg_detail['WTeamID']
df_losses['Result'] = 0

df_regular_season_games = pd.concat((df_wins, df_losses))
df_regular_season_games.head()

Unnamed: 0,Season,1TeamID,2TeamID,Result
0,2010,3103,3237,1
1,2010,3104,3399,1
2,2010,3110,3224,1
3,2010,3111,3267,1
4,2010,3119,3447,1


In [547]:
df_regular_season_games_with_features = pd.merge(left=df_regular_season_games, right=df_engineered_features_w, on=['Season', '1TeamID'])
df_regular_season_games_with_features = pd.merge(left=df_regular_season_games_with_features, right=df_engineered_features_l, on=['Season', '2TeamID'])

In [548]:
df_regular_season_games_with_features.head()

Unnamed: 0,Season,1TeamID,2TeamID,Result,1FGPbin,1FTPbin,1FGAbin,1FTAbin,1Astbin,1Blkbin,...,2FGAbin,2FTAbin,2Astbin,2Blkbin,2ORbin,2DRbin,2PODbin,2OCRbin,2Stlbin,2Leaguebin
0,2010,3103,3237,1,10,14,7,12,10,6,...,5,14,2,15,12,11,5,6,7,4.0
1,2010,3231,3237,1,6,12,14,14,6,2,...,5,14,2,15,12,11,5,6,7,4.0
2,2010,3282,3237,1,12,14,10,11,10,7,...,5,14,2,15,12,11,5,6,7,4.0
3,2010,3282,3237,1,12,14,10,11,10,7,...,5,14,2,15,12,11,5,6,7,4.0
4,2010,3293,3237,1,12,19,13,12,12,13,...,5,14,2,15,12,11,5,6,7,4.0


## THIS ENDS WHERE JULIA WAS WORKING... BELOW IS FROM THE ORIGINAL

In [313]:
## Inject regular season engineered feature into touranement data for the corresponding seasons and teams

df_engineered_features = df_engineered_features[['Season', 'TeamID', 'FGPbin', 'FTPbin', 'FGAbin', 'FTAbin', 'Astbin', 'Blkbin', 'ORbin', 'DRbin', 'Leaguebin', 'Stlbin', 'PODbin', 'OCRbin']]

# merge seeds with team IDs for tourney performance
df_W = df_seeds.rename(columns={'TeamID':'WTeamID', 'Seed':'WSeed'})
df_L = df_seeds.rename(columns={'TeamID':'LTeamID', 'Seed':'LSeed'})

df_engineered_features_W = df_engineered_features.rename(columns={'Leaguebin':'WLeaguebin', 'TeamID':'WTeamID', 'FGPbin':'WFGPbin', 'FTPbin':'WFTPbin', 'FGAbin':'WFGAbin', 'FTAbin':'WFTAbin', 'Astbin':'WAstbin', 'Blkbin':'WBlkbin', 'ORbin':'WORbin', 'DRbin':'WDRbin', 'Stlbin':'WStlbin', 'PODbin':'WPODbin', 'OCRbin':'WOCRbin'})
df_engineered_features_L = df_engineered_features.rename(columns={'Leaguebin':'LLeaguebin', 'TeamID':'LTeamID', 'FGPbin':'LFGPbin', 'FTPbin':'LFTPbin', 'FGAbin':'LFGAbin', 'FTAbin':'LFTAbin', 'Astbin':'LAstbin', 'Blkbin':'LBlkbin', 'ORbin':'LORbin', 'DRbin':'LDRbin', 'Stlbin':'LStlbin', 'PODbin':'LPODbin', 'OCRbin':'LOCRbin'})


# merge seed info with 
df_dummy = pd.merge(left=df_tour, right=df_W, how='left', on=['Season', 'WTeamID'])
df_concat = pd.merge(left=df_dummy, right=df_L, how='left', on=['Season', 'LTeamID']) ## 

df_dummy2 = pd.merge(left=df_concat, right=df_engineered_features_W, on=['Season', 'WTeamID'])
df_concat = pd.merge(left=df_dummy2, right=df_engineered_features_L, how='left', on=['Season', 'LTeamID'])


# at the beginning of the tourney, teams play within their region
# final 3 games = between regions

df_concat.head()



Unnamed: 0,LTeamID,Season,WTeamID,WSeed,LSeed,WFGPbin,WFTPbin,WFGAbin,WFTAbin,WAstbin,...,LFGAbin,LFTAbin,LAstbin,LBlkbin,LORbin,LDRbin,LLeaguebin,LStlbin,LPODbin,LOCRbin
0,3201,2010,3124,X04,X13,14,16,8,20,11,...,14,14,12,6,12,13,7.0,12,15,13
1,3207,2010,3124,X04,X05,14,16,8,20,11,...,12,16,13,6,16,2,15.0,17,13,10
2,3397,2010,3124,X04,X01,14,16,8,20,11,...,11,11,12,17,13,18,23.0,5,9,16
3,3181,2010,3124,X04,X02,14,16,8,20,11,...,12,12,10,14,14,12,22.0,14,11,11
4,3395,2010,3173,X08,X09,13,14,15,16,13,...,13,11,13,8,12,11,21.0,12,13,10


In [314]:
df_concat.columns

Index([u'LTeamID', u'Season', u'WTeamID', u'WSeed', u'LSeed', u'WFGPbin',
       u'WFTPbin', u'WFGAbin', u'WFTAbin', u'WAstbin', u'WBlkbin', u'WORbin',
       u'WDRbin', u'WLeaguebin', u'WStlbin', u'WPODbin', u'WOCRbin',
       u'LFGPbin', u'LFTPbin', u'LFGAbin', u'LFTAbin', u'LAstbin', u'LBlkbin',
       u'LORbin', u'LDRbin', u'LLeaguebin', u'LStlbin', u'LPODbin',
       u'LOCRbin'],
      dtype='object')

In [315]:
df_concat_old = df_concat[df_concat['Season'] < 2018]
df_concat_18 = df_concat[df_concat['Season'] == 2018]

In [306]:
df_concat_old['Result'] = 1

df_concat_old_neg = pd.DataFrame({
    'LAstbin' : df_concat_old['WAstbin'],
    'LBlkbin' : df_concat_old['WBlkbin'],
    'LDRbin' : df_concat_old['WDRbin'],
    'LFGAbin' : df_concat_old['WFGAbin'],
    'LFGPbin' : df_concat_old['WFGPbin'],
    'LFTAbin' : df_concat_old['WFTAbin'],
    'LFTPbin' : df_concat_old['WFTPbin'],
    'LLeaguebin' : df_concat_old['WLeaguebin'],
    'LOCRbin' : df_concat_old['WOCRbin'],
    'LORbin' : df_concat_old['WORbin'],
    'LPODbin' : df_concat_old['WPODbin'],
    'LSeed' : df_concat_old['WSeed'],
    'LStlbin' : df_concat_old['WStlbin'],
    'LTeamID' : df_concat_old['WTeamID'],
    'Season' : df_concat_old['Season'],
    'WAstbin' : df_concat_old['LAstbin'],
    'WBlkbin' : df_concat_old['LBlkbin'],
    'WDRbin' : df_concat_old['LDRbin'],
    'WFGAbin' : df_concat_old['LFGAbin'],
    'WFGPbin' : df_concat_old['LFGPbin'],
    'WFTAbin' : df_concat_old['LFTAbin'],
    'WFTPbin' : df_concat_old['LFTPbin'],
    'WLeaguebin' : df_concat_old['LLeaguebin'],
    'WOCRbin' : df_concat_old['LOCRbin'],
    'WORbin' : df_concat_old['LORbin'],
    'WPODbin' : df_concat_old['LPODbin'],
    'WSeed' : df_concat_old['LSeed'],
    'WStlbin' : df_concat_old['LStlbin'],
    'WTeamID' : df_concat_old['LTeamID'],
    'Result' : 0
})

df_concat = pd.concat([df_concat_old,df_concat_old_neg])

In [307]:
df_concat.columns

Index([u'LAstbin', u'LBlkbin', u'LDRbin', u'LFGAbin', u'LFGPbin', u'LFTAbin',
       u'LFTPbin', u'LLeaguebin', u'LOCRbin', u'LORbin', u'LPODbin', u'LSeed',
       u'LStlbin', u'LTeamID', u'Result', u'Season', u'WAstbin', u'WBlkbin',
       u'WDRbin', u'WFGAbin', u'WFGPbin', u'WFTAbin', u'WFTPbin',
       u'WLeaguebin', u'WOCRbin', u'WORbin', u'WPODbin', u'WSeed', u'WStlbin',
       u'WTeamID'],
      dtype='object')

In [317]:
df_actual_18 = pd.read_csv('NCAA_Result_2018 copy.csv')
df_actual_18 = df_actual_18.rename(columns={'Team1':'WTeamID','Team2':'LTeamID'})

df_concat_18 = pd.merge(left=df_actual_18, right=df_concat_18, how='left', on=['Season','WTeamID','LTeamID'])

In [318]:
df_concat_18.columns

Index([u'Season', u'WTeamID', u'LTeamID', u'Result', u'WSeed', u'LSeed',
       u'WFGPbin', u'WFTPbin', u'WFGAbin', u'WFTAbin', u'WAstbin', u'WBlkbin',
       u'WORbin', u'WDRbin', u'WLeaguebin', u'WStlbin', u'WPODbin', u'WOCRbin',
       u'LFGPbin', u'LFTPbin', u'LFGAbin', u'LFTAbin', u'LAstbin', u'LBlkbin',
       u'LORbin', u'LDRbin', u'LLeaguebin', u'LStlbin', u'LPODbin',
       u'LOCRbin'],
      dtype='object')

In [319]:
df_concat.head()

Unnamed: 0,LTeamID,Season,WTeamID,WSeed,LSeed,WFGPbin,WFTPbin,WFGAbin,WFTAbin,WAstbin,...,LFGAbin,LFTAbin,LAstbin,LBlkbin,LORbin,LDRbin,LLeaguebin,LStlbin,LPODbin,LOCRbin
0,3201,2010,3124,X04,X13,14,16,8,20,11,...,14,14,12,6,12,13,7.0,12,15,13
1,3207,2010,3124,X04,X05,14,16,8,20,11,...,12,16,13,6,16,2,15.0,17,13,10
2,3397,2010,3124,X04,X01,14,16,8,20,11,...,11,11,12,17,13,18,23.0,5,9,16
3,3181,2010,3124,X04,X02,14,16,8,20,11,...,12,12,10,14,14,12,22.0,14,11,11
4,3395,2010,3173,X08,X09,13,14,15,16,13,...,13,11,13,8,12,11,21.0,12,13,10


In [276]:
df_concat.shape

(2520, 29)

### Calculate a win probability for each pair of teams during regular season based on regular season data


In [270]:
df.tail()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,WLeagueName,LLeagueName,WPOD,LPOD,WOCR,LOCR,WFGP,LFGP,WFTP,LFTP
46337,2018,132,3195,68,3239,58,H,0,24,58,...,ASUN,ASUN,150,137,0.453333,0.423358,0.413793,0.418182,0.789474,0.692308
46338,2018,132,3251,60,3421,42,H,0,21,56,...,Big South,Big South,146,120,0.410959,0.35,0.375,0.297872,0.647059,0.8
46339,2018,132,3311,69,3372,65,N,0,24,47,...,Southland,Southland,132,156,0.522727,0.416667,0.510638,0.396552,0.7,0.857143
46340,2018,132,3343,63,3335,34,N,0,21,54,...,Ivy League,Ivy League,149,140,0.422819,0.242857,0.388889,0.222222,0.6,0.666667
46341,2018,132,3384,66,3352,56,H,0,19,46,...,NEC,NEC,138,162,0.478261,0.345679,0.413043,0.338462,0.769231,0.5


In [271]:
### Recast regular season games a pairwise team cobinations within seasons
df_wins = pd.DataFrame()
df_wins['Season'] = df['Season']
df_wins['First_TeamID'] = df['WTeamID']
df_wins['Second_TeamID'] = df['LTeamID']
df_wins['Result'] = 1

df_losses = pd.DataFrame()
df_losses['Season'] = df['Season']
df_losses['First_TeamID'] = df['LTeamID']
df_losses['Second_TeamID'] = df['WTeamID']
df_losses['Result'] = 0

df_regular_season_games = pd.concat((df_wins, df_losses))
df_regular_season_games.head()

Unnamed: 0,Season,First_TeamID,Second_TeamID,Result
0,2010,3103,3237,1
1,2010,3104,3399,1
2,2010,3110,3224,1
3,2010,3111,3267,1
4,2010,3119,3447,1


In [272]:
df_regular_season_games.tail()

Unnamed: 0,Season,First_TeamID,Second_TeamID,Result
46337,2018,3239,3195,0
46338,2018,3421,3251,0
46339,2018,3372,3311,0
46340,2018,3335,3343,0
46341,2018,3352,3384,0


### Inject engineered features

In [273]:
df_engineered_features_W.head()

Unnamed: 0,Season,WTeamID,WFGPbin,WFTPbin,WFGAbin,WFTAbin,WAstbin,WBlkbin,WORbin,WDRbin,WLeaguebin,WStlbin,WPODbin,WOCRbin
0,2010,3147,10,13,10,12,9,9,11,12,,8,9,10
1,2010,3153,6,15,4,7,5,5,5,9,9.0,2,4,7
2,2010,3163,21,18,11,10,18,17,10,22,9.0,8,11,21
3,2010,3187,15,16,7,16,13,6,9,15,9.0,8,7,18
4,2010,3222,8,12,13,17,6,10,12,14,9.0,7,13,9


In [168]:
df_first_team_features = df_engineered_features_W
df_first_team_features = df_first_team_features.rename(columns = {'WTeamID':'First_TeamID', 'LTeamID':'Second_TeamID', 'WFGPbin':'First_FGPbin', 'WPODbin':'First_PODbin', 'WOCRbin':'First_OCRbin', 'WFTPbin':'First_FTPbin', 'WFGAbin':'First_FGAbin', 'WFTAbin':'First_FTAbin', 'WAstbin':'First_Astbin', 'WBlkbin':'First_Blkbin', 'WORbin':'First_ORbin', 'WDRbin':'First_DRbin', 'WStlbin':'First_Stlbin', 'WLeaguebin':'First_Leaguebin', 'WSeed':'First_Seed', 'LFGPbin':'Second_FGPbin', 'LFTPbin':'Second_FTPbin', 'LPODbin':'Second_PODbin', 'LOCRbin':'Second_OCRbin', 'LAstbin':'Second_Astbin', 'LBlkbin':'Second_Blkbin', 'LORbin':'Second_ORbin', 'LDRbin':'Second_DRbin', 'LStlbin':'Second_Stlbin', 'LLeaguebin':'Second_Leaguebin', 'LSeed':'Second_Seed'})

df_regular_season_games_with_features = pd.merge(left=df_regular_season_games, right=df_first_team_features, on=['Season', 'First_TeamID'])

df_second_team_features = df_engineered_features_L
df_second_team_features = df_second_team_features.rename(columns = {'WTeamID':'First_TeamID', 'LTeamID':'Second_TeamID', 'WFGPbin':'First_FGPbin', 'WFTPbin':'First_FTPbin', 'WFGAbin':'First_FGAbin', 'WPODbin':'First_PODbin', 'WOCRbin':'First_OCRbin', 'WFTAbin':'First_FTAbin', 'WAstbin':'First_Astbin', 'WBlkbin':'First_Blkbin', 'WORbin':'First_ORbin', 'WDRbin':'First_DRbin', 'WStlbin':'First_Stlbin', 'WLeaguebin':'First_Leaguebin', 'WSeed':'First_Seed', 'LFGPbin':'Second_FGPbin', 'LFTPbin':'Second_FTPbin', 'LPODbin':'Second_PODbin', 'LOCRbin':'Second_OCRbin', 'LAstbin':'Second_Astbin', 'LBlkbin':'Second_Blkbin', 'LORbin':'Second_ORbin', 'LDRbin':'Second_DRbin', 'LStlbin':'Second_Stlbin', 'LLeaguebin':'Second_Leaguebin', 'LSeed':'Second_Seed'})

df_regular_season_games_with_features = pd.merge(left=df_regular_season_games_with_features, right=df_second_team_features, on=['Season', 'Second_TeamID'])
df_regular_season_games_with_features.head()

Unnamed: 0,Season,First_TeamID,Second_TeamID,Result,First_FGPbin,First_FTPbin,First_FGAbin,First_FTAbin,First_Astbin,First_Blkbin,...,LFGAbin,LFTAbin,Second_Astbin,Second_Blkbin,Second_ORbin,Second_DRbin,Second_Leaguebin,Second_Stlbin,Second_PODbin,Second_OCRbin
0,2010,3103,3237,1,10,14,7,12,10,6,...,5,14,2,15,12,11,4.0,7,5,6
1,2010,3231,3237,1,6,12,14,14,6,2,...,5,14,2,15,12,11,4.0,7,5,6
2,2010,3282,3237,1,12,14,10,11,10,7,...,5,14,2,15,12,11,4.0,7,5,6
3,2010,3282,3237,1,12,14,10,11,10,7,...,5,14,2,15,12,11,4.0,7,5,6
4,2010,3293,3237,1,12,19,13,12,12,13,...,5,14,2,15,12,11,4.0,7,5,6


### Train model on regular season data to predict wins based on regular season performance

In [175]:
df_temp = df_regular_season_games_with_features.dropna(how='any')
df_temp = df_temp[['Result', 'Season', 'First_TeamID', 'Second_TeamID', 'First_PODbin', 'Second_PODbin', 'First_OCRbin', 'Second_OCRbin', 'First_Leaguebin', 'Second_Leaguebin', 'First_Astbin', 'Second_Astbin', 'First_Blkbin', 'Second_Blkbin', 'First_ORbin', 'Second_ORbin', 'First_Stlbin', 'Second_Stlbin']]

##cat1 = pd.get_dummies(df_temp['First_TeamID'], prefix = 'First_TeamID')
##cat2 = pd.get_dummies(df_temp['Second_TeamID'], prefix = 'Second_TeamID')
##cat3 = pd.get_dummies(df_temp['Season'], prefix = 'Season')

##df_temp = pd.concat([df_temp, cat1, cat2],axis=1)
##df_temp = df_temp.drop('First_TeamID', 1)
##df_temp = df_temp.drop('Second_TeamID', 1)

##df_temp = df_temp.drop({ORbinDiff','DRbinDiff', 'FGAbinDiff', 'FGPbinDiff', 'FTAbinDiff', 'AstbinDiff', 'StlbinDiff', 'FTPbinDiff'}, 1)

X_train = df_temp[df_temp['Season'] < 2018]
X_test = df_temp[df_temp['Season'] == 2018]

##X_train = X_train.drop({'SeedDiff'}, 1)
##X_test = X_test.drop({'SeedDiff'}, 1)


## cat5 = pd.get_dummies(X_train['Season'], prefix = 'Season')
## X_train = pd.concat([X_train, cat5],axis=1)

##X_train = X_train.drop('Season', 1)
##X_test = X_test.drop('Season', 1)


##X_train = X_train.drop('Season', 1)
##X_test = X_test.drop('Season', 1)

y_train = X_train.Result.values
y_test = X_test.Result.values

X_train = X_train.drop('Result', 1)
X_test = X_test.drop('Result', 1)

##X_train['RbinDiff'] = X_train.ORbinDiff + X_train.DRbinDiff

## X_temp = df_temp.drop({'ORbinDiff','DRbinDiff', 'AstbinDiff', 'FGPbinDiff', 'FGAbinDiff', 'FTPbinDiff', 'FTAbinDiff', 'LeaguebinDiff'}, 1)
## X_temp = df_temp[['SeedDiff', 'FGPbinDiff', 'AstbinDiff', 'BlkbinDiff']]


X_train.head()


Unnamed: 0,Season,First_TeamID,Second_TeamID,First_PODbin,Second_PODbin,First_OCRbin,Second_OCRbin,First_Leaguebin,Second_Leaguebin,First_Astbin,Second_Astbin,First_Blkbin,Second_Blkbin,First_ORbin,Second_ORbin,First_Stlbin,Second_Stlbin
0,2010,3103,3237,6,5,11,6,2.0,4.0,10,2,6,15,12,12,5,7
1,2010,3231,3237,13,5,6,6,17.0,4.0,6,2,2,15,13,12,10,7
2,2010,3282,3237,9,5,12,6,0.0,4.0,10,2,7,15,14,12,6,7
3,2010,3282,3237,9,5,12,6,0.0,4.0,10,2,7,15,14,12,6,7
4,2010,3293,3237,13,5,13,6,0.0,4.0,12,2,13,15,9,12,6,7


In [176]:
def eval(clf):
    clf.fit(X_train,y_train)
    print "score = " + str(clf.score(X_test, y_test))

    pred = clf.predict_proba(X_test)
    print "log loss = " + str(metrics.log_loss(y_test,pred))

print "fitting logistic regression model"
modelLR = LogisticRegression()
eval(modelLR)

print "\nfitting Naive Bayes model"
modelNB = GaussianNB()
eval(modelNB)

print "\nfitting Random Forest model"
modelRF = RandomForestClassifier(n_jobs=2, random_state=0)
eval(modelRF)

fitting logistic regression model
score = 0.736417738529
log loss = 0.517229243381

fitting Naive Bayes model
score = 0.717796122096
log loss = 0.579345348845

fitting Random Forest model
score = 0.700998272221
log loss = 0.692909645744


In [171]:
## Fit a logistic regression model through grid search
lr = LogisticRegression()
params = {'C': np.logspace(start=-5, stop=3, num=9)}
lr = GridSearchCV(lr, params, scoring='neg_log_loss', refit=True)
lr.fit(X_train, y_train)
print('Best log_loss: {:.4}, with best C: {}'.format(lr.best_score_, lr.best_params_['C']))

Best log_loss: -0.5394, with best C: 0.001


In [133]:
lr = LogisticRegression(C=.01)

lr.fit(X_train,y_train)
lr.score(X_test,y_test)
##lr.predict_proba(X_test)

0.73800383877159303

In [134]:
pred = lr.predict_proba(X_test)
print "log loss = " + str(metrics.log_loss(y_test,pred))

log loss = 0.516913916426


### Engineer a feature 'winprob' for tournament matchups based on regular season data

In [325]:
df_concat.head()

Unnamed: 0,LTeamID,Season,WTeamID,WSeed,LSeed,WFGPbin,WFTPbin,WFGAbin,WFTAbin,WAstbin,...,LFGAbin,LFTAbin,LAstbin,LBlkbin,LORbin,LDRbin,LLeaguebin,LStlbin,LPODbin,LOCRbin
0,3201,2010,3124,X04,X13,14,16,8,20,11,...,14,14,12,6,12,13,7.0,12,15,13
1,3207,2010,3124,X04,X05,14,16,8,20,11,...,12,16,13,6,16,2,15.0,17,13,10
2,3397,2010,3124,X04,X01,14,16,8,20,11,...,11,11,12,17,13,18,23.0,5,9,16
3,3181,2010,3124,X04,X02,14,16,8,20,11,...,12,12,10,14,14,12,22.0,14,11,11
4,3395,2010,3173,X08,X09,13,14,15,16,13,...,13,11,13,8,12,11,21.0,12,13,10


In [275]:
df_tour_matchups = df_concat[['Season', 'WTeamID', 'LTeamID']]


df_tour_matchups_18 = df_tour_matchups[df_tour_matchups['Season'] == 2018]
df_tour_matchups = df_tour_matchups[df_tour_matchups['Season'] < 2018]
print df_tour_matchups.shape

(504, 3)


In [194]:
df_wins = pd.DataFrame()
df_wins['Season'] = df_tour_matchups['Season']
df_wins['First_TeamID'] = df_tour_matchups['WTeamID']
df_wins['Second_TeamID'] = df_tour_matchups['LTeamID']
df_wins['Result'] = 1

df_losses = pd.DataFrame()
df_losses['Season'] = df_tour_matchups['Season']
df_losses['First_TeamID'] = df_tour_matchups['LTeamID']
df_losses['Second_TeamID'] = df_tour_matchups['WTeamID']
df_losses['Result'] = 0

df_tour_matchups = pd.concat((df_wins, df_losses))
df_tour_matchups.tail()

Unnamed: 0,Season,First_TeamID,Second_TeamID,Result
2452,2017,3335,3401,0
2453,2017,3129,3417,0
2454,2017,3401,3417,0
2455,2017,3286,3449,0
2456,2017,3328,3449,0


In [212]:
df_tour_matchups_18 = pd.read_csv('NCAA_Result_2018 copy.csv')
df_tour_matchups_18 = df_tour_matchups_18.rename(columns={'Team1':'First_TeamID','Team2':'Second_TeamID'})

df_tour_matchups = pd.concat([df_tour_matchups,df_tour_matchups_18])

In [213]:
### Inject engineered features into Tournament matchups
df_first_team_features = df_engineered_features_W
df_first_team_features = df_first_team_features.rename(columns = {'WTeamID':'First_TeamID', 'LTeamID':'Second_TeamID', 'WFGPbin':'First_FGPbin', 'WPODbin':'First_PODbin', 'WOCRbin':'First_OCRbin', 'WFTPbin':'First_FTPbin', 'WFGAbin':'First_FGAbin', 'WFTAbin':'First_FTAbin', 'WAstbin':'First_Astbin', 'WBlkbin':'First_Blkbin', 'WORbin':'First_ORbin', 'WDRbin':'First_DRbin', 'WStlbin':'First_Stlbin', 'WLeaguebin':'First_Leaguebin', 'WSeed':'First_Seed', 'LFGPbin':'Second_FGPbin', 'LFTPbin':'Second_FTPbin', 'LPODbin':'Second_PODbin', 'LOCRbin':'Second_OCRbin', 'LAstbin':'Second_Astbin', 'LBlkbin':'Second_Blkbin', 'LORbin':'Second_ORbin', 'LDRbin':'Second_DRbin', 'LStlbin':'Second_Stlbin', 'LLeaguebin':'Second_Leaguebin', 'LSeed':'Second_Seed'})

df_tour_matchups_with_features = pd.merge(left=df_tour_matchups, right=df_first_team_features, on=['Season', 'First_TeamID'])

df_second_team_features = df_engineered_features_L
df_second_team_features = df_second_team_features.rename(columns = {'WTeamID':'First_TeamID', 'LTeamID':'Second_TeamID', 'WFGPbin':'First_FGPbin', 'WFTPbin':'First_FTPbin', 'WFGAbin':'First_FGAbin', 'WPODbin':'First_PODbin', 'WOCRbin':'First_OCRbin', 'WFTAbin':'First_FTAbin', 'WAstbin':'First_Astbin', 'WBlkbin':'First_Blkbin', 'WORbin':'First_ORbin', 'WDRbin':'First_DRbin', 'WStlbin':'First_Stlbin', 'WLeaguebin':'First_Leaguebin', 'WSeed':'First_Seed', 'LFGPbin':'Second_FGPbin', 'LFTPbin':'Second_FTPbin', 'LPODbin':'Second_PODbin', 'LOCRbin':'Second_OCRbin', 'LAstbin':'Second_Astbin', 'LBlkbin':'Second_Blkbin', 'LORbin':'Second_ORbin', 'LDRbin':'Second_DRbin', 'LStlbin':'Second_Stlbin', 'LLeaguebin':'Second_Leaguebin', 'LSeed':'Second_Seed'})

df_tour_matchups_with_features = pd.merge(left=df_tour_matchups_with_features, right=df_second_team_features, on=['Season', 'Second_TeamID'])
df_tour_matchups_with_features.head()

Unnamed: 0,First_TeamID,LTeamID,Result,Season,Second_TeamID,WTeamID,First_FGPbin,First_FTPbin,First_FGAbin,First_FTAbin,...,LFGAbin,LFTAbin,Second_Astbin,Second_Blkbin,Second_ORbin,Second_DRbin,Second_Leaguebin,Second_Stlbin,Second_PODbin,Second_OCRbin
0,3110,,0.0,2018,3417,,12,18,10,14,...,15,16,11,11,18,12,21.0,12,13,10
1,3166,,0.0,2018,3417,,11,17,9,11,...,15,16,11,11,18,12,21.0,12,13,10
2,3280,,1.0,2018,3417,,14,16,13,14,...,15,16,11,11,18,12,21.0,12,13,10
3,3400,,0.0,2018,3417,,13,14,16,9,...,15,16,11,11,18,12,21.0,12,13,10
4,3113,,1.0,2018,3304,,11,13,8,11,...,9,10,11,12,11,15,17.0,6,10,8


In [214]:
## Use Naive Bayes probs
df_tour_matchups = df_tour_matchups_with_features[['Result', 'Season', 'First_TeamID', 'Second_TeamID', 'First_PODbin', 'Second_PODbin', 'First_OCRbin', 'Second_OCRbin', 'First_Astbin', 'Second_Astbin', 'First_Blkbin', 'Second_Blkbin', 'First_ORbin', 'Second_ORbin', 'First_Stlbin', 'Second_Stlbin', 'First_Leaguebin', 'Second_Leaguebin']]
df_tour_matchups = df_tour_matchups.drop('Result', 1)
df_temp = pd.DataFrame(modelNB.predict_proba(df_tour_matchups), columns=list('xy'))
df_temp.head()

Unnamed: 0,x,y
0,0.962222,0.037778
1,0.861062,0.138938
2,0.307419,0.692581
3,0.590905,0.409095
4,0.494106,0.505894


In [215]:
df_tour_matchups_with_features['winprob'] = df_temp.x
df_tour_matchups_with_features['lossprob'] = df_temp.y

df_tour_matchups_with_features = df_tour_matchups_with_features[['Season', 'First_TeamID', 'Second_TeamID', 'winprob', 'lossprob']]
df_tour_matchups_with_features.head()

Unnamed: 0,Season,First_TeamID,Second_TeamID,winprob,lossprob
0,2018,3110,3417,0.962222,0.037778
1,2018,3166,3417,0.861062,0.138938
2,2018,3280,3417,0.307419,0.692581
3,2018,3400,3417,0.590905,0.409095
4,2018,3113,3304,0.494106,0.505894


### Calculate a win probability for each pair of teams during a tournament season based on regular season data


In [326]:
df_concat.head()

Unnamed: 0,LTeamID,Season,WTeamID,WSeed,LSeed,WFGPbin,WFTPbin,WFGAbin,WFTAbin,WAstbin,...,LFGAbin,LFTAbin,LAstbin,LBlkbin,LORbin,LDRbin,LLeaguebin,LStlbin,LPODbin,LOCRbin
0,3201,2010,3124,X04,X13,14,16,8,20,11,...,14,14,12,6,12,13,7.0,12,15,13
1,3207,2010,3124,X04,X05,14,16,8,20,11,...,12,16,13,6,16,2,15.0,17,13,10
2,3397,2010,3124,X04,X01,14,16,8,20,11,...,11,11,12,17,13,18,23.0,5,9,16
3,3181,2010,3124,X04,X02,14,16,8,20,11,...,12,12,10,14,14,12,22.0,14,11,11
4,3395,2010,3173,X08,X09,13,14,15,16,13,...,13,11,13,8,12,11,21.0,12,13,10


In [216]:
# calculate difference in performance based for all features  
df_concat['SeedDiff'] = df_concat.apply(lambda row: int(row['WSeed'][1:3]) - int(row['LSeed'][1:3]), axis=1)
df_concat['SeedDiffPct'] = df_concat['SeedDiff'] / (df_concat.apply(lambda row: int(row['WSeed'][1:3]) + int(row['LSeed'][1:3]), axis=1))
df_concat['FGPbinDiff'] = df_concat.apply(lambda row: int(row['WFGPbin']) - int(row['LFGPbin']), axis=1)
df_concat['FTPbinDiff'] = df_concat.apply(lambda row: int(row['WFTPbin']) - int(row['LFTPbin']), axis=1)
df_concat['FGAbinDiff'] = df_concat.apply(lambda row: int(row['WFGAbin']) - int(row['LFGAbin']), axis=1)
df_concat['FTAbinDiff'] = df_concat.apply(lambda row: int(row['WFTAbin']) - int(row['LFTAbin']), axis=1)
df_concat['AstbinDiff'] = df_concat.apply(lambda row: int(row['WAstbin']) - int(row['LAstbin']), axis=1)
df_concat['BlkbinDiff'] = df_concat.apply(lambda row: int(row['WBlkbin']) - int(row['LBlkbin']), axis=1)
df_concat['ORbinDiff'] = df_concat.apply(lambda row: int(row['WORbin']) - int(row['LORbin']), axis=1)
df_concat['DRbinDiff'] = df_concat.apply(lambda row: int(row['WDRbin']) - int(row['LDRbin']), axis=1)
df_concat['StlbinDiff'] = df_concat.apply(lambda row: int(row['WStlbin']) - int(row['LStlbin']), axis=1)
df_concat['PODbinDiff'] = df_concat.apply(lambda row: int(row['WPODbin']) - int(row['LPODbin']), axis=1)
df_concat['OCRbinDiff'] = df_concat.apply(lambda row: int(row['WOCRbin']) - int(row['LOCRbin']), axis=1)
df_concat['LeaguebinDiff'] = df_concat.apply(lambda row: int(row['WLeaguebin']) - int(row['LLeaguebin']), axis=1)

df_concat.tail(10)

Unnamed: 0,LTeamID,Season,WTeamID,WSeed,LSeed,WFGPbin,WFTPbin,WFGAbin,WFTAbin,WAstbin,...,FGAbinDiff,FTAbinDiff,AstbinDiff,BlkbinDiff,ORbinDiff,DRbinDiff,StlbinDiff,PODbinDiff,OCRbinDiff,LeaguebinDiff
2447,3243,2017,3390,Y02,Y07,13,13,9,13,10,...,3,-2,1,6,2,6,-3,2,3,0
2448,3400,2017,3390,Y02,Y03,13,13,9,13,10,...,-4,5,2,2,-4,3,-2,0,1,0
2449,3323,2017,3390,Y02,Y01,13,13,9,13,10,...,-2,1,-7,1,2,-1,-3,0,-6,-1
2450,3235,2017,3393,W08,W09,7,18,17,16,11,...,7,5,1,6,7,-3,8,7,-3,1
2451,3173,2017,3397,X05,X12,11,17,13,14,11,...,3,-3,1,-3,-4,2,-1,0,1,13
2452,3335,2017,3401,W05,W12,12,19,9,12,13,...,1,3,2,-9,-1,2,-1,-1,5,21
2453,3129,2017,3417,W04,W13,8,17,14,17,9,...,0,8,-4,2,0,2,2,3,-2,14
2454,3401,2017,3417,W04,W05,8,17,14,17,9,...,5,5,-4,4,2,0,6,8,-5,-2
2455,3286,2017,3449,X03,X14,15,18,11,18,8,...,0,3,-3,-5,-6,1,-2,4,6,21
2456,3328,2017,3449,X03,X06,15,18,11,18,8,...,0,3,1,-6,-3,9,-7,4,7,0


In [141]:
df_wins = pd.DataFrame()

df_wins['SeedDiff'] = df_concat['SeedDiff']
df_wins['SeedDiffPct'] = df_concat['SeedDiffPct']
df_wins['FGPbinDiff'] = df_concat['FGPbinDiff']
df_wins['FTPbinDiff'] = df_concat['FTPbinDiff']
df_wins['FGAbinDiff'] = df_concat['FGAbinDiff']
df_wins['FTAbinDiff'] = df_concat['FTAbinDiff']
df_wins['AstbinDiff'] = df_concat['AstbinDiff']
df_wins['BlkbinDiff'] = df_concat['BlkbinDiff']
df_wins['ORbinDiff'] = df_concat['ORbinDiff']
df_wins['DRbinDiff'] = df_concat['DRbinDiff']
df_wins['StlbinDiff'] = df_concat['StlbinDiff']
df_wins['PODbinDiff'] = df_concat['PODbinDiff']
df_wins['OCRbinDiff'] = df_concat['OCRbinDiff']
df_wins['LeaguebinDiff'] = df_concat['LeaguebinDiff']
df_wins['Season'] = df_concat['Season']
df_wins['First_TeamID'] = df_concat['WTeamID']
df_wins['Second_TeamID'] = df_concat['LTeamID']


## df_wins['DayNum'] = df_concat['DayNum']



df_wins['Result'] = 1

df_losses = pd.DataFrame()

df_losses['SeedDiff'] = -df_concat['SeedDiff']
df_losses['SeedDiffPct'] = -df_concat['SeedDiffPct']
df_losses['FGPbinDiff'] = -df_concat['FGPbinDiff']
df_losses['FTPbinDiff'] = -df_concat['FTPbinDiff']
df_losses['FGAbinDiff'] = -df_concat['FGAbinDiff']
df_losses['FTAbinDiff'] = -df_concat['FTAbinDiff']
df_losses['AstbinDiff'] = -df_concat['AstbinDiff']
df_losses['BlkbinDiff'] = -df_concat['BlkbinDiff']
df_losses['ORbinDiff'] = -df_concat['ORbinDiff']
df_losses['DRbinDiff'] = -df_concat['DRbinDiff']
df_losses['StlbinDiff'] = -df_concat['StlbinDiff']
df_losses['PODbinDiff'] = -df_concat['PODbinDiff']
df_losses['OCRbinDiff'] = -df_concat['OCRbinDiff']
df_losses['LeaguebinDiff'] = -df_concat['LeaguebinDiff']
df_losses['Season'] = df_concat['Season']
df_losses['First_TeamID'] = df_concat['LTeamID']
df_losses['Second_TeamID'] = df_concat['WTeamID']
## df_losses['DayNum'] = df_concat['DayNum']



df_losses['Result'] = 0

df_predictions = pd.concat((df_wins, df_losses))
df_predictions.tail()


Unnamed: 0,SeedDiff,SeedDiffPct,FGPbinDiff,FTPbinDiff,FGAbinDiff,FTAbinDiff,AstbinDiff,BlkbinDiff,ORbinDiff,DRbinDiff,StlbinDiff,PODbinDiff,OCRbinDiff,LeaguebinDiff,Season,First_TeamID,Second_TeamID,Result
0,-11,-0.647059,4,1,5,5,4,4,0,0,8,4,5,15,2011,3177,3298,1
1,-3,-0.333333,2,-3,-2,-2,7,4,-2,-3,0,0,1,-2,2011,3177,3336,1
2,-13,-0.764706,-3,-4,-3,-4,-2,2,2,-6,4,-4,-3,22,2011,3181,3404,1
3,-8,-0.666667,-6,-10,-2,1,-5,-1,6,-8,4,-2,-7,19,2011,3181,3265,1
4,-1,-0.2,-5,-2,-3,-1,-5,0,2,-1,0,-3,-5,7,2011,3181,3177,1


In [143]:
jb = pd.read_csv('jb_features.csv')
jb.head()

Unnamed: 0,LeaguePerfDiff,Season,SeedDiff,Team1,Team1AvgGames,Team1AvgSeed,Team1LeaguePerf,Team1PrevSeed,Team1Seed,Team2,Team2AvgGames,Team2AvgSeed,Team2LeaguePerf,Team2PrevSeed,Team2Seed
0,0.41777,1999,7,3104,1.0,9.5,0.50788,2.0,5,3212,0.0,16.5,0.09011,16.0,12
1,-0.020362,1999,5,3112,1.0,10.0,0.487518,3.0,6,3196,1.0,10.0,0.50788,3.0,11
2,0.451126,1999,13,3155,0.5,11.5,0.501126,6.0,2,3197,0.0,17.0,0.05,17.0,15
3,0.142226,1999,13,3161,0.5,14.5,0.299423,12.0,2,3169,0.0,17.0,0.157197,17.0,15
4,0.200148,1999,15,3163,1.5,9.5,0.227926,2.0,1,3384,0.0,16.5,0.027778,16.0,16


In [144]:
# match on season and TeamIDs, Seed Diff (note our directions are switched)
jb = jb.rename(columns = {
    'Team2' : 'Second_TeamID',
    'Team1' : 'First_TeamID'
})

jb['SeedDiff'] = -jb['SeedDiff']

In [145]:
test = pd.merge(left=df_predictions,right=jb, how='left',on=['Season','First_TeamID','Second_TeamID','SeedDiff'])

In [44]:
test.head()

Unnamed: 0,SeedDiff,SeedDiffPct,FGPbinDiff,FTPbinDiff,FGAbinDiff,FTAbinDiff,AstbinDiff,BlkbinDiff,ORbinDiff,DRbinDiff,...,Team1AvgGames,Team1AvgSeed,Team1LeaguePerf,Team1PrevSeed,Team1Seed,Team2AvgGames,Team2AvgSeed,Team2LeaguePerf,Team2PrevSeed,Team2Seed
0,1,0.058824,-2,-5,2,6,-2,5,5,-1,...,0.0,17.0,0.487518,17.0,9,1.0,7.5,0.50788,8.0,8
1,-13,-0.764706,0,-2,9,4,6,0,8,9,...,4.0,1.0,0.490433,1.0,2,0.0,17.0,0.282576,17.0,15
2,-5,-0.555556,4,4,7,0,9,3,4,2,...,4.0,1.0,0.490433,1.0,2,2.5,5.0,0.487518,2.0,7
3,-1,-0.2,3,4,2,-1,7,1,3,5,...,4.0,1.0,0.490433,1.0,2,3.0,2.0,0.50788,2.0,3
4,7,0.411765,0,-3,-2,5,1,13,-3,11,...,0.0,13.5,0.165578,17.0,12,0.0,17.0,0.501126,17.0,5


In [320]:
test = pd.concat([df_concat,df_concat_18])

In [324]:
test.head()

Unnamed: 0,LAstbin,LBlkbin,LDRbin,LFGAbin,LFGPbin,LFTAbin,LFTPbin,LLeaguebin,LOCRbin,LORbin,...,WFGPbin,WFTAbin,WFTPbin,WLeaguebin,WOCRbin,WORbin,WPODbin,WSeed,WStlbin,WTeamID
0,12,6,13,14,12,14,14,7.0,13,12,...,14,20,16,21.0,16,9,7,X04,7,3124
1,13,6,2,12,9,16,16,15.0,10,16,...,14,20,16,21.0,16,9,7,X04,7,3124
2,12,17,18,11,15,11,13,23.0,16,13,...,14,20,16,21.0,16,9,7,X04,7,3124
3,10,14,12,12,11,12,11,22.0,11,14,...,14,20,16,21.0,16,9,7,X04,7,3124
4,13,8,11,13,9,11,18,21.0,10,12,...,13,16,14,10.0,12,16,15,X08,7,3173


In [148]:
# Other potential columns:
## average seed of either team, Seed Diff, Previous Seeds of each team, Avg. number of games in tournament
df_temp2 = test[['First_TeamID', 'Second_TeamID','SeedDiff' ,'SeedDiffPct', 'Season', 'Result', 'LeaguebinDiff','Team1AvgGames','Team2AvgGames','Team1AvgSeed','Team2AvgSeed','Team1PrevSeed','Team2PrevSeed']]
df_temp2 = pd.merge(left=df_temp2, right=df_tour_matchups_with_features, how='left', on=['Season', 'First_TeamID', 'Second_TeamID'])
df_temp = df_temp2.drop({ 'lossprob' , 'First_TeamID', 'Second_TeamID','Team1PrevSeed','Team2PrevSeed','Team1AvgGames','Team2AvgGames','Team1AvgSeed','Team2AvgSeed','SeedDiff'}, 1)
print df_temp.head()

X_train = df_temp[df_temp['Season'] < 2017]
X_test = df_temp[df_temp['Season'] == 2017]

y_train = X_train.Result.values
y_test = X_test.Result.values

X_train = X_train.drop(['Result','Season'], 1)
X_test = X_test.drop(['Result','Season'], 1)

print X_train.head()

   SeedDiffPct  Season  Result  LeaguebinDiff   winprob
0    -0.647059    2011       1             15  0.025570
1    -0.333333    2011       1             -2  0.302635
2    -0.764706    2011       1             22  0.093802
3    -0.666667    2011       1             19  0.303333
4    -0.200000    2011       1              7  0.496440
   SeedDiffPct  LeaguebinDiff   winprob
0    -0.647059             15  0.025570
1    -0.333333             -2  0.302635
2    -0.764706             22  0.093802
3    -0.666667             19  0.303333
4    -0.200000              7  0.496440


In [218]:
X_test.head()

Unnamed: 0,Season,First_TeamID,Second_TeamID,First_PODbin,Second_PODbin,First_OCRbin,Second_OCRbin,First_Leaguebin,Second_Leaguebin,First_Astbin,Second_Astbin,First_Blkbin,Second_Blkbin,First_ORbin,Second_ORbin,First_Stlbin,Second_Stlbin
82266,2018,3104,3105,9,9,8,6,23.0,0.0,6,11,10,6,13,13,10,4
82267,2018,3108,3105,12,9,8,6,0.0,0.0,9,11,6,6,12,13,12,4
82268,2018,3108,3105,12,9,8,6,0.0,0.0,9,11,6,6,12,13,12,4
82269,2018,3412,3105,12,9,14,6,12.0,0.0,16,11,11,6,10,13,5,4
82270,2018,3191,3105,7,9,9,6,4.0,0.0,6,11,6,6,4,13,1,4


In [149]:
## Fit to training tournament matchups and predict test matchups



def eval(clf):
    clf.fit(X_train,y_train)
    print "score = " + str(clf.score(X_test, y_test))

    pred = clf.predict_proba(X_test)
    print "log loss = " + str(metrics.log_loss(y_test,pred))

print "fitting logistic regression model"
model = LogisticRegression()
eval(model)

print "\nfitting SVM model"
model = svm.SVC(probability=True)
eval(model)

print "\nfitting Naive Bayes model"
model = GaussianNB()
eval(model)

print "\nfitting Random Forest model"
model = RandomForestClassifier(n_jobs=2, random_state=0)
eval(model)

print "\nfitting Gradient Boosting Classifier"
model = GradientBoostingClassifier(n_estimators=10, learning_rate=1.0, max_depth=2, random_state=0)
eval(model)


fitting logistic regression model
score = 0.873015873016
log loss = 0.389504255304

fitting SVM model
score = 0.888888888889
log loss = 0.381631263473

fitting Naive Bayes model
score = 0.857142857143
log loss = 0.465268892775

fitting Random Forest model
score = 0.698412698413
log loss = 1.77257203829

fitting Gradient Boosting Classifier
score = 0.801587301587
log loss = 0.487281654588


In [59]:
X_train.shape
X_test.shape

(126, 9)

In [49]:
## Convert to arrays
## X_train = df_predictions.SeedDiff.values.reshape(-1,1)
## y_train = df_predictions.Result.values

df_temp = df_predictions[['First_TeamID', 'Second_TeamID', 'SeedDiffPct', 'Season', 'Result', 'LeaguebinDiff']]
print df_temp.head()
df_temp = pd.merge(left=df_temp, right=df_tour_matchups_with_features, how='left', on=['Season', 'First_TeamID', 'Second_TeamID'])
print df_temp.head()
df_temp = df_temp.drop({ 'lossprob' , 'First_TeamID', 'Second_TeamID',}, 1)
##df_temp['stage2pred'] = stage2pred


##df_temp = df_temp.drop({ORbinDiff','DRbinDiff', 'FGAbinDiff', 'FGPbinDiff', 'FTAbinDiff', 'AstbinDiff', 'StlbinDiff', 'FTPbinDiff'}, 1)

X_train = df_temp[df_temp['Season'] < 2017]
X_test = df_temp[df_temp['Season'] == 2017]

##X_train = X_train.drop({'SeedDiff'}, 1)
##X_test = X_test.drop({'SeedDiff'}, 1)


## cat5 = pd.get_dummies(X_train['Season'], prefix = 'Season')
## X_train = pd.concat([X_train, cat5],axis=1)

X_train = X_train.drop('Season', 1)
X_test = X_test.drop('Season', 1)


y_train = X_train.Result.values
y_test = X_test.Result.values

X_train = X_train.drop('Result', 1)
X_test = X_test.drop('Result', 1)

##X_train['RbinDiff'] = X_train.ORbinDiff + X_train.DRbinDiff

## X_temp = df_temp.drop({'ORbinDiff','DRbinDiff', 'AstbinDiff', 'FGPbinDiff', 'FGAbinDiff', 'FTPbinDiff', 'FTAbinDiff', 'LeaguebinDiff'}, 1)
## X_temp = df_temp[['SeedDiff', 'FGPbinDiff', 'AstbinDiff', 'BlkbinDiff']]


print X_train.head()


   First_TeamID  Second_TeamID  SeedDiffPct  Season  Result  LeaguebinDiff
0          3113           3435     0.058824    2014       1             -2
1          3124           3443    -0.764706    2014       1              9
2          3124           3143    -0.555556    2014       1              0
3          3124           3246    -0.200000    2014       1             -2
4          3140           3301     0.411765    2014       1            -16
   First_TeamID  Second_TeamID  SeedDiffPct  Season  Result  LeaguebinDiff  \
0          3113           3435     0.058824    2014       1             -2   
1          3124           3443    -0.764706    2014       1              9   
2          3124           3143    -0.555556    2014       1              0   
3          3124           3246    -0.200000    2014       1             -2   
4          3140           3301     0.411765    2014       1            -16   

    winprob  lossprob  
0  0.600978  0.399022  
1  0.061416  0.938584  
2  0.0684