In [561]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import itertools as it

import seaborn as sns

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV


# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

In [395]:

# right now, only looking at tournament seed / performance, not regular season games
data_dir = './WDataFiles/'
data_dir_1 = './WPrelimData2018/'
data_dir_2 = './WStage2DataFiles/'
df_seeds = pd.read_csv(data_dir_2 + 'WNCAATourneySeeds.csv')
df_tour_compact = pd.read_csv(data_dir_2 + 'WNCAATourneyCompactResults.csv')

df_tour_compact['WstdScore'] = df_tour_compact['WScore']/(df_tour_compact['WScore']+df_tour_compact['LScore'])
df_tour_compact['LstdScore'] = df_tour_compact['LScore']/(df_tour_compact['WScore']+df_tour_compact['LScore'])
df_tour_compact['stdScorediff'] = (df_tour_compact['WScore']-df_tour_compact['LScore'])/(df_tour_compact['WScore']+df_tour_compact['LScore'])


df_tour = pd.read_csv(data_dir_2 + 'WNCAATourneyDetailedResults.csv')
df_reg_compact = pd.read_csv(data_dir_2 +'WRegularSeasonCompactResults.csv')


df_reg_compact['WstdScore'] = df_reg_compact['WScore']/(df_reg_compact['WScore']+df_reg_compact['LScore'])
df_reg_compact['LstdScore'] = df_reg_compact['LScore']/(df_reg_compact['WScore']+df_reg_compact['LScore'])
df_reg_compact['stdScorediff'] = (df_reg_compact['WScore']-df_reg_compact['LScore'])/(df_reg_compact['WScore']+df_reg_compact['LScore'])



df_reg_detail = pd.read_csv(data_dir_2 + 'WRegularSeasonDetailedResults.csv')

df_reg_detail['WstdScore'] = df_reg_detail['WScore']/(df_reg_detail['WScore']+df_reg_detail['LScore'])
df_reg_detail['LstdScore'] = df_reg_detail['LScore']/(df_reg_detail['WScore']+df_reg_detail['LScore'])
df_reg_detail['stdScorediff'] = (df_reg_detail['WScore']-df_reg_detail['LScore'])/(df_reg_detail['WScore']+df_reg_detail['LScore'])

df_tour_detail = pd.read_csv(data_dir_2 + 'WNCAATourneyDetailedResults.csv')

df_tour_detail['WstdScore'] = df_tour_detail['WScore']/(df_tour_detail['WScore']+df_tour_detail['LScore'])
df_tour_detail['LstdScore'] = df_tour_detail['LScore']/(df_tour_detail['WScore']+df_tour_detail['LScore'])
df_tour_detail['stdScorediff'] = (df_tour_detail['WScore']-df_tour_detail['LScore'])/(df_tour_detail['WScore']+df_tour_detail['LScore'])

df_reg_compact['Game_Type'] = "R"

df_tour_compact['Game_Type'] = "T"

df_reg_detail['Game_Type'] = "R"

df_tour_detail['Game_Type'] = "T"

df_reg_tour_compact = pd.concat([df_reg_compact,df_tour_compact],ignore_index=True)
df_reg_tour_compact.sort_values(by=['Season','DayNum','Game_Type'],inplace=True)
df_reg_tour_compact=df_reg_tour_compact.reset_index()



df_reg_tour_detail = pd.concat([df_tour_detail,df_reg_detail],ignore_index=True)

df_reg_tour_detail.sort_values(by=['Season','DayNum','Game_Type'],inplace=True)

df_reg_tour_detail=df_reg_tour_detail.reset_index()

df_reg_tour_detail['index'] = df_reg_tour_detail.index

df_teams = pd.read_csv(data_dir_2 + 'WTeams.csv')

df_teams_leagues = pd.merge(left=df_teams, right=pd.read_csv(data_dir_2 + 'WLeagues.csv'), how='left', on=['TeamName'])

df_teams_leagues = df_teams_leagues[(df_teams_leagues.LeagueName.notnull().values)]

df_teams_leagues.head()

Unnamed: 0,TeamID,TeamName,LeagueName
0,3101,Abilene Chr,Southland
1,3102,Air Force,MWC
2,3103,Akron,MAC
3,3104,Alabama,SEC
4,3105,Alabama A&M,SWAC


# Get Matchups of interest for Training & Test Data¶


In [396]:

## Get 2018 data

# sorting ensures that we work through smallest to largest 
# no repeats, smallest always first in pair
current_year = df_seeds[df_seeds['Season'] == 2018].sort_values(by=['TeamID'])
current_year_pairs = list(it.combinations(current_year['TeamID'],2))

team1, team2 = map(list, zip(*current_year_pairs))

df_test_pairs = pd.DataFrame({
    'Season' : [2018] * len(team1),
    'Team1' : team1,
    'Team2' : team2
})

In [397]:

# also need previous matchups (1998 - 2017)
df_train_pairs = pd.DataFrame(
    { 'Team1' : df_tour_compact['WTeamID'],
     'Team2' : df_tour_compact['LTeamID'],
     'Season' : df_tour_compact['Season']
    })

df_train_pairs.tail()

Unnamed: 0,Season,Team1,Team2
1255,2017,3163,3332
1256,2017,3376,3199
1257,2017,3280,3163
1258,2017,3376,3390
1259,2017,3376,3280


In [398]:
# this is what we will use as a base to build our dataset
df_train_test_matchups = df_train_pairs.append(df_test_pairs)

# Feature Engineering for Training & Test Data

## Current Year Seeds


In [479]:
# get just integer value of seed (exclude region information)
df_seeds = pd.read_csv(data_dir_2 + 'WNCAATourneySeeds.csv')

def seed_to_int(seed):
    #Get just the digits from the seeding. Return as int
    s_int = int(seed[1:3])
    return s_int
df_seeds['SeedInt'] = df_seeds.Seed.apply(seed_to_int)
df_seeds.drop(labels=['Seed'], inplace=True, axis=1) # This is the string label

df_seeds.head()

df_seeds[df_seeds.TeamID==3104]

Unnamed: 0,Season,TeamID,SeedInt
17,1998,3104,2
100,1999,3104,5


In [480]:
# merge seeds with team IDs for tourney performance
df_W = df_seeds.rename(columns={'TeamID':'Team1', 'SeedInt':'Team1Seed'})
df_L = df_seeds.rename(columns={'TeamID':'Team2', 'SeedInt':'Team2Seed'})
df_dummy = pd.merge(left=df_train_test_matchups, right=df_W, how='left', on=['Season', 'Team1'])
df_matchups = pd.merge(left=df_dummy, right=df_L, how='left', on=['Season', 'Team2'])

df_matchups.head()

Unnamed: 0,Season,Team1,Team2,Team1Seed,Team2Seed
0,1998,3104,3422,2,15
1,1998,3112,3365,3,14
2,1998,3163,3193,2,15
3,1998,3198,3266,7,10
4,1998,3203,3208,10,7


## Current Year Seed Difference


In [481]:
# negative number indicates that team 2 (loser, in this case) has higher seed
df_matchups['SeedDiff'] = df_matchups['Team2Seed']
df_matchups['SeedDiff'] = df_matchups['Team2Seed'].sub(df_matchups['Team1Seed'], axis=0)

df_matchups.tail()

Unnamed: 0,Season,Team1,Team2,Team1Seed,Team2Seed,SeedDiff
3271,2018,3437,3443,9,11,2
3272,2018,3437,3453,9,7,-2
3273,2018,3438,3443,10,11,1
3274,2018,3438,3453,10,7,-3
3275,2018,3443,3453,11,7,-4


## Last Year's Tournament Seed

In [482]:

# what to do for games where we had no previous year (1998)?

# add 1 so easily referenced
df_seeds['Season'] = df_seeds['Season'].apply(lambda x: x + 1)
df_W = df_seeds.rename(columns={'TeamID':'Team1','SeedInt':'Team1PrevSeed'})
df_L = df_seeds.rename(columns={'TeamID':'Team2','SeedInt':'Team2PrevSeed'})
df_dummy = pd.merge(left=df_matchups, right=df_W, how='left', on=['Season', 'Team1'])
df_matchups = pd.merge(left=df_dummy, right=df_L, how='left', on=['Season', 'Team2'])

## what to do if they weren't in the tournament last year (17?)
df_matchups = df_matchups.fillna(value=17)

df_matchups.head()

Unnamed: 0,Season,Team1,Team2,Team1Seed,Team2Seed,SeedDiff,Team1PrevSeed,Team2PrevSeed
0,1998,3104,3422,2,15,13,17.0,17.0
1,1998,3112,3365,3,14,11,17.0,17.0
2,1998,3163,3193,2,15,13,17.0,17.0
3,1998,3198,3266,7,10,3,17.0,17.0
4,1998,3203,3208,10,7,-3,17.0,17.0


In [473]:
df_seeds[df_seeds.TeamID==3104]

Unnamed: 0,Season,TeamID,SeedInt
17,1999,3104,2
100,2000,3104,5


In [403]:
df_reg_tour_compact.columns = ['index', 'Season', 'DayNum', 'Team1', 'Score1', 'Team2', 'Score2',
       'Loc1', 'NumOT', 'stdScore1', 'stdScore2', 'stdScorediff', 'Game_Type']

In [404]:
def getteam2loc(x):
    if (x == "H"):
        l = "A"    
    if (x == "A"):
        l = "H"
    if (x == "N"):
        l = "N"    
    return(l)
    
df_reg_tour_compact['Loc2'] = df_reg_tour_compact.Loc1.apply(getteam2loc)

In [405]:
df_reg_tour_compact.columns

df_reg_tour_compact=df_reg_tour_compact.drop(['NumOT'], axis=1)



In [406]:
df_reg_tour_compact.head()

Unnamed: 0,index,Season,DayNum,Team1,Score1,Team2,Score2,Loc1,stdScore1,stdScore2,stdScorediff,Game_Type,Loc2
0,0,1998,18,3104,91,3202,41,H,0.689394,0.310606,0.378788,R,A
1,1,1998,18,3163,87,3221,76,H,0.533742,0.466258,0.067485,R,A
2,2,1998,18,3222,66,3261,59,H,0.528,0.472,0.056,R,A
3,3,1998,18,3307,69,3365,62,H,0.526718,0.473282,0.053435,R,A
4,4,1998,18,3349,115,3411,35,H,0.766667,0.233333,0.533333,R,A


In [407]:
df_test_pairs['DayNum'] = 140
df_test_pairs['Score1'] = 0
df_test_pairs['Score2'] = 0
df_test_pairs['stdScore1'] = 0
df_test_pairs['stdScore2'] = 0
df_test_pairs['Game_Type'] = "T"
df_test_pairs['Loc1'] = "N"
df_test_pairs['Loc2'] = "N"



df_reg_tour_compact = pd.concat([df_reg_tour_compact[['Season', 'DayNum', 'Team1', 'Score1', 'Team2', 'Score2','Loc1','Loc2','stdScore1', 'stdScore2','Game_Type']],
  df_test_pairs]).reset_index()        


df_reg_tour_compact.tail()

Unnamed: 0,index,DayNum,Game_Type,Loc1,Loc2,Score1,Score2,Season,Team1,Team2,stdScore1,stdScore2
105164,2011,140,T,N,N,0,0,2018,3437,3443,0.0,0.0
105165,2012,140,T,N,N,0,0,2018,3437,3453,0.0,0.0
105166,2013,140,T,N,N,0,0,2018,3438,3443,0.0,0.0
105167,2014,140,T,N,N,0,0,2018,3438,3453,0.0,0.0
105168,2015,140,T,N,N,0,0,2018,3443,3453,0.0,0.0


In [408]:
# Add winning an losing team names to the regular compact and tour compact dataframes

df_teams_w = df_teams_leagues.rename(columns={'TeamID':'Team1','TeamName':'TeamName1','LeagueName':'LeagueName1'})
df_teams_l = df_teams_leagues.rename(columns={'TeamID':'Team2','TeamName':'TeamName2','LeagueName':'LeagueName2'})

#df_reg_compact = pd.merge(left=df_reg_compact, right=df_teams_w, how='left', on=['WTeamID'])
#df_reg_compact = pd.merge(left=df_reg_compact, right=df_teams_l, how='left', on=['LTeamID'])


#df_tour_compact = pd.merge(left=df_tour_compact, right=df_teams_w, how='left', on=['WTeamID'])
#df_tour_compact = pd.merge(left=df_tour_compact, right=df_teams_l, how='left', on=['LTeamID'])

df_reg_tour_compact = pd.merge(left=df_reg_tour_compact, right=df_teams_w, how='left', on=['Team1'])
df_reg_tour_compact = pd.merge(left=df_reg_tour_compact, right=df_teams_l, how='left', on=['Team2'])

#df_reg_tour_detail = pd.merge(left=df_reg_tour_detail, right=df_teams_w, how='left', on=['WTeamID'])
#df_reg_tour_detail = pd.merge(left=df_reg_tour_detail, right=df_teams_l, how='left', on=['LTeamID'])

In [409]:
df_reg_tour_compact['margin'] = df_reg_tour_compact.Score1 - df_reg_tour_compact.Score2

df_reg_tour_compact['index'] = df_reg_tour_compact.index

df_reg_tour_compact.tail()

Unnamed: 0,index,DayNum,Game_Type,Loc1,Loc2,Score1,Score2,Season,Team1,Team2,stdScore1,stdScore2,TeamName1,LeagueName1,TeamName2,LeagueName2,margin
105164,105164,140,T,N,N,0,0,2018,3437,3443,0.0,0.0,Villanova,Big East,WKU,C-USA,0
105165,105165,140,T,N,N,0,0,2018,3437,3453,0.0,0.0,Villanova,Big East,WI Green Bay,Horizon,0
105166,105166,140,T,N,N,0,0,2018,3438,3443,0.0,0.0,Virginia,ACC,WKU,C-USA,0
105167,105167,140,T,N,N,0,0,2018,3438,3453,0.0,0.0,Virginia,ACC,WI Green Bay,Horizon,0
105168,105168,140,T,N,N,0,0,2018,3443,3453,0.0,0.0,WKU,C-USA,WI Green Bay,Horizon,0


In [410]:
# Caluclation of Elo Rating
K = 20.
HOME_ADVANTAGE = 100.
team_ids = set(df_reg_tour_compact.Team1).union(set(df_reg_tour_compact.Team2))
# This dictionary will be used as a lookup for current
# scores while the algorithm is iterating through each game
elo_dict = dict(zip(list(team_ids), [1500] * len(team_ids)))

elo_dict_count = dict(zip(list(team_ids), [0] * len(team_ids)))

df_teams_leagues = df_teams_leagues[df_teams_leagues['TeamID'].isin( list(team_ids))]

#df_teams_leagues = df_teams_leagues[(df_teams_leagues.LeagueName.notnull().values)]

a = list(df_teams_leagues.LeagueName.unique())

elo_league_dict = dict(zip(a,[1500]*len(a)))


Reg_weight = 0.8

Tour_weight = 1.0

def elo_pred(elo1, elo2):
    return(1. / (10. ** (-(elo1 - elo2) / 400.) + 1.))

def expected_margin(elo_diff):
    return((7.5 + 0.006 * elo_diff))

def elo_update(w_elo, l_elo, margin,Game_Type):
    elo_diff = w_elo - l_elo
    pred = elo_pred(w_elo, l_elo)
    if Game_Type == "R":
        
        mult = (((margin + 3.) ** 0.8) / expected_margin(elo_diff)) * Reg_weight
    else:
        mult = (((margin + 3.) ** 0.8) / expected_margin(elo_diff)) * Tour_weight
        
    update = K * mult * (1 - pred)
    return(pred, update)

def calc_league_elo_rating():
    
    for l in df_teams_leagues.LeagueName.unique():
            teams = df_teams_leagues.loc[df_teams_leagues.LeagueName == l,'TeamID']
            v = 0
            for t in teams:
                v = v + elo_dict[t]
            elo_league_dict[l] = v / len(teams) 
            
            for t in teams:                
                elo_dict[t] = elo_league_dict[l]

In [411]:
preds = []
w_elo = []
l_elo = []

current_season = 1998
# Loop over all rows of the games dataframe
for row in df_reg_tour_compact.itertuples():
    
    # Get key data from current row
    
    w = row.Team1
    l = row.Team2
    margin = row.margin
    wloc = row.Loc1
    Game_Type = row.Game_Type
    season = row.Season
    wLeague = row.LeagueName1
    lLeague = row.LeagueName2
    
    
    # Does either team get a home-court advantage?
    w_ad, l_ad, = 0., 0.
    if wloc == "H":
        w_ad += HOME_ADVANTAGE
    elif wloc == "A":
        l_ad += HOME_ADVANTAGE
    
    
    if (season == current_season):
        
        w_elo.append(elo_dict[w])
        l_elo.append(elo_dict[l])
        
        # Get elo updates as a result of the game
        pred, update = elo_update(elo_dict[w] + w_ad,
                              elo_dict[l] + l_ad, 
                              margin,Game_Type)
        preds.append(pred)

        elo_dict[w] += update
        elo_dict[l] -= update
        
    else:
        current_season = season
        calc_league_elo_rating()
        w_elo.append(elo_dict[w])
        l_elo.append(elo_dict[l])
        
        # Get elo updates as a result of the game
        pred, update = elo_update(elo_dict[w] + w_ad,
                              elo_dict[l] + l_ad, 
                              margin,Game_Type)
        preds.append(pred)
        
        elo_dict[w] += update
        elo_dict[l] -= update
            
    # Save prediction and new Elos for each round

df_reg_tour_compact['elo1'] = w_elo
df_reg_tour_compact['elo2'] = l_elo

In [412]:
df_reg_tour_compact.tail()

Unnamed: 0,index,DayNum,Game_Type,Loc1,Loc2,Score1,Score2,Season,Team1,Team2,stdScore1,stdScore2,TeamName1,LeagueName1,TeamName2,LeagueName2,margin,elo1,elo2
105164,105164,140,T,N,N,0,0,2018,3437,3443,0.0,0.0,Villanova,Big East,WKU,C-USA,0,1568.829373,1536.37097
105165,105165,140,T,N,N,0,0,2018,3437,3453,0.0,0.0,Villanova,Big East,WI Green Bay,Horizon,0,1571.667533,1604.318569
105166,105166,140,T,N,N,0,0,2018,3438,3443,0.0,0.0,Virginia,ACC,WKU,C-USA,0,1585.165783,1533.532809
105167,105167,140,T,N,N,0,0,2018,3438,3453,0.0,0.0,Virginia,ACC,WI Green Bay,Horizon,0,1587.794463,1600.712537
105168,105168,140,T,N,N,0,0,2018,3443,3453,0.0,0.0,WKU,C-USA,WI Green Bay,Horizon,0,1530.904129,1597.347462


In [413]:
# Find per season Elo score

In [414]:
#a = df_reg_tour_compact[['Season','WTeamID',]].groupby(['season','WTeamID']).rolling(5, min_periods=1).mean().reset_index([0,1],drop=True)

a = df_reg_tour_compact[['index','Season','DayNum','Team1','Score1','Loc1','stdScore1','Game_Type','TeamName1','LeagueName1']]

a.columns=['index','Season','DayNum','TeamID','Score','Loc','stdScore','Game_Type','TeamName','LeagueName']

a = a.reset_index()
a.head()

Unnamed: 0,level_0,index,Season,DayNum,TeamID,Score,Loc,stdScore,Game_Type,TeamName,LeagueName
0,0,0,1998,18,3104,91,H,0.689394,R,Alabama,SEC
1,1,1,1998,18,3163,87,H,0.533742,R,Connecticut,AAC
2,2,2,1998,18,3222,66,H,0.528,R,Houston,AAC
3,3,3,1998,18,3307,69,H,0.526718,R,New Mexico,MWC
4,4,4,1998,18,3349,115,H,0.766667,R,Rice,C-USA


In [415]:
a['OC'] = 1

In [416]:
b = df_reg_tour_compact[['index','Season','DayNum','Team2','Score2','Loc2','stdScore2','Game_Type','TeamName2','LeagueName2']]

b.columns=['index','Season','DayNum','TeamID','Score','Loc','stdScore','Game_Type','TeamName','LeagueName']
b = b.reset_index()
b['OC'] = 0

In [417]:
c = pd.concat([a,b])    
c['rollingavg'] = c['OC']

In [418]:
c.sort_values(by=['Season','DayNum','index','TeamID'],inplace=True)

In [419]:
c = c.reset_index(drop=True)

In [420]:
d = c[['Season','TeamID','rollingavg']].groupby(['Season','TeamID']).rolling(5, min_periods=1).mean().reset_index([0,1],drop=True)

d.head()

Unnamed: 0,Season,TeamID,rollingavg
284,1998.0,3102.0,0.0
432,1998.0,3102.0,0.5
692,1998.0,3102.0,0.666667
934,1998.0,3102.0,0.5
1306,1998.0,3102.0,0.6


In [421]:
d['rollingavg_shifted'] = d.groupby(['Season','TeamID']).shift(1)

In [422]:
import math
d['rollingavg_final'] = d.apply(lambda x: x['rollingavg'] if (math.isnan(x['rollingavg_shifted'])) else x['rollingavg_shifted'],axis=1)

In [423]:
c = pd.concat([c, d[['rollingavg_final']]], axis=1)

In [424]:
c.tail()

Unnamed: 0,level_0,index,Season,DayNum,TeamID,Score,Loc,stdScore,Game_Type,TeamName,LeagueName,OC,rollingavg,rollingavg_final
210333,105166,105166,2018,140,3443,0,N,0.0,T,WKU,C-USA,0,0,0.0
210334,105167,105167,2018,140,3438,0,N,0.0,T,Virginia,ACC,1,1,0.2
210335,105167,105167,2018,140,3453,0,N,0.0,T,WI Green Bay,Horizon,0,0,0.0
210336,105168,105168,2018,140,3443,0,N,0.0,T,WKU,C-USA,1,1,0.0
210337,105168,105168,2018,140,3453,0,N,0.0,T,WI Green Bay,Horizon,0,0,0.0


In [425]:
c = pd.merge(c.loc[c.OC==1,['index','Season','DayNum','TeamID','Score','Loc','stdScore','Game_Type','TeamName','LeagueName','OC','rollingavg_final']].\
rename(columns={'TeamID':'Team1','Score':'Score1','Loc':'Loc1','stdScore':'stdScore1','TeamName':'TeamName1','LeagueName':'LeagueName1','OC':'OC1','rollingavg_final':'rollingavg_final1'}),
         c.loc[c.OC==0,['index','TeamID','Score','stdScore','TeamName','LeagueName','OC','rollingavg_final']].\
rename(columns={'TeamID':'Team2','Score':'Score2','stdScore':'stdScore2','TeamName':'TeamName2','LeagueName':'LeagueName2','OC':'OC2','rollingavg_final':'rollingavg_final2'}),on='index')

In [426]:
df_reg_tour_compact = pd.merge(df_reg_tour_compact,c[['index','rollingavg_final1','rollingavg_final2']],on='index')


In [427]:
df_reg_tour_compact.head()

Unnamed: 0,index,DayNum,Game_Type,Loc1,Loc2,Score1,Score2,Season,Team1,Team2,...,stdScore2,TeamName1,LeagueName1,TeamName2,LeagueName2,margin,elo1,elo2,rollingavg_final1,rollingavg_final2
0,0,18,R,H,A,91,41,1998,3104,3202,...,0.310606,Alabama,SEC,Furman,SoCon,50,1500.0,1500.0,1.0,0.0
1,1,18,R,H,A,87,76,1998,3163,3221,...,0.466258,Connecticut,AAC,Holy Cross,Patriot,11,1500.0,1500.0,1.0,0.0
2,2,18,R,H,A,66,59,1998,3222,3261,...,0.472,Houston,AAC,LSU,SEC,7,1500.0,1500.0,1.0,0.0
3,3,18,R,H,A,69,62,1998,3307,3365,...,0.473282,New Mexico,MWC,Santa Clara,WCC,7,1500.0,1500.0,1.0,0.0
4,4,18,R,H,A,115,35,1998,3349,3411,...,0.233333,Rice,C-USA,TX Southern,SWAC,80,1500.0,1500.0,1.0,0.0


In [428]:
# Create joint team Id
def createteam(row):
    z=[]
    z.append(row['Team1'])
    z.append(row['Team2'])
    z.sort()
    x = ''.join(z)
    return (x)

In [429]:
df_reg_tour_compact['JointTeamID']=df_reg_tour_compact[['Team1','Team2']].astype(str).apply(createteam,axis=1)

In [430]:
a = df_reg_tour_compact[['index','JointTeamID','Season','Team1','Score1','stdScore1']]

a.columns=['index','JointTeamID','Season','TeamID','Score','stdScore']

a = a.reset_index()
a['HOC'] = 1

In [431]:
b = df_reg_tour_compact[['index','JointTeamID','Season','Team2','Score2','stdScore2']]

b.columns=['index','JointTeamID','Season','TeamID','Score','stdScore']

b = b.reset_index()
b['HOC'] = 0

In [432]:
c = pd.concat([a,b])    
c['OC_dup'] = c['HOC']

In [433]:
c.head()

Unnamed: 0,level_0,index,JointTeamID,Season,TeamID,Score,stdScore,HOC,OC_dup
0,0,0,31043202,1998,3104,91,0.689394,1,1
1,1,1,31633221,1998,3163,87,0.533742,1,1
2,2,2,32223261,1998,3222,66,0.528,1,1
3,3,3,33073365,1998,3307,69,0.526718,1,1
4,4,4,33493411,1998,3349,115,0.766667,1,1


In [434]:
c.sort_values(by=['index','Season','JointTeamID'],inplace=True)
c = c.reset_index(drop=True)
c.head()

c['JointTeamID'] = c['JointTeamID'].astype('int64') 
c.dtypes

level_0          int64
index            int64
JointTeamID      int64
Season           int64
TeamID           int64
Score            int64
stdScore       float64
HOC              int64
OC_dup           int64
dtype: object

In [435]:
d = c[['Season','JointTeamID','TeamID','OC_dup','stdScore']].groupby(['Season','JointTeamID','TeamID']).rolling(5, min_periods=1).mean().reset_index([0,2],drop=True)


In [436]:
d.reset_index(0,drop=True).head()

Unnamed: 0,Season,JointTeamID,TeamID,OC_dup,stdScore
692,1998.0,31023119.0,3102.0,1.0,0.531532
693,1998.0,31023119.0,3119.0,0.0,0.468468
4693,1998.0,31023140.0,3102.0,0.0,0.368098
6691,1998.0,31023140.0,3102.0,0.0,0.413216
4692,1998.0,31023140.0,3140.0,1.0,0.631902


In [437]:
d[['H2Hrollingavg','H2HrollingstdScore']] = d.groupby(['Season','JointTeamID','TeamID']).shift(1)



Defaulting to column but this will raise an ambiguity error in a future version
  """Entry point for launching an IPython kernel.


In [438]:
d = d.reset_index(0,drop=True)

In [439]:
import math
d['H2Hrollingavg_final'] = d.apply(lambda x: x['OC_dup'] if (math.isnan(x['H2Hrollingavg'])) else x['H2Hrollingavg'],axis=1)

In [440]:
d['H2HrollingstdScore_final'] = d.apply(lambda x: x['stdScore'] if (math.isnan(x['H2HrollingstdScore'])) else x['H2HrollingstdScore'],axis=1)

d.head()
#c = pd.concat([c, d[['rollingavg_final']]], axis=1)

Unnamed: 0,Season,JointTeamID,TeamID,OC_dup,stdScore,H2Hrollingavg,H2HrollingstdScore,H2Hrollingavg_final,H2HrollingstdScore_final
692,1998.0,31023119.0,3102.0,1.0,0.531532,,,1.0,0.531532
693,1998.0,31023119.0,3119.0,0.0,0.468468,,,0.0,0.468468
4693,1998.0,31023140.0,3102.0,0.0,0.368098,,,0.0,0.368098
6691,1998.0,31023140.0,3102.0,0.0,0.413216,0.0,0.368098,0.0,0.368098
4692,1998.0,31023140.0,3140.0,1.0,0.631902,,,1.0,0.631902


In [441]:
e = c[['Season','JointTeamID','TeamID','OC_dup']].groupby(['Season','JointTeamID','TeamID']).rolling(5, min_periods=1).count()


In [442]:

f=e.reset_index([0,1,2],drop=True)[['OC_dup']]
f.head()

Unnamed: 0,OC_dup
692,1.0
693,1.0
4693,1.0
6691,2.0
4692,1.0


In [443]:
c = pd.concat([c, pd.concat([d,f],axis=1)[['H2Hrollingavg_final','H2HrollingstdScore_final']]], axis=1)
c.head()

Unnamed: 0,level_0,index,JointTeamID,Season,TeamID,Score,stdScore,HOC,OC_dup,H2Hrollingavg_final,H2HrollingstdScore_final
0,0,0,31043202,1998,3104,91,0.689394,1,1,1.0,0.689394
1,0,0,31043202,1998,3202,41,0.310606,0,0,0.0,0.310606
2,1,1,31633221,1998,3163,87,0.533742,1,1,1.0,0.533742
3,1,1,31633221,1998,3221,76,0.466258,0,0,0.0,0.466258
4,2,2,32223261,1998,3222,66,0.528,1,1,1.0,0.528


In [444]:
g = pd.merge(c.loc[c.HOC==1,['index','JointTeamID','Season','TeamID','H2Hrollingavg_final','H2HrollingstdScore_final']].\
rename(columns={'TeamID':'Team1','H2Hrollingavg_final':'H2Hrollingavg_final1','H2HrollingstdScore_final':'H2HrollingstdScore_final1'}),
c.loc[c.HOC==0,['index','TeamID','H2Hrollingavg_final','H2HrollingstdScore_final']].\
rename(columns={'TeamID':'Team2','H2Hrollingavg_final':'H2Hrollingavg_final2','H2HrollingstdScore_final':'H2HrollingstdScore_final2'}),on='index')

In [445]:
g.tail()

Unnamed: 0,index,JointTeamID,Season,Team1,H2Hrollingavg_final1,H2HrollingstdScore_final1,Team2,H2Hrollingavg_final2,H2HrollingstdScore_final2
105164,105164,34373443,2018,3437,1.0,0.0,3443,0.0,0.0
105165,105165,34373453,2018,3437,1.0,0.0,3453,0.0,0.0
105166,105166,34383443,2018,3438,1.0,0.0,3443,0.0,0.0
105167,105167,34383453,2018,3438,1.0,0.0,3453,0.0,0.0
105168,105168,34433453,2018,3443,1.0,0.0,3453,0.0,0.0


In [446]:
df_reg_tour_compact = pd.merge(df_reg_tour_compact,g[['index','H2Hrollingavg_final1','H2Hrollingavg_final2','H2HrollingstdScore_final1','H2HrollingstdScore_final2']],on='index')


In [447]:
df_reg_tour_compact.tail()

Unnamed: 0,index,DayNum,Game_Type,Loc1,Loc2,Score1,Score2,Season,Team1,Team2,...,margin,elo1,elo2,rollingavg_final1,rollingavg_final2,JointTeamID,H2Hrollingavg_final1,H2Hrollingavg_final2,H2HrollingstdScore_final1,H2HrollingstdScore_final2
105164,105164,140,T,N,N,0,0,2018,3437,3443,...,0,1568.829373,1536.37097,0.2,0.0,34373443,1.0,0.0,0.0,0.0
105165,105165,140,T,N,N,0,0,2018,3437,3453,...,0,1571.667533,1604.318569,0.4,0.0,34373453,1.0,0.0,0.0,0.0
105166,105166,140,T,N,N,0,0,2018,3438,3443,...,0,1585.165783,1533.532809,0.0,0.0,34383443,1.0,0.0,0.0,0.0
105167,105167,140,T,N,N,0,0,2018,3438,3453,...,0,1587.794463,1600.712537,0.2,0.0,34383453,1.0,0.0,0.0,0.0
105168,105168,140,T,N,N,0,0,2018,3443,3453,...,0,1530.904129,1597.347462,0.0,0.0,34433453,1.0,0.0,0.0,0.0


In [448]:
winning_games_w = df_reg_tour_compact.loc[df_reg_tour_compact.Game_Type=="T",['Team1','Season','index']].groupby(['Team1','Season']).agg('count').rename({'Team1':'Team1','index':'wincount1'}).reset_index()
winning_games_l = df_reg_tour_compact.loc[df_reg_tour_compact.Game_Type=="T",['Team1','Season','index']].groupby(['Team1','Season']).agg('count').rename({'Team1':'Team2','index':'wincount2'}).reset_index()

losing_games_w = df_reg_tour_compact.loc[df_reg_tour_compact.Game_Type=="T",['Team2','Season','index']].groupby(['Team2','Season']).agg('count').rename({'Team2':'Team1','index':'losscount1'}).reset_index()
losing_games_l = df_reg_tour_compact.loc[df_reg_tour_compact.Game_Type=="T",['Team2','Season','index']].groupby(['Team2','Season']).agg('count').rename({'Team2':'Team2','index':'losscount2'}).reset_index()

winning_games_w.columns = ['Team1','Season','wincount1']
winning_games_l.columns = ['Team2','Season','wincount2']
losing_games_w.columns = ['Team1','Season','losscount1']
losing_games_l.columns = ['Team2','Season','losscount2']


In [449]:
teams_tour_win_loss = pd.merge(winning_games_w,losing_games_w,how='outer',on=['Team1','Season']).fillna(0)

teams_tour_win_loss = teams_tour_win_loss.rename(columns = {'Team1':'TeamID','wincount1':'twincount','losscount1':'tlosscount'})

teams_tour_win_loss['Season'] = teams_tour_win_loss.Season + 1

teams_tour_win_loss = pd.merge(teams_tour_win_loss,df_teams_leagues,how='left',on=['TeamID'])
League_tour_win_loss= teams_tour_win_loss[['Season','LeagueName','twincount','tlosscount']].groupby(['Season','LeagueName']).sum().reset_index()

In [450]:
League_tour_win_loss=League_tour_win_loss.loc[~(League_tour_win_loss.Season==2019)]

In [451]:
teams_tour_win_loss=teams_tour_win_loss.loc[~(teams_tour_win_loss.Season==2019)]

In [452]:
teams_tour_win_loss.loc[teams_tour_win_loss.TeamID==3438]

Unnamed: 0,TeamID,Season,twincount,tlosscount,TeamName,LeagueName
665,3438,1999,1.0,1.0,Virginia,ACC
666,3438,2001,2.0,1.0,Virginia,ACC
667,3438,2004,1.0,1.0,Virginia,ACC
668,3438,2006,1.0,1.0,Virginia,ACC
669,3438,2009,1.0,1.0,Virginia,ACC
670,3438,2010,1.0,1.0,Virginia,ACC
1292,3438,2000,0.0,1.0,Virginia,ACC
1293,3438,2002,0.0,1.0,Virginia,ACC
1294,3438,2003,0.0,1.0,Virginia,ACC
1295,3438,2011,0.0,1.0,Virginia,ACC


In [453]:
df_reg_tour_compact = pd.merge(df_reg_tour_compact,teams_tour_win_loss[['Season','TeamID','twincount','tlosscount']],how='left',left_on=['Team1','Season'],right_on=['TeamID','Season']).fillna(0)

In [472]:
df_reg_tour_compact.tail()

Unnamed: 0,index,DayNum,Game_Type,Loc1,Loc2,Score1,Score2,Season,Team1,Team2,...,rollingavg_final2,JointTeamID,H2Hrollingavg_final1,H2Hrollingavg_final2,H2HrollingstdScore_final1,H2HrollingstdScore_final2,twincount1,tlosscount1,twincount2,tlosscount2
105164,105164,140,T,N,N,0,0,2018,3437,3443,...,0.0,34373443,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
105165,105165,140,T,N,N,0,0,2018,3437,3453,...,0.0,34373453,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
105166,105166,140,T,N,N,0,0,2018,3438,3443,...,0.0,34383443,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
105167,105167,140,T,N,N,0,0,2018,3438,3453,...,0.0,34383453,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
105168,105168,140,T,N,N,0,0,2018,3443,3453,...,0.0,34433453,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0


In [457]:
df_reg_tour_compact = df_reg_tour_compact.drop(['TeamID'],axis=1)

In [460]:
df_reg_tour_compact=df_reg_tour_compact.rename(columns={'twincount':'twincount1','tlosscount':'tlosscount1'})

In [461]:
df_reg_tour_compact = pd.merge(df_reg_tour_compact,teams_tour_win_loss[['Season','TeamID','twincount','tlosscount']],how='left',left_on=['Team2','Season'],right_on=['TeamID','Season']).fillna(0)

In [463]:
df_reg_tour_compact = df_reg_tour_compact.drop(['TeamID'],axis=1)
df_reg_tour_compact=df_reg_tour_compact.rename(columns={'twincount':'twincount2','tlosscount':'tlosscount2'})

In [None]:
df_reg_tour_compact = pd.merge(df_reg_tour_compact,df_matchups[['Season','Team1','Team1Seed','Team1PrevSeed']],how='left',on=['Season','Team1'])

df_reg_tour_compact = pd.merge(df_reg_tour_compact,df_matchups[['Season','Team2','Team2Seed','Team2PrevSeed']],how='left',on=['Season','Team2'])

In [659]:
df_reg_tour_compact.shape

(3853239, 34)

In [666]:
#df_train = df_reg_tour_compact.loc[df_reg_tour_compact.Season > 2012]

df_train = df_reg_tour_compact.loc[~((df_reg_tour_compact.Season==2018)&(df_reg_tour_compact.Game_Type=='T'))]

df_train.shape

(466023, 34)

In [667]:
df_train=df_train.loc[~(df_train.Season==1998)]
df_train.shape

(461177, 34)

In [668]:
df_train = df_train[['elo1', 'elo2','Game_Type',
       'rollingavg_final1', 'rollingavg_final2',
       'H2Hrollingavg_final1', 'H2Hrollingavg_final2',
       'H2HrollingstdScore_final1', 'H2HrollingstdScore_final2','twincount1','twincount2','tlosscount1','tlosscount2','Team1Seed','Team1PrevSeed','Team2Seed','Team2PrevSeed']]

In [669]:
df_train_neg = pd.DataFrame(

    {
        'elo1' : df_train['elo2'],
        'elo2' : df_train['elo1'],
        'rollingavg_final1': df_train['rollingavg_final2'],
        'rollingavg_final2': df_train['rollingavg_final1'],
        'H2Hrollingavg_final1': df_train['H2Hrollingavg_final2'],
        'H2Hrollingavg_final2':df_train['H2Hrollingavg_final1'],
        'H2HrollingstdScore_final1':df_train['H2HrollingstdScore_final2'],
        'H2HrollingstdScore_final2':df_train['H2HrollingstdScore_final1'],
        'twincount1':df_train['twincount2'],
        'twincount2':df_train['twincount1'],
        'tlosscount1':df_train['tlosscount2'],
        'tlosscount2':df_train['tlosscount1'],
        'Team1Seed':df_train['Team2Seed'],
        'Team1PrevSeed':df_train['Team2PrevSeed'],
        'Team2Seed':df_train['Team1Seed'],
        'Team2PrevSeed':df_train['Team1PrevSeed'],
        'Game_Type':df_train['Game_Type']
        
    }
)

In [677]:
x_train = df_train.append(df_train_neg)

x_train[['Team1Seed','Team1PrevSeed','Team2Seed','Team2PrevSeed']] = x_train[['Team1Seed','Team1PrevSeed','Team2Seed','Team2PrevSeed']].fillna(17.0)

x_train['SeedDiff'] = x_train.Team1Seed - x_train.Team2Seed
x_train['twincountdiff'] = x_train.twincount1 - x_train.twincount2
x_train['tlosscountdiff'] = x_train.tlosscount1 - x_train.tlosscount2


# build outcome = first 1/2 of data is positive, second 1/2 is negative 
y_train = ([1] * len(df_train)) + ([0] * len(df_train_neg))

In [678]:
x_train.shape

(922354, 20)

In [679]:
x_train = x_train[['twincountdiff','tlosscountdiff','elo1','elo2']]

#x_train['Game_Type'] = x_train['Game_Type'].apply(lambda x : 2 if x =="R" else 1)

In [680]:
train_x, dev_x, train_y, dev_y = train_test_split(x_train,y_train)

In [681]:
train_x.head()

Unnamed: 0,twincountdiff,tlosscountdiff,elo1,elo2
400585,-3.0,0.0,1742.688137,1805.334744
101623,0.0,-1.0,1226.372271,1301.336811
285026,3.0,0.0,1734.285688,1803.451338
406064,-1.0,-1.0,1709.357773,1738.839293
306941,-1.0,-1.0,1772.313483,1794.483649


In [690]:
lr = LogisticRegression(C=0.1)

lr.fit(train_x,train_y)
# we achieve 77% accuracy on training data - how does it generalize to dev?
lr.score(train_x,train_y)

0.73101992728744591

In [692]:
lr.score(dev_x,dev_y) 
pred_lr = lr.predict_proba(dev_x)
metrics.log_loss(dev_y,pred_lr)

0.53017449990998056

In [693]:
pred_lr

array([[ 0.85530376,  0.14469624],
       [ 0.47475492,  0.52524508],
       [ 0.78329937,  0.21670063],
       ..., 
       [ 0.62461537,  0.37538463],
       [ 0.38637919,  0.61362081],
       [ 0.3687265 ,  0.6312735 ]])

In [684]:

## Fit a logistic regression model through grid search
logreg = LogisticRegression()
params = {'C': np.logspace(start=-5, stop=3, num=9)}
clf = GridSearchCV(logreg, params, scoring='neg_log_loss', refit=True)
clf.fit(train_x, train_y)
print('Best log_loss: {:.4}, with best C: {}'.format(clf.best_score_, clf.best_params_['C']))

Best log_loss: -0.5296, with best C: 0.1


In [685]:
# Create a random forest Classifier. By convention, clf means 'Classifier'
clf = RandomForestClassifier(n_jobs=2, random_state=0)

# Train the Classifier to take the training features and learn how they relate
# to the training y (the species)
clf.fit(train_x,train_y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=2,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [686]:
clf.score(train_x, train_y)

0.99473809747529873

In [688]:
clf.score(dev_x, dev_y)
pred = clf.predict_proba(dev_x)

In [638]:
pred

array([[ 0. ,  1. ],
       [ 0. ,  1. ],
       [ 1. ,  0. ],
       ..., 
       [ 0. ,  1. ],
       [ 1. ,  0. ],
       [ 0.2,  0.8]])

In [689]:
metrics.log_loss(dev_y,pred)

0.41548276145360274

In [565]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}
# Create a based model
rf = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2,scoring='neg_log_loss')

In [567]:
grid_search.fit(train_x, train_y)
grid_search.best_params_

Fitting 3 folds for each of 288 candidates, totalling 864 fits
[CV] bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=100 
[CV] bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=100 
[CV] bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=100 
[CV] bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=200 
[CV] bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=200 
[CV] bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=200 
[CV] bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=300 
[CV] bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=300 
[CV]  bootstrap=True, max_depth=80, max_features=2, min_s

[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed: 28.9min


[CV]  bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=1000 -25.8min
[CV] bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=3, min_samples_split=12, n_estimators=1000 
[CV]  bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=3, min_samples_split=12, n_estimators=200 - 5.9min
[CV] bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=3, min_samples_split=12, n_estimators=1000 
[CV]  bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=3, min_samples_split=12, n_estimators=300 -12.1min
[CV] bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=3, min_samples_split=12, n_estimators=1000 
[CV]  bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=3, min_samples_split=12, n_estimators=300 -12.2min
[CV] bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=4, min_samples_split=8, n_estimators=100 
[CV]  bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=3, mi

KeyboardInterrupt: 