In [424]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import itertools as it

import seaborn as sns

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV


# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

In [425]:

# right now, only looking at tournament seed / performance, not regular season games
data_dir = './WDataFiles/'
data_dir_1 = './WPrelimData2018/'
data_dir_2 = './WStage2DataFiles/'
df_seeds = pd.read_csv(data_dir_2 + 'WNCAATourneySeeds.csv')
df_tour_compact = pd.read_csv(data_dir_2 + 'WNCAATourneyCompactResults.csv')

df_tour_compact['WstdScore'] = df_tour_compact['WScore']/(df_tour_compact['WScore']+df_tour_compact['LScore'])
df_tour_compact['LstdScore'] = df_tour_compact['LScore']/(df_tour_compact['WScore']+df_tour_compact['LScore'])
df_tour_compact['stdScorediff'] = (df_tour_compact['WScore']-df_tour_compact['LScore'])/(df_tour_compact['WScore']+df_tour_compact['LScore'])


df_tour = pd.read_csv(data_dir_2 + 'WNCAATourneyDetailedResults.csv')
df_reg_compact = pd.read_csv(data_dir_2 +'WRegularSeasonCompactResults.csv')


df_reg_compact['WstdScore'] = df_reg_compact['WScore']/(df_reg_compact['WScore']+df_reg_compact['LScore'])
df_reg_compact['LstdScore'] = df_reg_compact['LScore']/(df_reg_compact['WScore']+df_reg_compact['LScore'])
df_reg_compact['stdScorediff'] = (df_reg_compact['WScore']-df_reg_compact['LScore'])/(df_reg_compact['WScore']+df_reg_compact['LScore'])



df_reg_detail = pd.read_csv(data_dir_2 + 'WRegularSeasonDetailedResults.csv')

df_reg_detail['WstdScore'] = df_reg_detail['WScore']/(df_reg_detail['WScore']+df_reg_detail['LScore'])
df_reg_detail['LstdScore'] = df_reg_detail['LScore']/(df_reg_detail['WScore']+df_reg_detail['LScore'])
df_reg_detail['stdScorediff'] = (df_reg_detail['WScore']-df_reg_detail['LScore'])/(df_reg_detail['WScore']+df_reg_detail['LScore'])

df_tour_detail = pd.read_csv(data_dir_2 + 'WNCAATourneyDetailedResults.csv')

df_tour_detail['WstdScore'] = df_tour_detail['WScore']/(df_tour_detail['WScore']+df_tour_detail['LScore'])
df_tour_detail['LstdScore'] = df_tour_detail['LScore']/(df_tour_detail['WScore']+df_tour_detail['LScore'])
df_tour_detail['stdScorediff'] = (df_tour_detail['WScore']-df_tour_detail['LScore'])/(df_tour_detail['WScore']+df_tour_detail['LScore'])

df_reg_compact['Game_Type'] = "R"

df_tour_compact['Game_Type'] = "T"

df_reg_detail['Game_Type'] = "R"

df_tour_detail['Game_Type'] = "T"

df_reg_tour_compact = pd.concat([df_reg_compact,df_tour_compact],ignore_index=True)
df_reg_tour_compact.sort_values(by=['Season','DayNum','Game_Type'],inplace=True)
df_reg_tour_compact=df_reg_tour_compact.reset_index()



df_reg_tour_detail = pd.concat([df_tour_detail,df_reg_detail],ignore_index=True)

df_reg_tour_detail.sort_values(by=['Season','DayNum','Game_Type'],inplace=True)

df_reg_tour_detail=df_reg_tour_detail.reset_index()

df_reg_tour_detail['index'] = df_reg_tour_detail.index

df_teams = pd.read_csv(data_dir_2 + 'WTeams.csv')

df_teams_leagues = pd.merge(left=df_teams, right=pd.read_csv(data_dir_2 + 'WLeagues.csv'), how='left', on=['TeamName'])

df_teams_leagues = df_teams_leagues[(df_teams_leagues.LeagueName.notnull().values)]

df_teams_leagues.head()

Unnamed: 0,TeamID,TeamName,LeagueName
0,3101,Abilene Chr,Southland
1,3102,Air Force,MWC
2,3103,Akron,MAC
3,3104,Alabama,SEC
4,3105,Alabama A&M,SWAC


# Get Matchups of interest for Training & Test Data¶


In [426]:

## Get 2018 data

# sorting ensures that we work through smallest to largest 
# no repeats, smallest always first in pair
current_year = df_seeds[df_seeds['Season'] == 2018].sort_values(by=['TeamID'])
current_year_pairs = list(it.combinations(current_year['TeamID'],2))

team1, team2 = map(list, zip(*current_year_pairs))

df_test_pairs = pd.DataFrame({
    'Season' : [2018] * len(team1),
    'Team1' : team1,
    'Team2' : team2
})

In [427]:

# also need previous matchups (1998 - 2017)
df_train_pairs = pd.DataFrame(
    { 'Team1' : df_tour_compact['WTeamID'],
     'Team2' : df_tour_compact['LTeamID'],
     'Season' : df_tour_compact['Season']
    })

df_train_pairs.tail()

Unnamed: 0,Season,Team1,Team2
1255,2017,3163,3332
1256,2017,3376,3199
1257,2017,3280,3163
1258,2017,3376,3390
1259,2017,3376,3280


In [428]:
# this is what we will use as a base to build our dataset
df_train_test_matchups = df_train_pairs.append(df_test_pairs)

# Feature Engineering for Training & Test Data

## Current Year Seeds


In [429]:
# get just integer value of seed (exclude region information)
df_seeds = pd.read_csv(data_dir_2 + 'WNCAATourneySeeds.csv')

def seed_to_int(seed):
    #Get just the digits from the seeding. Return as int
    s_int = int(seed[1:3])
    return s_int
df_seeds['SeedInt'] = df_seeds.Seed.apply(seed_to_int)
df_seeds.drop(labels=['Seed'], inplace=True, axis=1) # This is the string label

df_seeds.head()

df_seeds[df_seeds.TeamID==3104]

Unnamed: 0,Season,TeamID,SeedInt
17,1998,3104,2
100,1999,3104,5


In [430]:
# merge seeds with team IDs for tourney performance
df_W = df_seeds.rename(columns={'TeamID':'Team1', 'SeedInt':'Team1Seed'})
df_L = df_seeds.rename(columns={'TeamID':'Team2', 'SeedInt':'Team2Seed'})
df_dummy = pd.merge(left=df_train_test_matchups, right=df_W, how='left', on=['Season', 'Team1'])
df_matchups = pd.merge(left=df_dummy, right=df_L, how='left', on=['Season', 'Team2'])

df_matchups.head()

Unnamed: 0,Season,Team1,Team2,Team1Seed,Team2Seed
0,1998,3104,3422,2,15
1,1998,3112,3365,3,14
2,1998,3163,3193,2,15
3,1998,3198,3266,7,10
4,1998,3203,3208,10,7


## Current Year Seed Difference


In [431]:
# negative number indicates that team 2 (loser, in this case) has higher seed
df_matchups['SeedDiff'] = df_matchups['Team2Seed']
df_matchups['SeedDiff'] = df_matchups['Team2Seed'].sub(df_matchups['Team1Seed'], axis=0)

df_matchups.tail()

Unnamed: 0,Season,Team1,Team2,Team1Seed,Team2Seed,SeedDiff
3271,2018,3437,3443,9,11,2
3272,2018,3437,3453,9,7,-2
3273,2018,3438,3443,10,11,1
3274,2018,3438,3453,10,7,-3
3275,2018,3443,3453,11,7,-4


## Last Year's Tournament Seed

In [432]:

# what to do for games where we had no previous year (1998)?

# add 1 so easily referenced
df_seeds['Season'] = df_seeds['Season'].apply(lambda x: x + 1)
df_W = df_seeds.rename(columns={'TeamID':'Team1','SeedInt':'Team1PrevSeed'})
df_L = df_seeds.rename(columns={'TeamID':'Team2','SeedInt':'Team2PrevSeed'})
df_dummy = pd.merge(left=df_matchups, right=df_W, how='left', on=['Season', 'Team1'])
df_matchups = pd.merge(left=df_dummy, right=df_L, how='left', on=['Season', 'Team2'])

## what to do if they weren't in the tournament last year (17?)
df_matchups = df_matchups.fillna(value=17)

df_matchups.head()

Unnamed: 0,Season,Team1,Team2,Team1Seed,Team2Seed,SeedDiff,Team1PrevSeed,Team2PrevSeed
0,1998,3104,3422,2,15,13,17.0,17.0
1,1998,3112,3365,3,14,11,17.0,17.0
2,1998,3163,3193,2,15,13,17.0,17.0
3,1998,3198,3266,7,10,3,17.0,17.0
4,1998,3203,3208,10,7,-3,17.0,17.0


In [433]:
df_seeds[df_seeds.TeamID==3104]

Unnamed: 0,Season,TeamID,SeedInt
17,1999,3104,2
100,2000,3104,5


In [434]:
df_reg_tour_compact.columns = ['index', 'Season', 'DayNum', 'Team1', 'Score1', 'Team2', 'Score2',
       'Loc1', 'NumOT', 'stdScore1', 'stdScore2', 'stdScorediff', 'Game_Type']

In [435]:
def getteam2loc(x):
    if (x == "H"):
        l = "A"    
    if (x == "A"):
        l = "H"
    if (x == "N"):
        l = "N"    
    return(l)
    
df_reg_tour_compact['Loc2'] = df_reg_tour_compact.Loc1.apply(getteam2loc)

In [436]:
df_reg_tour_compact.columns

df_reg_tour_compact=df_reg_tour_compact.drop(['NumOT'], axis=1)



In [437]:
df_reg_tour_compact.head()

Unnamed: 0,index,Season,DayNum,Team1,Score1,Team2,Score2,Loc1,stdScore1,stdScore2,stdScorediff,Game_Type,Loc2
0,0,1998,18,3104,91,3202,41,H,0.689394,0.310606,0.378788,R,A
1,1,1998,18,3163,87,3221,76,H,0.533742,0.466258,0.067485,R,A
2,2,1998,18,3222,66,3261,59,H,0.528,0.472,0.056,R,A
3,3,1998,18,3307,69,3365,62,H,0.526718,0.473282,0.053435,R,A
4,4,1998,18,3349,115,3411,35,H,0.766667,0.233333,0.533333,R,A


In [438]:
df_test_pairs['DayNum'] = 140
df_test_pairs['Score1'] = 0
df_test_pairs['Score2'] = 0
df_test_pairs['stdScore1'] = 0
df_test_pairs['stdScore2'] = 0
df_test_pairs['Game_Type'] = "T"
df_test_pairs['Loc1'] = "N"
df_test_pairs['Loc2'] = "N"



df_reg_tour_compact = pd.concat([df_reg_tour_compact[['Season', 'DayNum', 'Team1', 'Score1', 'Team2', 'Score2','Loc1','Loc2','stdScore1', 'stdScore2','Game_Type']],
  df_test_pairs]).reset_index()        


df_reg_tour_compact.tail()

Unnamed: 0,index,DayNum,Game_Type,Loc1,Loc2,Score1,Score2,Season,Team1,Team2,stdScore1,stdScore2
105164,2011,140,T,N,N,0,0,2018,3437,3443,0.0,0.0
105165,2012,140,T,N,N,0,0,2018,3437,3453,0.0,0.0
105166,2013,140,T,N,N,0,0,2018,3438,3443,0.0,0.0
105167,2014,140,T,N,N,0,0,2018,3438,3453,0.0,0.0
105168,2015,140,T,N,N,0,0,2018,3443,3453,0.0,0.0


In [1172]:
df_reg_tour_detail.columns=['index', 'Season', 'DayNum', 'Team1', 'Score1', 'Team2', 'Score2',
       'Loc1', 'NumOT', 'FGM1', 'FGA1', 'threeptFGM1', 'threeptFGA1', 'FTM1', 'FTA1',
       'OR1', 'DR1', 'Ast1', 'TO1', 'Stl1', 'Blk1', 'PF1', 'FGM2', 'FGA2',
       'threeptFGM2', 'threeptFGA2', 'FTM2', 'FTA2', 'OR2', 'DR2', 'Ast2', 'TO2', 'Stl2',
       'Blk2', 'PF2', 'stdScore1', 'stdScore2', 'stdScorediff', 'Game_Type']

df_reg_tour_detail.columns

Index(['index', 'Season', 'DayNum', 'Team1', 'Score1', 'Team2', 'Score2',
       'Loc1', 'NumOT', 'FGM1', 'FGA1', 'threeptFGM1', 'threeptFGA1', 'FTM1',
       'FTA1', 'OR1', 'DR1', 'Ast1', 'TO1', 'Stl1', 'Blk1', 'PF1', 'FGM2',
       'FGA2', 'threeptFGM2', 'threeptFGA2', 'FTM2', 'FTA2', 'OR2', 'DR2',
       'Ast2', 'TO2', 'Stl2', 'Blk2', 'PF2', 'stdScore1', 'stdScore2',
       'stdScorediff', 'Game_Type', 'TeamName1', 'LeagueName1', 'TeamName2',
       'LeagueName2'],
      dtype='object')

In [1169]:
# Add winning an losing team names to the regular compact and tour compact dataframes

df_teams_w = df_teams_leagues.rename(columns={'TeamID':'Team1','TeamName':'TeamName1','LeagueName':'LeagueName1'})
df_teams_l = df_teams_leagues.rename(columns={'TeamID':'Team2','TeamName':'TeamName2','LeagueName':'LeagueName2'})

#df_reg_compact = pd.merge(left=df_reg_compact, right=df_teams_w, how='left', on=['WTeamID'])
#df_reg_compact = pd.merge(left=df_reg_compact, right=df_teams_l, how='left', on=['LTeamID'])


#df_tour_compact = pd.merge(left=df_tour_compact, right=df_teams_w, how='left', on=['WTeamID'])
#df_tour_compact = pd.merge(left=df_tour_compact, right=df_teams_l, how='left', on=['LTeamID'])

df_reg_tour_compact = pd.merge(left=df_reg_tour_compact, right=df_teams_w, how='left', on=['Team1'])
df_reg_tour_compact = pd.merge(left=df_reg_tour_compact, right=df_teams_l, how='left', on=['Team2'])

df_reg_tour_detail = pd.merge(left=df_reg_tour_detail, right=df_teams_w, how='left', on=['Team1'])
df_reg_tour_detail = pd.merge(left=df_reg_tour_detail, right=df_teams_l, how='left', on=['Team2'])

In [440]:
df_reg_tour_compact['margin'] = df_reg_tour_compact.Score1 - df_reg_tour_compact.Score2

df_reg_tour_compact['index'] = df_reg_tour_compact.index

df_reg_tour_compact.tail()

Unnamed: 0,index,DayNum,Game_Type,Loc1,Loc2,Score1,Score2,Season,Team1,Team2,stdScore1,stdScore2,TeamName1,LeagueName1,TeamName2,LeagueName2,margin
105164,105164,140,T,N,N,0,0,2018,3437,3443,0.0,0.0,Villanova,Big East,WKU,C-USA,0
105165,105165,140,T,N,N,0,0,2018,3437,3453,0.0,0.0,Villanova,Big East,WI Green Bay,Horizon,0
105166,105166,140,T,N,N,0,0,2018,3438,3443,0.0,0.0,Virginia,ACC,WKU,C-USA,0
105167,105167,140,T,N,N,0,0,2018,3438,3453,0.0,0.0,Virginia,ACC,WI Green Bay,Horizon,0
105168,105168,140,T,N,N,0,0,2018,3443,3453,0.0,0.0,WKU,C-USA,WI Green Bay,Horizon,0


In [1429]:
# Caluclation of Elo Rating
K = 20.
HOME_ADVANTAGE = 100.
team_ids = set(df_reg_tour_compact.Team1).union(set(df_reg_tour_compact.Team2))
# This dictionary will be used as a lookup for current
# scores while the algorithm is iterating through each game
elo_dict = dict(zip(list(team_ids), [1500] * len(team_ids)))

elo_dict_count = dict(zip(list(team_ids), [0] * len(team_ids)))

df_teams_leagues = df_teams_leagues[df_teams_leagues['TeamID'].isin( list(team_ids))]

#df_teams_leagues = df_teams_leagues[(df_teams_leagues.LeagueName.notnull().values)]

a = list(df_teams_leagues.LeagueName.unique())

elo_league_dict = dict(zip(a,[1500]*len(a)))


Reg_weight = 1.0

Tour_weight = 1.0

def elo_pred(elo1, elo2):
    return(1. / (10. ** (-(elo1 - elo2) / 400.) + 1.))

def expected_margin(elo_diff):
    return((7.5 + 0.006 * elo_diff))

def elo_update(w_elo, l_elo, margin,Game_Type):
    elo_diff = w_elo - l_elo
    pred = elo_pred(w_elo, l_elo)
    if Game_Type == "R":
        
        mult = (((margin + 3.) ** 0.8) / expected_margin(elo_diff)) * Reg_weight
    else:
        mult = (((margin + 3.) ** 0.8) / expected_margin(elo_diff)) * Tour_weight
        
    update = K * mult * (1 - pred)
    return(pred, update)

def calc_league_elo_rating():
    
    for l in df_teams_leagues.LeagueName.unique():
            teams = df_teams_leagues.loc[df_teams_leagues.LeagueName == l,'TeamID']
            v = 0
            for t in teams:
                v = v + elo_dict[t]
            elo_league_dict[l] = v / len(teams) 
            
            for t in teams:                
                elo_dict[t] = elo_league_dict[l]

In [1430]:
preds = []
w_elo = []
l_elo = []
win_team=[]
loss_team=[]
season_elo=[]
index =[]
game_type=[]

current_season = 1998
# Loop over all rows of the games dataframe
for row in df_reg_tour_compact.itertuples():
    
    # Get key data from current row
    w = row.Team1
    l = row.Team2
    margin = row.margin
    wloc = row.Loc1
    Game_Type = row.Game_Type
    season = row.Season
    wLeague = row.LeagueName1
    lLeague = row.LeagueName2
    
    season_elo.append(season)
    game_type.append(row.Game_Type)
    win_team.append(w)
    index.append(row.index)
    loss_team.append(l)
    # Does either team get a home-court advantage?
    w_ad, l_ad, = 0., 0.
    if wloc == "H":
        w_ad += HOME_ADVANTAGE
    elif wloc == "A":
        l_ad += HOME_ADVANTAGE
    
    if (season == current_season):
        
        w_elo.append(elo_dict[w])
        l_elo.append(elo_dict[l])
        
        # Get elo updates as a result of the game
        pred, update = elo_update(elo_dict[w] + w_ad,
                              elo_dict[l] + l_ad, 
                              margin,Game_Type)
        preds.append(pred)

        elo_dict[w] += update
        elo_dict[l] -= update
        
    else:
        current_season = season
        calc_league_elo_rating()
        w_elo.append(elo_dict[w])
        l_elo.append(elo_dict[l])
        
        # Get elo updates as a result of the game
        pred, update = elo_update(elo_dict[w] + w_ad,
                              elo_dict[l] + l_ad, 
                              margin,Game_Type)
        preds.append(pred)
        
        elo_dict[w] += update
        elo_dict[l] -= update
            
    # Save prediction and new Elos for each round

df_season_elo = pd.DataFrame({'Season':season_elo,'Index':index,'Game_Type':game_type,'Team1':win_team,'w_elo_reset':w_elo,'Team2':loss_team,'l_elo_reset':l_elo})
   
#df_reg_tour_compact['elo1'] = w_elo
#df_reg_tour_compact['elo2'] = l_elo

In [1435]:
df_season_elo_v1 = pd.concat([df_season_elo[['Season','Game_Type','Index','Team1','w_elo_reset']].rename(columns={'Team1':'Team','w_elo_reset':'elo_reset'}),
           df_season_elo[['Season','Game_Type','Index','Team2','l_elo_reset']].rename(columns={'Team2':'Team','l_elo_reset':'elo_reset'})],axis=0)

#df_season_elo_v1.shape
df_season_elo_v1 = df_season_elo_v1.loc[df_season_elo_v1.Game_Type=="T",['Season','Team','elo_reset']].drop_duplicates(['Season','Team'],keep='first')

df_season_elo_v1.to_csv('/Users/arganesa/MIDS/W207/Kaggle/mids-w207-kaggle/season_elo_v1.csv')

In [443]:
df_reg_tour_compact['elodiff'] = df_reg_tour_compact['elo1'] - df_reg_tour_compact['elo2'] 
df_reg_tour_compact.loc[(df_reg_tour_compact.Season<2018)&(df_reg_tour_compact.Game_Type=='T') & (df_reg_tour_compact.elodiff<-20)].sort_values(by=['elodiff'])

Unnamed: 0,index,DayNum,Game_Type,Loc1,Loc2,Score1,Score2,Season,Team1,Team2,stdScore1,stdScore2,TeamName1,LeagueName1,TeamName2,LeagueName2,margin,elo1,elo2,elodiff
45549,45549,138,T,A,H,67,63,2007,3265,3326,0.515385,0.484615,Marist,MAAC,Ohio St,Big Ten,4,1628.892215,1881.249402,-252.357187
76796,76796,146,T,N,N,82,81,2013,3257,3124,0.503067,0.496933,Louisville,ACC,Baylor,Big 12,1,1839.519182,2091.163410,-251.644228
71521,71521,139,T,N,N,76,70,2012,3265,3208,0.520548,0.479452,Marist,MAAC,Georgia,SEC,6,1598.839871,1840.263548,-241.423677
97925,97925,140,T,A,H,85,78,2017,3346,3274,0.521472,0.478528,Quinnipiac,MAAC,Miami FL,ACC,7,1607.117551,1841.557034,-234.439484
87353,87353,138,T,N,N,69,60,2015,3114,3401,0.534884,0.465116,Ark Little Rock,Sun Belt,Texas A&M,SEC,9,1631.506060,1832.717156,-201.211096
45578,45578,140,T,A,H,73,59,2007,3265,3292,0.553030,0.446970,Marist,MAAC,MTSU,C-USA,14,1644.458250,1842.085232,-197.626982
71529,71529,140,T,N,N,65,54,2012,3211,3274,0.546218,0.453782,Gonzaga,WCC,Miami FL,ACC,11,1692.427761,1887.927097,-195.499336
26390,26390,140,T,H,A,73,61,2003,3307,3280,0.544776,0.455224,New Mexico,MWC,Mississippi St,SEC,12,1657.001323,1850.811291,-193.809968
82091,82091,140,T,N,N,80,76,2014,3140,3304,0.512821,0.487179,BYU,WCC,Nebraska,Big Ten,4,1703.072864,1896.811771,-193.738907
17219,17219,145,T,A,H,84,67,2001,3449,3328,0.556291,0.443709,Washington,Pac-12,Oklahoma,Big 12,17,1645.638873,1837.869183,-192.230309


In [444]:
# Find per season Elo score

In [445]:
#a = df_reg_tour_compact[['Season','WTeamID',]].groupby(['season','WTeamID']).rolling(5, min_periods=1).mean().reset_index([0,1],drop=True)

a = df_reg_tour_compact[['index','Season','DayNum','Team1','Score1','Loc1','stdScore1','Game_Type','TeamName1','LeagueName1']]

a.columns=['index','Season','DayNum','TeamID','Score','Loc','stdScore','Game_Type','TeamName','LeagueName']

a = a.reset_index()
a.head()

Unnamed: 0,level_0,index,Season,DayNum,TeamID,Score,Loc,stdScore,Game_Type,TeamName,LeagueName
0,0,0,1998,18,3104,91,H,0.689394,R,Alabama,SEC
1,1,1,1998,18,3163,87,H,0.533742,R,Connecticut,AAC
2,2,2,1998,18,3222,66,H,0.528,R,Houston,AAC
3,3,3,1998,18,3307,69,H,0.526718,R,New Mexico,MWC
4,4,4,1998,18,3349,115,H,0.766667,R,Rice,C-USA


In [446]:
a['OC'] = 1

In [447]:
b = df_reg_tour_compact[['index','Season','DayNum','Team2','Score2','Loc2','stdScore2','Game_Type','TeamName2','LeagueName2']]

b.columns=['index','Season','DayNum','TeamID','Score','Loc','stdScore','Game_Type','TeamName','LeagueName']
b = b.reset_index()
b['OC'] = 0

In [448]:
c = pd.concat([a,b])    
c['rollingavg'] = c['OC']

In [449]:
c.sort_values(by=['Season','DayNum','index','TeamID'],inplace=True)

In [450]:
c = c.reset_index(drop=True)

In [451]:
d = c[['Season','TeamID','rollingavg']].groupby(['Season','TeamID']).rolling(10, min_periods=1).mean().reset_index([0,1],drop=True)

d.head()

Unnamed: 0,Season,TeamID,rollingavg
284,1998.0,3102.0,0.0
432,1998.0,3102.0,0.5
692,1998.0,3102.0,0.666667
934,1998.0,3102.0,0.5
1306,1998.0,3102.0,0.6


In [452]:
d['rollingavg_shifted'] = d.groupby(['Season','TeamID']).shift(1)

In [453]:
import math
d['rollingavg_final'] = d.apply(lambda x: x['rollingavg'] if (math.isnan(x['rollingavg_shifted'])) else x['rollingavg_shifted'],axis=1)

In [454]:
c = pd.concat([c, d[['rollingavg_final']]], axis=1)

In [455]:
c.tail()

Unnamed: 0,level_0,index,Season,DayNum,TeamID,Score,Loc,stdScore,Game_Type,TeamName,LeagueName,OC,rollingavg,rollingavg_final
210333,105166,105166,2018,140,3443,0,N,0.0,T,WKU,C-USA,0,0,0.0
210334,105167,105167,2018,140,3438,0,N,0.0,T,Virginia,ACC,1,1,0.1
210335,105167,105167,2018,140,3453,0,N,0.0,T,WI Green Bay,Horizon,0,0,0.0
210336,105168,105168,2018,140,3443,0,N,0.0,T,WKU,C-USA,1,1,0.0
210337,105168,105168,2018,140,3453,0,N,0.0,T,WI Green Bay,Horizon,0,0,0.0


In [456]:
c = pd.merge(c.loc[c.OC==1,['index','Season','DayNum','TeamID','Score','Loc','stdScore','Game_Type','TeamName','LeagueName','OC','rollingavg_final']].\
rename(columns={'TeamID':'Team1','Score':'Score1','Loc':'Loc1','stdScore':'stdScore1','TeamName':'TeamName1','LeagueName':'LeagueName1','OC':'OC1','rollingavg_final':'rollingavg_final1'}),
         c.loc[c.OC==0,['index','TeamID','Score','stdScore','TeamName','LeagueName','OC','rollingavg_final']].\
rename(columns={'TeamID':'Team2','Score':'Score2','stdScore':'stdScore2','TeamName':'TeamName2','LeagueName':'LeagueName2','OC':'OC2','rollingavg_final':'rollingavg_final2'}),on='index')

In [457]:
df_reg_tour_compact = pd.merge(df_reg_tour_compact,c[['index','rollingavg_final1','rollingavg_final2']],on='index')
df_reg_tour_compact['rollingavgdiff'] = df_reg_tour_compact['rollingavg_final1'] - df_reg_tour_compact['rollingavg_final2']

In [391]:
df_reg_tour_compact.loc[(df_reg_tour_compact.Season<2018)&(df_reg_tour_compact.Game_Type=='T') & (df_reg_tour_compact.elodiff<-20)&(df_reg_tour_compact.rollingavgdiff<0)].sort_values(by=['elodiff']).head()

Unnamed: 0,index,DayNum,Game_Type,Loc1,Loc2,Score1,Score2,Season,Team1,Team2,...,LeagueName1,TeamName2,LeagueName2,margin,elo1,elo2,elodiff,rollingavg_final1,rollingavg_final2,rollingavgdiff
76796,76796,146,T,N,N,82,81,2013,3257,3124,...,ACC,Baylor,Big 12,1,1839.519182,2091.16341,-251.644228,0.6,1.0,-0.4
45578,45578,140,T,A,H,73,59,2007,3265,3292,...,MAAC,MTSU,C-USA,14,1644.45825,1842.085232,-197.626982,0.9,1.0,-0.1
17219,17219,145,T,A,H,84,67,2001,3449,3328,...,Pac-12,Oklahoma,Big 12,17,1645.638873,1837.869183,-192.230309,0.7,0.9,-0.2
92655,92655,140,T,A,H,74,65,2016,3449,3268,...,Pac-12,Maryland,Big Ten,9,1788.538283,1974.400883,-185.8626,0.7,1.0,-0.3
97935,97935,145,T,N,N,77,63,2017,3332,3268,...,Pac-12,Maryland,Big Ten,14,1791.088302,1967.944967,-176.856665,0.6,0.9,-0.3


In [1175]:
# Calculate Field Goal Percentage 

df_reg_tour_detail['FGP1'] = df_reg_tour_detail.FGM1 / df_reg_tour_detail.FGA1

# Calculate 2Pt Field Goal Percentage

df_reg_tour_detail['twoptFGP1'] = (df_reg_tour_detail.FGM1 - df_reg_tour_detail.threeptFGM1) / (df_reg_tour_detail.FGA1 - df_reg_tour_detail.threeptFGA1)

# Calculate 3Pt Field Goal Percentage

df_reg_tour_detail['threeptFGP1'] = df_reg_tour_detail.threeptFGM1 / df_reg_tour_detail.threeptFGA1

# Calculate Free Throw Shooting

df_reg_tour_detail['FTP1'] = df_reg_tour_detail.FTM1 / df_reg_tour_detail.FTA1

# Calculate Field Goal Percentage 

df_reg_tour_detail['FGP2'] = df_reg_tour_detail.FGM2 / df_reg_tour_detail.FGA2

# Calculate 2Pt Field Goal Percentage

df_reg_tour_detail['twoptFGP2'] = (df_reg_tour_detail.FGM2 - df_reg_tour_detail.threeptFGM2) / (df_reg_tour_detail.FGA2 - df_reg_tour_detail.threeptFGA2)

# Calculate 3Pt Field Goal Percentage

df_reg_tour_detail['threeptFGP2'] = df_reg_tour_detail.threeptFGM2 / df_reg_tour_detail.threeptFGA2

# Calculate Free Throw Shooting

df_reg_tour_detail['FTP2'] = df_reg_tour_detail.FTM2 / df_reg_tour_detail.FTA2

In [1203]:
df_reg_tour_detail.columns

a = df_reg_tour_detail[['index','Season','DayNum','Team1','Score1','stdScore1','FGP1', 'twoptFGP1', 'threeptFGP1', 'FTP1','Ast1','Game_Type','TeamName1','LeagueName1']]

a.columns=[['index','Season','DayNum','Team','Score','stdScore','FGP', 'twoptFGP', 'threeptFGP', 'FTP','Ast','Game_Type','TeamName','LeagueName']]

a = a.reset_index()
a['OC'] = 1

In [1204]:
df_reg_tour_detail.columns

b = df_reg_tour_detail[['index','Season','DayNum','Team2','Score2','stdScore2','FGP2', 'twoptFGP2', 'threeptFGP2', 'FTP2','Ast2','Game_Type','TeamName2','LeagueName2']]

b.columns=[['index','Season','DayNum','Team','Score','stdScore','FGP', 'twoptFGP', 'threeptFGP', 'FTP','Ast','Game_Type','TeamName','LeagueName']]

b = b.reset_index()
b['OC'] = 0

In [1205]:
c = pd.concat([a,b])
c.sort_values(by=['Season','DayNum','index','Team'],inplace=True)
c= c.reset_index(drop=True)

In [1209]:
d = c[['Season','Team','FGP','twoptFGP','threeptFGP','FTP','Ast']].groupby(['Season','Team']).rolling(5, min_periods=1).mean().reset_index([0,1],drop=True)

d.columns= ['Season','Team','FGP_r','twoptFGP_r','threeptFGP_r','FTP_r','Ast_r']

In [1210]:
season_per = pd.concat([c,d[['FGP_r','twoptFGP_r','threeptFGP_r','FTP_r','Ast_r']]],axis=1)

In [1212]:
season_per.drop_duplicates(['Season','Team'], keep='last', inplace=True)

In [1213]:
d[['Ast_s','FGP_s','FTP_s','threeptFGP_s','twoptFGP_s']] =d.groupby(['Season','Team']).shift(1)

In [1215]:
import math
d['Ast_rolling'] = d.apply(lambda x: x['Ast_r'] if (math.isnan(x['Ast_s'])) else x['Ast_s'],axis=1)
d['FGP_rolling'] = d.apply(lambda x: x['FGP_r'] if (math.isnan(x['FGP_s'])) else x['FGP_s'],axis=1)
d['FTP_rolling'] = d.apply(lambda x: x['FTP_r'] if (math.isnan(x['FTP_s'])) else x['FTP_s'],axis=1)
d['threeptFGP_rolling'] = d.apply(lambda x: x['threeptFGP_r'] if (math.isnan(x['threeptFGP_s'])) else x['threeptFGP_s'],axis=1)
d['twoptFGP_rolling'] = d.apply(lambda x: x['twoptFGP_r'] if (math.isnan(x['twoptFGP_s'])) else x['twoptFGP_s'],axis=1)




In [1216]:
d.columns

Index(['Season', 'Team', 'FGP_r', 'twoptFGP_r', 'threeptFGP_r', 'FTP_r',
       'Ast_r', 'Ast_s', 'FGP_s', 'FTP_s', 'threeptFGP_s', 'twoptFGP_s',
       'Ast_rolling', 'FGP_rolling', 'FTP_rolling', 'threeptFGP_rolling',
       'twoptFGP_rolling'],
      dtype='object')

In [1217]:
c = pd.concat([c, d[['Ast_rolling',
       'FGP_rolling', 'FTP_rolling', 'threeptFGP_rolling', 'twoptFGP_rolling']]], axis=1)

In [1219]:
c.columns

Index(['level_0', 'index', 'Season', 'DayNum', 'Team', 'Score', 'stdScore',
       'FGP', 'twoptFGP', 'threeptFGP', 'FTP', 'Ast', 'Game_Type', 'TeamName',
       'LeagueName', 'OC', 'Ast_rolling', 'FGP_rolling', 'FTP_rolling',
       'threeptFGP_rolling', 'twoptFGP_rolling'],
      dtype='object')

In [1220]:
c = pd.merge(c.loc[c.OC==1,['index', 'Season', 'DayNum', 'Team', 'Ast_rolling', 'FGP_rolling', 'FTP_rolling',
       'threeptFGP_rolling', 'twoptFGP_rolling']].\
rename(columns={'Team':'Team1','Ast_rolling':'Ast_rolling1', 'FGP_rolling':'FGP_rolling1', 'FTP_rolling':'FTP_rolling1',
       'threeptFGP_rolling':'threeptFGP_rolling1', 'twoptFGP_rolling':'twoptFGP_rolling1'}),
         c.loc[c.OC==0,['index','Team', 'Ast_rolling', 'FGP_rolling', 'FTP_rolling',
       'threeptFGP_rolling', 'twoptFGP_rolling']].\
rename(columns={'Team':'Team2','Ast_rolling':'Ast_rolling2', 'FGP_rolling':'FGP_rolling2', 'FTP_rolling':'FTP_rolling2',
       'threeptFGP_rolling':'threeptFGP_rolling2', 'twoptFGP_rolling':'twoptFGP_rolling2'}),on='index')

In [1222]:
df_train_rolling_performance = c

In [458]:
# Create joint team Id
def createteam(row):
    z=[]
    z.append(row['Team1'])
    z.append(row['Team2'])
    z.sort()
    x = ''.join(z)
    return (x)

In [459]:
df_reg_tour_compact['JointTeamID']=df_reg_tour_compact[['Team1','Team2']].astype(str).apply(createteam,axis=1)

In [460]:
a = df_reg_tour_compact[['index','JointTeamID','Season','Team1','Score1','stdScore1']]

a.columns=['index','JointTeamID','Season','TeamID','Score','stdScore']

a = a.reset_index()
a['HOC'] = 1

In [461]:
b = df_reg_tour_compact[['index','JointTeamID','Season','Team2','Score2','stdScore2']]

b.columns=['index','JointTeamID','Season','TeamID','Score','stdScore']

b = b.reset_index()
b['HOC'] = 0

In [462]:
c = pd.concat([a,b])    
c['OC_dup'] = c['HOC']

In [463]:
c.head()

Unnamed: 0,level_0,index,JointTeamID,Season,TeamID,Score,stdScore,HOC,OC_dup
0,0,0,31043202,1998,3104,91,0.689394,1,1
1,1,1,31633221,1998,3163,87,0.533742,1,1
2,2,2,32223261,1998,3222,66,0.528,1,1
3,3,3,33073365,1998,3307,69,0.526718,1,1
4,4,4,33493411,1998,3349,115,0.766667,1,1


In [464]:
c.sort_values(by=['index','Season','JointTeamID'],inplace=True)
c = c.reset_index(drop=True)
c['JointTeamID'] = c['JointTeamID'].astype('int64') 


In [465]:
d = c[['Season','JointTeamID','TeamID','OC_dup','stdScore']].groupby(['Season','JointTeamID','TeamID']).rolling(5, min_periods=1).mean().reset_index([0,2],drop=True)


In [466]:
d.reset_index(0,drop=True).head()

Unnamed: 0,Season,JointTeamID,TeamID,OC_dup,stdScore
692,1998.0,31023119.0,3102.0,1.0,0.531532
693,1998.0,31023119.0,3119.0,0.0,0.468468
4693,1998.0,31023140.0,3102.0,0.0,0.368098
6691,1998.0,31023140.0,3102.0,0.0,0.413216
4692,1998.0,31023140.0,3140.0,1.0,0.631902


In [467]:
d[['H2Hrollingavg','H2HrollingstdScore']] = d.groupby(['Season','JointTeamID','TeamID']).shift(1)



Defaulting to column but this will raise an ambiguity error in a future version
  """Entry point for launching an IPython kernel.


In [468]:
d = d.reset_index(0,drop=True)

In [469]:
import math
d['H2Hrollingavg_final'] = d.apply(lambda x: x['OC_dup'] if (math.isnan(x['H2Hrollingavg'])) else x['H2Hrollingavg'],axis=1)

In [470]:
d['H2HrollingstdScore_final'] = d.apply(lambda x: x['stdScore'] if (math.isnan(x['H2HrollingstdScore'])) else x['H2HrollingstdScore'],axis=1)

d.head()
#c = pd.concat([c, d[['rollingavg_final']]], axis=1)

Unnamed: 0,Season,JointTeamID,TeamID,OC_dup,stdScore,H2Hrollingavg,H2HrollingstdScore,H2Hrollingavg_final,H2HrollingstdScore_final
692,1998.0,31023119.0,3102.0,1.0,0.531532,,,1.0,0.531532
693,1998.0,31023119.0,3119.0,0.0,0.468468,,,0.0,0.468468
4693,1998.0,31023140.0,3102.0,0.0,0.368098,,,0.0,0.368098
6691,1998.0,31023140.0,3102.0,0.0,0.413216,0.0,0.368098,0.0,0.368098
4692,1998.0,31023140.0,3140.0,1.0,0.631902,,,1.0,0.631902


In [471]:
e = c[['Season','JointTeamID','TeamID','OC_dup']].groupby(['Season','JointTeamID','TeamID']).rolling(5, min_periods=1).count()


In [472]:

f=e.reset_index([0,1,2],drop=True)
f.head()

Unnamed: 0,Season,JointTeamID,TeamID,OC_dup
692,1.0,1.0,1.0,1.0
693,1.0,1.0,1.0,1.0
4693,1.0,1.0,1.0,1.0
6691,2.0,2.0,2.0,2.0
4692,1.0,1.0,1.0,1.0


In [473]:
f['H2HCount'] = f['OC_dup']


In [474]:
c = pd.concat([c, pd.concat([d,f],axis=1)[['H2Hrollingavg_final','H2HrollingstdScore_final','H2HCount']]], axis=1)
c.head()

Unnamed: 0,level_0,index,JointTeamID,Season,TeamID,Score,stdScore,HOC,OC_dup,H2Hrollingavg_final,H2HrollingstdScore_final,H2HCount
0,0,0,31043202,1998,3104,91,0.689394,1,1,1.0,0.689394,1.0
1,0,0,31043202,1998,3202,41,0.310606,0,0,0.0,0.310606,1.0
2,1,1,31633221,1998,3163,87,0.533742,1,1,1.0,0.533742,1.0
3,1,1,31633221,1998,3221,76,0.466258,0,0,0.0,0.466258,1.0
4,2,2,32223261,1998,3222,66,0.528,1,1,1.0,0.528,1.0


In [475]:
g = pd.merge(c.loc[c.HOC==1,['index','JointTeamID','Season','TeamID','H2HCount','H2Hrollingavg_final','H2HrollingstdScore_final']].\
rename(columns={'TeamID':'Team1','H2HCount':'H2HCount','H2Hrollingavg_final':'H2Hrollingavg_final1','H2HrollingstdScore_final':'H2HrollingstdScore_final1'}),
c.loc[c.HOC==0,['index','TeamID','H2Hrollingavg_final','H2HrollingstdScore_final']].\
rename(columns={'TeamID':'Team2','H2Hrollingavg_final':'H2Hrollingavg_final2','H2HrollingstdScore_final':'H2HrollingstdScore_final2'}),on='index')

In [476]:
g.tail()

Unnamed: 0,index,JointTeamID,Season,Team1,H2HCount,H2Hrollingavg_final1,H2HrollingstdScore_final1,Team2,H2Hrollingavg_final2,H2HrollingstdScore_final2
105164,105164,34373443,2018,3437,1.0,1.0,0.0,3443,0.0,0.0
105165,105165,34373453,2018,3437,1.0,1.0,0.0,3453,0.0,0.0
105166,105166,34383443,2018,3438,1.0,1.0,0.0,3443,0.0,0.0
105167,105167,34383453,2018,3438,1.0,1.0,0.0,3453,0.0,0.0
105168,105168,34433453,2018,3443,1.0,1.0,0.0,3453,0.0,0.0


In [477]:
df_reg_tour_compact = pd.merge(df_reg_tour_compact,g[['index','H2Hrollingavg_final1','H2Hrollingavg_final2','H2HrollingstdScore_final1','H2HrollingstdScore_final2','H2HCount']],on='index')


In [480]:
df_reg_tour_compact['H2HrollingstdScorediff'] = df_reg_tour_compact['H2HrollingstdScore_final1'] - df_reg_tour_compact['H2HrollingstdScore_final2']
df_reg_tour_compact['H2Hrollingavgdiff'] = df_reg_tour_compact['H2Hrollingavg_final1'] - df_reg_tour_compact['H2Hrollingavg_final2']

In [481]:
df_reg_tour_compact_checked = df_reg_tour_compact

In [251]:
df_reg_tour_compact.loc[(df_reg_tour_compact.Season<2018)&(df_reg_tour_compact.Game_Type=='T') & (df_reg_tour_compact.elodiff<-20)&(df_reg_tour_compact.rollingavgdiff<0)].sort_values(by=['elodiff']).head()


Unnamed: 0,index,DayNum,Game_Type,Loc1,Loc2,Score1,Score2,Season,Team1,Team2,...,rollingavg_final2,rollingavgdiff,JointTeamID,H2Hrollingavg_final1,H2Hrollingavg_final2,H2HrollingstdScore_final1,H2HrollingstdScore_final2,H2HCount,H2HrollingstdScorediff,H2Hrollingavgdiff
76796,76796,146,T,N,N,82,81,2013,3257,3124,...,1.0,-0.4,31243257,1.0,0.0,0.503067,0.496933,1.0,0.006135,1.0
45578,45578,140,T,A,H,73,59,2007,3265,3292,...,1.0,-0.1,32653292,1.0,0.0,0.55303,0.44697,1.0,0.106061,1.0
17219,17219,145,T,A,H,84,67,2001,3449,3328,...,0.9,-0.2,33283449,1.0,0.0,0.556291,0.443709,1.0,0.112583,1.0
92655,92655,140,T,A,H,74,65,2016,3449,3268,...,1.0,-0.3,32683449,1.0,0.0,0.532374,0.467626,1.0,0.064748,1.0
97935,97935,145,T,N,N,77,63,2017,3332,3268,...,0.9,-0.3,32683332,1.0,0.0,0.55,0.45,1.0,0.1,1.0


In [483]:
df_reg_tour_compact.groupby('Game_Type').count()

Unnamed: 0_level_0,index,DayNum,Loc1,Loc2,Score1,Score2,Season,Team1,Team2,stdScore1,...,rollingavg_final2,rollingavgdiff,JointTeamID,H2Hrollingavg_final1,H2Hrollingavg_final2,H2HrollingstdScore_final1,H2HrollingstdScore_final2,H2HCount,H2HrollingstdScorediff,H2Hrollingavgdiff
Game_Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
R,101893,101893,101893,101893,101893,101893,101893,101893,101893,101893,...,101893,101893,101893,101893,101893,101893,101893,101893,101893,101893
T,3276,3276,3276,3276,3276,3276,3276,3276,3276,3276,...,3276,3276,3276,3276,3276,3276,3276,3276,3276,3276


In [500]:
df_tour_compact_checked = df_reg_tour_compact.loc[df_reg_tour_compact.Game_Type=="T"]
df_tour_compact_checked= df_tour_compact_checked.reset_index()

In [504]:
df_tour_compact_checked.tail()
df_tour_compact_checked.groupby('Game_Type').count()

Unnamed: 0_level_0,level_0,index,DayNum,Loc1,Loc2,Score1,Score2,Season,Team1,Team2,...,rollingavg_final2,rollingavgdiff,JointTeamID,H2Hrollingavg_final1,H2Hrollingavg_final2,H2HrollingstdScore_final1,H2HrollingstdScore_final2,H2HCount,H2HrollingstdScorediff,H2Hrollingavgdiff
Game_Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
T,3276,3276,3276,3276,3276,3276,3276,3276,3276,3276,...,3276,3276,3276,3276,3276,3276,3276,3276,3276,3276


In [522]:
df_tour_compact_checked['H2Hrollingavg_final1n'] = 0
df_tour_compact_checked['H2Hrollingavg_final2n'] = 0

In [523]:
def func3(x):
    mavg = 5
    
    if((x['H2Hrollingavg_final1']==0) | (x['H2Hrollingavg_final2']==0)):
        if((x['H2Hrollingavg_final1']==0) & (x['H2HCount'] <=5)):
            
            #print((0.5 - ((x['aH2HCount'])/mavg)),(0.5 + ((x['aH2HCount'])/mavg)),(x['aH2HCount']/mavg))
            
            x['H2Hrollingavg_final1n'] = (0.5 - ((x['H2HCount'])/mavg))
            x['H2Hrollingavg_final2n'] = (0.5 + ((x['H2HCount'])/mavg))
            x['gamecountweight'] = x['H2HCount']/mavg
            
        if((x['H2Hrollingavg_final1']==0) & (x['H2HCount'] > 5)):
            
            #print((0.5 - ((x['aH2HCount'])/mavg)),(0.5 + ((x['aH2HCount'])/mavg)),(x['aH2HCount']/mavg))
            
            x['H2Hrollingavg_final1n'] = x['H2Hrollingavg_final1'] + 0.05
            x['H2Hrollingavg_final2n'] = x['H2Hrollingavg_final2'] - 0.05
            x['gamecountweight'] = x['H2HCount']/mavg            
            
        
        if((x['H2Hrollingavg_final2']==0)&(x['H2HCount'] <=5)):
            
            x['H2Hrollingavg_final2n'] = (0.5 - ((x['H2HCount'])/mavg))
            x['H2Hrollingavg_final1n'] = (0.5 + ((x['H2HCount'])/mavg))
            x['gamecountweight'] = x['H2HCount']/mavg
        
        if((x['H2Hrollingavg_final2']==0)&(x['H2HCount'] > 5)):
            
            x['H2Hrollingavg_final1n'] = x['H2Hrollingavg_final1'] - 0.05
            x['H2Hrollingavg_final2n'] = x['H2Hrollingavg_final2'] + 0.05
            x['gamecountweight'] = x['H2HCount']/mavg  
        
    else:
        x['H2Hrollingavg_final1n'] = x['H2Hrollingavg_final1']
        x['H2Hrollingavg_final2n'] = x['H2Hrollingavg_final2']
        x['gamecountweight'] = x['H2HCount']/mavg
        
    return x
               
    

df_tour_compact_checked_v1 = df_tour_compact_checked.apply(func3,axis=1)

#sk2.loc[((sk2.AwH2H==0)|(sk2.BwH2H==0)),['A','B','Season', 'Game_Type', 'DayNum','AstdScoremean', 'Amwp', 'BstdScoremean', 'Bmwp','AwH2H', 'BwH2H', 'AstdscoreH2H',
#       'BStdscoreH2H','aH2HCount','gamecountweight','AnwH2H','BnwH2H','gamecountweight']]

In [528]:
df_tour_compact_checked_v1.shape

(3276, 35)

In [530]:
df_tour_compact_checked_v1.Season.unique()

array([1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008,
       2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018])

In [531]:
winning_games_w = df_tour_compact_checked_v1.loc[(df_tour_compact_checked_v1.Season<2018),['Team1','Season','index']].groupby(['Team1','Season']).agg('count').rename({'Team1':'Team1','index':'wincount1'}).reset_index()
winning_games_l = df_tour_compact_checked_v1.loc[(df_tour_compact_checked_v1.Season<2018),['Team1','Season','index']].groupby(['Team1','Season']).agg('count').rename({'Team1':'Team2','index':'wincount2'}).reset_index()

losing_games_w = df_tour_compact_checked_v1.loc[(df_tour_compact_checked_v1.Season<2018),['Team2','Season','index']].groupby(['Team2','Season']).agg('count').rename({'Team2':'Team1','index':'losscount1'}).reset_index()
losing_games_l = df_tour_compact_checked_v1.loc[(df_tour_compact_checked_v1.Season<2018),['Team2','Season','index']].groupby(['Team2','Season']).agg('count').rename({'Team2':'Team2','index':'losscount2'}).reset_index()

winning_games_w.columns = ['Team1','Season','wincount1']
winning_games_l.columns = ['Team2','Season','wincount2']
losing_games_w.columns = ['Team1','Season','losscount1']
losing_games_l.columns = ['Team2','Season','losscount2']


In [534]:
teams_tour_win_loss = pd.merge(winning_games_w,losing_games_w,how='outer',on=['Team1','Season']).fillna(0)

teams_tour_win_loss = teams_tour_win_loss.rename(columns = {'Team1':'TeamID','wincount1':'twincount','losscount1':'tlosscount'})

teams_tour_win_loss.Season.unique()

array([1998, 1999, 2016, 2000, 2005, 2002, 2006, 2007, 2008, 2009, 2014,
       2015, 2017, 2010, 2001, 2003, 2012, 2004, 2011, 2013])

In [535]:
teams_tour_win_loss['Season'] = teams_tour_win_loss.Season + 1

teams_tour_win_loss = pd.merge(teams_tour_win_loss,df_teams_leagues,how='left',on=['TeamID'])
League_tour_win_loss= teams_tour_win_loss[['Season','LeagueName','twincount','tlosscount']].groupby(['Season','LeagueName']).sum().reset_index()

In [537]:
teams_tour_win_loss.head()

Unnamed: 0,TeamID,Season,twincount,tlosscount,TeamName,LeagueName
0,3104,1999,2.0,1.0,Alabama,SEC
1,3104,2000,1.0,1.0,Alabama,SEC
2,3107,2017,1.0,1.0,Albany NY,America East
3,3112,1999,2.0,1.0,Arizona,Pac-12
4,3112,2000,1.0,1.0,Arizona,Pac-12


In [63]:
teams_tour_win_loss.loc[teams_tour_win_loss.TeamID==3438]

Unnamed: 0,TeamID,Season,twincount,tlosscount,TeamName,LeagueName
665,3438,1999,1.0,1.0,Virginia,ACC
666,3438,2001,2.0,1.0,Virginia,ACC
667,3438,2004,1.0,1.0,Virginia,ACC
668,3438,2006,1.0,1.0,Virginia,ACC
669,3438,2009,1.0,1.0,Virginia,ACC
670,3438,2010,1.0,1.0,Virginia,ACC
1292,3438,2000,0.0,1.0,Virginia,ACC
1293,3438,2002,0.0,1.0,Virginia,ACC
1294,3438,2003,0.0,1.0,Virginia,ACC
1295,3438,2011,0.0,1.0,Virginia,ACC


In [539]:
#Remove 1998

df_tour_compact_checked_v2 = df_tour_compact_checked_v1.loc[~(df_tour_compact_checked_v1.Season==1998)]
df_tour_compact_checked_v2.shape

(3213, 35)

In [552]:
df_tour_compact_checked_v3 = pd.merge(df_tour_compact_checked_v2,teams_tour_win_loss[['Season','TeamID','twincount','tlosscount']],how='left',left_on=['Team1','Season'],right_on=['TeamID','Season']).fillna(0)

In [553]:
df_tour_compact_checked_v3.shape

(3213, 38)

In [554]:
df_tour_compact_checked_v3 = df_tour_compact_checked_v3.drop(['TeamID'],axis=1)

In [555]:
df_tour_compact_checked_v3=df_tour_compact_checked_v3.rename(columns={'twincount':'twincount1','tlosscount':'tlosscount1'})

In [556]:
df_tour_compact_checked_v3 = pd.merge(df_tour_compact_checked_v3,teams_tour_win_loss[['Season','TeamID','twincount','tlosscount']],how='left',left_on=['Team2','Season'],right_on=['TeamID','Season']).fillna(0)

In [557]:
df_tour_compact_checked_v3 = df_tour_compact_checked_v3.drop(['TeamID'],axis=1)
df_tour_compact_checked_v3=df_tour_compact_checked_v3.rename(columns={'twincount':'twincount2','tlosscount':'tlosscount2'})

In [960]:
df_tour_compact_checked_v4 = df_tour_compact_checked_v3

In [961]:
#df_train = df_reg_tour_compact.loc[df_reg_tour_compact.Season > 2012]
df_tour_compact_checked_v4['H2Hrollingavg_finalndiff'] = df_tour_compact_checked_v4['H2Hrollingavg_final1n'] - df_tour_compact_checked_v4['H2Hrollingavg_final2n']
df_tour_compact_checked_v4['twincountdiff'] = df_tour_compact_checked_v4.twincount1 - df_tour_compact_checked_v4.twincount2
df_tour_compact_checked_v4['tlosscountdiff'] = df_tour_compact_checked_v4.tlosscount1 - df_tour_compact_checked_v4.tlosscount2
#df_train = df_reg_tour_compact.loc[~((df_reg_tour_compact.Season==2018)&(df_reg_tour_compact.Game_Type=='T'))]

#df_train=df_train.loc[~(df_train.Season==1998)]

df_tour_compact_checked_v4.columns

Index(['level_0', 'index', 'DayNum', 'Game_Type', 'Loc1', 'Loc2', 'Score1',
       'Score2', 'Season', 'Team1', 'Team2', 'stdScore1', 'stdScore2',
       'TeamName1', 'LeagueName1', 'TeamName2', 'LeagueName2', 'margin',
       'elo1', 'elo2', 'elodiff', 'rollingavg_final1', 'rollingavg_final2',
       'rollingavgdiff', 'JointTeamID', 'H2Hrollingavg_final1',
       'H2Hrollingavg_final2', 'H2HrollingstdScore_final1',
       'H2HrollingstdScore_final2', 'H2HCount', 'H2HrollingstdScorediff',
       'H2Hrollingavgdiff', 'H2Hrollingavg_final1n', 'H2Hrollingavg_final2n',
       'gamecountweight', 'twincount1', 'tlosscount1', 'twincount2',
       'tlosscount2', 'H2Hrollingavg_finalndiff', 'twincountdiff',
       'tlosscountdiff'],
      dtype='object')

In [962]:
# sEEDING
# get just integer value of seed (exclude region information)
df_seeds = pd.read_csv(data_dir_2 + 'WNCAATourneySeeds.csv')

def seed_to_int(seed):
    #Get just the digits from the seeding. Return as int
    s_int = int(seed[1:3])
    return s_int
df_seeds['SeedInt'] = df_seeds.Seed.apply(seed_to_int)
df_seeds.drop(labels=['Seed'], inplace=True, axis=1) # This is the string label

df_seeds.head()

Unnamed: 0,Season,TeamID,SeedInt
0,1998,3330,1
1,1998,3163,2
2,1998,3112,3
3,1998,3301,4
4,1998,3272,5


In [963]:
# merge seeds with team IDs for tourney performance
df_W = df_seeds.rename(columns={'TeamID':'Team1', 'SeedInt':'Team1Seed'})
df_L = df_seeds.rename(columns={'TeamID':'Team2', 'SeedInt':'Team2Seed'})
df_tour_compact_checked_v4 = pd.merge(left=df_tour_compact_checked_v4, right=df_W, how='left', on=['Season', 'Team1'])
df_tour_compact_checked_v4 = pd.merge(left=df_tour_compact_checked_v4, right=df_L, how='left', on=['Season', 'Team2'])

In [965]:
df_tour_compact_checked_v4['Seeddiff'] = df_tour_compact_checked_v4['Team1Seed'] - df_tour_compact_checked_v4['Team2Seed']

In [1229]:
df_train_rolling_performance.columns

Index(['index', 'Season', 'DayNum', 'Team1', 'Ast_rolling1', 'FGP_rolling1',
       'FTP_rolling1', 'threeptFGP_rolling1', 'twoptFGP_rolling1', 'Team2',
       'Ast_rolling2', 'FGP_rolling2', 'FTP_rolling2', 'threeptFGP_rolling2',
       'twoptFGP_rolling2'],
      dtype='object')

In [1329]:

df_train = df_tour_compact_checked_v4.loc[(df_tour_compact_checked_v4.Season>=2010)&(df_tour_compact_checked_v4.Season<2017)]

df_dev = df_tour_compact_checked_v4.loc[df_tour_compact_checked_v4.Season==2017]

df_test = df_tour_compact_checked_v4.loc[df_tour_compact_checked_v4.Season==2018]


In [1330]:
df_train.shape

(441, 45)

In [1331]:
df_train = pd.merge(df_train,df_train_rolling_performance[['Season', 'DayNum', 'Team1', 'Ast_rolling1', 'FGP_rolling1',
      'FTP_rolling1', 'threeptFGP_rolling1', 'twoptFGP_rolling1']],how='left',on=['Team1','DayNum','Season'])

df_train = pd.merge(df_train,df_train_rolling_performance[['Season', 'DayNum', 'Team2', 'Ast_rolling2', 'FGP_rolling2',
       'FTP_rolling2', 'threeptFGP_rolling2', 'twoptFGP_rolling2']],how='left',on=['Team2','DayNum','Season'])


df_train['Astdiff'] = df_train['Ast_rolling1'] - df_train['Ast_rolling2']
df_train['twoptFGPdiff'] = df_train['twoptFGP_rolling1'] - df_train['twoptFGP_rolling2']
df_train['threeptFGPdiff'] = df_train['threeptFGP_rolling1'] - df_train['threeptFGP_rolling2']
df_train['FGPdiff'] = df_train['FGP_rolling1'] - df_train['FGP_rolling2']

In [1332]:
df_train_pos = df_train[['Season','Seeddiff','elodiff','rollingavgdiff',
      'H2Hrollingavg_finalndiff',
       'H2HrollingstdScorediff','twincountdiff','tlosscountdiff','Ast_rolling1', 'FGP_rolling1',
       'FTP_rolling1', 'threeptFGP_rolling1', 'twoptFGP_rolling1','Ast_rolling2', 'FGP_rolling2',
       'FTP_rolling2', 'threeptFGP_rolling2', 'twoptFGP_rolling2','Astdiff',
    'twoptFGPdiff',
    'threeptFGPdiff',
    'FGPdiff']]

In [1333]:
df_train_neg = pd.DataFrame(

    {
        'elodiff' : -df_train['elodiff'],
        'rollingavgdiff': -df_train['rollingavgdiff'],
        'H2Hrollingavg_finalndiff': -df_train['H2Hrollingavg_finalndiff'],
        'H2HrollingstdScorediff': -df_train['H2HrollingstdScorediff'],
        'twincountdiff':-df_train['twincountdiff'],
        'tlosscountdiff':-df_train['tlosscountdiff'],
        'Season':df_train['Season'],
        'Seeddiff':-df_train['Seeddiff'],
        'Ast_rolling1':df_train['Ast_rolling2'], 
        'FGP_rolling1':df_train['FGP_rolling2'],
       'FTP_rolling1':df_train['FTP_rolling2'], 
        'threeptFGP_rolling1':df_train['threeptFGP_rolling2'],
        'twoptFGP_rolling1':df_train['twoptFGP_rolling2'],
        'Ast_rolling2':df_train['Ast_rolling1'], 
        'FGP_rolling2':df_train['FGP_rolling1'],
       'FTP_rolling2':df_train['FTP_rolling1'], 
        'threeptFGP_rolling2':df_train['threeptFGP_rolling1'],
        'twoptFGP_rolling2':df_train['twoptFGP_rolling1'],
            'Astdiff':-df_train['Astdiff'],
    'twoptFGPdiff':-df_train['twoptFGPdiff'],
    'threeptFGPdiff':-df_train['threeptFGPdiff'],
    'FGPdiff':-df_train['FGPdiff']
        
        
    }
)



In [1334]:
df_train_pos.columns

Index(['Season', 'Seeddiff', 'elodiff', 'rollingavgdiff',
       'H2Hrollingavg_finalndiff', 'H2HrollingstdScorediff', 'twincountdiff',
       'tlosscountdiff', 'Ast_rolling1', 'FGP_rolling1', 'FTP_rolling1',
       'threeptFGP_rolling1', 'twoptFGP_rolling1', 'Ast_rolling2',
       'FGP_rolling2', 'FTP_rolling2', 'threeptFGP_rolling2',
       'twoptFGP_rolling2', 'Astdiff', 'twoptFGPdiff', 'threeptFGPdiff',
       'FGPdiff'],
      dtype='object')

In [1363]:
x_train = df_train_pos.append(df_train_neg)


# build outcome = first 1/2 of data is positive, second 1/2 is negative 
y_train = ([1] * len(df_train_pos)) + ([0] * len(df_train_neg))

x_train['Outcome'] = y_train

x_train.columns

train_x = x_train[['Seeddiff','H2Hrollingavg_finalndiff', 'H2HrollingstdScorediff','elodiff', 'rollingavgdiff', 'tlosscountdiff', 'twincountdiff','Ast_rolling1', 'FGP_rolling1', 'FTP_rolling1',
       'threeptFGP_rolling1', 'twoptFGP_rolling1', 'Ast_rolling2',
       'FGP_rolling2', 'FTP_rolling2', 'threeptFGP_rolling2',
       'twoptFGP_rolling2','Astdiff', 'twoptFGPdiff', 'threeptFGPdiff',
       'FGPdiff']]
train_y = x_train[['Outcome']]

In [1364]:
train_x.shape

(882, 21)

In [1365]:
df_tour_compact_checked_dev_1 = df_dev[['Team1','index','DayNum', 'elo1',
                                                                          'rollingavg_final1',
        'JointTeamID', 'H2HrollingstdScore_final1',
       'H2Hrollingavg_final1n', 
       'gamecountweight', 'twincount1', 'tlosscount1']]

df_tour_compact_checked_dev_1.columns = ['Team','index','DayNum', 'elo',
                                                                          'rollingavg_final',
        'JointTeamID', 'H2HrollingstdScore_final',
       'H2Hrollingavg_finaln', 
       'gamecountweight', 'twincount', 'tlosscount']

df_tour_compact_checked_dev_2 = df_dev[['Team2','index','DayNum', 'elo2',
                                                                           'rollingavg_final2',
        'JointTeamID', 
       'H2HrollingstdScore_final2', 'H2Hrollingavg_final2n',
       'gamecountweight','twincount2',
       'tlosscount2']]
df_tour_compact_checked_dev_2.columns = ['Team','index','DayNum', 'elo',
                                                                          'rollingavg_final',
        'JointTeamID', 'H2HrollingstdScore_final',
       'H2Hrollingavg_finaln', 
       'gamecountweight', 'twincount', 'tlosscount']

In [1366]:
pd.concat([df_tour_compact_checked_dev_2['Team'],df_tour_compact_checked_dev_1['Team']]).nunique()

64

In [1367]:
df_tour_compact_checked_dev = pd.concat([df_tour_compact_checked_dev_1,df_tour_compact_checked_dev_2])
df_tour_compact_checked_dev.sort_values(['DayNum','Team','index'], inplace=True)
df_tour_compact_checked_dev.drop_duplicates(['Team'], keep='first', inplace=True)

In [1368]:
df_tour_compact_checked_dev.head()

Unnamed: 0,Team,index,DayNum,elo,rollingavg_final,JointTeamID,H2HrollingstdScore_final,H2Hrollingavg_finaln,gamecountweight,twincount,tlosscount
1134,3113,97881,137,1770.411258,0.5,31133277,0.544776,0.7,0.2,1.0,1.0
1142,3120,97889,137,1749.77284,0.3,31203301,0.436364,0.3,0.2,1.0,1.0
1138,3125,97885,137,1614.532925,1.0,31253246,0.48951,0.3,0.2,0.0,1.0
1139,3137,97886,137,1613.881379,1.0,31373268,0.371951,0.3,0.2,0.0,0.0
1148,3146,97895,137,1523.929997,1.0,31463400,0.390625,0.3,0.2,0.0,1.0


In [1369]:
dev_x = df_dev[['Team1','Season', 'Team2','Seeddiff','H2Hrollingavg_finalndiff', 'H2HrollingstdScorediff', 'tlosscountdiff', 'twincountdiff']]


In [1370]:
dev_x_v1 = pd.merge(dev_x,df_tour_compact_checked_dev[['Team','elo','rollingavg_final']],how='left',left_on = 'Team1',right_on='Team')
dev_x_v1 = dev_x_v1.drop(['Team'],axis=1)
dev_x_v1 = dev_x_v1.rename(columns={'elo':'elo1','rollingavg_final':'rollingavg_final1'})

In [1371]:
dev_x_v1 = pd.merge(dev_x_v1,df_tour_compact_checked_dev[['Team','elo','rollingavg_final']],how='left',left_on = 'Team2',right_on='Team')
dev_x_v1 = dev_x_v1.drop(['Team'],axis=1)
dev_x_v1=dev_x_v1.rename(columns={'elo':'elo2','rollingavg_final':'rollingavg_final2'})

In [1372]:
dev_x_v1['elodiff'] = dev_x_v1['elo1'] -dev_x_v1['elo2'] 
dev_x_v1['rollingavgdiff'] = dev_x_v1['rollingavg_final1'] -dev_x_v1['rollingavg_final2'] 


In [1373]:
dev_x_v1 = pd.merge(dev_x_v1,season_per[['Season', 'DayNum', 'Team','FGP_r', 'twoptFGP_r', 'threeptFGP_r', 'FTP_r',
       'Ast_r']],how='left',left_on=['Team1','Season'],right_on=['Team','Season']).rename(columns={'FGP_r':'FGP_rolling1', 'twoptFGP_r':'twoptFGP_rolling1', 'threeptFGP_r':'threeptFGP_rolling1',
                                                                                                   'FTP_r':'FTP_rolling1',
       'Ast_r':'Ast_rolling1'})

dev_x_v1 = dev_x_v1.drop(['Team','DayNum'],axis=1)

dev_x_v1 = pd.merge(dev_x_v1,season_per[['Season', 'DayNum', 'Team','FGP_r', 'twoptFGP_r', 'threeptFGP_r', 'FTP_r',
       'Ast_r']],how='left',left_on=['Team2','Season'],right_on=['Team','Season']).rename(columns={'FGP_r':'FGP_rolling2', 'twoptFGP_r':'twoptFGP_rolling2', 'threeptFGP_r':'threeptFGP_rolling2',
                                                                                                   'FTP_r':'FTP_rolling2',
       'Ast_r':'Ast_rolling2'})
dev_x_v1 = dev_x_v1.drop(['Team','DayNum'],axis=1)



In [1374]:
dev_x_v1.columns

Index(['Team1', 'Season', 'Team2', 'Seeddiff', 'H2Hrollingavg_finalndiff',
       'H2HrollingstdScorediff', 'tlosscountdiff', 'twincountdiff', 'elo1',
       'rollingavg_final1', 'elo2', 'rollingavg_final2', 'elodiff',
       'rollingavgdiff', 'FGP_rolling1', 'twoptFGP_rolling1',
       'threeptFGP_rolling1', 'FTP_rolling1', 'Ast_rolling1', 'FGP_rolling2',
       'twoptFGP_rolling2', 'threeptFGP_rolling2', 'FTP_rolling2',
       'Ast_rolling2'],
      dtype='object')

In [1375]:
dev_x_v2 = dev_x_v1[['H2Hrollingavg_finalndiff',
       'H2HrollingstdScorediff', 'tlosscountdiff', 'twincountdiff','elodiff',
       'rollingavgdiff','Seeddiff','FGP_rolling1',
       'twoptFGP_rolling1', 'threeptFGP_rolling1', 'FTP_rolling1',
       'Ast_rolling1', 'FGP_rolling2',
       'twoptFGP_rolling2', 'threeptFGP_rolling2', 'FTP_rolling2',
       'Ast_rolling2']]


dev_x_v2['Astdiff'] = dev_x_v2['Ast_rolling1'] - dev_x_v2['Ast_rolling2']
dev_x_v2['twoptFGPdiff'] = dev_x_v2['twoptFGP_rolling1'] - dev_x_v2['twoptFGP_rolling2']
dev_x_v2['threeptFGPdiff'] = dev_x_v2['threeptFGP_rolling1'] - dev_x_v2['threeptFGP_rolling2']
dev_x_v2['FGPdiff'] = dev_x_v2['FGP_rolling1'] - dev_x_v2['FGP_rolling2']

dev_x_v2_neg = pd.DataFrame(

{
    'H2Hrollingavg_finalndiff':-dev_x_v2['H2Hrollingavg_finalndiff'],
       'H2HrollingstdScorediff':-dev_x_v2['H2HrollingstdScorediff'],
    'tlosscountdiff':-dev_x_v2['tlosscountdiff'],
    'twincountdiff':-dev_x_v2['twincountdiff'],
    'elodiff':-dev_x_v2['elodiff'],
       'rollingavgdiff':-dev_x_v2['rollingavgdiff'],
    'Seeddiff':-dev_x_v2['Seeddiff'],
            'Ast_rolling1':dev_x_v2['Ast_rolling2'], 
        'FGP_rolling1':dev_x_v2['FGP_rolling2'],
       'FTP_rolling1':dev_x_v2['FTP_rolling2'], 
        'threeptFGP_rolling1':dev_x_v2['threeptFGP_rolling2'],
        'twoptFGP_rolling1':dev_x_v2['twoptFGP_rolling2'],
        'Ast_rolling2':dev_x_v2['Ast_rolling1'], 
        'FGP_rolling2':dev_x_v2['FGP_rolling1'],
       'FTP_rolling2':dev_x_v2['FTP_rolling1'], 
        'threeptFGP_rolling2':dev_x_v2['threeptFGP_rolling1'],
        'twoptFGP_rolling2':dev_x_v2['twoptFGP_rolling1'],
    'Astdiff':-dev_x_v2['Astdiff'],
    'twoptFGPdiff':-dev_x_v2['twoptFGPdiff'],
    'threeptFGPdiff':-dev_x_v2['threeptFGPdiff'],
    'FGPdiff':-dev_x_v2['FGPdiff']
    
        
    
})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

Se

In [1376]:
dev_x_all = pd.concat([dev_x_v2,dev_x_v2_neg])
dev_y = np.array([1] * len(dev_x_v2) + [0] * len(dev_x_v2_neg))

dev_x_mix = pd.concat([dev_x_all[:31],dev_x_all[94:]])

dev_y_mix = np.concatenate((dev_y[:31],dev_y[94:]))



In [1377]:
df_tour_compact_checked_test_1 = df_test[['Team1','index','DayNum', 'elo1',
                                                                          'rollingavg_final1',
        'JointTeamID', 'H2HrollingstdScore_final1',
       'H2Hrollingavg_final1n', 
       'gamecountweight', 'twincount1', 'tlosscount1']]

df_tour_compact_checked_test_1.columns = ['Team','index','DayNum', 'elo',
                                                                          'rollingavg_final',
        'JointTeamID', 'H2HrollingstdScore_final',
       'H2Hrollingavg_finaln', 
       'gamecountweight', 'twincount', 'tlosscount']

df_tour_compact_checked_test_2 = df_test[['Team2','index','DayNum', 'elo2',
                                                                           'rollingavg_final2',
        'JointTeamID', 
       'H2HrollingstdScore_final2', 'H2Hrollingavg_final2n',
       'gamecountweight','twincount2',
       'tlosscount2']]

df_tour_compact_checked_test_2.columns = ['Team','index','DayNum', 'elo',
                                                                          'rollingavg_final',
        'JointTeamID', 'H2HrollingstdScore_final',
       'H2Hrollingavg_finaln', 
       'gamecountweight', 'twincount', 'tlosscount']

In [1378]:
df_tour_compact_checked_test = pd.concat([df_tour_compact_checked_test_1,df_tour_compact_checked_test_2])
df_tour_compact_checked_test.sort_values(['index','Team',], inplace=True)
df_tour_compact_checked_test.drop_duplicates(['Team'], keep='first', inplace=True)

In [1379]:
df_results = pd.read_csv('/Users/arganesa/MIDS/W207/Kaggle/mids-w207-kaggle/WStage2DataFiles/NCAA_2018_Results.csv')


In [1380]:
df_results['JointTeamID']=df_results[['Team1','Team2']].astype(str).apply(createteam,axis=1)

In [1381]:
df_results.shape

(63, 5)

In [1382]:
df_test['Outcome'] = 0
df_test_r = pd.merge(df_test,df_results,how='left',on='JointTeamID')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [1383]:
test_bool = ~(df_test_r['Result'].isnull())

In [1384]:
test_x = df_test[['Team1','Season', 'Team2','Seeddiff','H2Hrollingavg_finalndiff', 'H2HrollingstdScorediff', 'tlosscountdiff', 'twincountdiff']]

test_x_v1 = pd.merge(test_x,df_tour_compact_checked_test[['Team','elo','rollingavg_final']],how='left',left_on = 'Team1',right_on='Team')
test_x_v1 = test_x_v1.drop(['Team'],axis=1)
test_x_v1 = test_x_v1.rename(columns={'elo':'elo1','rollingavg_final':'rollingavg_final1'})

test_x_v1 = pd.merge(test_x_v1,df_tour_compact_checked_test[['Team','elo','rollingavg_final']],how='left',left_on = 'Team2',right_on='Team')
test_x_v1 = test_x_v1.drop(['Team'],axis=1)
test_x_v1 = test_x_v1.rename(columns={'elo':'elo2','rollingavg_final':'rollingavg_final2'})

test_x_v1['elodiff'] = test_x_v1['elo1'] -test_x_v1['elo2'] 
test_x_v1['rollingavgdiff'] = test_x_v1['rollingavg_final1'] -test_x_v1['rollingavg_final2'] 

test_x_v1 = pd.merge(test_x_v1,season_per[['Season', 'DayNum', 'Team','FGP_r', 'twoptFGP_r', 'threeptFGP_r', 'FTP_r',
       'Ast_r']],how='left',left_on=['Team1','Season'],right_on=['Team','Season']).rename(columns={'FGP_r':'FGP_rolling1', 'twoptFGP_r':'twoptFGP_rolling1', 'threeptFGP_r':'threeptFGP_rolling1',
                                                                                                   'FTP_r':'FTP_rolling1',
       'Ast_r':'Ast_rolling1'})

test_x_v1 = test_x_v1.drop(['Team','DayNum'],axis=1)

test_x_v1 = pd.merge(test_x_v1,season_per[['Season', 'DayNum', 'Team','FGP_r', 'twoptFGP_r', 'threeptFGP_r', 'FTP_r',
       'Ast_r']],how='left',left_on=['Team2','Season'],right_on=['Team','Season']).rename(columns={'FGP_r':'FGP_rolling2', 'twoptFGP_r':'twoptFGP_rolling2', 'threeptFGP_r':'threeptFGP_rolling2',
                                                                                                   'FTP_r':'FTP_rolling2',
       'Ast_r':'Ast_rolling2'})
test_x_v1 = test_x_v1.drop(['Team','DayNum'],axis=1)


test_x_v2 = test_x_v1[['H2Hrollingavg_finalndiff',
       'H2HrollingstdScorediff', 'tlosscountdiff', 'twincountdiff','elodiff',
       'rollingavgdiff','Seeddiff','FGP_rolling1',
       'twoptFGP_rolling1', 'threeptFGP_rolling1', 'FTP_rolling1',
       'Ast_rolling1', 'FGP_rolling2',
       'twoptFGP_rolling2', 'threeptFGP_rolling2', 'FTP_rolling2',
       'Ast_rolling2']]

test_x_v2['Astdiff'] = test_x_v2['Ast_rolling1'] - test_x_v2['Ast_rolling2']
test_x_v2['twoptFGPdiff'] = test_x_v2['twoptFGP_rolling1'] - test_x_v2['twoptFGP_rolling2']
test_x_v2['threeptFGPdiff'] = test_x_v2['threeptFGP_rolling1'] - test_x_v2['threeptFGP_rolling2']
test_x_v2['FGPdiff'] = test_x_v2['FGP_rolling1'] - test_x_v2['FGP_rolling2']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [1385]:
test_x_v3 = test_x_v2[test_bool]

test_y = df_test_r.loc[test_bool,'Result']


In [1386]:
#train_y=train_y.values.flatten().reshape(2268)
train_y=train_y.values.flatten().reshape(train_y.shape[0])

In [1387]:
train_x.columns

Index(['Seeddiff', 'H2Hrollingavg_finalndiff', 'H2HrollingstdScorediff',
       'elodiff', 'rollingavgdiff', 'tlosscountdiff', 'twincountdiff',
       'Ast_rolling1', 'FGP_rolling1', 'FTP_rolling1', 'threeptFGP_rolling1',
       'twoptFGP_rolling1', 'Ast_rolling2', 'FGP_rolling2', 'FTP_rolling2',
       'threeptFGP_rolling2', 'twoptFGP_rolling2', 'Astdiff', 'twoptFGPdiff',
       'threeptFGPdiff', 'FGPdiff'],
      dtype='object')

In [1388]:

features = ['elodiff','rollingavgdiff','twincountdiff','Astdiff',
    'twoptFGPdiff',
    'threeptFGPdiff',
    'FGPdiff']
feature_comb = []
for L in range(0, len(features)+1):
    for subset in it.combinations(features, L):
        feature_comb.append(list(subset))
feature_comb[1:]

[['elodiff'],
 ['rollingavgdiff'],
 ['twincountdiff'],
 ['Astdiff'],
 ['twoptFGPdiff'],
 ['threeptFGPdiff'],
 ['FGPdiff'],
 ['elodiff', 'rollingavgdiff'],
 ['elodiff', 'twincountdiff'],
 ['elodiff', 'Astdiff'],
 ['elodiff', 'twoptFGPdiff'],
 ['elodiff', 'threeptFGPdiff'],
 ['elodiff', 'FGPdiff'],
 ['rollingavgdiff', 'twincountdiff'],
 ['rollingavgdiff', 'Astdiff'],
 ['rollingavgdiff', 'twoptFGPdiff'],
 ['rollingavgdiff', 'threeptFGPdiff'],
 ['rollingavgdiff', 'FGPdiff'],
 ['twincountdiff', 'Astdiff'],
 ['twincountdiff', 'twoptFGPdiff'],
 ['twincountdiff', 'threeptFGPdiff'],
 ['twincountdiff', 'FGPdiff'],
 ['Astdiff', 'twoptFGPdiff'],
 ['Astdiff', 'threeptFGPdiff'],
 ['Astdiff', 'FGPdiff'],
 ['twoptFGPdiff', 'threeptFGPdiff'],
 ['twoptFGPdiff', 'FGPdiff'],
 ['threeptFGPdiff', 'FGPdiff'],
 ['elodiff', 'rollingavgdiff', 'twincountdiff'],
 ['elodiff', 'rollingavgdiff', 'Astdiff'],
 ['elodiff', 'rollingavgdiff', 'twoptFGPdiff'],
 ['elodiff', 'rollingavgdiff', 'threeptFGPdiff'],
 ['elodiff',

In [1389]:
Featurelist = []
train_accuracy=[]
test_accuracy=[]
dev_accuracy=[]
dev_logloss=[]
test_logloss=[]


for feature in feature_comb[1:]:
    
    Featurelist.append(', '.join(feature))
    
    lr = LogisticRegression()

    lr.fit(train_x[feature],train_y)
# we achieve 77% accuracy on training data - how does it generalize to dev?
    #print("The training accuracy is : ",lr.score(train_x[feature],train_y))
    #print("The dev accuracy is : ",lr.score(dev_x_mix[feature],dev_y_mix))
    train_accuracy.append(lr.score(train_x[feature],train_y))
    dev_accuracy.append(lr.score(dev_x_mix[feature],dev_y_mix))
    pred_lr = lr.predict_proba(dev_x_mix[feature])
    
    #print("The dev log loss is : ", metrics.log_loss(dev_y_mix,pred_lr))
    dev_logloss.append(metrics.log_loss(dev_y_mix,pred_lr))
    #print("The test accuracy is : ", lr.score(test_x_v3[feature],test_y))
    test_accuracy.append(lr.score(test_x_v3[feature],test_y))
#pred_lr = lr.predict_proba(dev_x)
    pred_lr = lr.predict_proba(test_x_v3[feature])
    #print("The test log loss is : ", metrics.log_loss(test_y,pred_lr))
    test_logloss.append(metrics.log_loss(test_y,pred_lr))
#print(metrics.log_loss(dev_y,pred_lr))

result = pd.DataFrame({'Feature':Featurelist,'train_accuracy':train_accuracy,'dev_accuracy':dev_accuracy,'dev_logloss':dev_logloss,'test_accuracy':test_accuracy,'test_logloss':test_logloss})

result.sort_values(by=(['test_logloss','test_accuracy']))

Unnamed: 0,Feature,dev_accuracy,dev_logloss,test_accuracy,test_logloss,train_accuracy
101,"elodiff, rollingavgdiff, twincountdiff, twoptF...",0.825397,0.419195,0.777778,0.431529,0.793651
122,"elodiff, rollingavgdiff, twincountdiff, twoptF...",0.825397,0.420039,0.777778,0.431536,0.795918
64,"elodiff, rollingavgdiff, twincountdiff, twoptF...",0.825397,0.417457,0.777778,0.431606,0.795918
102,"elodiff, rollingavgdiff, twincountdiff, twoptF...",0.825397,0.417703,0.777778,0.432012,0.798186
124,"elodiff, twincountdiff, Astdiff, twoptFGPdiff,...",0.825397,0.413974,0.793651,0.433197,0.793651
109,"elodiff, twincountdiff, Astdiff, twoptFGPdiff,...",0.825397,0.413953,0.793651,0.433200,0.793651
108,"elodiff, twincountdiff, Astdiff, twoptFGPdiff,...",0.825397,0.413970,0.793651,0.433200,0.793651
126,"elodiff, rollingavgdiff, twincountdiff, Astdif...",0.825397,0.414067,0.793651,0.433201,0.793651
73,"elodiff, twincountdiff, Astdiff, twoptFGPdiff",0.825397,0.413949,0.793651,0.433203,0.793651
120,"elodiff, rollingavgdiff, twincountdiff, Astdif...",0.825397,0.414046,0.793651,0.433204,0.793651


In [1390]:
result.iloc[101]['Feature']

'elodiff, rollingavgdiff, twincountdiff, twoptFGPdiff, threeptFGPdiff'

In [1049]:
feature =['elodiff', 'rollingavgdiff', 'twincountdiff']
lr = LogisticRegression()

lr.fit(train_x[feature],train_y)
print(lr.score(train_x[feature],train_y))
print(lr.score(dev_x_mix[feature],dev_y_mix))
pred_lr = lr.predict_proba(dev_x_mix[feature])
print(metrics.log_loss(dev_y_mix,pred_lr))
print(lr.score(test_x_v3[feature],test_y))
pred_lr = lr.predict_proba(test_x_v3[feature])
print(metrics.log_loss(test_y,pred_lr))

0.779541446208
0.84126984127
0.415876036972
0.777777777778
0.439617740891


In [1087]:
pred_lr = lr.predict_proba(test_x_v2[feature])
pred_sub = []
r =[]
for p in pred_lr:
    if (p[1] > p[0]):
        r.append(1)
    
    else:
        r.append(0)
    pred_sub.append(p[1])
    

In [1082]:
print(df_tour_compact_checked_v4.shape)
print(df_train.shape)
print(df_train_pos.shape)
print(df_test.shape)
print(x_train.shape)
print(train_x.shape)
print(dev_x.shape)

(3213, 45)
(1134, 45)
(1134, 8)
(2016, 46)
(2268, 9)
(2268, 7)
(63, 8)


In [1091]:
test_output = test_x

In [1092]:
test_output['Pred'] = pred_sub

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [1108]:
def genID(row):
    z=[]
    z.append('2018')
    z.append(row['Team1'])
    z.append(row['Team2'])
    z.sort()
    x = '_'.join(z)
    return (x)
    
test_output['ID'] = test_output[['Team1','Team2']].astype('str').apply(genID,axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


In [1110]:
test_output['ID_us'] = '2018' + '_' + test_output.Team1.astype('str') + '_'+test_output.Team2.astype('str')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [1115]:
test_output[['ID','Pred']].to_csv('/Users/arganesa/MIDS/W207/Kaggle/mids-w207-kaggle/WStage2DataFiles/WSampleSubmissionStage2.csv')

In [1129]:
test_output.loc[list(test_bool)]

Unnamed: 0,Team1,Season,Team2,Seeddiff,H2Hrollingavg_finalndiff,H2HrollingstdScorediff,tlosscountdiff,twincountdiff,Pred,ID,ID_us
1255,3110,2018,3417,11,0.4,0.000000,-1.0,-2.0,0.027618,2018_3110_3417,2018_3110_3417
1297,3113,2018,3304,-3,0.4,0.000000,1.0,1.0,0.606825,2018_3113_3304,2018_3113_3304
1315,3113,2018,3400,5,0.4,0.000000,0.0,-1.0,0.153449,2018_3113_3400,2018_3113_3400
1337,3114,2018,3199,11,0.4,0.000000,-1.0,-3.0,0.027213,2018_3114_3199,2018_3114_3199
1401,3124,2018,3212,-13,0.4,0.000000,1.0,3.0,0.999701,2018_3124_3212,2018_3124_3212
1411,3124,2018,3276,-5,0.4,0.000000,1.0,3.0,0.962449,2018_3124_3276,2018_3124_3276
1425,3124,2018,3333,-4,0.4,0.000000,0.0,1.0,0.906900,2018_3124_3333,2018_3124_3333
1453,3125,2018,3181,7,0.4,0.000000,0.0,-1.0,0.160276,2018_3125_3181,2018_3125_3181
1521,3129,2018,3257,15,0.4,0.000000,0.0,-2.0,0.011656,2018_3129_3257,2018_3129_3257
1571,3138,2018,3199,8,0.4,0.000000,-1.0,-3.0,0.141342,2018_3138_3199,2018_3138_3199


In [938]:

## Fit a logistic regression model through grid search
logreg = LogisticRegression()
params = {'C': np.logspace(start=-5, stop=3, num=9)}
clf = GridSearchCV(logreg, params, scoring='neg_log_loss', refit=True)
clf.fit(train_x[['elodiff','rollingavgdiff','H2HrollingstdScorediff','twincountdiff']],train_y)
print('Best log_loss: {:.4}, with best C: {}'.format(clf.best_score_, clf.best_params_['C']))

Best log_loss: -0.1723, with best C: 100.0


In [947]:
# Create a random forest Classifier. By convention, clf means 'Classifier'
clf = RandomForestClassifier(n_jobs=2, random_state=0)

# Train the Classifier to take the training features and learn how they relate
# to the training y (the species)
clf.fit(train_x[['elodiff','rollingavgdiff','H2HrollingstdScorediff','twincountdiff']],train_y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=2,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [951]:
clf.score(train_x[['elodiff','rollingavgdiff','H2HrollingstdScorediff','twincountdiff']],train_y)

0.99470899470899465

In [952]:
print(clf.score(dev_x_mix[['elodiff','rollingavgdiff','H2HrollingstdScorediff','twincountdiff']],dev_y_mix))
pred = clf.predict_proba(dev_x_mix[['elodiff','rollingavgdiff','H2HrollingstdScorediff','twincountdiff']])

1.0


In [954]:
print(metrics.log_loss(dev_y_mix,pred))

0.0419611475427


In [955]:
print(clf.score(test_x_v3[['elodiff','rollingavgdiff','H2HrollingstdScorediff','twincountdiff']],test_y))


0.532258064516


In [1128]:
list(test_bool)

[False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 Fa