# Part 3 - Stage 2 Data Manipulation

The purpose of this library is to manipulate stage 2 data into useable training data. 

The inputs you need for this are:
/data/MNCAATourneySeeds.csv
/data/MRegularSeasonDetailedResults.csv

## Library Imports

In [9]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import pkg_resources

from binaryTree import Node
from PIL import Image, ImageDraw

from sklearn.model_selection import GridSearchCV
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

In [10]:
cwd = os.getcwd()

## Data Manipulation

### Regular Season Data Analysis

In [12]:
# Existing code from data manipulation section. Only run if needed. 
cwd = os.getcwd()

# tourney_cresults = pd.read_csv(cwd + '/data/MNCAATourneyCompactResults.csv')
seeds = pd.read_csv(cwd + '/data/MNCAATourneySeeds.csv')
season_dresults = pd.read_csv(cwd +'/data/MRegularSeasonDetailedResults.csv')

targetYear = 2003
# tourney_cresults = tourney_cresults.loc[tourney_cresults['Season'] >= targetYear]

training_set = pd.read_csv("training_set.csv")
record = pd.read_csv('record.csv')

seeds['Seed'] =  pd.to_numeric(seeds['Seed'].str[1:3], downcast='integer',errors='coerce')

def delta_seed(row):
    cond = (seeds['Season'] == row['Season'])
    return seeds[cond & (seeds['TeamID'] == row['Team1'])]['Seed'].iloc[0] - seeds[cond & (seeds['TeamID'] == row['Team2'])]['Seed'].iloc[0]

# function to, given a row, calculate what the difference between the two seeds was. 
#Function to look up 
def delta_winPct(row):
    cond1 = (record['Season'] == row['Season']) & (record['WTeamID'] == row['Team1'])
    cond2 = (record['Season'] == row['Season']) & (record['WTeamID'] == row['Team2'])
    return (record[cond1]['wins']/record[cond1]['games']).mean() - (record[cond2]['wins']/record[cond2]['games']).mean()

def get_points_against(row):
    wcond = (dfW['Season'] == row['Season']) & (dfW['WTeamID'] == row['WTeamID']) 
    fld1 = 'LScore'
    lcond = (dfL['Season'] == row['Season']) & (dfL['LTeamID'] == row['WTeamID']) 
    fld2 = 'WScore'
    retVal = dfW[wcond][fld1].sum()
    if len(dfL[lcond][fld2]) > 0:
        retVal = retVal + dfL[lcond][fld2].sum() 
    return retVal

def get_points_for(row):
    wcond = (dfW['Season'] == row['Season']) & (dfW['WTeamID'] == row['WTeamID']) 
    fld1 = 'WScore'
    lcond = (dfL['Season'] == row['Season']) & (dfL['LTeamID'] == row['WTeamID']) 
    fld2 = 'LScore'
    retVal = dfW[wcond][fld1].sum()
    if len(dfL[lcond][fld2]) > 0:
        retVal = retVal + dfL[lcond][fld2].sum() 
    return retVal

def get_remaining_stats(row, field):
    wcond = (dfW['Season'] == row['Season']) & (dfW['WTeamID'] == row['WTeamID']) 
    fld1 = 'W' + field
    lcond = (dfL['Season'] == row['Season']) & (dfL['LTeamID'] == row['WTeamID']) 
    fld2 = 'L'+ field
    retVal = dfW[wcond][fld1].sum()
    if len(dfL[lcond][fld2]) > 0:
        retVal = retVal + dfL[lcond][fld2].sum()
    return retVal

def delta_stat(row, field):
    cond1 = (record['Season'] == row['Season']) & (record['WTeamID'] == row['Team1'])
    cond2 = (record['Season'] == row['Season']) & (record['WTeamID'] == row['Team2'])
    return (record[cond1][field]/record[cond1]['games']).mean() - (record[cond2][field]/record[cond2]['games']).mean()

Ok, so now we have a trained model. Next we need to find sumission data.

The kaggle competition provides a sample submission.csv file that contains a matchup ID, and a default prediction value. 

In [13]:
sub = pd.read_csv(cwd + '/data/SampleSubmissionStage2.csv')
sub

Unnamed: 0,ID,Pred
0,2019_1101_1113,0.5
1,2019_1101_1120,0.5
2,2019_1101_1124,0.5
3,2019_1101_1125,0.5
4,2019_1101_1133,0.5
...,...,...
2273,2019_1449_1459,0.5
2274,2019_1449_1463,0.5
2275,2019_1458_1459,0.5
2276,2019_1458_1463,0.5


Split this string into Team IDs and year

In [14]:
sub['Season'], sub['Team1'], sub['Team2'] = sub['ID'].str.split('_').str
sub[['Season', 'Team1', 'Team2']] = sub[['Season', 'Team1', 'Team2']].apply(pd.to_numeric)
sub

Unnamed: 0,ID,Pred,Season,Team1,Team2
0,2019_1101_1113,0.5,2019,1101,1113
1,2019_1101_1120,0.5,2019,1101,1120
2,2019_1101_1124,0.5,2019,1101,1124
3,2019_1101_1125,0.5,2019,1101,1125
4,2019_1101_1133,0.5,2019,1101,1133
...,...,...,...,...,...
2273,2019_1449_1459,0.5,2019,1449,1459
2274,2019_1449_1463,0.5,2019,1449,1463
2275,2019_1458_1459,0.5,2019,1458,1459
2276,2019_1458_1463,0.5,2019,1458,1463


Calculate the deltaSeed and deltaWinPct features

In [15]:
sub['deltaSeed'] = sub.apply(delta_seed,axis=1)
# sub['deltaMO'] = sub.apply(delta_ord,axis=1)
sub['deltaWinPct'] = sub.apply(delta_winPct,axis=1)
sub

Unnamed: 0,ID,Pred,Season,Team1,Team2,deltaSeed,deltaWinPct
0,2019_1101_1113,0.5,2019,1101,1113,4,0.105603
1,2019_1101_1120,0.5,2019,1101,1120,10,0.057809
2,2019_1101_1124,0.5,2019,1101,1124,6,0.199353
3,2019_1101_1125,0.5,2019,1101,1125,4,-0.040230
4,2019_1101_1133,0.5,2019,1101,1133,0,0.217346
...,...,...,...,...,...,...,...
2273,2019_1449_1459,0.5,2019,1449,1459,2,-0.101961
2274,2019_1449_1463,0.5,2019,1449,1463,-5,0.014706
2275,2019_1458_1459,0.5,2019,1458,1459,-2,-0.169697
2276,2019_1458_1463,0.5,2019,1458,1463,-9,-0.053030


Now, caluclate the rest of our stats. This will take a while.

In [7]:
# cut to slides
rawCols = ['PointsFor','PointsAgainst','FGM','FGA','FGM3','FGA3','FTM','FTA','OR','DR','Ast','TO','Stl','Blk','PF']

for rawCol in rawCols:
    print("Processing",rawCol)
    sub['delta' + rawCol] = sub.apply(delta_stat,args=(rawCol,),axis=1)

Processing PointsFor
Processing PointsAgainst
Processing FGM
Processing FGA
Processing FGM3
Processing FGA3
Processing FTM
Processing FTA
Processing OR
Processing DR
Processing Ast
Processing TO
Processing Stl
Processing Blk
Processing PF


In [8]:
sub.to_csv("training_set_stage2.csv", index=False)
sub

Unnamed: 0,ID,Pred,Season,Team1,Team2,deltaSeed,deltaWinPct,deltaPointsFor,deltaPointsAgainst,deltaFGM,...,deltaFGA3,deltaFTM,deltaFTA,deltaOR,deltaDR,deltaAst,deltaTO,deltaStl,deltaBlk,deltaPF
0,2019_1101_1113,0.5,2019,1101,1113,4,0.105603,-6.088362,-8.165948,-1.248922,...,-2.353448,-3.581897,-6.837284,-3.087284,-4.915948,1.026940,-1.938578,1.781250,-0.667026,-0.768319
1,2019_1101_1120,0.5,2019,1101,1120,10,0.057809,-7.158215,-3.691684,-1.684584,...,-11.074037,0.381339,0.333671,-2.666329,0.955375,0.208925,-0.491886,-1.294118,-2.212982,0.755578
2,2019_1101_1124,0.5,2019,1101,1124,6,0.199353,0.067888,-2.290948,-0.155172,...,-4.478448,1.074353,0.193966,-4.306034,-1.697198,0.776940,-1.626078,1.875000,-2.198276,0.356681
3,2019_1101_1125,0.5,2019,1101,1125,4,-0.040230,-15.142529,-9.770115,-6.321839,...,-9.070115,0.626437,1.168966,0.168966,-6.770115,-4.979310,0.055172,1.333333,-1.248276,3.437931
4,2019_1101_1133,0.5,2019,1101,1133,0,0.217346,5.360502,-0.285266,2.314525,...,0.138976,0.399164,-0.294671,0.038662,-2.194357,2.560084,-0.890282,2.848485,-1.205852,1.531870
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2273,2019_1449_1459,0.5,2019,1449,1459,2,-0.101961,-11.376471,-3.150980,-4.741176,...,-4.545098,1.501961,2.425490,-0.829412,-2.315686,-3.023529,2.119608,2.866667,2.801961,0.911765
2274,2019_1449_1463,0.5,2019,1449,1463,-5,0.014706,-11.069328,-9.331933,-5.262605,...,0.766807,-0.228992,0.701681,0.792017,-7.703782,-5.323529,0.102941,3.250000,1.413866,1.411765
2275,2019_1458_1459,0.5,2019,1458,1459,-2,-0.169697,-12.139394,-6.109091,-3.421212,...,-6.678788,-1.551515,-0.936364,-2.148485,2.627273,-1.942424,-1.687879,-1.012121,1.278788,-2.439394
2276,2019_1458_1463,0.5,2019,1458,1463,-9,-0.053030,-11.832251,-12.290043,-3.942641,...,-1.366883,-3.282468,-2.660173,-0.527056,-2.760823,-4.242424,-3.704545,-0.628788,-0.109307,-1.939394
