# Part 1 - Stage 1 Data Manipulation

The purpose of this notebook is to analyze the stage 1 data and convert it into a csv file that can be loaded in model expoloration.

In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import pkg_resources

from binaryTree import Node
from PIL import Image, ImageDraw

from sklearn.model_selection import GridSearchCV
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

In [2]:
cwd = os.getcwd()

## Load Data

In [3]:
tourney_cresults = pd.read_csv(cwd + '/data/MNCAATourneyCompactResults.csv')
seeds = pd.read_csv(cwd + '/data/MNCAATourneySeeds.csv')
seeds['Seed'] =  pd.to_numeric(seeds['Seed'].str[1:3], downcast='integer',errors='coerce')
season_dresults = pd.read_csv(cwd +'/data/MRegularSeasonDetailedResults.csv')

In [4]:
tourney_cresults.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,1985,136,1116,63,1234,54,N,0
1,1985,136,1120,59,1345,58,N,0
2,1985,136,1207,68,1250,43,N,0
3,1985,136,1229,58,1425,55,N,0
4,1985,136,1242,49,1325,38,N,0


In [5]:
seeds.head()

Unnamed: 0,Season,Seed,TeamID
0,1985,1,1207
1,1985,2,1210
2,1985,3,1228
3,1985,4,1260
4,1985,5,1374


In [6]:
season_dresults.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
0,2003,10,1104,68,1328,62,N,0,27,58,...,10,16,22,10,22,8,18,9,2,20
1,2003,10,1272,70,1393,63,N,0,26,62,...,24,9,20,20,25,7,12,8,6,16
2,2003,11,1266,73,1437,61,N,0,24,58,...,26,14,23,31,22,9,12,2,5,23
3,2003,11,1296,56,1457,50,N,0,18,38,...,22,8,15,17,20,9,19,4,3,23
4,2003,11,1400,77,1208,71,N,0,30,61,...,16,17,27,21,15,12,10,7,1,14


## Analysis

Only analyze tournament games post 2003. There was a change to the tournament format in this year, so data before this is not as useful.

In [7]:
targetYear = 2003
tourney_cresults = tourney_cresults.loc[tourney_cresults['Season'] >= targetYear]

Initialize a dataframe for the training set

In [8]:
training_set = pd.DataFrame()

Add base features to training set

In [9]:
training_set['Result'] = np.random.randint(0,2,len(tourney_cresults.index))
training_set['Season'] = tourney_cresults['Season'].values
training_set['Team1'] = training_set['Result'].values * tourney_cresults['WTeamID'].values + (1-training_set['Result'].values) * tourney_cresults['LTeamID'].values 
training_set['Team2'] = (1-training_set['Result'].values) * tourney_cresults['WTeamID'].values + training_set['Result'].values * tourney_cresults['LTeamID'].values
training_set.head()

Unnamed: 0,Result,Season,Team1,Team2
0,1,2003,1421,1411
1,0,2003,1436,1112
2,0,2003,1272,1113
3,1,2003,1141,1166
4,0,2003,1301,1143


Define helper functions for populating the rest of the data in the training set.

In [10]:
def delta_seed(row):
    cond = (seeds['Season'] == row['Season'])
    return seeds[cond & (seeds['TeamID'] == row['Team1'])]['Seed'].iloc[0] - seeds[cond & (seeds['TeamID'] == row['Team2'])]['Seed'].iloc[0]

def delta_winPct(row):
    cond1 = (record['Season'] == row['Season']) & (record['WTeamID'] == row['Team1'])
    cond2 = (record['Season'] == row['Season']) & (record['WTeamID'] == row['Team2'])
    return (record[cond1]['wins']/record[cond1]['games']).mean() - (record[cond2]['wins']/record[cond2]['games']).mean()

def get_points_against(row):
    wcond = (dfW['Season'] == row['Season']) & (dfW['WTeamID'] == row['WTeamID']) 
    fld1 = 'LScore'
    lcond = (dfL['Season'] == row['Season']) & (dfL['LTeamID'] == row['WTeamID']) 
    fld2 = 'WScore'
    retVal = dfW[wcond][fld1].sum()
    if len(dfL[lcond][fld2]) > 0:
        retVal = retVal + dfL[lcond][fld2].sum() 
    return retVal

def get_points_for(row):
    wcond = (dfW['Season'] == row['Season']) & (dfW['WTeamID'] == row['WTeamID']) 
    fld1 = 'WScore'
    lcond = (dfL['Season'] == row['Season']) & (dfL['LTeamID'] == row['WTeamID']) 
    fld2 = 'LScore'
    retVal = dfW[wcond][fld1].sum()
    if len(dfL[lcond][fld2]) > 0:
        retVal = retVal + dfL[lcond][fld2].sum() 
    return retVal

def get_remaining_stats(row, field):
    wcond = (dfW['Season'] == row['Season']) & (dfW['WTeamID'] == row['WTeamID']) 
    fld1 = 'W' + field
    lcond = (dfL['Season'] == row['Season']) & (dfL['LTeamID'] == row['WTeamID']) 
    fld2 = 'L'+ field
    retVal = dfW[wcond][fld1].sum()
    if len(dfL[lcond][fld2]) > 0:
        retVal = retVal + dfL[lcond][fld2].sum()
    return retVal

def delta_stat(row, field):
    cond1 = (record['Season'] == row['Season']) & (record['WTeamID'] == row['Team1'])
    cond2 = (record['Season'] == row['Season']) & (record['WTeamID'] == row['Team2'])
    return (record[cond1][field]/record[cond1]['games']).mean() - (record[cond2][field]/record[cond2]['games']).mean()
  

In [11]:
training_set['deltaSeed'] = training_set.apply(delta_seed,axis=1)
training_set.head()

Unnamed: 0,Result,Season,Team1,Team2,deltaSeed
0,1,2003,1421,1411,0
1,0,2003,1436,1112,15
2,0,2003,1272,1113,-3
3,1,2003,1141,1166,5
4,0,2003,1301,1143,1


In [12]:
record = pd.DataFrame({'wins': season_dresults.groupby(['Season','WTeamID']).size()}).reset_index();
losses = pd.DataFrame({'losses': season_dresults.groupby(['Season','LTeamID']).size()}).reset_index();

record = record.merge(losses, how='outer', left_on=['Season','WTeamID'], right_on=['Season','LTeamID'])
record = record.fillna(0)
record['games'] = record['wins'] + record['losses']

In [13]:
# create dataframes of both winners and losers
dfW = season_dresults.groupby(['Season','WTeamID']).sum().reset_index()
dfL = season_dresults.groupby(['Season','LTeamID']).sum().reset_index()

In [14]:
# add points for and points against data
record['PointsFor'] = record.apply(get_points_for, axis=1)
record['PointsAgainst'] = record.apply(get_points_against, axis=1)

In [15]:
# This cell takes ~3 min. To slides
cols = ['FGM','FGA','FGM3','FGA3','FTM','FTA','OR','DR','Ast','TO','Stl','Blk','PF']

for col in cols:
    print("Processing",col)
    record[col] = record.apply(get_remaining_stats, args=(col,), axis=1)

Processing FGM
Processing FGA
Processing FGM3
Processing FGA3
Processing FTM
Processing FTA
Processing OR
Processing DR
Processing Ast
Processing TO
Processing Stl
Processing Blk
Processing PF


You can calculate 

In [16]:
record['FGprct'] = record['FGM'] / record['FGA']  
record.head()

Unnamed: 0,Season,WTeamID,wins,LTeamID,losses,games,PointsFor,PointsAgainst,FGM,FGA,...,FTM,FTA,OR,DR,Ast,TO,Stl,Blk,PF,FGprct
0,2003,1102.0,12.0,1102.0,16.0,28.0,1603,1596,536,1114,...,312,479,117,471,364,320,167,50,525,0.481149
1,2003,1103.0,13.0,1103.0,14.0,27.0,2127,2110,733,1508,...,514,698,264,538,411,341,196,63,536,0.486074
2,2003,1104.0,17.0,1104.0,11.0,28.0,1940,1820,673,1601,...,416,586,380,670,339,372,185,106,505,0.420362
3,2003,1105.0,7.0,1105.0,19.0,26.0,1866,1993,634,1602,...,401,568,351,601,378,485,242,54,526,0.395755
4,2003,1106.0,13.0,1106.0,15.0,28.0,1781,1785,656,1548,...,298,461,344,668,327,477,234,88,509,0.423773


In [17]:
# This will take ~ 3 min. To slides
cols = ['PointsFor','PointsAgainst','FGM','FGA','FGM3','FGA3','FTM','FTA','OR','DR','Ast','TO','Stl','Blk','PF']

for col in cols:
    print("Processing",col)
    training_set['delta' + col] = training_set.apply(delta_stat,args=(col,),axis=1)

Processing PointsFor
Processing PointsAgainst
Processing FGM
Processing FGA
Processing FGM3
Processing FGA3
Processing FTM
Processing FTA
Processing OR
Processing DR
Processing Ast
Processing TO
Processing Stl
Processing Blk
Processing PF


In [18]:
training_set['deltaWinPct'] = training_set.apply(delta_winPct,axis=1)
training_set.head()

Unnamed: 0,Result,Season,Team1,Team2,deltaSeed,deltaPointsFor,deltaPointsAgainst,deltaFGM,deltaFGA,deltaFGM3,...,deltaFTM,deltaFTA,deltaOR,deltaDR,deltaAst,deltaTO,deltaStl,deltaBlk,deltaPF,deltaWinPct
0,1,2003,1421,1411,0,-1.593103,7.614943,-0.354023,1.526437,0.549425,...,-1.434483,-7.135632,-0.890805,-1.627586,-1.165517,0.973563,0.635632,0.766667,0.803448,-0.151724
1,0,2003,1436,1112,15,-17.421182,-7.112069,-5.493842,-9.852217,-1.759852,...,-4.673645,-5.448276,-2.213054,-1.918719,-3.435961,-0.716749,-1.602217,-1.248768,-1.853448,-0.237685
2,0,2003,1272,1113,-3,-1.448276,-3.344828,-0.931034,3.103448,3.0,...,-2.586207,-3.310345,0.37931,2.655172,1.068966,-0.206897,2.172414,0.827586,-0.655172,0.172414
3,1,2003,1141,1166,5,0.102403,8.908046,-2.07628,-4.76489,-1.142111,...,5.397074,5.142111,-0.292581,0.094044,-1.197492,4.877743,-1.290491,-0.454545,3.69279,-0.085684
4,0,2003,1301,1143,1,-2.082759,-1.758621,-3.011494,-5.390805,1.552874,...,2.387356,0.949425,-1.508046,-2.345977,-1.333333,0.027586,1.214943,0.273563,1.563218,-0.124138


In [19]:
training_set.describe()

Unnamed: 0,Result,Season,Team1,Team2,deltaSeed,deltaPointsFor,deltaPointsAgainst,deltaFGM,deltaFGA,deltaFGM3,...,deltaFTM,deltaFTA,deltaOR,deltaDR,deltaAst,deltaTO,deltaStl,deltaBlk,deltaPF,deltaWinPct
count,1115.0,1115.0,1115.0,1115.0,1115.0,1115.0,1115.0,1115.0,1115.0,1115.0,...,1115.0,1115.0,1115.0,1115.0,1115.0,1115.0,1115.0,1115.0,1115.0,1115.0
mean,0.478924,2011.096861,1299.471749,1290.362332,0.391928,-0.020348,0.217385,-0.020923,0.052145,0.062951,...,-0.041453,-0.024109,0.05826,0.004984,0.065204,0.103474,-0.006053,-0.005006,-0.03512,-0.006201
std,0.49978,4.896921,103.849238,104.400295,7.454213,7.293794,6.404939,2.914838,4.878651,1.8039,...,2.70822,3.659529,2.561961,2.564043,2.59737,2.067775,1.851985,1.856093,2.53309,0.144362
min,0.0,2003.0,1102.0,1101.0,-15.0,-21.173797,-19.41129,-8.379679,-13.846774,-6.007315,...,-8.059629,-10.827586,-7.692235,-8.543672,-7.526565,-7.666667,-6.089015,-6.242424,-7.896552,-0.633333
25%,0.0,2007.0,1211.0,1211.0,-5.0,-5.067853,-4.139706,-2.027629,-3.196496,-1.14411,...,-1.879277,-2.408617,-1.645833,-1.641309,-1.753501,-1.294199,-1.247963,-1.227718,-1.824778,-0.099615
50%,0.0,2011.0,1295.0,1277.0,0.0,-0.265597,0.179221,-0.057576,-0.172727,0.113725,...,-0.023897,-0.027233,0.022727,0.034091,0.121212,0.111742,-0.006466,0.029412,-0.095833,-0.002653
75%,1.0,2015.0,1396.5,1390.0,7.0,4.704014,4.514131,1.948468,3.337049,1.24269,...,1.743304,2.337676,1.75,1.683333,1.768951,1.399413,1.170582,1.137327,1.73467,0.08338
max,1.0,2019.0,1463.0,1462.0,15.0,22.892857,25.035714,9.357143,22.071429,6.0625,...,7.731034,10.709677,9.214286,9.285714,9.220143,7.580645,6.283681,6.752688,7.230303,0.491657


In [20]:
training_set.to_csv("training_set.csv", index=False)
record.to_csv("record.csv", index=False)