# Part 1 - Stage 1 Data Manipulation

The purpose of this notebook is to analyze the stage 1 data and convert it into a csv file that can be loaded in model expoloration.

In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import pkg_resources

from sklearn.model_selection import GridSearchCV
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

In [2]:
cwd = os.getcwd()

## Load Data

In [3]:
tourney_cresults = pd.read_csv(cwd + '/data_stage2/MNCAATourneyCompactResults.csv')
seeds = pd.read_csv(cwd + '/data_stage2/MNCAATourneySeeds.csv')
seeds['Seed'] =  pd.to_numeric(seeds['Seed'].str[1:3], downcast='integer',errors='coerce')
season_dresults = pd.read_csv(cwd +'/data_stage2/MRegularSeasonDetailedResults.csv')

In [4]:
tourney_cresults.tail()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
2379,2022,146,1242,76,1274,50,N,0
2380,2022,146,1314,69,1389,49,N,0
2381,2022,152,1242,81,1437,65,N,0
2382,2022,152,1314,81,1181,77,N,0
2383,2022,154,1242,72,1314,69,N,0


In [5]:
seeds.tail()

Unnamed: 0,Season,Seed,TeamID
2485,2023,12,1433
2486,2023,13,1233
2487,2023,14,1213
2488,2023,15,1421
2489,2023,16,1224


In [6]:
season_dresults.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
0,2003,10,1104,68,1328,62,N,0,27,58,...,10,16,22,10,22,8,18,9,2,20
1,2003,10,1272,70,1393,63,N,0,26,62,...,24,9,20,20,25,7,12,8,6,16
2,2003,11,1266,73,1437,61,N,0,24,58,...,26,14,23,31,22,9,12,2,5,23
3,2003,11,1296,56,1457,50,N,0,18,38,...,22,8,15,17,20,9,19,4,3,23
4,2003,11,1400,77,1208,71,N,0,30,61,...,16,17,27,21,15,12,10,7,1,14


## Analysis

Only analyze tournament games post 2003. There was a change to the tournament format in this year, so data before this is not as useful.

In [7]:
targetYear = 2003
tourney_cresults = tourney_cresults.loc[tourney_cresults['Season'] >= targetYear]

Initialize a dataframe for the training set

In [8]:
training_set = pd.DataFrame()

Add base features to training set

In [9]:
training_set['Result'] = np.random.randint(0,2,len(tourney_cresults.index))
training_set['Season'] = tourney_cresults['Season'].values
training_set['Team1'] = training_set['Result'].values * tourney_cresults['WTeamID'].values + (1-training_set['Result'].values) * tourney_cresults['LTeamID'].values 
training_set['Team2'] = (1-training_set['Result'].values) * tourney_cresults['WTeamID'].values + training_set['Result'].values * tourney_cresults['LTeamID'].values
training_set.head()

Unnamed: 0,Result,Season,Team1,Team2
0,1,2003,1421,1411
1,1,2003,1112,1436
2,1,2003,1113,1272
3,1,2003,1141,1166
4,1,2003,1143,1301


Define helper functions for populating the rest of the data in the training set.

In [10]:
def delta_seed(row):
    cond = (seeds['Season'] == row['Season'])
    return seeds[cond & (seeds['TeamID'] == row['Team1'])]['Seed'].iloc[0] - seeds[cond & (seeds['TeamID'] == row['Team2'])]['Seed'].iloc[0]

def delta_winPct(row):
    cond1 = (record['Season'] == row['Season']) & (record['WTeamID'] == row['Team1'])
    cond2 = (record['Season'] == row['Season']) & (record['WTeamID'] == row['Team2'])
    return (record[cond1]['wins']/record[cond1]['games']).mean() - (record[cond2]['wins']/record[cond2]['games']).mean()

def get_points_against(row):
    wcond = (dfW['Season'] == row['Season']) & (dfW['WTeamID'] == row['WTeamID']) 
    fld1 = 'LScore'
    lcond = (dfL['Season'] == row['Season']) & (dfL['LTeamID'] == row['WTeamID']) 
    fld2 = 'WScore'
    retVal = dfW[wcond][fld1].sum()
    if len(dfL[lcond][fld2]) > 0:
        retVal = retVal + dfL[lcond][fld2].sum() 
    return retVal

def get_points_for(row):
    wcond = (dfW['Season'] == row['Season']) & (dfW['WTeamID'] == row['WTeamID']) 
    fld1 = 'WScore'
    lcond = (dfL['Season'] == row['Season']) & (dfL['LTeamID'] == row['WTeamID']) 
    fld2 = 'LScore'
    retVal = dfW[wcond][fld1].sum()
    if len(dfL[lcond][fld2]) > 0:
        retVal = retVal + dfL[lcond][fld2].sum() 
    return retVal

def get_remaining_stats(row, field):
    wcond = (dfW['Season'] == row['Season']) & (dfW['WTeamID'] == row['WTeamID']) 
    fld1 = 'W' + field
    lcond = (dfL['Season'] == row['Season']) & (dfL['LTeamID'] == row['WTeamID']) 
    fld2 = 'L'+ field
    retVal = dfW[wcond][fld1].sum()
    if len(dfL[lcond][fld2]) > 0:
        retVal = retVal + dfL[lcond][fld2].sum()
    return retVal

def delta_stat(row, field):
    cond1 = (record['Season'] == row['Season']) & (record['WTeamID'] == row['Team1'])
    cond2 = (record['Season'] == row['Season']) & (record['WTeamID'] == row['Team2'])
    return (record[cond1][field]/record[cond1]['games']).mean() - (record[cond2][field]/record[cond2]['games']).mean()
  

In [11]:
training_set['deltaSeed'] = training_set.apply(delta_seed,axis=1)
training_set.head()

Unnamed: 0,Result,Season,Team1,Team2,deltaSeed
0,1,2003,1421,1411,0
1,1,2003,1112,1436,-15
2,1,2003,1113,1272,3
3,1,2003,1141,1166,5
4,1,2003,1143,1301,-1


In [12]:
record = pd.DataFrame({'wins': season_dresults.groupby(['Season','WTeamID']).size()}).reset_index();
losses = pd.DataFrame({'losses': season_dresults.groupby(['Season','LTeamID']).size()}).reset_index();

record = record.merge(losses, how='outer', left_on=['Season','WTeamID'], right_on=['Season','LTeamID'])
record = record.fillna(0)
record['games'] = record['wins'] + record['losses']

In [13]:
# create dataframes of both winners and losers
dfW = season_dresults.groupby(['Season','WTeamID']).sum().reset_index()
dfL = season_dresults.groupby(['Season','LTeamID']).sum().reset_index()

  dfW = season_dresults.groupby(['Season','WTeamID']).sum().reset_index()
  dfL = season_dresults.groupby(['Season','LTeamID']).sum().reset_index()


In [14]:
# add points for and points against data
record['PointsFor'] = record.apply(get_points_for, axis=1)
record['PointsAgainst'] = record.apply(get_points_against, axis=1)

In [15]:
# This cell takes ~3 min. To slides
cols = ['FGM','FGA','FGM3','FGA3','FTM','FTA','OR','DR','Ast','TO','Stl','Blk','PF']

for col in cols:
    print("Processing",col)
    record[col] = record.apply(get_remaining_stats, args=(col,), axis=1)

Processing FGM
Processing FGA
Processing FGM3
Processing FGA3
Processing FTM
Processing FTA
Processing OR
Processing DR
Processing Ast
Processing TO
Processing Stl
Processing Blk
Processing PF


You can calculate 

In [16]:
record['FGprct'] = record['FGM'] / record['FGA']  
record.tail()

Unnamed: 0,Season,WTeamID,wins,LTeamID,losses,games,PointsFor,PointsAgainst,FGM,FGA,...,FTM,FTA,OR,DR,Ast,TO,Stl,Blk,PF,FGprct
7250,2015,0.0,0.0,1212.0,27.0,27.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,
7251,2015,0.0,0.0,1363.0,28.0,28.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,
7252,2021,0.0,0.0,1152.0,9.0,9.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,
7253,2022,0.0,0.0,1175.0,25.0,25.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,
7254,2022,0.0,0.0,1249.0,27.0,27.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,


In [32]:
# This will take ~ 3 min. To slides
cols = ['PointsFor','PointsAgainst','FGM','FGA','FGM3','FGA3','FTM','FTA','OR','DR','Ast','TO','Stl','Blk','PF']

for col in cols:
    print("Processing",col)
    training_set['delta' + col] = training_set.apply(delta_stat,args=(col,),axis=1)

Processing PointsFor
Processing PointsAgainst
Processing FGM
Processing FGA
Processing FGM3
Processing FGA3
Processing FTM
Processing FTA
Processing OR
Processing DR
Processing Ast
Processing TO
Processing Stl
Processing Blk
Processing PF


In [33]:
training_set['deltaWinPct'] = training_set.apply(delta_winPct,axis=1)
training_set.head()

Unnamed: 0,Result,Season,Team1,Team2,deltaSeed,deltaPointsFor,deltaPointsAgainst,deltaFGM,deltaFGA,deltaFGM3,...,deltaFTM,deltaFTA,deltaOR,deltaDR,deltaAst,deltaTO,deltaStl,deltaBlk,deltaPF,deltaWinPct
0,0,2003,1411,1421,0,1.593103,-7.614943,0.354023,-1.526437,-0.549425,...,1.434483,7.135632,0.890805,1.627586,1.165517,-0.973563,-0.635632,-0.766667,-0.803448,0.151724
1,1,2003,1112,1436,-15,17.421182,7.112069,5.493842,9.852217,1.759852,...,4.673645,5.448276,2.213054,1.918719,3.435961,0.716749,1.602217,1.248768,1.853448,0.237685
2,1,2003,1113,1272,3,1.448276,3.344828,0.931034,-3.103448,-3.0,...,2.586207,3.310345,-0.37931,-2.655172,-1.068966,0.206897,-2.172414,-0.827586,0.655172,-0.172414
3,1,2003,1141,1166,5,0.102403,8.908046,-2.07628,-4.76489,-1.142111,...,5.397074,5.142111,-0.292581,0.094044,-1.197492,4.877743,-1.290491,-0.454545,3.69279,-0.085684
4,0,2003,1301,1143,1,-2.082759,-1.758621,-3.011494,-5.390805,1.552874,...,2.387356,0.949425,-1.508046,-2.345977,-1.333333,0.027586,1.214943,0.273563,1.563218,-0.124138


In [17]:
training_set.describe()

Unnamed: 0,Result,Season,Team1,Team2,deltaSeed
count,1248.0,1248.0,1248.0,1248.0,1248.0
mean,0.494391,2012.205929,1291.951923,1295.580128,-0.080128
std,0.500169,5.636409,102.475647,106.163202,7.500654
min,0.0,2003.0,1103.0,1101.0,-15.0
25%,0.0,2007.0,1211.0,1211.0,-6.0
50%,0.0,2012.0,1281.0,1285.5,0.0
75%,1.0,2017.0,1388.0,1396.25,5.0
max,1.0,2022.0,1463.0,1463.0,15.0


In [18]:
training_set.to_csv("training_set.csv", index=False)
record.to_csv("record.csv", index=False)

In [39]:
training_set[training_set['Season']==2019]

Unnamed: 0,Result,Season,Team1,Team2,deltaSeed,deltaPointsFor,deltaPointsAgainst,deltaFGM,deltaFGA,deltaFGM3,...,deltaFTM,deltaFTA,deltaOR,deltaDR,deltaAst,deltaTO,deltaStl,deltaBlk,deltaPF,deltaWinPct
1048,0,2019,1396,1125,0,-12.022917,-3.510417,-5.260417,-3.514583,-2.866667,...,1.364583,1.975000,0.818750,-5.104167,-5.131250,-0.443750,1.989583,-1.550000,1.768750,-0.114583
1049,1,2019,1192,1341,0,-0.106549,-0.243402,0.678397,-3.380254,1.624633,...,-3.087977,-6.330401,-1.986315,1.127077,1.632454,0.845552,-1.302053,2.039101,-4.955034,-0.055718
1050,1,2019,1113,1385,0,0.297348,-1.725379,-1.345644,-2.076705,-1.265152,...,4.253788,7.815341,4.519886,2.903409,-0.436553,3.230114,-2.599432,-0.114583,0.845644,0.051136
1051,1,2019,1295,1300,0,4.590323,5.875269,0.313978,-0.223656,3.443011,...,0.519355,-0.784946,-4.952688,-0.739785,-3.110753,-4.854839,-0.524731,-0.082796,-2.904301,0.016129
1052,1,2019,1120,1308,-7,1.226103,3.775735,-0.314338,1.159926,2.036765,...,-0.181985,-1.452206,-0.889706,-3.183824,-0.088235,0.115809,3.762868,2.295956,0.288603,-0.139706
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1110,0,2019,1246,1120,-3,-2.155080,-3.163993,-0.423351,-4.835116,-5.381462,...,4.073084,4.901070,-0.129234,4.877005,-0.563280,0.489305,-3.263815,0.295900,-2.079323,0.082888
1111,1,2019,1277,1181,1,-4.676471,-2.029412,-2.735294,-6.735294,1.029412,...,-0.235294,-2.088235,-2.558824,1.705882,3.029412,-0.264706,-4.235294,-1.352941,1.058824,-0.029412
1112,0,2019,1277,1403,-1,5.729779,6.248162,1.608456,1.707721,1.044118,...,1.468750,1.406250,1.979779,4.650735,4.909926,0.477941,-2.139706,0.564338,-0.869485,0.011029
1113,0,2019,1120,1438,4,7.038603,13.525735,1.404412,6.847426,2.943015,...,1.286765,2.485294,2.860294,-3.808824,-0.088235,3.115809,3.669118,0.952206,3.726103,-0.170956
