# NCAA ML Competition - Women's 2018

## Notes

Notes about what's in the data files: https://www.kaggle.com/c/womens-machine-learning-competition-2018/data

Starter Kernel might help: https://www.kaggle.com/juliaelliott/basic-starter-kernel-ncaa-women-s-dataset

In [1]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report



## Load Data

In [4]:
# update this as needed!

# right now, only looking at tournament seed / performance, not regular season games
data_dir = './WDataFiles/'
df_seeds = pd.read_csv(data_dir + 'WNCAATourneySeeds.csv')
df_tour = pd.read_csv(data_dir + 'WNCAATourneyCompactResults.csv')
df_reg = pd.read_csv(data_dir + 'WRegularSeasonCompactResults.csv')
df_cities = pd.read_csv(data_dir + 'WGameCities.csv')

In [8]:
## city info only available for 2015 on... so maybe not as helpful?
df_combo = pd.merge(left=df_tour, right=df_cities, on=['Season', 'WTeamID','LTeamID','DayNum'])
df_combo.tail(10)
#df_cities.tail(10)

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,CRType,CityID
179,2017,145,3199,66,3333,53,N,0,NCAA,4336
180,2017,145,3332,77,3268,63,N,0,NCAA,4044
181,2017,145,3376,100,3346,58,N,0,NCAA,4336
182,2017,146,3280,94,3124,85,N,1,NCAA,4254
183,2017,146,3390,76,3323,75,N,0,NCAA,4193
184,2017,147,3163,90,3332,52,H,0,NCAA,4044
185,2017,147,3376,71,3199,64,N,0,NCAA,4336
186,2017,151,3280,66,3163,64,N,1,NCAA,4088
187,2017,151,3376,62,3390,53,N,0,NCAA,4088
188,2017,153,3376,67,3280,55,N,0,NCAA,4088


In [39]:
# seed = region + seed within region (1-16)
df_seeds.tail(10)

Unnamed: 0,Season,Seed,TeamID
1270,2017,Z07,3166
1271,2017,Z08,3113
1272,2017,Z09,3277
1273,2017,Z10,3405
1274,2017,Z11,3378
1275,2017,Z12,3346
1276,2017,Z13,3195
1277,2017,Z14,3442
1278,2017,Z15,3253
1279,2017,Z16,3421


In [40]:
# contains winning and losing team IDs (must check against list of teams)
# winning and losing team scores
# WLoc = home (H), away (A), neutral (N)
df_tour.tail(10)

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
1250,2017,145,3199,66,3333,53,N,0
1251,2017,145,3332,77,3268,63,N,0
1252,2017,145,3376,100,3346,58,N,0
1253,2017,146,3280,94,3124,85,N,1
1254,2017,146,3390,76,3323,75,N,0
1255,2017,147,3163,90,3332,52,H,0
1256,2017,147,3376,71,3199,64,N,0
1257,2017,151,3280,66,3163,64,N,1
1258,2017,151,3376,62,3390,53,N,0
1259,2017,153,3376,67,3280,55,N,0


## Merge

In [41]:
# merge seeds with team IDs for tourney performance
df_W = df_seeds.rename(columns={'TeamID':'WTeamID', 'Seed':'WSeed'})
df_L = df_seeds.rename(columns={'TeamID':'LTeamID', 'Seed':'LSeed'})
df_dummy = pd.merge(left=df_tour, right=df_W, how='left', on=['Season', 'WTeamID'])
df_concat = pd.merge(left=df_dummy, right=df_L, on=['Season', 'LTeamID'])

# at the beginning of the tourney, teams play within their region
# final 3 games = between regions
df_concat.tail(10)

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WSeed,LSeed
1250,2017,145,3199,66,3333,53,N,0,Z03,Z02
1251,2017,145,3332,77,3268,63,N,0,W10,W03
1252,2017,145,3376,100,3346,58,N,0,Z01,Z12
1253,2017,146,3280,94,3124,85,N,1,X02,X01
1254,2017,146,3390,76,3323,75,N,0,Y02,Y01
1255,2017,147,3163,90,3332,52,H,0,W01,W10
1256,2017,147,3376,71,3199,64,N,0,Z01,Z03
1257,2017,151,3280,66,3163,64,N,1,X02,W01
1258,2017,151,3376,62,3390,53,N,0,Z01,Y02
1259,2017,153,3376,67,3280,55,N,0,Z01,X02


In [42]:
# calculate seed differential: positive = higher seed won, negative = lower seed won
df_concat['SeedDiff'] = df_concat.apply(lambda row: int(row['LSeed'][1:3]) - int(row['WSeed'][1:3]), axis=1)

df_concat.tail(10)

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WSeed,LSeed,SeedDiff
1250,2017,145,3199,66,3333,53,N,0,Z03,Z02,-1
1251,2017,145,3332,77,3268,63,N,0,W10,W03,-7
1252,2017,145,3376,100,3346,58,N,0,Z01,Z12,11
1253,2017,146,3280,94,3124,85,N,1,X02,X01,-1
1254,2017,146,3390,76,3323,75,N,0,Y02,Y01,-1
1255,2017,147,3163,90,3332,52,H,0,W01,W10,9
1256,2017,147,3376,71,3199,64,N,0,Z01,Z03,2
1257,2017,151,3280,66,3163,64,N,1,X02,W01,-1
1258,2017,151,3376,62,3390,53,N,0,Z01,Y02,1
1259,2017,153,3376,67,3280,55,N,0,Z01,X02,1


In [77]:
# Baseline: just use seed information? 
# Baseline: 50/50 chance? (even easier)

AttributeError: 'list' object has no attribute 'permutations'

In [45]:
# Stage 1: submission file
# Make predictions for all possible matchups from the past four NCAA® tournaments 2014-2017
# (64*63/2 = 2,016 predictions per year)
recent_tourney_teams = df_seeds = df_seeds[df_seeds['Season'] > 2013]
recent_tourney_teams = recent_tourney_teams.reset_index(drop=True)
recent_tourney_teams.head(10)

# how to match these up pairwise each year??

# ID - string SSSS_XXXX_YYYY, SSSS = 4-digit season yr, 
# XXXX  = 4-digit TeamID of lower-ID team, YYYY = 4-digit TeamID of higher-ID team.
# Pred - predicted winning percentage for the first team in the ID field

Unnamed: 0,Season,Seed,TeamID
0,2014,W01,3163
1,2014,W02,3181
2,2014,W03,3401
3,2014,W04,3304
4,2014,W05,3301
5,2014,W06,3211
6,2014,W07,3177
7,2014,W08,3208
8,2014,W09,3386
9,2014,W10,3328
