# NCAA ML Competition - Women's 2018

## Notes

Notes about what's in the data files: https://www.kaggle.com/c/womens-machine-learning-competition-2018/data

Starter Kernel might help: https://www.kaggle.com/juliaelliott/basic-starter-kernel-ncaa-women-s-dataset

In [17]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

## Load Data

In [76]:
# update this as needed!

# right now, only looking at tournament seed / performance, not regular season games
data_dir = './WStage2DataFiles/'
df_seeds = pd.read_csv(data_dir + 'WNCAATourneySeeds.csv')
df_tour = pd.read_csv(data_dir + 'WNCAATourneyCompactResults.csv')
df_tour_long = pd.read_csv(data_dir + 'WNCAATourneyDetailedResults.csv')
df_teams = pd.read_csv(data_dir + 'WTeams.csv')
#df_tour = pd.read_csv(data_dir + 'WNCAATourneyCompactResults.csv')

In [37]:
# seed = region + seed within region (1-16)
df_seeds.head(10)

Unnamed: 0,Season,Seed,TeamID
0,1998,W01,3330
1,1998,W02,3163
2,1998,W03,3112
3,1998,W04,3301
4,1998,W05,3272
5,1998,W06,3438
6,1998,W07,3208
7,1998,W08,3307
8,1998,W09,3304
9,1998,W10,3203


In [20]:
# contains winning and losing team IDs (must check against list of teams)
# winning and losing team scores
# WLoc = home (H), away (A), neutral (N)
df_tour.tail(10)

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
1250,2017,145,3199,66,3333,53,N,0
1251,2017,145,3332,77,3268,63,N,0
1252,2017,145,3376,100,3346,58,N,0
1253,2017,146,3280,94,3124,85,N,1
1254,2017,146,3390,76,3323,75,N,0
1255,2017,147,3163,90,3332,52,H,0
1256,2017,147,3376,71,3199,64,N,0
1257,2017,151,3280,66,3163,64,N,1
1258,2017,151,3376,62,3390,53,N,0
1259,2017,153,3376,67,3280,55,N,0


## Merge

In [21]:
# merge seeds with team IDs for tourney performance
df_W = df_seeds.rename(columns={'TeamID':'WTeamID', 'Seed':'WSeed'})
df_L = df_seeds.rename(columns={'TeamID':'LTeamID', 'Seed':'LSeed'})
df_dummy = pd.merge(left=df_tour, right=df_W, how='left', on=['Season', 'WTeamID'])
df_concat = pd.merge(left=df_dummy, right=df_L, on=['Season', 'LTeamID'])

# at the beginning of the tourney, teams play within their region
# final 3 games = between regions
df_concat.tail(10)

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WSeed,LSeed
1250,2017,145,3199,66,3333,53,N,0,Z03,Z02
1251,2017,145,3332,77,3268,63,N,0,W10,W03
1252,2017,145,3376,100,3346,58,N,0,Z01,Z12
1253,2017,146,3280,94,3124,85,N,1,X02,X01
1254,2017,146,3390,76,3323,75,N,0,Y02,Y01
1255,2017,147,3163,90,3332,52,H,0,W01,W10
1256,2017,147,3376,71,3199,64,N,0,Z01,Z03
1257,2017,151,3280,66,3163,64,N,1,X02,W01
1258,2017,151,3376,62,3390,53,N,0,Z01,Y02
1259,2017,153,3376,67,3280,55,N,0,Z01,X02


In [22]:
# calculate seed differential: positive = higher seed won, negative = lower seed won
df_concat['SeedDiff'] = df_concat.apply(lambda row: int(row['LSeed'][1:3]) - int(row['WSeed'][1:3]), axis=1)

df_concat.tail(10)

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WSeed,LSeed,SeedDiff
1250,2017,145,3199,66,3333,53,N,0,Z03,Z02,-1
1251,2017,145,3332,77,3268,63,N,0,W10,W03,-7
1252,2017,145,3376,100,3346,58,N,0,Z01,Z12,11
1253,2017,146,3280,94,3124,85,N,1,X02,X01,-1
1254,2017,146,3390,76,3323,75,N,0,Y02,Y01,-1
1255,2017,147,3163,90,3332,52,H,0,W01,W10,9
1256,2017,147,3376,71,3199,64,N,0,Z01,Z03,2
1257,2017,151,3280,66,3163,64,N,1,X02,W01,-1
1258,2017,151,3376,62,3390,53,N,0,Z01,Y02,1
1259,2017,153,3376,67,3280,55,N,0,Z01,X02,1


## Previous Appearances

Indicate a team's previous experience in tournaments. The number of games played will be a proxy for this (more games played = more appearances). (Technically a team that wins one year is the same as a team that's only made it to the first round for several years in a row, but we're looking for longer-term trends.)

In [82]:
df_teams_w = df_teams.rename(columns={'TeamID':'WTeamID','TeamName':'WTeamName'})
df_teams_l = df_teams.rename(columns={'TeamID':'LTeamID','TeamName':'LTeamName'})

df_tour = pd.merge(left=df_concat, right=df_teams_w, how='left', on=['WTeamID'])
df_tour_hist = pd.merge(left=df_tour, right=df_teams_l, how='left', on=['LTeamID'])
df_tour_hist.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WSeed,LSeed,SeedDiff,WTeamName,LTeamName
0,1998,137,3104,94,3422,46,H,0,X02,X15,13,Alabama,UNC Greensboro
1,1998,137,3112,75,3365,63,H,0,W03,W14,11,Arizona,Santa Clara
2,1998,137,3163,93,3193,52,H,0,W02,W15,13,Connecticut,Fairfield
3,1998,137,3198,59,3266,45,H,0,Y07,Y10,3,Florida Intl,Marquette
4,1998,137,3203,74,3208,72,A,0,W10,W07,-3,G Washington,Georgia


In [93]:
#

winning_games_w = df_tour_hist.groupby('WTeamID').count().reset_index()[['WTeamID','Season']].rename(columns={'Season':'WWins'})
winning_games_l = df_tour_hist.groupby('WTeamID').count().reset_index()[['WTeamID','Season']].rename(columns={'Season':'LWins','WTeamID':'LTeamID'})

losing_games_w = df_concat.groupby('LTeamID').count().reset_index()[['LTeamID','Season']].rename(columns={'Season':'WLosses','LTeamID':'WTeamID'})
losing_games_l = df_concat.groupby('LTeamID').count().reset_index()[['LTeamID','Season']].rename(columns={'Season':'LLosses'})

# get total appearances
total_apps = pd.merge(left=df_tour_hist, right=winning_games_w, how='left', on=['WTeamID']).fillna(0)
total_apps = pd.merge(left=total_apps, right=winning_games_l, how='left', on=['LTeamID']).fillna(0)
total_apps = pd.merge(left=total_apps, right=losing_games_w, how='left', on=['WTeamID']).fillna(0)
total_apps = pd.merge(left=total_apps, right=losing_games_l, how='left', on=['LTeamID']).fillna(0)


total_apps['WTotApps'] = total_apps['WWins'] + total_apps['WLosses']
total_apps['WPctWins'] = total_apps['WWins'] / total_apps['WTotApps'] 

total_apps['LTotApps'] = total_apps['LWins'] + total_apps['LLosses']
total_apps['LPctWins'] = total_apps['LWins'] / total_apps['LTotApps'] 
total_apps.head()


# per team win

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WSeed,LSeed,...,WTeamName,LTeamName,WWins,LWins,WLosses,LLosses,WTotApps,WPctWins,LTotApps,LPctWins
0,1998,137,3104,94,3422,46,H,0,X02,X15,...,Alabama,UNC Greensboro,3,0.0,2,1,5,0.6,1.0,0.0
1,1998,137,3112,75,3365,63,H,0,W03,W14,...,Arizona,Santa Clara,5,0.0,6,4,11,0.454545,4.0,0.0
2,1998,137,3163,93,3193,52,H,0,W02,W15,...,Connecticut,Fairfield,93,0.0,10,2,103,0.902913,2.0,0.0
3,1998,137,3198,59,3266,45,H,0,Y07,Y10,...,Florida Intl,Marquette,2,3.0,3,7,5,0.4,10.0,0.3
4,1998,137,3203,74,3208,72,A,0,W10,W07,...,G Washington,Georgia,9,27.0,11,18,20,0.45,45.0,0.6


In [137]:
df_wins = pd.DataFrame()
df_wins[['SeedDiff','WProb']] = total_apps[['SeedDiff','WPctWins']]
df_wins['Result'] = 1

df_losses = pd.DataFrame()
df_losses['SeedDiff'] = -total_apps['SeedDiff']
df_losses['WProb'] = total_apps['LPctWins']
df_losses['Result'] = 0

df_predictions = pd.concat((df_wins, df_losses))
df_predictions.tail()

Unnamed: 0,SeedDiff,WProb,Result
1255,-9,0.454545,0
1256,-2,0.617647,0
1257,1,0.902913,0
1258,-1,0.722222,0
1259,-1,0.608696,0


In [138]:
## Create training and test data sets; may create a dev dataset later
num_test = len(df_predictions)

# TODO: Add dev data!
mask = np.random.rand(len(df_predictions)) < 0.7 ## 80% data for training and 20% for testing
train_data = df_predictions[mask]
test_data = df_predictions[~mask]
mask = np.random.rand(len(test_data)) < 0.5
dev_data = test_data[mask]
test_data = test_data[~mask]


In [139]:
## Convert to arrays
#X_train = train_data.SeedDiff.values.reshape(-1,1)
#y_train = train_data.Result.values
X_train = train_data.drop(['Result'],axis=1).as_matrix()
y_train = train_data['Result'].values

X_dev = dev_data.drop(['Result'],axis=1).as_matrix()
y_dev = dev_data['Result'].values
X_test = test_data.SeedDiff.values.reshape(-1,1)
y_test = test_data.Result.values
print X_train[:10]

[[ 13.           0.6       ]
 [ 11.           0.45454545]
 [ 13.           0.90291262]
 [ -3.           0.45      ]
 [  9.           0.38095238]
 [ -1.           0.41176471]
 [ -1.           0.72857143]
 [ 15.           0.5       ]
 [ 15.           0.61538462]
 [  3.           0.54545455]]


In [140]:
## Fit a logistic regression model through grid search
logreg = LogisticRegression()
params = {'C': np.logspace(start=-5, stop=3, num=9)}
clf = GridSearchCV(logreg, params, scoring='f1', refit=True)
clf.fit(X_train, y_train)
print('Best log_loss: {:.4}, with best C: {}'.format(clf.best_score_, clf.best_params_['C']))

clf.score(X_dev,y_dev)
#clf.score(X_train,y_train)

Best log_loss: 0.7855, with best C: 1.0


0.75000000000000011

In [114]:
## Plot to confirm intuition that greater positive seed difference indicates higher win probability
X = np.arange(-10, 10).reshape(-1, 1)
preds = clf.predict_proba(X)[:,1]

plt.plot(X, preds)
plt.xlabel('Team1 seed - Team2 seed')
plt.ylabel('P(Team1 will win)')

[[-10]
 [ -9]
 [ -8]
 [ -7]
 [ -6]
 [ -5]
 [ -4]
 [ -3]
 [ -2]
 [ -1]
 [  0]
 [  1]
 [  2]
 [  3]
 [  4]
 [  5]
 [  6]
 [  7]
 [  8]
 [  9]]


ValueError: X has 1 features per sample; expecting 2

In [115]:
df_sample_sub = pd.read_csv('WSampleSubmissionStage1.csv')
n_test_games = len(df_sample_sub)

def get_year_t1_t2(ID):
    """Return a tuple with ints `year`, `team1` and `team2`."""
    return (int(x) for x in ID.split('_'))


In [116]:
def seed_to_int(seed):
    #Get just the digits from the seeding. Return as int
    s_int = int(seed[1:3])
    return s_int
df_seeds['seed_int'] = df_seeds.Seed.apply(seed_to_int)
df_seeds.drop(labels=['Seed'], inplace=True, axis=1) # This is the string label


In [117]:
## Unpack the ID field for team IDs
## Look up seeds for the team
## Calculate seed difference

X_test = np.zeros(shape=(n_test_games, 1))
for ii, row in df_sample_sub.iterrows():
    year, t1, t2 = get_year_t1_t2(row.ID)
    t1_seed = df_seeds[(df_seeds.TeamID == t1) & (df_seeds.Season == year)].seed_int.values[0]
    t2_seed = df_seeds[(df_seeds.TeamID == t2) & (df_seeds.Season == year)].seed_int.values[0]
    diff_seed = t1_seed - t2_seed
    X_test[ii, 0] = diff_seed

In [36]:
## Make predictions

preds = clf.predict_proba(X_test)[:,1]

clipped_preds = np.clip(preds, 0.05, 0.95)
df_sample_sub.Pred = clipped_preds
df_sample_sub.head(10)

Unnamed: 0,ID,Pred
0,2014_3103_3107,0.37626
1,2014_3103_3113,0.739968
2,2014_3103_3119,0.502908
3,2014_3103_3124,0.945607
4,2014_3103_3140,0.567136
5,2014_3103_3143,0.826767
6,2014_3103_3151,0.629184
7,2014_3103_3163,0.95
8,2014_3103_3169,0.31778
9,2014_3103_3173,0.860737


In [38]:
df_sample_sub.to_csv('test-submission.csv', index=False)