In [117]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from collections import Counter

from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve, train_test_split
from sklearn.metrics import brier_score_loss

import xgboost as xgb
from xgboost import XGBClassifier

pd.set_option('display.max_columns', None)

LOAD MODEL DATA SETS

In [118]:
all_games_df = pd.read_csv('all_games_df.csv')

In [119]:
test_combos_df = pd.read_csv('test_combos_df_2021.csv')
test_combos_df = test_combos_df.sort_values(by=['ID']).reset_index(drop=True)

In [120]:
test_combos_df.head(3)

Unnamed: 0,Season,ID,HTeamID,RTeamID,HSeed,RSeed,is_tourney,Hwins_top25,HPointMargin,HFG,HFG3,Rwins_top25,RPointMargin,RFG,RFG3,HRankPOM,RRankPOM,home,Hconf,Hadjem,Hadjo,Hadjd,Hluck,road,Rconf,Radjem,Radjo,Radjd,Rluck,Htourny20plus,Rtourny20plus,HBig4Conf,RBig4Conf
0,2021,2021_1101_1104,1104.0,1101.0,2.0,14.0,1,12,9.8,0.430893,0.348281,0,14.565217,0.459648,0.383192,8,93,alabama,SEC,24.99,111.7,86.7,0.031,abilene christian,Slnd,10.13,103.0,92.9,-0.027,0,0,1,0
1,2021,2021_1101_1111,1101.0,1111.0,14.0,16.0,1,0,14.565217,0.459648,0.383192,0,-0.083333,0.405138,0.320992,93,207,abilene christian,Slnd,10.13,103.0,92.9,-0.027,appalachian st.,SB,-4.34,99.4,103.7,0.06,0,0,0,0
2,2021,2021_1101_1116,1116.0,1101.0,3.0,14.0,1,4,11.714286,0.454653,0.331366,0,14.565217,0.459648,0.383192,18,93,arkansas,SEC,21.53,111.5,90.0,0.056,abilene christian,Slnd,10.13,103.0,92.9,-0.027,0,0,1,0


In [121]:
test_combos_df.tail()

Unnamed: 0,Season,ID,HTeamID,RTeamID,HSeed,RSeed,is_tourney,Hwins_top25,HPointMargin,HFG,HFG3,Rwins_top25,RPointMargin,RFG,RFG3,HRankPOM,RRankPOM,home,Hconf,Hadjem,Hadjo,Hadjd,Hluck,road,Rconf,Radjem,Radjo,Radjd,Rluck,Htourny20plus,Rtourny20plus,HBig4Conf,RBig4Conf
2273,2021,2021_1452_1457,1452.0,1457.0,3.0,12.0,1,12,5.185185,0.430284,0.362454,0,12.75,0.462443,0.350717,25,86,west virginia,B12,20.32,116.1,95.8,-0.04,winthrop,BSth,8.94,105.1,96.2,0.032,1,0,1,0
2274,2021,2021_1452_1458,1452.0,1458.0,3.0,9.0,1,12,5.185185,0.430284,0.362454,4,5.310345,0.422498,0.369371,25,11,west virginia,B12,20.32,116.1,95.8,-0.04,wisconsin,B10,22.65,112.5,89.9,-0.049,1,1,1,1
2275,2021,2021_1455_1457,1455.0,1457.0,11.0,12.0,1,4,2.631579,0.406527,0.34443,0,12.75,0.462443,0.350717,72,86,wichita st.,Amer,11.55,110.0,98.4,0.121,winthrop,BSth,8.94,105.1,96.2,0.032,0,0,0,0
2276,2021,2021_1455_1458,1458.0,1455.0,9.0,11.0,1,4,5.310345,0.422498,0.369371,4,2.631579,0.406527,0.34443,11,72,wisconsin,B10,22.65,112.5,89.9,-0.049,wichita st.,Amer,11.55,110.0,98.4,0.121,1,0,1,0
2277,2021,2021_1457_1458,1458.0,1457.0,9.0,12.0,1,4,5.310345,0.422498,0.369371,0,12.75,0.462443,0.350717,11,86,wisconsin,B10,22.65,112.5,89.9,-0.049,winthrop,BSth,8.94,105.1,96.2,0.032,1,0,1,0


In [122]:
ind_var_selected = [
'is_tourney', 
'HRankPOM',
'RRankPOM',
'line',
'Hwins_top25',
'Rwins_top25',
'HPointMargin',
'RPointMargin',
'HFG',
'RFG',
'HFG3',
'RFG3',
'Hadjem',
'Hadjo',
'Hadjd',
'Hluck',
'Radjem',
'Radjo',
'Radjd',
'Rluck',
'Htourny20plus',
'Rtourny20plus',
'HBig4Conf',
'RBig4Conf', 
'HSeed',
'RSeed'
]

Note: test is 2021 predictions but our "test" holdout set is referred to as "valid"

In [123]:
#prediction set 2021
test_ids = test_combos_df['ID'].reset_index(drop=True)
X_test = test_combos_df[['is_tourney','HRankPOM','RRankPOM','Hwins_top25','Rwins_top25','HPointMargin','RPointMargin','HFG','RFG','HFG3','RFG3','Hadjem','Hadjo','Hadjd','Hluck','Radjem','Radjo','Radjd','Rluck','Htourny20plus','Rtourny20plus','HBig4Conf','RBig4Conf','HSeed','RSeed']].reset_index(drop=True)

In [124]:
#Predict the last two years as a test set (2017, 2018):
temp_df = all_games_df[all_games_df['Season']>2016]
temp_df = temp_df[temp_df['is_tourney']==1]
X_valid = temp_df[ind_var_selected].reset_index(drop=True)
y_valid = temp_df['Hwin'].reset_index(drop=True)

#Train on everything else:
temp_df1 = all_games_df[all_games_df['Season']>2016]
temp_df1 = temp_df1[temp_df1['is_tourney']==0]
temp_df2 = all_games_df[all_games_df['Season']<2017]
combined_temp_df = temp_df1.append(temp_df2)

X_train = combined_temp_df[ind_var_selected].reset_index(drop=True)
y_train = combined_temp_df['Hwin'].reset_index(drop=True)

#For final predictions:
X_train_orig = all_games_df[ind_var_selected].reset_index(drop=True)
y_train_orig = all_games_df['Hwin'].reset_index(drop=True)


In [125]:
#Create second holdout set to double-check not overfit and check model stability (season 2016)
temp_df16 = all_games_df[all_games_df['Season']==2016]
temp_df16 = temp_df16[temp_df16['is_tourney']==1]
X_valid16 = temp_df16[ind_var_selected].reset_index(drop=True)
y_valid16 = temp_df16['Hwin'].reset_index(drop=True)

temp_df1_16 = all_games_df[all_games_df['Season']==2016]
temp_df1_16 = temp_df1_16[temp_df1_16['is_tourney']==0]
temp_df2_16 = all_games_df[all_games_df['Season']!=2016]
combined_temp_df_16 = temp_df1_16.append(temp_df2_16)

X_train16 = combined_temp_df_16[ind_var_selected].reset_index(drop=True)
y_train16 = combined_temp_df_16['Hwin'].reset_index(drop=True)

In [126]:
X_test = X_test.astype("float64")

X_train_orig = X_train_orig.astype("float64")
y_train_orig = y_train_orig.astype("float64")

X_train = X_train.astype("float64")
X_valid = X_valid.astype("float64")
y_train = y_train.astype("float64")
y_valid = y_valid.astype("float64")

X_train16 = X_train16.astype("float64")
X_valid16 = X_valid16.astype("float64")
y_train16 = y_train16.astype("float64")
y_valid16 = y_valid16.astype("float64")

Scoring rules and benchmarks

In [127]:
def LogLoss(predictions, realizations):
    predictions_use = predictions.clip(0)
    realizations_use = realizations.clip(0)
    LogLoss = -np.mean( (realizations_use * np.log(predictions_use)) + 
                        (1 - realizations_use) * np.log(1 - predictions_use) )
    return LogLoss

If the model doesn't beat assuming 50% it is poor

In [128]:
bench_5050 = np.repeat(0.5, len(y_valid))

In [129]:
LogLoss(bench_5050, y_valid)

0.6931471805599453

How does this compare to Lopez and Matthews (2014 winners)?

In [130]:
Z1 = LogisticRegression(C = 1e9, random_state=23)
Z1.fit(X_train[['line']], y_train)
Z1_pred = pd.DataFrame(Z1.predict_proba(X_valid[['line']]))[1]

In [131]:
LogLoss(Z1_pred, y_valid)

0.5099275904348796

In [132]:
Z2 = LogisticRegression(C = 1e9, random_state=23)
Z2.fit(X_train[['Hadjo','Hadjd','Radjo','Radjd']], y_train)
Z2_pred = pd.DataFrame(Z2.predict_proba(X_valid[['Hadjo','Hadjd','Radjo','Radjd']]))[1]

In [133]:
LogLoss(Z2_pred, y_valid)

0.5126291000204722

In [134]:
Z1 = LogisticRegression(C = 1e9, random_state=23)
Z1.fit(X_train16[['line']], y_train16)
Z1_pred = pd.DataFrame(Z1.predict_proba(X_valid16[['line']]))[1]

In [135]:
LogLoss(Z1_pred, y_valid16)

0.5381701864065767

In [136]:
Z2 = LogisticRegression(C = 1e9, random_state=23)
Z2.fit(X_train16[['Hadjo','Hadjd','Radjo','Radjd']], y_train16)
Z2_pred = pd.DataFrame(Z2.predict_proba(X_valid16[['Hadjo','Hadjd','Radjo','Radjd']]))[1]

In [137]:
LogLoss(Z2_pred, y_valid16)

0.523375490320045

Fit a neural network (with and without line)

Normalize data (using z-scores) before neural network

In [138]:
scaler = StandardScaler()
scaler.fit(X_train)  # Fit only to the training data
scaled_X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
scaled_X_valid = pd.DataFrame(scaler.transform(X_valid), index=X_valid.index, columns=X_valid.columns)

In [139]:
scaler = StandardScaler()
scaler.fit(X_train16)  # Fit only to the training data
scaled_X_train16 = pd.DataFrame(scaler.transform(X_train16), index=X_train16.index, columns=X_train16.columns)
scaled_X_valid16 = pd.DataFrame(scaler.transform(X_valid16), index=X_valid16.index, columns=X_valid16.columns)

In [140]:
#drop line from training since we won't use in predictions, need these to be same number of columns.
X_train_orig = X_train_orig.drop(['line'], axis=1)

In [141]:
scaler = StandardScaler()
scaler.fit(X_train_orig)  # Fit to all training data

scaled_X_train_orig = pd.DataFrame(scaler.transform(X_train_orig), index=X_train_orig.index, columns=X_train_orig.columns)
scaled_X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)


With line (note: we won't have line in the rounds after the first, but we could use this for the first round only like Lopez and Matthews did)

In [142]:
#Note: I tried logistic activation and different combinations of hidden layers/nodes
#Hyperparameters below minimized the log loss in the holdout set
#I also submited a prediction with 10 nodes in the first layer, but this is the submission that placed 4th (w/ 8 in 1st)
nn = MLPClassifier(activation='relu', hidden_layer_sizes=(8,5,3),random_state=201, max_iter=1000)
nn.fit(scaled_X_train,y_train)
nn_pred = pd.DataFrame(nn.predict_proba(scaled_X_valid))[1]

In [143]:
LogLoss(nn_pred, y_valid)

0.46058352667105923

In [144]:
#try second holdout (does worse, but still better than baseline of 54)
nn.fit(scaled_X_train16,y_train16)
nn_pred16 = pd.DataFrame(nn.predict_proba(scaled_X_valid16))[1]
LogLoss(nn_pred16, y_valid16)

0.46602394659108043

Without line

In [145]:
#Note: I tried logistic activation and different combinations of hidden layers/nodes
#Hyperparameters below minimized the log loss in the holdout set
ind_var_selected_no_line = ['is_tourney', 'Hwins_top25','Rwins_top25','HPointMargin','RPointMargin','HFG','RFG','HFG3','RFG3','HRankPOM','RRankPOM','Hadjem','Hadjo','Hadjd','Hluck','Radjem','Radjo','Radjd','Rluck','Htourny20plus','Rtourny20plus','HBig4Conf','RBig4Conf', 'HSeed','RSeed']
nn = MLPClassifier(activation='relu', hidden_layer_sizes=(7,5,3),random_state=201, max_iter=1000)
nn.fit(scaled_X_train[ind_var_selected_no_line],y_train)
nn_pred_no_line = pd.DataFrame(nn.predict_proba(scaled_X_valid[ind_var_selected_no_line]))[1]

In [146]:
LogLoss(nn_pred_no_line, y_valid)

0.44535847672906115

In [147]:
#try second holdout (does better)
nn.fit(scaled_X_train16[ind_var_selected_no_line],y_train16)
nn_pred_no_line16 = pd.DataFrame(nn.predict_proba(scaled_X_valid16[ind_var_selected_no_line]))[1]
LogLoss(nn_pred_no_line16, y_valid16)

0.48802297812915235

In [148]:
avg = (nn_pred_no_line+nn_pred)/2
LogLoss(avg, y_valid)

0.44832781692010787

In [149]:
avg16 = (nn_pred_no_line16+nn_pred16)/2
LogLoss(avg16, y_valid16)

0.469656232815097

Create test predictions

In [150]:
#different submissions: differ by first layer of nueral net
ind_var_selected_no_line = ['is_tourney', 'Hwins_top25','Rwins_top25','HPointMargin','RPointMargin','HFG','RFG','HFG3','RFG3','HRankPOM','RRankPOM','Hadjem','Hadjo','Hadjd','Hluck','Radjem','Radjo','Radjd','Rluck','Htourny20plus','Rtourny20plus','HBig4Conf','RBig4Conf', 'HSeed','RSeed']

#train model on all data (previously held out some tournaments for a test set)
nn = MLPClassifier(activation='relu', hidden_layer_sizes=(7,5,3),random_state=201, max_iter=1000)
nn.fit(scaled_X_train_orig[ind_var_selected_no_line],y_train_orig)
second_rd_submission_all = pd.DataFrame(nn.predict_proba(scaled_X_test[ind_var_selected_no_line]))

In [151]:
#Note: I'm predicting home (lower seed) win probability. Need to convert to be consistent with output file (lower team ID)
second_rd_submission = pd.merge(test_combos_df, second_rd_submission_all, left_index=True, right_index=True)

In [152]:
second_rd_submission.loc[second_rd_submission['HTeamID']<second_rd_submission['RTeamID'], 'pred'] = second_rd_submission[1]
second_rd_submission.loc[second_rd_submission['HTeamID']>second_rd_submission['RTeamID'], 'pred'] = second_rd_submission[0]

In [153]:
second_rd_submission.to_csv('Ismail_Munira_second_rd_submission_all.csv', index=False)

In [154]:
second_rd_submission = second_rd_submission[['ID','pred']]

In [155]:
second_rd_submission.head()

Unnamed: 0,ID,pred
0,2021_1101_1104,0.108285
1,2021_1101_1111,0.828493
2,2021_1101_1116,0.064667
3,2021_1101_1124,0.068472
4,2021_1101_1140,0.071935


In [156]:
second_rd_submission.tail()

Unnamed: 0,ID,pred
2273,2021_1452_1457,0.812187
2274,2021_1452_1458,0.471668
2275,2021_1455_1457,0.845873
2276,2021_1455_1458,0.18678
2277,2021_1457_1458,0.078025


In [157]:
second_rd_submission.to_csv('Aboo_Munira_Neera2_submission.csv', index=False)