## Model Building, Evaluation, and Optimization

In [190]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.feature_selection import SelectKBest
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [3]:
cfb_data = pd.read_csv('data/cfb_final.csv')
cfb_data

## Preprocessing

We'll do some final preprocessing by utilizing `sklearn`'s `OneHotEncoder` utility to transform our `winner` column into 1 if the home team won and 0 for away. The selection of 1 for home or away is arbitrary.

In [5]:
# Replace 'winner' column with binary 'home_win' column
cfb_data = pd.concat([cfb_data, pd.get_dummies(cfb_data['winner'], drop_first=True)], axis=1)
cfb_data = cfb_data.drop(columns = 'winner')
cfb_data = cfb_data.rename(columns={'H': 'home_win'})

In [6]:
cfb_data

Unnamed: 0,id,home_team,season,conference_home,firstDowns_home,fourthDownConversions_home,fourthDowns_home,fumblesLost_home,fumblesRecovered_home,games_home,...,rushingYards_away,sacks_away,tacklesForLoss_away,thirdDownConversions_away,thirdDowns_away,avgYards_away,turnovers_away,talent_away,totalPPA_away,home_win
0,400787117,Air Force,2015.0,Mountain West,21.428571,1.000000,1.642857,0.785714,0.500000,14.0,...,182.923077,0.000000,0.000000,6.769231,14.076923,404.615385,1.307692,379.60,251.8,1
1,400787254,Air Force,2015.0,Mountain West,21.428571,1.000000,1.642857,0.785714,0.500000,14.0,...,173.416667,0.000000,0.000000,4.333333,13.000000,360.666667,1.750000,314.72,110.4,1
2,400787266,Air Force,2015.0,Mountain West,21.428571,1.000000,1.642857,0.785714,0.500000,14.0,...,136.833333,0.000000,0.000000,5.750000,15.416667,315.166667,1.916667,402.58,111.7,1
3,400760497,Air Force,2015.0,Mountain West,21.428571,1.000000,1.642857,0.785714,0.500000,14.0,...,244.250000,0.000000,0.000000,5.833333,13.333333,338.250000,1.833333,186.45,53.6,1
4,400787280,Air Force,2015.0,Mountain West,21.428571,1.000000,1.642857,0.785714,0.500000,14.0,...,165.076923,0.000000,0.000000,5.076923,14.461538,370.615385,1.846154,331.44,127.3,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4109,401269384,Wisconsin,2020.0,Big Ten,19.571429,0.714286,1.714286,0.714286,0.571429,7.0,...,192.857143,1.142857,2.142857,5.857143,12.285714,392.000000,1.000000,594.70,374.7,1
4110,401247338,Wisconsin,2020.0,Big Ten,19.571429,0.714286,1.714286,0.714286,0.571429,7.0,...,109.375000,3.125000,5.875000,5.500000,14.250000,360.750000,1.375000,611.17,249.4,0
4111,401247297,Wisconsin,2020.0,Big Ten,19.571429,0.714286,1.714286,0.714286,0.571429,7.0,...,194.875000,2.250000,5.625000,5.375000,14.250000,347.750000,1.125000,644.09,202.5,1
4112,401249035,Wyoming,2020.0,Mountain West,18.500000,1.000000,2.000000,0.666667,1.000000,6.0,...,105.857143,2.571429,7.285714,6.000000,13.714286,347.714286,1.000000,571.21,321.2,0


## Feature Selection

Because we have approximately 70 variables we're working with, we should do some feature selection/dimensionality reduction. We'll divide our data into an 80/20 train/test split and perform feature selection only on the training data so our model doesn't "cheat" and look at the test data during feature selection.

In [50]:
cfb_data['home_win'].astype('category')
# Because we'll be using ANOVA values to rank our features, we need to temporarily drop categorical features
# We'll calculate if a Chi-squared test is statistically significant in its place
X = cfb_data.drop(columns=['id', 'home_team', 'conference_home',
                    'away_team', 'conference_away', 'home_win'],axis=1)
# X = X.drop(columns=['games_home', 'games_away',
#                'away_team', 'conference_away'])
y = cfb_data['home_win']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

TODO: Get Chi-Squared of conference_home/away for potential addition

In `SelectKBest`, the default scoring function is `f_classif`, which is calculates an ANOVA F statistic between the target and features. From this, the features with the 20 highest F scores will be selected for our model. The number of features or the method of selection can certainly be hypertuned but we'll heuristically select the 20 best for now.

In [167]:
# Calculate ANOVA values
kbest = SelectKBest()
X_new = kbest.fit(X_train, y_train)

# DataFrame to easily list top 20 features and scores
scores = pd.DataFrame(X_new.scores_)
columns = pd.DataFrame(X_train.columns)
selected_features = pd.concat([columns, scores],axis=1)
selected_features.columns = ['Columns','Scores']
new_cols = list(selected_features['Columns'])

Here we list the 20 selected features with their corresponding ANOVA values:

In [168]:
selected_features.sort_values('Scores', ascending=False)[:20]

Unnamed: 0,Columns,Scores
0,rushingTDs_home,303.247926
1,avgYards_home,299.661179
2,avgYards_away,240.574946
3,firstDowns_home,222.20264
4,turnovers_home,194.926007
5,interceptions_home,193.520687
6,firstDowns_away,185.276193
7,totalPPA_home,184.371108
8,turnovers_away,168.870284
9,rushingYards_home,162.99204


In [187]:
# Subset the training data to selected features
new_X_train = X_train[new_cols]
new_X_test = X_test[new_cols]

## Estimating Test Error through Cross Validation

In [196]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score

Training Logistic Regression, Random Forest, and K Nearest Neighbors models seperately:

In [213]:
# Adding StandardScaler to fix 'lbfgs failed to converge' error
log_reg = Pipeline([('scalar', StandardScaler()),
                    ('log_reg', LogisticRegression())])
cross_val_score(log_reg, new_X_train, y_train,
               cv=10, scoring='roc_auc').mean()

0.8267370165592519

In [201]:
rf = Pipeline([('scalar', StandardScaler()),
               ('random_forest', RandomForestClassifier(max_depth=10))])
cross_val_score(rf, new_X_train, y_train,
               cv=10, scoring='roc_auc').mean()

0.8175051295419987

In [203]:
knn = Pipeline([('scalar', StandardScaler()),
                ('knn', KNeighborsClassifier())])
cross_val_score(knn, new_X_train, y_train,
               cv=10, scoring='roc_auc').mean()

0.753170463353759

More neatly:

In [258]:
names = ['log_reg', 'rf', 'knn']
models = [LogisticRegression(), RandomForestClassifier(), KNeighborsClassifier()]
evals = pd.DataFrame({'Models': names,
             'Scores': [0, 0, 0]}, index=None)

In [257]:
for (name, model) in zip(names, models):
    model_pipe = Pipeline([('scalar', StandardScaler()), (name, model)])
    score = cross_val_score(model_pipe, new_X_train, y_train,
                   cv=10, scoring='roc_auc').mean()
    evals.loc[evals['Models'] == name, ['Scores']] = score
evals

Unnamed: 0,Models,Scores
0,log_reg,0.826737
1,rf,0.809133
2,knn,0.75317
