In [1]:
import pandas as pd
from xgboost.sklearn import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
import time
import matplotlib.pyplot as plt
import numpy as np
import random
from sklearn.model_selection import KFold, cross_val_score



## XGBoost

In [25]:
params = {'gamma': 10,
 'learning_rate': 0.01,
 'max_depth': 6,
 'min_child_weight': 5,
 'n_estimators': 200,
 'subsample': 0.5}
year = 2010
years = [2010+i for i in range(8)]
baseline = [0.5588477366255145, 0.5253190613421161, 0.5329218106995884, 0.5376388317564789, 0.5300411522633744, 0.541786743515850,
            0.5300658978583196, 0.5395061728395062]

accuracies = []
confusions = []
features = []
num_models = 10
for i in range(8):
    # Read in dataset
    df = pd.read_csv('Rolling Average Stats/' + str(year) +'.csv')

    # Create observation and labels
    X = df.drop(['date', 'home_team', 'away_team', 'home_score', 'away_score', 'home_pitcher', 'away_pitcher','homeLine','awayLine'], 1)
    y = df.home_score > df.away_score # 1 if home team wins, 0 otherwise
    # Use 1st 2/3rds of season for training, test on last 1/3
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, shuffle=False)
    pred_avg = np.zeros(len(y_test))
    features = []

    for i in random.sample(range(1, 1000000), num_models): #[30, 595, 123, 4849, 3, 1010, 234, 8267, 3736, 99999]:
        clfi =  XGBClassifier(**params, silent=False, seed=i)
        clfi.fit(X_train, y_train.values.ravel())
        clfi_pred = clfi.predict(X_test) 
        pred_avg = pred_avg + (1/num_models)*clfi_pred
        features.append(clfi)

    pred_avg = np.round(pred_avg)
    # Report mean error rate
    accuracy = accuracy_score(y_test, pred_avg)
    error_rate = 1 - accuracy
    print('Year: ', year)
    print("Mean error rate: {}\nAccuracy: {}".format(error_rate, accuracy))
    accuracies.append(accuracy)

    # Report confusion matrix for each classifier
    confusion = confusion_matrix(y_test, pred_avg)
    print("Confusion matrix for XGBoost: \n{}\n".format(confusion))
    confusions.append(confusion)
    year += 1
    
print('Average accuracy: ', np.mean(accuracies))

Year:  2010
Mean error rate: 0.45299145299145294
Accuracy: 0.5470085470085471
Confusion matrix for XGBoost: 
[[ 53 264]
 [ 54 331]]

Year:  2011
Mean error rate: 0.4485714285714286
Accuracy: 0.5514285714285714
Confusion matrix for XGBoost: 
[[133 193]
 [121 253]]

Year:  2012
Mean error rate: 0.4387464387464387
Accuracy: 0.5612535612535613
Confusion matrix for XGBoost: 
[[ 96 221]
 [ 87 298]]

Year:  2013
Mean error rate: 0.449358059914408
Accuracy: 0.550641940085592
Confusion matrix for XGBoost: 
[[ 85 249]
 [ 66 301]]

Year:  2014
Mean error rate: 0.47
Accuracy: 0.53
Confusion matrix for XGBoost: 
[[140 170]
 [159 231]]

Year:  2015
Mean error rate: 0.48930099857346643
Accuracy: 0.5106990014265336
Confusion matrix for XGBoost: 
[[ 60 282]
 [ 61 298]]

Year:  2016
Mean error rate: 0.46494992846924177
Accuracy: 0.5350500715307582
Confusion matrix for XGBoost: 
[[ 83 252]
 [ 73 291]]

Year:  2017
Mean error rate: 0.41226818830242506
Accuracy: 0.5877318116975749
Confusion matrix for XGBo

## Random Forest

In [26]:
rf_params = {
    'max_depth' : 100,
    'max_features' : 44, 
    'n_estimators' : 200
}

year = 2010
years = [2010+i for i in range(8)]
baseline = [0.5588477366255145, 0.5253190613421161, 0.5329218106995884, 0.5376388317564789, 0.5300411522633744, 0.541786743515850,
            0.5300658978583196, 0.5395061728395062]

accuracies = []
confusions = []
features = []
for i in range(8):
    # Read in dataset
    df = pd.read_csv('Rolling Average Stats/' + str(year) +'.csv')

    # Create observation and labels
    X = df.drop(['date', 'home_team', 'away_team', 'home_score', 'away_score', 'home_pitcher', 'away_pitcher','homeLine','awayLine'], 1)
    y = df.home_score > df.away_score # 1 if home team wins, 0 otherwise
    # Use 1st 2/3rds of season for training, test on last 1/3
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, shuffle=False)
    features = []
    rf = RandomForestClassifier(**rf_params)
    rf.fit(X_train, y_train)
    pred = rf.predict(X_test)
    
    # Report mean error rate
    accuracy = accuracy_score(y_test, pred)
    error_rate = 1 - accuracy
    print('Year: ', year)
    print("Mean error rate: {}\nAccuracy: {}".format(error_rate, accuracy))
    accuracies.append(accuracy)

    # Report confusion matrix for each classifier
    confusion = confusion_matrix(y_test, pred)
    print("Confusion matrix for XGBoost: \n{}\n".format(confusion))
    confusions.append(confusion)
    year += 1
    
print('Average accuracy: ', np.mean(accuracies))

Year:  2010
Mean error rate: 0.4757834757834758
Accuracy: 0.5242165242165242
Confusion matrix for XGBoost: 
[[ 80 237]
 [ 97 288]]

Year:  2011
Mean error rate: 0.4614285714285714
Accuracy: 0.5385714285714286
Confusion matrix for XGBoost: 
[[164 162]
 [161 213]]

Year:  2012
Mean error rate: 0.48005698005698005
Accuracy: 0.51994301994302
Confusion matrix for XGBoost: 
[[117 200]
 [137 248]]

Year:  2013
Mean error rate: 0.4693295292439372
Accuracy: 0.5306704707560628
Confusion matrix for XGBoost: 
[[125 209]
 [120 247]]

Year:  2014
Mean error rate: 0.4585714285714285
Accuracy: 0.5414285714285715
Confusion matrix for XGBoost: 
[[169 141]
 [180 210]]

Year:  2015
Mean error rate: 0.4821683309557775
Accuracy: 0.5178316690442225
Confusion matrix for XGBoost: 
[[104 238]
 [100 259]]

Year:  2016
Mean error rate: 0.49499284692417744
Accuracy: 0.5050071530758226
Confusion matrix for XGBoost: 
[[113 222]
 [124 240]]

Year:  2017
Mean error rate: 0.4536376604850214
Accuracy: 0.5463623395149786

## SVM

In [27]:
year = 2010
years = [2010+i for i in range(8)]
baseline = [0.5588477366255145, 0.5253190613421161, 0.5329218106995884, 0.5376388317564789, 0.5300411522633744, 0.541786743515850,
            0.5300658978583196, 0.5395061728395062]

accuracies = []
confusions = []
features = []
for i in range(8):
    # Read in dataset
    df = pd.read_csv('Rolling Average Stats/' + str(year) +'.csv')

    # Create observation and labels
    X = df.drop(['date', 'home_team', 'away_team', 'home_score', 'away_score', 'home_pitcher', 'away_pitcher','homeLine','awayLine'], 1)
    y = df.home_score > df.away_score # 1 if home team wins, 0 otherwise
    # Use 1st 2/3rds of season for training, test on last 1/3
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, shuffle=False)
    features = []
    svm = SVC(C=1)
    svm.fit(X_train, y_train)
    pred = svm.predict(X_test)
    
    # Report mean error rate
    accuracy = accuracy_score(y_test, pred)
    error_rate = 1 - accuracy
    print('Year: ', year)
    print("Mean error rate: {}\nAccuracy: {}".format(error_rate, accuracy))
    accuracies.append(accuracy)

    # Report confusion matrix for each classifier
    confusion = confusion_matrix(y_test, pred)
    print("Confusion matrix for XGBoost: \n{}\n".format(confusion))
    confusions.append(confusion)
    year += 1
    
print('Average accuracy: ', np.mean(accuracies))

Year:  2010
Mean error rate: 0.4658119658119658
Accuracy: 0.5341880341880342
Confusion matrix for XGBoost: 
[[ 57 260]
 [ 67 318]]

Year:  2011
Mean error rate: 0.4585714285714285
Accuracy: 0.5414285714285715
Confusion matrix for XGBoost: 
[[140 186]
 [135 239]]

Year:  2012
Mean error rate: 0.4686609686609686
Accuracy: 0.5313390313390314
Confusion matrix for XGBoost: 
[[104 213]
 [116 269]]

Year:  2013
Mean error rate: 0.4664764621968617
Accuracy: 0.5335235378031383
Confusion matrix for XGBoost: 
[[ 79 255]
 [ 72 295]]

Year:  2014
Mean error rate: 0.4642857142857143
Accuracy: 0.5357142857142857
Confusion matrix for XGBoost: 
[[149 161]
 [164 226]]

Year:  2015
Mean error rate: 0.4864479315263909
Accuracy: 0.5135520684736091
Confusion matrix for XGBoost: 
[[ 65 277]
 [ 64 295]]

Year:  2016
Mean error rate: 0.46494992846924177
Accuracy: 0.5350500715307582
Confusion matrix for XGBoost: 
[[100 235]
 [ 90 274]]

Year:  2017
Mean error rate: 0.45078459343794575
Accuracy: 0.54921540656205

## Logistic Regression

In [28]:
year = 2010
years = [2010+i for i in range(8)]
baseline = [0.5588477366255145, 0.5253190613421161, 0.5329218106995884, 0.5376388317564789, 0.5300411522633744, 0.541786743515850,
            0.5300658978583196, 0.5395061728395062]

accuracies = []
confusions = []
features = []
for i in range(8):
    # Read in dataset
    df = pd.read_csv('Rolling Average Stats/' + str(year) +'.csv')

    # Create observation and labels
    X = df.drop(['date', 'home_team', 'away_team', 'home_score', 'away_score', 'home_pitcher', 'away_pitcher','homeLine','awayLine'], 1)
    y = df.home_score > df.away_score # 1 if home team wins, 0 otherwise
    # Use 1st 2/3rds of season for training, test on last 1/3
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, shuffle=False)
    features = []
    reg = LogisticRegression()
    reg.fit(X_train, y_train)
    pred = reg.predict(X_test)
    
    # Report mean error rate
    accuracy = accuracy_score(y_test, pred)
    error_rate = 1 - accuracy
    print('Year: ', year)
    print("Mean error rate: {}\nAccuracy: {}".format(error_rate, accuracy))
    accuracies.append(accuracy)

    # Report confusion matrix for each classifier
    confusion = confusion_matrix(y_test, pred)
    print("Confusion matrix for XGBoost: \n{}\n".format(confusion))
    confusions.append(confusion)
    year += 1

print('Average accuracy: ', np.mean(accuracies))

Year:  2010
Mean error rate: 0.4757834757834758
Accuracy: 0.5242165242165242
Confusion matrix for XGBoost: 
[[ 81 236]
 [ 98 287]]

Year:  2011
Mean error rate: 0.47
Accuracy: 0.53
Confusion matrix for XGBoost: 
[[132 194]
 [135 239]]

Year:  2012
Mean error rate: 0.45868945868945865
Accuracy: 0.5413105413105413
Confusion matrix for XGBoost: 
[[107 210]
 [112 273]]

Year:  2013
Mean error rate: 0.4436519258202568
Accuracy: 0.5563480741797432
Confusion matrix for XGBoost: 
[[127 207]
 [104 263]]

Year:  2014
Mean error rate: 0.4571428571428572
Accuracy: 0.5428571428571428
Confusion matrix for XGBoost: 
[[155 155]
 [165 225]]

Year:  2015
Mean error rate: 0.4864479315263909
Accuracy: 0.5135520684736091
Confusion matrix for XGBoost: 
[[ 73 269]
 [ 72 287]]

Year:  2016
Mean error rate: 0.4277539341917024
Accuracy: 0.5722460658082976
Confusion matrix for XGBoost: 
[[132 203]
 [ 96 268]]

Year:  2017
Mean error rate: 0.47075606276747506
Accuracy: 0.5292439372325249
Confusion matrix for XGBo

## MLP

In [29]:
year = 2010
years = [2010+i for i in range(8)]
baseline = [0.5588477366255145, 0.5253190613421161, 0.5329218106995884, 0.5376388317564789, 0.5300411522633744, 0.541786743515850,
            0.5300658978583196, 0.5395061728395062]

accuracies = []
confusions = []
features = []
for i in range(8):
    # Read in dataset
    df = pd.read_csv('Rolling Average Stats/' + str(year) +'.csv')

    # Create observation and labels
    X = df.drop(['date', 'home_team', 'away_team', 'home_score', 'away_score', 'home_pitcher', 'away_pitcher','homeLine','awayLine'], 1)
    y = df.home_score > df.away_score # 1 if home team wins, 0 otherwise
    # Use 1st 2/3rds of season for training, test on last 1/3
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, shuffle=False)
    features = []
    mlp = MLPClassifier(solver='sgd', learning_rate='adaptive', hidden_layer_sizes=(30, 20))
    mlp.fit(X_train, y_train)
    pred = mlp.predict(X_test)
    
    # Report mean error rate
    accuracy = accuracy_score(y_test, pred)
    error_rate = 1 - accuracy
    print('Year: ', year)
    print("Mean error rate: {}\nAccuracy: {}".format(error_rate, accuracy))
    accuracies.append(accuracy)

    # Report confusion matrix for each classifier
    confusion = confusion_matrix(y_test, pred)
    print("Confusion matrix for XGBoost: \n{}\n".format(confusion))
    confusions.append(confusion)
    year += 1
    
print('Average accuracy: ', np.mean(accuracies))

Year:  2010
Mean error rate: 0.47720797720797725
Accuracy: 0.5227920227920227
Confusion matrix for XGBoost: 
[[ 49 268]
 [ 67 318]]

Year:  2011
Mean error rate: 0.44571428571428573
Accuracy: 0.5542857142857143
Confusion matrix for XGBoost: 
[[110 216]
 [ 96 278]]

Year:  2012
Mean error rate: 0.4786324786324786
Accuracy: 0.5213675213675214
Confusion matrix for XGBoost: 
[[ 67 250]
 [ 86 299]]

Year:  2013
Mean error rate: 0.463623395149786
Accuracy: 0.536376604850214
Confusion matrix for XGBoost: 
[[ 76 258]
 [ 67 300]]

Year:  2014
Mean error rate: 0.4742857142857143
Accuracy: 0.5257142857142857
Confusion matrix for XGBoost: 
[[136 174]
 [158 232]]

Year:  2015
Mean error rate: 0.5064194008559202
Accuracy: 0.49358059914407987
Confusion matrix for XGBoost: 
[[ 60 282]
 [ 73 286]]

Year:  2016
Mean error rate: 0.45779685264663805
Accuracy: 0.542203147353362
Confusion matrix for XGBoost: 
[[103 232]
 [ 88 276]]

Year:  2017
Mean error rate: 0.449358059914408
Accuracy: 0.550641940085592
