In [1]:
import pandas as pd
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
import time

# Read in dataset
df = pd.read_csv('Rolling Average Stats/2017.csv')

# Create observation and labels
X = df.drop(['date', 'home_team', 'away_team', 'home_score', 'away_score', 'home_pitcher', 'away_pitcher'], 1)
y = df.home_score > df.away_score # 1 if home team wins, 0 otherwise

# Use 1st 2/3rds of season for training, test on last 1/3
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, shuffle=False)



In [2]:
# Specify XGBoost parameters
max_depth = [3, 6, 10]
min_child_weight = [0.1,0.5, 1, 2, 5]
gamma = [0, 1, 10, 100]
subsample = [0.5, 1]
#colsample_bytree = [0.5, 1] # Didn't help
eta = [0.01,0.3, 1] # if eta (step size) goes down, num_rounds must go up
n_estimators = [100, 200, 800]
#num_round = # Not sure what the default value is, so not sure what to try

parameters = {'max_depth':max_depth, 'gamma':gamma, 'learning_rate':eta, 'min_child_weight':min_child_weight, 'subsample':subsample, 'n_estimators':n_estimators}

# Create classifier
clf = GridSearchCV(XGBClassifier(), parameters)

# Train classifier
start_time = time.time()
clf.fit(X_train, y_train.values.ravel())
clf_fit_time = (time.time() - start_time)

# Report time of execution
print("XGBoost train time : {:.5f}".format(clf_fit_time))

XGBoost train time : 6400.15129


In [9]:
clf.best_params_

{'gamma': 10,
 'learning_rate': 0.01,
 'max_depth': 6,
 'min_child_weight': 5,
 'n_estimators': 200,
 'subsample': 0.5}

In [3]:
# Test classifier
start_time = time.time()
clf_pred = clf.predict(X_test)
clf_pred_time = (time.time() - start_time)

# Report time of execution
print("XGBoost predict time: {:.5f} seconds".format(clf_pred_time))

XGBoost predict time: 0.01049 seconds


In [4]:
# Report mean error rate
accuracy = accuracy_score(y_test, clf_pred)
error_rate = 1 - accuracy
print("Mean error rate for XGBoost: \n{} \nAccuracy rate for XGBoost: \n{}".format(error_rate, accuracy))

# Report confusion matrix for each classifier
print("Confusion matrix for XGBoost: \n{}".format(confusion_matrix(y_test, clf_pred)))

Mean error rate for XGBoost: 
0.4322396576319544 
Accuracy rate for XGBoost: 
0.5677603423680456
Confusion matrix for XGBoost: 
[[ 91 235]
 [ 68 307]]


In [10]:
import numpy as np
import random

pred_avg = np.zeros(len(y_test))
for i in random.sample(range(1, 1000000), 10): #[30, 595, 123, 4849, 3, 1010, 234, 8267, 3736, 99999]:
    clfi =  XGBClassifier(**clf.best_params_, silent=False, seed=i)
    clfi.fit(X_train, y_train.values.ravel())
    clfi_pred = clfi.predict(X_test)    
    pred_avg = pred_avg + 0.1*clfi_pred

pred_avg = np.round(pred_avg)

# Report mean error rate
accuracy = accuracy_score(y_test, pred_avg)
error_rate = 1 - accuracy
print("Mean error rate for XGBoost: \n{} \nAccuracy rate for XGBoost: \n{}".format(error_rate, accuracy))
    
# Report confusion matrix for each classifier
print("Confusion matrix for XGBoost: \n{}".format(confusion_matrix(y_test, pred_avg)))

Mean error rate for XGBoost: 
0.4179743223965763 
Accuracy rate for XGBoost: 
0.5820256776034237
Confusion matrix for XGBoost: 
[[107 219]
 [ 74 301]]
