In [3]:
import pandas as pd
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
import time

In [5]:
import numpy as np
import random

params = {'gamma': 10,
 'learning_rate': 0.01,
 'max_depth': 6,
 'min_child_weight': 5,
 'n_estimators': 200,
 'subsample': 0.5}
year = 2010
for i in range(7):
    # Read in dataset
    df = pd.read_csv('Rolling Average Stats/' + str(year) +'.csv')

    # Create observation and labels
    X = df.drop(['date', 'home_team', 'away_team', 'home_score', 'away_score', 'home_pitcher', 'away_pitcher'], 1)
    y = df.home_score > df.away_score # 1 if home team wins, 0 otherwise

    # Use 1st 2/3rds of season for training, test on last 1/3
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, shuffle=False)
    pred_avg = np.zeros(len(y_test))
    for i in random.sample(range(1, 1000000), 10): #[30, 595, 123, 4849, 3, 1010, 234, 8267, 3736, 99999]:
        clfi =  XGBClassifier(**params, silent=False, seed=i)
        clfi.fit(X_train, y_train.values.ravel())
        clfi_pred = clfi.predict(X_test)    
        pred_avg = pred_avg + 0.1*clfi_pred

    pred_avg = np.round(pred_avg)

    # Report mean error rate
    accuracy = accuracy_score(y_test, pred_avg)
    error_rate = 1 - accuracy
    print('Year: ', year)
    print("Mean error rate: {}\nAccuracy: {}".format(error_rate, accuracy))

    # Report confusion matrix for each classifier
    print("Confusion matrix for XGBoost: \n{}".format(confusion_matrix(y_test, pred_avg)))
    year += 1

Year:  2010
Mean error rate: 0.4686609686609686
Accuracy: 0.5313390313390314
Confusion matrix for XGBoost: 
[[ 79 238]
 [ 91 294]]
Year:  2011
Mean error rate: 0.4057142857142857
Accuracy: 0.5942857142857143
Confusion matrix for XGBoost: 
[[177 149]
 [135 239]]
Year:  2012
Mean error rate: 0.43162393162393164
Accuracy: 0.5683760683760684
Confusion matrix for XGBoost: 
[[120 197]
 [106 279]]
Year:  2013
Mean error rate: 0.41512125534950073
Accuracy: 0.5848787446504993
Confusion matrix for XGBoost: 
[[139 195]
 [ 96 271]]
Year:  2014
Mean error rate: 0.43999999999999995
Accuracy: 0.56
Confusion matrix for XGBoost: 
[[166 144]
 [164 226]]
Year:  2015
Mean error rate: 0.4322396576319544
Accuracy: 0.5677603423680456
Confusion matrix for XGBoost: 
[[ 99 243]
 [ 60 299]]
Year:  2016
Mean error rate: 0.4306151645207439
Accuracy: 0.5693848354792561
Confusion matrix for XGBoost: 
[[120 215]
 [ 86 278]]
