In [5]:
import pandas as pd
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
import time

# Read in dataset
df = pd.read_csv('Rolling Average Stats/2017.csv')

# Create observation and labels
X = df.drop(['date', 'home_team', 'away_team', 'home_score', 'away_score', 'home_pitcher', 'away_pitcher'], 1)
y = df.home_score > df.away_score # 1 if home team wins, 0 otherwise

# Use 1st 2/3rds of season for training, test on last 1/3
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, shuffle=False)

<bound method NDFrame.head of 0        True
1        True
2        True
3       False
4       False
5       False
6        True
7        True
8        True
9        True
10       True
11       True
12      False
13      False
14       True
15       True
16       True
17       True
18      False
19      False
20       True
21      False
22      False
23       True
24       True
25      False
26       True
27      False
28      False
29       True
        ...  
2093     True
2094    False
2095     True
2096    False
2097    False
2098     True
2099    False
2100    False
2101     True
2102    False
2103     True
2104    False
2105     True
2106     True
2107     True
2108    False
2109    False
2110    False
2111     True
2112     True
2113    False
2114    False
2115     True
2116    False
2117     True
2118    False
2119     True
2120    False
2121     True
2122    False
Length: 2123, dtype: bool>

In [2]:
# Specify XGBoost parameters
max_depth = [1, 3, 7, 10]
min_child_weight = [0.1,0.5, 1, 2, 5]
gamma = [0, 1, 10, 100]
subsample = [0.5, 1]
learning_rate = [0.01, 0.1, 1]
#colsample_bytree = [0.5, 1] # Didn't help
eta = [0.01,0.3, 1] # if eta (step size) goes down, num_rounds must go up
#num_round = # Not sure what the default value is, so not sure what to try

parameters = {'max_depth':max_depth, 'gamma':gamma, 'learning_rate':eta, 'min_child_weight':min_child_weight, 
              'subsample':subsample, 'learning_rate':learning_rate}

# Create classifier
clf = GridSearchCV(XGBClassifier(), parameters)

# Train classifier
start_time = time.time()
clf.fit(X_train, y_train.values.ravel())
clf_fit_time = (time.time() - start_time)

# Report time of execution
print("XGBoost train time : {:.5f}".format(clf_fit_time))

XGBoost train time : 393.45840


In [3]:
# Test classifier
start_time = time.time()
clf_pred = clf.predict(X_test)
clf_pred_time = (time.time() - start_time)

# Report time of execution
print("XGBoost predict time: {:.5f} seconds".format(clf_pred_time))

XGBoost predict time: 0.00365 seconds


In [4]:
# Report mean error rate
accuracy = accuracy_score(y_test, clf_pred)
error_rate = 1 - accuracy
print("Mean error rate for XGBoost: \n{} \nAccuracy rate for XGBoost: \n{}".format(error_rate, accuracy))

# Report confusion matrix for each classifier
print("Confusion matrix for XGBoost: \n{}".format(confusion_matrix(y_test, clf_pred)))

Mean error rate for XGBoost: 
0.44650499286733236 
Accuracy rate for XGBoost: 
0.5534950071326676
Confusion matrix for XGBoost: 
[[125 201]
 [112 263]]


In [260]:
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(max_iter=200, solver='adam', learning_rate='adaptive', hidden_layer_sizes=(44,44,44,44))
clf.fit(X_train, y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(44, 44, 44, 44), learning_rate='adaptive',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [261]:
predict = clf.predict(X_test)
accuracy = accuracy_score(y_test, predict)
print(accuracy)
print(confusion_matrix(y_test, predict))

0.549215406562
[[ 37 289]
 [ 27 348]]


In [265]:
from sklearn import svm
svm = svm.SVC(kernel='poly')
svm.fit(X_train, y_train)
predict = clf.predict(X_test)
accuracy = accuracy_score(y_test, predict)
print(accuracy)
print(confusion_matrix(y_test, predict))

0.549215406562
[[ 37 289]
 [ 27 348]]
