In [1]:
import pandas as pd
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
import time

# Read in dataset
df = pd.read_csv('Rolling Average Stats/2017.csv')

# Create observation and labels
X = df.drop(['date', 'home_team', 'away_team', 'home_score', 'away_score', 'home_pitcher', 'away_pitcher'], 1)
y = df.home_score > df.away_score # 1 if home team wins, 0 otherwise

# Use 1st 2/3rds of season for training, test on last 1/3
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, shuffle=False)



In [None]:
# Specify XGBoost parameters
max_depth = [1, 3, 7, 10]
min_child_weight = [0.1,0.5, 1, 2, 5]
gamma = [0, 1, 10, 100]
subsample = [0.5, 1]
learning_rate = [0.01, 0.1, 1]
#colsample_bytree = [0.5, 1] # Didn't help
eta = [0.01,0.3, 1] # if eta (step size) goes down, num_rounds must go up
#num_round = # Not sure what the default value is, so not sure what to try

parameters = {'max_depth':max_depth, 'gamma':gamma, 'learning_rate':eta, 'min_child_weight':min_child_weight, 
              'subsample':subsample, 'learning_rate':learning_rate}

# Create classifier
clf = GridSearchCV(XGBClassifier(), parameters)

# Train classifier
start_time = time.time()
clf.fit(X_train, y_train.values.ravel())
clf_fit_time = (time.time() - start_time)

# Report time of execution
print("XGBoost train time : {:.5f}".format(clf_fit_time))

In [None]:
# Test classifier
start_time = time.time()
clf_pred = clf.predict(X_test)
clf_pred_time = (time.time() - start_time)

# Report time of execution
print("XGBoost predict time: {:.5f} seconds".format(clf_pred_time))

In [None]:
# Report mean error rate
accuracy = accuracy_score(y_test, clf_pred)
error_rate = 1 - accuracy
print("Mean error rate for XGBoost: \n{} \nAccuracy rate for XGBoost: \n{}".format(error_rate, accuracy))

# Report confusion matrix for each classifier
print("Confusion matrix for XGBoost: \n{}".format(confusion_matrix(y_test, clf_pred)))

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn import preprocessing
clf = None
while True:
    clf = MLPClassifier(max_iter=200, solver='sgd', learning_rate='adaptive', hidden_layer_sizes=(25,))
    clf.fit(X_train, y_train)
    predict = clf.predict(X_test)
    accuracy = accuracy_score(y_test, predict)
    print(accuracy)
    if accuracy > 0.59:
        break

0.519623233909
0.5431711146
0.551020408163
0.532182103611
0.516483516484
0.50863422292
0.562009419152
0.518053375196
0.538461538462
0.533751962323
0.565149136578
0.580847723705
0.571428571429
0.555729984301
0.546310832025
0.551020408163
0.568288854003
0.5431711146
0.555729984301
0.562009419152
0.521193092622
0.576138147567
0.521193092622
0.568288854003
0.527472527473
0.533751962323
0.562009419152
0.533751962323
0.536891679749
0.547880690738
0.536891679749
0.507064364207
0.549450549451
0.511773940345
0.514913657771
0.540031397174
0.56043956044
0.514913657771
0.519623233909
0.503924646782
0.580847723705
0.541601255887
0.529042386185
0.557299843014
0.558869701727
0.582417582418
0.533751962323
0.538461538462
0.552590266876
0.529042386185
0.538461538462
0.5431711146
0.538461538462
0.529042386185
0.5431711146
0.536891679749
0.557299843014
0.519623233909
0.546310832025
0.569858712716
0.530612244898
0.5431711146
0.547880690738
0.527472527473
0.535321821036
0.551020408163
0.569858712716
0.54160

In [None]:
from sklearn import svm
svm = svm.SVC()
svm.fit(X_train, y_train)
predict = clf.predict(X_test)
accuracy = accuracy_score(y_test, predict)
print(accuracy)
print(confusion_matrix(y_test, predict))