In [48]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegressionCV
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [49]:
train_df = pd.read_csv('data/cleaned/train.csv')
test_df = pd.read_csv('data/cleaned/test.csv')

In [50]:
np.random.seed(2018)
train, validation = train_test_split(train_df, test_size = 0.2)
y_train = train['home_win'].as_matrix()
X_train = train.drop(['home_win'], axis = 1).as_matrix()
y_validation = validation['home_win'].as_matrix()
X_validation = validation.drop(['home_win'], axis = 1).as_matrix()


In [51]:
lr_model = LogisticRegressionCV(solver = 'lbfgs', max_iter = 5000, cv = 5, multi_class='multinomial').fit(X_train, y_train)

In [52]:
print(lr_model.score(X_train, y_train))
print(lr_model.score(X_validation, y_validation))

0.5516884906960716
0.5619834710743802


In [53]:
lda_model = LinearDiscriminantAnalysis().fit(X_train, y_train)

In [54]:
print(lda_model.score(X_train, y_train))
print(lda_model.score(X_validation, y_validation))

0.562026188835286
0.5661157024793388


In [55]:
qda_model = QuadraticDiscriminantAnalysis().fit(X_train, y_train)

In [56]:
print(qda_model.score(X_train, y_train))
print(qda_model.score(X_validation, y_validation))

0.5503101309441765
0.5482093663911846


In [57]:
rf_model = RandomForestClassifier(min_samples_leaf = 20, n_estimators=100).fit(X_train, y_train)

In [58]:
print(rf_model.score(X_train, y_train))
print(rf_model.score(X_validation, y_validation))

0.6212956581667816
0.5523415977961432


In [59]:
print(accuracy_score(np.ones(len(y_train)), y_train))
print(accuracy_score(np.ones(len(y_validation)), y_validation))

0.4727773949000689
0.47796143250688705


In [60]:
y_test = test_df['home_win'].as_matrix()
X_test = test_df.drop(['home_win'], axis = 1).as_matrix()

In [61]:
lr_model.score(X_test, y_test)

0.53125

In [62]:
lda_model.score(X_test, y_test)

0.5

In [63]:
qda_model.score(X_test, y_test)

0.546875

In [64]:
rf_model.score(X_test, y_test)

0.546875

In [65]:
accuracy_score(np.ones(len(y_test)), y_test)

0.421875