In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegressionCV
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [2]:
train_df = pd.read_csv('data/cleaned/train.csv')
test_df = pd.read_csv('data/cleaned/test.csv')

In [4]:
train_df.head()

Unnamed: 0,home_win,home_cur_year_avg_weighted,home_total_points,away_cur_year_avg_weighted,away_total_points,home_last_year_avg,away_last_year_avg,home_two_year_ago_avg,away_two_year_ago_avg,CAF,CONCACAF,CONMEBOL,OFC,UEFA,gdp_pp_diff,rank_diff
0,0,174.7,291.66,343.17,535.63,102.82,152.19,163.78,266.37,0,0,0,0,0,39359.0,45
1,1,101.66,249.49,122.76,393.28,166.27,343.17,106.47,152.19,0,0,0,0,0,39359.0,41
2,2,151.41,289.46,257.72,436.97,131.81,102.61,151.92,242.16,0,0,0,0,0,39359.0,47
3,2,111.05,256.68,90.43,298.5,151.41,257.72,131.81,102.61,0,0,0,0,0,39359.0,16
4,1,140.16,324.85,343.17,535.63,183.57,152.19,225.79,266.37,0,0,0,0,0,112440.0,38


In [31]:
np.random.seed(2018)
train, validation = train_test_split(train_df, test_size = 0.2)
y_train = train['home_win'].as_matrix()
X_train = train.drop(['home_win'], axis = 1).as_matrix()
y_validation = validation['home_win'].as_matrix()
X_validation = validation.drop(['home_win'], axis = 1).as_matrix()


In [32]:
lr_model = LogisticRegressionCV(solver = 'lbfgs', max_iter = 5000, cv = 5, multi_class='multinomial').fit(X_train, y_train)

In [33]:
print(lr_model.score(X_train, y_train))
print(lr_model.score(X_validation, y_validation))

0.5527929155313351
0.5354223433242506


In [34]:
lda_model = LinearDiscriminantAnalysis().fit(X_train, y_train)

In [35]:
print(lda_model.score(X_train, y_train))
print(lda_model.score(X_validation, y_validation))

0.5630108991825613
0.553133514986376


In [36]:
qda_model = QuadraticDiscriminantAnalysis().fit(X_train, y_train)

In [37]:
print(qda_model.score(X_train, y_train))
print(qda_model.score(X_validation, y_validation))

0.5480245231607629
0.5149863760217984


In [38]:
rf_model = RandomForestClassifier(min_samples_leaf = 20, n_estimators=100).fit(X_train, y_train)

In [39]:
print(rf_model.score(X_train, y_train))
print(rf_model.score(X_validation, y_validation))

0.6304495912806539
0.5463215258855586


In [40]:
print(accuracy_score(np.ones(len(y_train)), y_train))
print(accuracy_score(np.ones(len(y_validation)), y_validation))

0.47445504087193463
0.4673024523160763


In [41]:
y_test = test_df['home_win'].as_matrix()
X_test = test_df.drop(['home_win'], axis = 1).as_matrix()

In [42]:
lr_model.score(X_test, y_test)

0.53125

In [43]:
lda_model.score(X_test, y_test)

0.5

In [44]:
qda_model.score(X_test, y_test)

0.53125

In [45]:
rf_model.score(X_test, y_test)

0.5

In [46]:
accuracy_score(np.ones(len(y_test)), y_test)

0.421875