In [43]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegressionCV
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from predict_test_data import predict_test_data

In [44]:
train = pd.read_csv('data/train_team.csv')
test = pd.read_csv('data/test_team.csv')

In [45]:
train.columns

Index(['home_win', 'attack', 'bup_dribbling', 'bup_passing', 'bup_speed',
       'cc_crossing', 'cc_passing', 'cc_shooting', 'd_aggresion', 'd_pressure',
       'd_width', 'defence', 'full_age', 'goalkeeeper_overall', 'growth',
       'midfield', 'overall', 'prestige', 'start_age', 'value_euros_millions',
       'wage_euros_thousands', 'cur_year_avg_weighted', 'cur_year_avg',
       'last_year_avg_weighted', 'last_year_avg', 'previous_points',
       'rank_change', 'rank', 'three_year_ago_avg', 'three_year_ago_weighted',
       'total_points', 'two_year_ago_avg', 'two_year_ago_weighted'],
      dtype='object')

In [46]:
train = train[['rank', 'home_win', 'attack', 'defence', 'midfield', 'overall']]
test = test[['rank', 'home_win', 'attack', 'defence', 'midfield', 'overall', 'Group']]

In [47]:
np.random.seed(2018)
train, validation = train_test_split(train, test_size = 0.2)
y_train = train['home_win'].ravel()
X_train = train.drop('home_win', axis = 1)
y_validation = validation.loc[:,'home_win'].ravel()
X_validation = validation.drop(['home_win'], axis = 1)
y_test = test['home_win'].ravel()

In [48]:
lr_model = LogisticRegressionCV(solver = 'lbfgs', max_iter = 5000, cv = 5, multi_class='multinomial').fit(X_train, y_train)

In [49]:
print(lr_model.score(X_train, y_train))
print(lr_model.score(X_validation, y_validation))

0.5228095582910934
0.5028901734104047


In [50]:
lda_model = LinearDiscriminantAnalysis().fit(X_train, y_train)

In [51]:
print(lda_model.score(X_train, y_train))
print(lda_model.score(X_validation, y_validation))

0.5170166545981173
0.5144508670520231


In [52]:
qda_model = QuadraticDiscriminantAnalysis().fit(X_train, y_train)

In [53]:
print(qda_model.score(X_train, y_train))
print(qda_model.score(X_validation, y_validation))

0.5286024619840695
0.5028901734104047


In [54]:
rf_model = RandomForestClassifier(min_samples_leaf = 20, n_estimators=100).fit(X_train, y_train)

In [55]:
print(rf_model.score(X_train, y_train))
print(rf_model.score(X_validation, y_validation))

0.5619116582186822
0.5057803468208093


In [56]:
print(accuracy_score(np.ones(len(y_train)), y_train))
print(accuracy_score(np.ones(len(y_validation)), y_validation))

0.44750181028240404
0.45664739884393063


In [57]:
accuracy_score(test['home_win'], predict_test_data(test, X_train.columns, lr_model))

0.59375

In [58]:
accuracy_score(test['home_win'], predict_test_data(test, X_train.columns, lda_model))

0.59375

In [59]:
accuracy_score(test['home_win'], predict_test_data(test, X_train.columns, qda_model))

0.59375

In [60]:
accuracy_score(test['home_win'], predict_test_data(test, X_train.columns, rf_model))

0.625

In [61]:
accuracy_score(np.ones(len(y_test)), y_test)

0.421875