In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegressionCV
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from predict_test_data import predict_test_data

In [3]:
train = pd.read_csv('../data/train_team.csv')
test = pd.read_csv('../data/test_team.csv')

In [4]:
train.columns

Index(['home_win', 'attack_diff', 'bup_dribbling_diff', 'bup_passing_diff',
       'bup_speed_diff', 'cc_crossing_diff', 'cc_passing_diff',
       'cc_shooting_diff', 'd_aggresion_diff', 'd_pressure_diff',
       'd_width_diff', 'defence_diff', 'full_age_diff',
       'goalkeeeper_overall_diff', 'growth_diff', 'midfield_diff',
       'overall_diff', 'prestige_diff', 'start_age_diff',
       'value_euros_millions_diff', 'wage_euros_thousands_diff',
       'cur_year_avg_weighted_diff', 'cur_year_avg_diff',
       'last_year_avg_weighted_diff', 'last_year_avg_diff',
       'previous_points_diff', 'rank_change_diff', 'rank_diff',
       'three_year_ago_avg_diff', 'three_year_ago_weighted_diff',
       'total_points_diff', 'two_year_ago_avg_diff',
       'two_year_ago_weighted_diff'],
      dtype='object')

In [5]:
train = train[['rank_diff', 'home_win', 'attack_diff', 'defence_diff', 'midfield_diff', 'overall_diff']]
test = test[['rank_diff', 'home_win', 'attack_diff', 'defence_diff', 'midfield_diff', 'overall_diff', 'Group']]

In [6]:
np.random.seed(2018)
train, validation = train_test_split(train, test_size = 0.2)
y_train = train['home_win'].ravel()
X_train = train.drop('home_win', axis = 1)
y_validation = validation.loc[:,'home_win'].ravel()
X_validation = validation.drop(['home_win'], axis = 1)
y_test = test['home_win'].ravel()

In [7]:
lr_model = LogisticRegressionCV(solver = 'lbfgs', max_iter = 5000, cv = 5, multi_class='multinomial').fit(X_train, y_train)

In [8]:
print(lr_model.score(X_train, y_train))
print(lr_model.score(X_validation, y_validation))

0.5228095582910934
0.5028901734104047


In [9]:
lda_model = LinearDiscriminantAnalysis().fit(X_train, y_train)

In [10]:
print(lda_model.score(X_train, y_train))
print(lda_model.score(X_validation, y_validation))

0.5170166545981173
0.5144508670520231


In [11]:
qda_model = QuadraticDiscriminantAnalysis().fit(X_train, y_train)

In [12]:
print(qda_model.score(X_train, y_train))
print(qda_model.score(X_validation, y_validation))

0.5286024619840695
0.5028901734104047


In [13]:
rf_model = RandomForestClassifier(min_samples_leaf = 20, n_estimators=100).fit(X_train, y_train)

In [14]:
print(rf_model.score(X_train, y_train))
print(rf_model.score(X_validation, y_validation))

0.5619116582186822
0.5057803468208093


In [15]:
print(accuracy_score(np.ones(len(y_train)), y_train))
print(accuracy_score(np.ones(len(y_validation)), y_validation))

0.44750181028240404
0.45664739884393063


In [16]:
accuracy_score(test['home_win'], predict_test_data(test, X_train.columns, lr_model))

0.59375

In [17]:
accuracy_score(test['home_win'], predict_test_data(test, X_train.columns, lda_model))

0.59375

In [18]:
accuracy_score(test['home_win'], predict_test_data(test, X_train.columns, qda_model))

0.59375

In [19]:
accuracy_score(test['home_win'], predict_test_data(test, X_train.columns, rf_model))

0.625

In [20]:
accuracy_score(np.ones(len(y_test)), y_test)

0.421875