In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from predict_test_data import predict_test_data
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('../data/cleaned/train_final.csv')
test = pd.read_csv('../data/cleaned/test_final.csv')

In [3]:
columns = ['attack_away_defence_home_diff', 'attack_home_defence_away_diff', 
           'attack_diff', 'defence_diff', 'midfield_diff', 'prestige_diff', 'growth_diff', 
           'full_age_diff', 'start_age_diff', 'value_euros_millions_diff','wage_euros_thousands_diff', 
           'goalkeeper_overall_diff', 'bup_dribbling_diff', 'bup_passing_diff', 'bup_speed_diff',
           'cc_crossing_diff', 'cc_passing_diff', 'cc_shooting_diff', 'd_aggresion_diff', 'd_pressure_diff', 
           'd_width_diff', 'gdp_diff', 'is_home', 'raw_gdp_diff', 
          'score_past_5_games_diff', 'score_conceded_past_5_games_diff']

train = train[columns + ['home_win']]
test = test[columns + ['Group', 'home_win']]

In [4]:
np.random.seed(14)
X_train, X_valid = train_test_split(train, test_size = 0.2)
y_train = X_train['home_win'].ravel()
X_train = X_train.drop(['home_win'], axis = 1)
y_valid= X_valid['home_win'].ravel()
X_valid = X_valid.drop(['home_win'], axis = 1)
y_test = test['home_win'].ravel()

# collect group
grp = test['Group'].ravel()
grp_id = [0 if len(i) == 1 else 1 for i in grp]
X_test = test.drop(['home_win', 'Group'], axis = 1)

In [5]:
def make_models(X_train, y_train):

    lr_mod = LogisticRegressionCV(solver = 'lbfgs', 
                                  max_iter = 5000, 
                                  cv = 5, 
                                  multi_class='multinomial').fit(X_train, y_train)
    lda_mod = LinearDiscriminantAnalysis().fit(X_train, y_train)
    qda_mod = QuadraticDiscriminantAnalysis().fit(X_train, y_train)
    
    
    rf_params = {'bootstrap': [True, False],
             'max_depth': [3, 5, 10, 20, 30, 40, None],
             'max_features': ['auto', 'sqrt'],
             'min_samples_leaf': [1, 2, 4, 10, 20],
             'min_samples_split': [2, 5, 10],
             'n_estimators': [10, 50, 100, 200, 500]}

    rf_mod = RandomizedSearchCV(estimator=RandomForestClassifier(), param_distributions=rf_params,\
                                n_iter=50, scoring='accuracy', n_jobs=-1, cv=5, verbose=1).fit(X_train, y_train)
    
    
    xgb_params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }
    xgb_model = RandomizedSearchCV(estimator=XGBClassifier(objective='multi:softmax', num_class = 3), param_distributions=xgb_params,\
                                       n_iter=50, scoring='accuracy', n_jobs=-1, cv=5, verbose=1).fit(X_train, y_train)
    
    
    return (lr_mod, lda_mod, qda_mod, rf_mod, xgb_model)

In [6]:
models = make_models(X_train, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    7.7s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   29.5s
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:   40.2s finished


Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   11.8s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   50.8s
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:  1.1min finished


In [7]:
def fit_stack_model(models, X_train, y_train): 
    predictions = []
    for model in models:
        predictions.append(model.predict(X_train))
        
    predictions = np.array(predictions).T
    logit = LogisticRegression(C=1000).fit(predictions, y_train)
    return logit
    
def stack_model_predict(models, stack_model, X, test = False):
    predictions = []
    for model in models:
        if not test:
            predictions.append(model.predict(X))        
        else:
            predictions.append(predict_test_data(X, model))
    predictions = np.array(predictions).T
    return stack_model.predict(predictions)

In [8]:
models_kept = models[:5]
stack_model = fit_stack_model(models_kept, X_valid, y_valid)
accuracy_score(y_valid, stack_model_predict(models_kept, stack_model, X_valid))

0.5631578947368421

In [9]:
accuracy_score(y_test, stack_model_predict(models_kept, stack_model, X_test))

0.625