In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from predict_test_data import predict_test_data
from sklearn.preprocessing import StandardScaler, LabelBinarizer
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import PLSRegression
from sklearn.metrics import accuracy_score
from sklearn.cross_decomposition import PLSRegression
from xgboost import XGBClassifier


import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('../data/cleaned/train_final.csv')
test = pd.read_csv('../data/cleaned/test_final.csv')

In [3]:
def full_models(train, test, seed=14):
    #Same seed as baseline to ensure same train and validation sets so that comparisons are valid.
    #10
    np.random.seed(seed)
    X_train, X_valid = train_test_split(train, test_size = 0.2)
    y_train = X_train['home_win'].ravel()
    X_train = X_train.drop(['home_win'], axis = 1)
    y_valid= X_valid['home_win'].ravel()
    X_valid = X_valid.drop(['home_win'], axis = 1)
    y_test = test['home_win'].ravel()


    # collect group
    grp = test['Group'].ravel()
    grp_id = [0 if len(i) == 1 else 1 for i in grp]
    X_test = test.drop(['home_win', 'Group'], axis = 1)

    # scale data
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    X_valid_scaled = scaler.transform(X_valid)
    X_test_scaled = scaler.transform(X_test)
    
    # stores the score of each model
    score = {}
    
    lr_mod = LogisticRegressionCV(solver = 'lbfgs', 
                                  max_iter = 5000, 
                                  cv = 5, 
                                  multi_class='multinomial').fit(X_train, y_train)
    score["Logistic Regression"] = {}
    score["Logistic Regression"]["model"] = lr_mod
    score["Logistic Regression"]["Train Score"] = lr_mod.score(X_train, y_train)
    score["Logistic Regression"]["Validation Score"] = lr_mod.score(X_valid, y_valid)
    score["Logistic Regression"]["Test Score"] = lr_mod.score(X_test, y_test)

    lda_mod = LinearDiscriminantAnalysis().fit(X_train, y_train)
    
    score["LDA"] = {}
    score["LDA"]["model"] = lda_mod
    score["LDA"]["Train Score"] = lda_mod.score(X_train, y_train)
    score["LDA"]["Validation Score"] = lda_mod.score(X_valid, y_valid)
    score["LDA"]["Test Score"] = lda_mod.score(X_test, y_test)
    
    qda_mod = QuadraticDiscriminantAnalysis().fit(X_train, y_train)
    
    score["QDA"] = {}
    score["QDA"]['model'] = qda_mod
    score["QDA"]["Train Score"] = qda_mod.score(X_train, y_train)
    score["QDA"]["Validation Score"] = qda_mod.score(X_valid, y_valid)
    score["QDA"]["Test Score"] = qda_mod.score(X_test, y_test)
    
    rf_params = {'bootstrap': [True, False],
             'max_depth': [3, 5, 10, 20, 30, 40, None],
             'max_features': ['auto', 'sqrt'],
             'min_samples_leaf': [1, 2, 4, 10, 20],
             'min_samples_split': [2, 5, 10],
             'n_estimators': [10, 50, 100, 200, 500]}

    rf_mod = RandomizedSearchCV(estimator=RandomForestClassifier(), param_distributions=rf_params,\
                                n_iter=50, scoring='accuracy', n_jobs=-1, cv=5, verbose=1).fit(X_train, y_train)
    
    score["Random Forest"] = {}
    score["Random Forest"]['model'] = rf_mod
    score["Random Forest"]["Train Score"] = rf_mod.score(X_train, y_train)
    score["Random Forest"]["Validation Score"] = rf_mod.score(X_valid, y_valid)
    score["Random Forest"]["Test Score"] = rf_mod.score(X_test, y_test)
    

    xgb_params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }
    xgb_model = RandomizedSearchCV(estimator=XGBClassifier(objective='multi:softmax', num_class = 3), param_distributions=xgb_params,\
                                       n_iter=50, scoring='accuracy', n_jobs=-1, cv=5, verbose=1).fit(X_train, y_train)
    score["XGBoost"] = {}
    score["XGBoost"]['model'] = xgb_model
    score["XGBoost"]["Train Score"] = xgb_model.score(X_train, y_train)
    score["XGBoost"]["Validation Score"] = xgb_model.score(X_valid, y_valid)
    score["XGBoost"]["Test Score"] = xgb_model.score(X_test, y_test)


    
    # PCA on data
    pca = PCA().fit(X_train_scaled)
    X_train_pca = pca.transform(X_train_scaled)
    X_valid_pca = pca.transform(X_valid_scaled)

    # full components
    pcr_mod = LogisticRegressionCV(solver = 'lbfgs', 
                                   max_iter = 5000, 
                                   cv = 5, 
                                   multi_class='multinomial').fit(X_train_pca, y_train)
    # test set
    X_test_pca = pca.transform(X_test_scaled)

    pcr_test_pred = [np.where(np.argsort(val) == 1)[0][0]-1 if (i==1) & (np.argmax(val) == 1)
                     else np.argmax(val)-1 for i, val in zip(grp_id, pcr_mod.predict_proba(X_test_pca))]
    
    
    score["pcr_full"] = {}
    score["pcr_full"]["model"] = pcr_mod
    score["pcr_full"]["Train Score"] = pcr_mod.score(X_train_pca, y_train)
    score["pcr_full"]["Validation Score"] = pcr_mod.score(X_valid_pca, y_valid)
    score["pcr_full"]["Test Score"] = accuracy_score(y_test, pcr_test_pred)
    
    pca_cumvar = np.cumsum(pca.explained_variance_ratio_) 
    
    pca80_com = np.argmax(pca_cumvar >= 0.8)+1
    pca90_com = np.argmax(pca_cumvar >= 0.9)+1
    
    # 80% variation
    pca80 = PCA(n_components=pca80_com).fit(X_train_scaled)
    X_train_pca80 = pca80.transform(X_train_scaled)
    X_valid_pca80 = pca80.transform(X_valid_scaled)

    # 90% variation
    pca90 = PCA(n_components=pca90_com).fit(X_train_scaled)
    X_train_pca90 = pca90.transform(X_train_scaled)
    X_valid_pca90 = pca90.transform(X_valid_scaled)

    # fit models
    pcr80_mod = LogisticRegressionCV(solver = 'lbfgs', 
                                     max_iter = 5000, 
                                     cv = 5, 
                                     multi_class='multinomial').fit(X_train_pca80, y_train)
    pcr90_mod = LogisticRegressionCV(solver = 'lbfgs', 
                                     max_iter = 5000, 
                                     cv = 5, 
                                     multi_class='multinomial').fit(X_train_pca90, y_train)
    
    # test set
    # 80% variation
    X_test_pca80 = pca80.transform(X_test_scaled)

    pcr80_test_pred = [np.where(np.argsort(val) == 1)[0][0]-1 if (i==1) & (np.argmax(val) == 1)
                       else np.argmax(val)-1 for i, val in zip(grp_id, pcr80_mod.predict_proba(X_test_pca80))]

    # 90% variation
    X_test_pca90 = pca90.transform(X_test_scaled)

    pcr90_test_pred = [np.where(np.argsort(val) == 1)[0][0]-1 if (i==1) & (np.argmax(val) == 1)
                       else np.argmax(val)-1 for i, val in zip(grp_id, pcr90_mod.predict_proba(X_test_pca90))]
    
    # 80% variation
    score["pcr_80%"] = {}
    score["pcr_80%"]["model"] = pcr80_mod
    score["pcr_80%"]["Train Score"] = pcr80_mod.score(X_train_pca80, y_train)
    score["pcr_80%"]["Validation Score"] = pcr80_mod.score(X_valid_pca80, y_valid)
    score["pcr_80%"]["Test Score"] = accuracy_score(y_test, pcr80_test_pred)
    
    # 90% variation
    score["pcr_90%"] = {}
    score["pcr_90%"]["model"] = pcr90_mod
    score["pcr_90%"]["Train Score"] = pcr90_mod.score(X_train_pca90, y_train)
    score["pcr_90%"]["Validation Score"] = pcr90_mod.score(X_valid_pca90, y_valid)
    score["pcr_90%"]["Test Score"] = accuracy_score(y_test, pcr90_test_pred)
    
    
    pcr_train_score = []
    pcr_valid_score = []
    pcr_test_score = []

    for i in np.arange(1, X_train.shape[1]):
        pca_cv = PCA(n_components=i).fit(X_train_scaled)
        X_train_pca_cv = pca_cv.transform(X_train_scaled)
        X_valid_pca_cv = pca_cv.transform(X_valid_scaled)
        X_test_pca_cv = pca_cv.transform(X_test_scaled)
    
        pcr_cv_mod = LogisticRegressionCV(solver = 'lbfgs', 
                                          max_iter = 5000, 
                                          cv = 5, 
                                          multi_class='multinomial').fit(X_train_pca_cv, y_train)
    
        pcr_train_score.append(pcr_cv_mod.score(X_train_pca_cv, y_train))
        pcr_valid_score.append(pcr_cv_mod.score(X_valid_pca_cv, y_valid))
    
        pcr_test_score.append(accuracy_score(y_test,
                                             [np.where(np.argsort(val) == 1)[0][0]-1 if (i==1) & (np.argmax(val) == 1)
                                              else np.argmax(val)-1 for i, val in zip(grp_id, pcr_cv_mod.predict_proba(X_test_pca_cv))]))
        
    pca_best_com = np.argmax(pcr_valid_score)+1
        
    pca_best = PCA(n_components=pca_best_com).fit(X_train_scaled)
    X_train_pca_best = pca_best.transform(X_train_scaled)
    X_valid_pca_best = pca_best.transform(X_valid_scaled)

    # fit models
    pcr_best_mod = LogisticRegressionCV(solver = 'lbfgs', 
                                        max_iter = 5000, 
                                        cv = 5, 
                                        multi_class='multinomial').fit(X_train_pca_best, y_train)
    
    # test set
    X_test_pca_best = pca_best.transform(X_test_scaled)

    pcr_best_test_pred = [np.where(np.argsort(val) == 1)[0][0]-1 if (i==1) & (np.argmax(val) == 1)
                          else np.argmax(val)-1 for i, val in zip(grp_id, pcr_best_mod.predict_proba(X_test_pca_best))]

    score["pcr_best"] = {}
    score["pcr_best"]["model"] = pcr_best_mod
    score["pcr_best"]["Train Score"] = pcr_best_mod.score(X_train_pca_best, y_train)
    score["pcr_best"]["Validation Score"] = pcr_best_mod.score(X_valid_pca_best, y_valid)
    score["pcr_best"]["Test Score"] = accuracy_score(y_test, pcr_best_test_pred)
    
    lb = LabelBinarizer()
    y_train_lb = lb.fit_transform(y_train)
    
    plsda_train_score = []
    plsda_valid_score = []

    for i in np.arange(1, X_train.shape[1]):
        plsda_mod = PLSRegression(n_components=i, scale=False) 
        plsda_mod.fit(X_train_scaled, y_train_lb)

        plsda_train_score.append(accuracy_score(y_train, np.argmax(plsda_mod.predict(X_train_scaled), axis=1) - 1))
        plsda_valid_score.append(accuracy_score(y_valid, np.argmax(plsda_mod.predict(X_valid_scaled), axis=1) - 1))
    
    plsda_best_com = np.argmax(plsda_valid_score)+1
    
    # check test accuracy
    plsda_best_mod = PLSRegression(n_components=plsda_best_com, scale=False) 
    plsda_best_mod.fit(X_train_scaled, y_train_lb)

    plsda_best_test_pred = [np.where(np.argsort(val) == 1)[0][0]-1 if (i==1) & (np.argmax(val) == 1) 
                            else np.argmax(val)-1 for i, val in zip(grp_id, plsda_best_mod.predict(X_test_scaled))]
    
    score["plsda_best"] = {}
    score["plsda_best"]["model"] = plsda_best_mod
    score["plsda_best"]["Train Score"] = accuracy_score(y_train, np.argmax(plsda_mod.predict(X_train_scaled), axis=1) - 1)
    score["plsda_best"]["Validation Score"] = accuracy_score(y_valid, np.argmax(plsda_mod.predict(X_valid_scaled), axis=1) - 1)
    score["plsda_best"]["Test Score"] = accuracy_score(y_test, plsda_best_test_pred)
    
    df_result = pd.DataFrame(score).T
    
    best_mod = df_result['Validation Score'].astype(float).argmax()
    
    best_test_score = score[best_mod]['Test Score']
    
    return df_result, best_mod, best_test_score

In [4]:
columns = ['attack_away_defence_home_diff', 'attack_home_defence_away_diff', 
           'attack_diff', 'defence_diff', 'midfield_diff', 'prestige_diff', 'growth_diff', 
           'full_age_diff', 'start_age_diff', 'value_euros_millions_diff','wage_euros_thousands_diff', 
           'goalkeeper_overall_diff', 'bup_dribbling_diff', 'bup_passing_diff', 'bup_speed_diff',
           'cc_crossing_diff', 'cc_passing_diff', 'cc_shooting_diff', 'd_aggresion_diff', 'd_pressure_diff', 
           'd_width_diff', 'gdp_diff', 'is_home', 'raw_gdp_diff', 
          'score_past_5_games_diff', 'score_conceded_past_5_games_diff']
train = train[columns + ['home_win']]
test = test[columns + ['Group', 'home_win']]

In [5]:
# 85 seems like a good seed
df, best_mod, best_score = full_models(train, test)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    7.7s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   29.9s
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:   40.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   14.2s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:  1.3min finished


In [6]:
df

Unnamed: 0,Test Score,Train Score,Validation Score,model
Logistic Regression,0.5625,0.514832,0.55,"LogisticRegressionCV(Cs=10, class_weight=None,..."
LDA,0.609375,0.514173,0.542105,"LinearDiscriminantAnalysis(n_components=None, ..."
QDA,0.546875,0.526697,0.444737,"QuadraticDiscriminantAnalysis(priors=None, reg..."
Random Forest,0.640625,0.523401,0.536842,"RandomizedSearchCV(cv=5, error_score='raise-de..."
XGBoost,0.59375,0.591958,0.515789,"RandomizedSearchCV(cv=5, error_score='raise-de..."
pcr_full,0.671875,0.511536,0.534211,"LogisticRegressionCV(Cs=10, class_weight=None,..."
pcr_80%,0.65625,0.507581,0.552632,"LogisticRegressionCV(Cs=10, class_weight=None,..."
pcr_90%,0.703125,0.512195,0.528947,"LogisticRegressionCV(Cs=10, class_weight=None,..."
pcr_best,0.65625,0.507581,0.552632,"LogisticRegressionCV(Cs=10, class_weight=None,..."
plsda_best,0.65625,0.509558,0.536842,"PLSRegression(copy=True, max_iter=500, n_compo..."
