# 8. Stacking

In [19]:
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, StackingClassifier
# import tensorflow as tf
import pickle

## 8.1 Tree-based models

In [24]:
train_set = pd.read_csv("./preprocessed_data/processed_base_train_set.csv", index_col=0)
validation_set = pd.read_csv("./preprocessed_data/processed_base_validation_set.csv", index_col=0)
test_set = pd.read_csv("./preprocessed_data/processed_base_test_set.csv", index_col=0)

X_train_base, y_train_base = np.array(train_set.drop(columns='FTR')), np.array(train_set['FTR'])
X_val_base, y_val_base = np.array(validation_set.drop(columns='FTR')), np.array(validation_set['FTR'])
X_test_base, y_test_base = np.array(test_set.drop(columns='FTR')), np.array(test_set['FTR'])

# merge training and validation sets
X_all_base = np.concatenate([X_train_base, X_val_base], axis=0)
y_all_base = np.concatenate([y_train_base, y_val_base], axis=0)

# load the model
with open('./models/TreeModelsVotingClassifier.pickle', 'rb') as f:
    voting_clf_tree = pickle.load(f)

## 8.2 Linear models

In [25]:
# train_set = pd.read_csv("./preprocessed_data/processed_categorical_train_set.csv", index_col=0)
# validation_set = pd.read_csv("./preprocessed_data/processed_categorical_validation_set.csv", index_col=0)
# test_set = pd.read_csv("./preprocessed_data/processed_categorical_test_set.csv", index_col=0)

# X_train_cat, y_train_cat = np.array(train_set.drop(columns='FTR')), np.array(train_set['FTR'])
# X_val_cat, y_val_cat = np.array(validation_set.drop(columns='FTR')), np.array(validation_set['FTR'])
# X_test_cat, y_test_cat = np.array(test_set.drop(columns='FTR')), np.array(test_set['FTR'])

# # merge training and validation sets
# X_all_cat = np.concatenate([X_train_cat, X_val_cat], axis=0)
# y_all_cat = np.concatenate([y_train_cat, y_val_cat], axis=0)

# load the model
with open('./models/LinearModelsVotingClassifier.pickle', 'rb') as f:
    voting_clf_linear = pickle.load(f)

In [26]:
stack_clf = StackingClassifier(estimators=[*voting_clf_tree.estimators, *voting_clf_linear.estimators],
                               final_estimator = LogisticRegression(C=0.01),
                               cv=5,
                               n_jobs=1,
                               verbose=1)

In [27]:
stack_clf.fit(X_all_base, y_all_base)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  3.2min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    8.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  3.0min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  7.3min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   52.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_j

StackingClassifier(cv=5,
                   estimators=[('RandomForestClassifier',
                                RandomForestClassifier(bootstrap=True,
                                                       ccp_alpha=0.0,
                                                       class_weight=None,
                                                       criterion='gini',
                                                       max_depth=15,
                                                       max_features='auto',
                                                       max_leaf_nodes=None,
                                                       max_samples=None,
                                                       min_impurity_decrease=0.0,
                                                       min_impurity_split=None,
                                                       min_samples_leaf=1,
                                                       min_samples_split=2,
                      

In [30]:
y_pred = stack_clf.predict(X_test_base)

print(metrics.accuracy_score(y_test_base, y_pred))  
print(metrics.precision_score(y_test_base, y_pred))
print(metrics.recall_score(y_test_base, y_pred))
print(metrics.f1_score(y_test_base, y_pred))
print(metrics.roc_auc_score(y_test_base, y_pred))

0.6842105263157895
0.6538461538461539
0.6071428571428571
0.6296296296296297
0.6762129380053908


---------------------------------------------------------------------------------------------------------------------------------

In [14]:
from sklearn.model_selection import StratifiedKFold, KFold

In [16]:
kfold = KFold(n_splits=5, shuffle=True)
splits = kfold.split(X_train_base, y_train_base)
for train_idx, test_idx in splits:
    print(train_idx,test_idx)

[   0    1    2 ... 5937 5938 5939] [   9   21   23 ... 5930 5933 5935]
[   1    2    5 ... 5937 5938 5939] [   0    3    4 ... 5924 5925 5936]
[   0    2    3 ... 5935 5936 5938] [   1    5    6 ... 5931 5937 5939]
[   0    1    3 ... 5937 5938 5939] [   2   20   24 ... 5920 5932 5934]
[   0    1    2 ... 5936 5937 5939] [  12   15   16 ... 5916 5926 5938]


In [None]:
def select_best_meta_model(base_estimators, X_trains, X_vals y_train, , y_val, n_fold=5, meta_models)
    


meta_features = np.zeros(len(y_train), len(base_estimators))
meta_labels = np.zeros(len(y_val), len(base_estimators))

for i, (clf, X_train) in enumerate(zip(base_estimators, datasets)):
    kfold = KFold(n_splits=n_fold, shuffle=True)
    splits = kfold.split(X_train_base, y_train_base)
    for train_idx, test_idx in splits:
        clf.fit(X_train[train_idx], y_train[train_idx])
        test_fold_pred = clf.pred_proba(X_train[test_idx])
        meta_features[test_idx;i] = test_fold_pred
acc_score
for meta_model in meta_models:
    meta.model.fit(meta_features, y_train)
    