# 8 Modeling - Advance ensembling

<b> Purpose of the action </b> - create 3 advance machine learning models:
- StackClassifier classifier consisting of:
    - base models - best single models of each type from previous parts
    - meta model - logistic regression with high regularization
- VotingClassifier consisting of best single models from previous parts
- VotingClassifier consisting of best 5 single models of each type from previous parts

<b> </b>
<b> Action plan </b>:
- Create StackClassifier and pipeline for each single model
- Create first VotingClassifier
- Create second Voting Classifier
- Compare prediction accuracy and other metrics on the test set and save results for future purpose

## 8.1 Import nessesary libraries and modules

In [1]:
import numpy as np
import pandas as pd
import pickle
from preprocessing_pipelines import tree_preprocess_pipeline, linear_preprocess_pipeline
from data_preprocessing import ImportantFeaturesSelector
from sklearn import metrics
from sklearn.ensemble import VotingClassifier, StackingClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import LogisticRegression
# from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
# from sklearn.pipeline import Pipeline, FeatureUnion
# import matplotlib.pyplot as plt
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.metrics import accuracy_score, log_loss
# from catboost import CatBoostClassifier
# from xgboost import XGBClassifier
# from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, StackingClassifier
# from sklearn.linear_model import LogisticRegression
# from sklearn.svm import SVC
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.model_selection import StratifiedKFold, KFold

## 8.2 Import raw data 

In [2]:
X_train = pd.read_csv('./preprocessed_data/train_set_stage2.csv', index_col=0)
X_test = pd.read_csv('./preprocessed_data/test_set_stage2.csv', index_col=0) 

y_train = np.array(X_train['FTR'])
y_test = np.array(X_test['FTR'])

## 8.3 Import all previously prepared VotingClassifiers

In [3]:
with open('./models/LinearModelsVotingClassifier.pickle', 'rb') as f:
    linear_voting_clf = pickle.load(f)
    
with open('./models/LinearModelsAveragingVotingClassifier.pickle', 'rb') as f:
    linear_averaging_voting_clf = pickle.load(f) 
    
with open('./models/TreeModelsVotingClassifier.pickle', 'rb') as f:
    tree_voting_clf = pickle.load(f) 

with open('./models/TreeModelsAveragingVotingClassifier.pickle', 'rb') as f:
    tree_averaging_voting_clf = pickle.load(f) 

## 8.4 Extract best single models from VotingClassifiers

In [4]:
# extract best tree-based estimators
clf_rf, clf_ada, clf_xgb, clf_cat = tree_voting_clf.estimators

# extract best linear estimators
clf_lr, clf_svc, clf_rbf, clf_knn = linear_voting_clf.estimators

# set max iteration for svc with linear and rbf kernels to save timie during training
clf_svc[1].set_params(max_iter=10000)
clf_rbf[1].set_params(max_iter=10000)

SVC(C=1000000, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=1e-05, kernel='rbf',
    max_iter=10000, probability=True, random_state=0, shrinking=True, tol=0.001,
    verbose=False)

## 8.5 Create empty list for future results

In [5]:
accuracy_score = []
precision_score = []
recall_score = []
f1_score = []
roc_auc_score = []
models_name = []

## 8.6 Create StackClassifier

### 8.6.1 Create pipelines for each individual model of each type

I need to use cross-validation instead of time-based validation for this model due to the small amount of data. I had to prepare a full pre-processing pipeline for each model to avoid data leaks between folds.

#### 8.6.1.1 Create pipelines for tree-based models

In [6]:
# ImportantFeaturesSelector is from data_preprocessing.py
# tree_preprocess_pipeline is from script preprocessing_pipelines.py 

# pipeline for RandomForestClassifier
pipe_rf = Pipeline([
    ('preprocess_pipeline', tree_preprocess_pipeline),
    ('feature_seletion', ImportantFeaturesSelector(clf_rf[1]) ),
    (clf_rf)
])

# pipeline for XGBClassifier
pipe_xgb = Pipeline([
    ('preprocess_pipeline', tree_preprocess_pipeline),
    ('feature_seletion', ImportantFeaturesSelector(clf_xgb[1]) ),
    (clf_xgb)
])

# pipeline for AdaBoostClassifier
pipe_ada = Pipeline([
    ('preprocess_pipeline', tree_preprocess_pipeline),
    ('feature_seletion', ImportantFeaturesSelector(clf_ada[1]) ),
    (clf_ada)
])

# pipeline for CatBoostClassifier
pipe_cat = Pipeline([
    ('preprocess_pipeline', tree_preprocess_pipeline),
    ('feature_seletion', ImportantFeaturesSelector(clf_cat[1]) ),
    (clf_cat)
])

#### 8.6.1.2 Create pipelines for linears models

In [7]:
# ImportantFeaturesSelector is from data_preprocessing.py
# linear_preprocess_pipeline is from script preprocessing_pipelines.py 

# pipeline for Linear SVC
pipe_svc = Pipeline([
    ('preprocess_pipeline', linear_preprocess_pipeline),
    ('feature_seletion', ImportantFeaturesSelector(clf_svc[1]) ),
    (clf_svc)
])

# pipeline for LogisticRegression
pipe_lr = Pipeline([
    ('preprocess_pipeline', linear_preprocess_pipeline),
    ('feature_seletion', ImportantFeaturesSelector(clf_lr[1]) ),
    (clf_lr)
])

# pipeline for KNeighborsClassifier
pipe_knn = Pipeline([
    ('preprocess_pipeline', linear_preprocess_pipeline),
    ('feature_seletion', ImportantFeaturesSelector(clf_knn[1]) ),
    (clf_knn)
])

# pipeline for SVC with rbf
pipe_rbf = Pipeline([
    ('preprocess_pipeline', linear_preprocess_pipeline),
    ('feature_seletion', ImportantFeaturesSelector(clf_rbf[1]) ),
    (clf_rbf)
])

### 8.6.2 Initialize and train model

In [None]:
stack_clf = StackingClassifier(estimators=[('svc', pipe_svc), 
                                           ('lr', pipe_lr), 
                                           ('knn', pipe_knn), 
                                           ('rbf', pipe_rbf),
                                           ('rf', pipe_rf), 
                                           ('xgb', pipe_xgb), 
                                           ('ada', pipe_ada), 
                                           ('cat', pipe_cat)],
                                final_estimator = LogisticRegression(C=1, penalty='l1'),
                                cv=5,
                                n_jobs=1,
                                verbose=10)

stack_clf.fit(X_train, y_train)

### 8.6.3 Calculate metrics of prediction and add results to the lists

In [None]:
# give model a name
model_name = f'{stack_clf.__class__.__name__}'
model_name

In [None]:
# append metrics for classifier to the list 
accuracy_score.append(metrics.accuracy_score(y_test , stack_clf.predict(X_test)))  
precision_score.append(metrics.precision_score(y_test , stack_clf.predict(X_test)))
recall_score.append(metrics.recall_score(y_test , stack_clf.predict(X_test)))
f1_score.append( metrics.f1_score(y_test , stack_clf.predict(X_test)))
roc_auc_score.append(metrics.roc_auc_score(y_test , stack_clf.predict_proba(X_test)[:,1]))

# add claffiers name to the list (needed for created table with results)
models_name.append(model_name)

## 8.6 Create VotingClassifier

### 8.6.2 Initialize and train model

In [None]:
voting_clf = VotingClassifier(estimators=[('svc', pipe_svc), 
                                           ('lr', pipe_lr), 
                                           ('knn', pipe_knn), 
                                           ('rbf', pipe_rbf),
                                           ('rf',pipe_rf), 
                                           ('xgb', pipe_xgb), 
                                           ('ada', pipe_ada), 
                                           ('cat', pipe_cat)],
                            voting='soft')

voting_clf.fit(X_train, y_train)



### 8.6.3 Calculate metrics of prediction and add results to the lists

In [None]:
# give model a name
model_name = f'AllModels{voting_clf.__class__.__name__}'
model_name

In [None]:
# append metrics for classifier to the list 
accuracy_score.append(metrics.accuracy_score(y_test , voting_clf.predict(X_test)))  
precision_score.append(metrics.precision_score(y_test , voting_clf.predict(X_test)))
recall_score.append(metrics.recall_score(y_test , voting_clf.predict(X_test)))
f1_score.append( metrics.f1_score(y_test , voting_clf.predict(X_test)))
roc_auc_score.append(metrics.roc_auc_score(y_test , voting_clf.predict_proba(X_test)[:,1]))

# add claffiers name to the list (needed for created table with results)
models_name.append(model_name)

## 8.7 Create Second VotingClassifier(averaging) - optional

### 8.7.1 Extract single VotingClassifiers from AveragingVotingClassifiers

In [19]:
# extract best tree-based estimators
vot_clf_rf, vot_clf_ada, vot_clf_xgb, vot_clf_cat = tree_averaging_voting_clf.estimators

# extract best linear estimators
vot_clf_lr, vot_clf_svc, vot_clf_rbf, vot_clf_knn = linear_averaging_voting_clf.estimators

# # set max iteration for svc with linear and rbf kernels to save timie during training
for i in range(5):
    vot_clf_svc[1].estimators[i][1].set_params(max_iter=10000)
    vot_clf_rbf[1].estimators[i][1].set_params(max_iter=10000)

vot_clf_rbf[1].estimators[0][1]

SVC(C=1000000, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=1e-05, kernel='rbf',
    max_iter=10000, probability=True, random_state=0, shrinking=True, tol=0.001,
    verbose=False)

### 8.7.2 Create pipelines for each VotingClassifier

#### 8.7.2.1 Create pipelines for tree-based VotingClassifier

In [10]:
# ImportantFeaturesSelector is from data_preprocessing.py
# tree_preprocess_pipeline is from script preprocessing_pipelines.py 

# pipeline for Voting RandomForestClassifier
vot_pipe_rf = Pipeline([
    ('preprocess_pipeline', tree_preprocess_pipeline),
    ('feature_seletion', ImportantFeaturesSelector(vot_clf_rf[1].estimators[0][1]) ),
    (vot_clf_rf)
])

# pipeline for Voting XGBClassifier
vot_pipe_xgb = Pipeline([
    ('preprocess_pipeline', tree_preprocess_pipeline),
    ('feature_seletion', ImportantFeaturesSelector(vot_clf_xgb[1].estimators[0][1]) ),
    (vot_clf_xgb)
])

# pipeline for Voting AdaBoostClassifier
vot_pipe_ada = Pipeline([
    ('preprocess_pipeline', tree_preprocess_pipeline),
    ('feature_seletion', ImportantFeaturesSelector(vot_clf_ada[1].estimators[0][1]) ),
    (vot_clf_ada)
])

# pipeline for Voting CatBoostClassifier
vot_pipe_cat = Pipeline([
    ('preprocess_pipeline', tree_preprocess_pipeline),
    ('feature_seletion', ImportantFeaturesSelector(vot_clf_cat[1].estimators[0][1]) ),
    (vot_clf_cat)
])

#### 8.7.2.2 Create pipelines for linear VotingClassifiers

In [11]:
# ImportantFeaturesSelector is from data_preprocessing.py
# linear_preprocess_pipeline is from script preprocessing_pipelines.py 

# pipeline for Voting Linear SVC
vot_pipe_svc = Pipeline([
    ('preprocess_pipeline', linear_preprocess_pipeline),
    ('feature_seletion', ImportantFeaturesSelector(vot_clf_svc[1].estimators[0][1]) ),
    (vot_clf_svc)
])

# pipeline for Voting LogisticRegression
vot_pipe_lr = Pipeline([
    ('preprocess_pipeline', linear_preprocess_pipeline),
    ('feature_seletion', ImportantFeaturesSelector(vot_clf_lr[1].estimators[0][1]) ),
    (vot_clf_lr)
])

# pipeline for Voting KNeighborsClassifier
vot_pipe_knn = Pipeline([
    ('preprocess_pipeline', linear_preprocess_pipeline),
    ('feature_seletion', ImportantFeaturesSelector(vot_clf_knn[1].estimators[0][1]) ),
    (vot_clf_knn)
])

# pipeline for Voting SVC with rbf
vot_pipe_rbf = Pipeline([
    ('preprocess_pipeline', linear_preprocess_pipeline),
    ('feature_seletion', ImportantFeaturesSelector(vot_clf_rbf[1].estimators[0][1]) ),
    (vot_clf_rbf)
])

### 8.7.1 Initialize and train model

In [None]:
average_voting_clf = VotingClassifier(estimators=[('svc', vot_pipe_svc), 
                                                  ('lr', vot_pipe_lr), 
                                                  ('knn', vot_pipe_knn), 
                                                  ('rbf', vot_pipe_rbf),
                                                  ('rf',  vot_pipe_rf), 
                                                  ('xgb', vot_pipe_xgb), 
                                                  ('ada', vot_pipe_ada), 
                                                  ('cat', vot_pipe_cat)],
                                      voting='soft')

### 8.7.2 Calculate metrics of prediction and add results to the lists

In [None]:
# give model a name
model_name = f'AllModelsAveraging{voting_clf.__class__.__name__}'
model_name

In [None]:
# append metrics for classifier to the list 
accuracy_score.append(metrics.accuracy_score(y_test , voting_clf.predict(X_test)))  
precision_score.append(metrics.precision_score(y_test , voting_clf.predict(X_test)))
recall_score.append(metrics.recall_score(y_test , voting_clf.predict(X_test)))
f1_score.append( metrics.f1_score(y_test , voting_clf.predict(X_test)))
roc_auc_score.append(metrics.roc_auc_score(y_test , voting_clf.predict_proba(X_test)[:,1]))

# add claffiers name to the list (needed for created table with results)
models_name.append(model_name)

## 8.8 Show all result in one table and save it for future purpose

In [None]:
# create dictionary of results 
results_dict = {'precision_score': precision_score, 
               'recall_score': recall_score, 
               'f1_score': f1_score,
               'roc_auc_score' : roc_auc_score,
               'accuracy_score' : accuracy_score}

results_df = pd.DataFrame(data=results_dict)
results_df.insert(loc=0, column='Model', value=models_name)
results_df

In [None]:
results_df.to_csv("./results/advance_ensembling_models_results.csv")