In [1]:
# Poprawic wszytkie koncowki w plikach do zapisu w notebookach 5,6,7

# 5 Modeling -  selection of the best tree-based models

<b> Purpose of the action </b> - checking accuracy of prediction on test set using different types of tree-based models:
- RandomForestClassifier
- AdaBoostClassifier
- XGBClassifier
- CatBoostClassifier

<b> </b>
<b> Action plan </b>:
- Test 20 diffrent models for each type
- Use ParameterSampler to generate different models with random hyperparameters
- Use training set for fitting model and use validation set for model evaluation 
- Select the best 5 models of each type and create one AveragingClassifier
- Train the best base models(top 1) of each type model on all data (training and validation sets)
- Do the same with AveragingClassifiers
- Create one AveragingClassifier using the best one model(top 1) of each type
- Create LargeAveragingClassifier from the previously created AveragingClassifier (each model contains the top 5 models of the same type)
- Save odels for use in future
- Compare prediction accuracy and other metrics on test set and save results for future purpose

## 5.1 Import nessesary libraries and modules

In [1]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from catboost import CatBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from modeling import Metrics, show_best_models, select_best_classifiers
from classifiers import AveragingClassifier, LargeAveragingClassifier
from preprocessing_pipelines import basic_preprocess_pipeline, ImportantFeaturesSelector
import pickle

## 5.2 Import data sets dedicated for tree-based models

In [2]:
# data sets for selecting best models of each type
train_set = pd.read_csv("./preprocessed_data/processed_base_train_set.csv", index_col=0)
validation_set = pd.read_csv("./preprocessed_data/processed_base_validation_set.csv", index_col=0)

# data sets for final fiting and prediction
train_set_all = pd.read_csv('./preprocessed_data/train_set_stage2.csv', index_col=0)
test_set = pd.read_csv('./preprocessed_data/test_set_stage2.csv', index_col=0) 

## 5.3 Split datasets to feature and label sets

In [3]:
# feature and label sets for selecting models
X_train, y_train = np.array(train_set.drop(columns='FTR')), np.array(train_set['FTR'])
X_val, y_val = np.array(validation_set.drop(columns='FTR')), np.array(validation_set['FTR'])

# feature and label sets for final training and prediction
X_train_all, y_train_all = train_set_all.drop(columns='FTR'), np.array(train_set_all['FTR'])
X_test, y_test = test_set.drop(columns='FTR'), np.array(test_set['FTR'])

## 5.4 Create placeholders to hold prediction results

In [4]:
# placeholder to hold prediction results
prediction_metrics = Metrics()

# lists to hold model objects
single_models = []
averaging_models = []

## 5.5 RandomForestClassifier

### 5.5.1  Select best models

Choose the best 5 models from 20 tested models using multiprocessing and <b> ParameterSampler </b> for generating random parameters. Use accuracy_score on validation set as metric for models evaluation.
Feature selection is made in the pipeline inside function for each model.

In [5]:
# define params for random grid search
params_grid={
   'n_estimators': [600, 800, 1000, 1200, 1400, 1600],
   'max_depth': [7, 9, 11, 13, 15, 17, 19, 21],
   'random_state': [0, 1, 2, 3, 4, 5, 6, 7, 8 ,9, 10] 
}
    
# to safely run multiprocessing on Windows
if __name__ == '__main__':
    
    # function selecting best classifiers using multiprocessing
    best_models, best_scoring = select_best_classifiers(estimator=RandomForestClassifier, 
                                                        params_grid=params_grid,
                                                        n_iter=20, 
                                                        random_state=23,
                                                        X_train=X_train, 
                                                        y_train=y_train, 
                                                        X_val=X_val, 
                                                        y_val=y_val, 
                                                        verbose=1,
                                                        n_best_models=5)
    # show best selected models
    show_best_models(best_models, best_scoring)

Place: 1
RandomForestClassifier{'random_state': 5, 'n_estimators': 600, 'max_depth': 17}
Accuracy score on validation set: 0.6909
-------------------------------------------------------------------------------------------------------------------------------
Place: 2
RandomForestClassifier{'random_state': 5, 'n_estimators': 1200, 'max_depth': 13}
Accuracy score on validation set: 0.6909
-------------------------------------------------------------------------------------------------------------------------------
Place: 3
RandomForestClassifier{'random_state': 7, 'n_estimators': 1000, 'max_depth': 15}
Accuracy score on validation set: 0.6909
-------------------------------------------------------------------------------------------------------------------------------
Place: 4
RandomForestClassifier{'random_state': 2, 'n_estimators': 800, 'max_depth': 11}
Accuracy score on validation set: 0.6879
----------------------------------------------------------------------------------------------

### 5.5.2 Extract single models from list

In [6]:
clf_1, clf_2, clf_3, clf_4, clf_5 = best_models[:,1][0].steps[1][1], best_models[:,1][1].steps[1][1], \
                                    best_models[:,1][2].steps[1][1], best_models[:,1][3].steps[1][1], \
                                    best_models[:,1][4].steps[1][1]

### 5.5.3 Create compleated pipelines (with scaling, encoding and futures selection) for each individual classifiers

In [7]:
# all base preprocess pipeline and transformers come from module preprocessing_pipelines.py
pipe_clf_1 = Pipeline([ ('preprocess_pipeline', basic_preprocess_pipeline),
                        ('feature_seletion', ImportantFeaturesSelector(clf_1, 'basic') ),
                        ('classification', clf_1)
                      ])

pipe_clf_2 = Pipeline([ ('preprocess_pipeline', basic_preprocess_pipeline),
                        ('feature_seletion', ImportantFeaturesSelector(clf_2, 'basic') ),
                        ('classification', clf_2)
                      ])

pipe_clf_3 = Pipeline([ ('preprocess_pipeline', basic_preprocess_pipeline),
                        ('feature_seletion', ImportantFeaturesSelector(clf_3, 'basic') ),
                        ('classification', clf_3)
                      ])

pipe_clf_4 = Pipeline([ ('preprocess_pipeline', basic_preprocess_pipeline),
                        ('feature_seletion', ImportantFeaturesSelector(clf_4, 'basic') ),
                        ('classification', clf_4)
                      ])

pipe_clf_5 = Pipeline([ ('preprocess_pipeline', basic_preprocess_pipeline),
                        ('feature_seletion', ImportantFeaturesSelector(clf_5, 'basic') ),
                        ('classification', clf_5)
                      ])

### 5.5.4  Make AveragingClassifier from the best 5 selected models (pipelines)

In [8]:
avg_clf = AveragingClassifier(base_estimators=[pipe_clf_1,
                                               pipe_clf_2,
                                               pipe_clf_3,
                                               pipe_clf_4,
                                               pipe_clf_5],
                              voting='soft')

# print(avg_clf.base_estimators[0])

### 5.5.5 Fit single and averaging models on the entire data set 

In [9]:
# to safely run multiprocessing on Windows
if __name__ == '__main__':
    
    # train model on all data
    pipe_clf_1.fit(X_train_all, y_train_all)
    avg_clf.fit(X_train_all, y_train_all)

    # give models a name
    clf_1_name = f'{clf_1.__class__.__name__}'
    avg_clf_name = f'Averaging{clf_1.__class__.__name__}'
    print(clf_1_name, avg_clf_name)

RandomForestClassifier AveragingRandomForestClassifier


### 5.5.6 Calculate metrics of prediction and add results to the lists

In [10]:
# add prediction metrics for single classifier to placeholder
prediction_metrics.add_metrics(pipe_clf_1, clf_1_name, X_test, y_test)

# add prediction metrics for voting classifier to placeholder
prediction_metrics.add_metrics(avg_clf, avg_clf_name, X_test, y_test)

# add both classifiers to the lists (to create largest average classifiers)
single_models.append( (pipe_clf_1) )
averaging_models.append( (avg_clf) )

## 5.6 AdaBoostClassifier

### 5.6.1 Select best models

Choose the best 5 models from 20 tested models using multiprocessing and <b> ParameterSampler </b> for generating random parameters. Use accuracy_score on validation set as metric for models evaluation.
Feature selection is made in the pipeline inside function for each model.

In [11]:
# define params for random grid search
params_grid={
   'base_estimator': [DecisionTreeClassifier(max_depth=1), DecisionTreeClassifier(max_depth=2), 
                      DecisionTreeClassifier(max_depth=3), DecisionTreeClassifier(max_depth=4),
                      DecisionTreeClassifier(max_depth=5)], 
   'n_estimators': [20, 30, 40, 50, 70, 80, 90, 100],
   'learning_rate': [0.4, 0.6, 0.8, 1.0, 1.2, 1.4],
   'random_state': [0, 1, 2, 3, 4, 5, 6, 7, 8 ,9, 10] 
}

# to safely run multiprocessing on Windows
if __name__ == '__main__':
    
    # function selecting best classifiers using multiprocessing
    best_models, best_scoring = select_best_classifiers(estimator=AdaBoostClassifier, 
                                                        params_grid=params_grid,
                                                        n_iter=20, 
                                                        random_state=23,
                                                        X_train=X_train, 
                                                        y_train=y_train, 
                                                        X_val=X_val, 
                                                        y_val=y_val, 
                                                        verbose=1,
                                                        n_best_models=5)
    # show best selected models
    show_best_models(best_models, best_scoring)

Place: 1
AdaBoostClassifier{'random_state': 5, 'n_estimators': 70, 'learning_rate': 0.6, 'base_estimator': DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=2, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')}
Accuracy score on validation set: 0.7152
-------------------------------------------------------------------------------------------------------------------------------
Place: 2
AdaBoostClassifier{'random_state': 5, 'n_estimators': 50, 'learning_rate': 0.8, 'base_estimator': DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=2, max_features=None, max_leaf_nodes=None,
                       min_impurity_decre

### 5.6.2 Extract single models from list

In [12]:
clf_1, clf_2, clf_3, clf_4, clf_5 = best_models[:,1][0].steps[1][1], best_models[:,1][1].steps[1][1], \
                                    best_models[:,1][2].steps[1][1], best_models[:,1][3].steps[1][1], \
                                    best_models[:,1][4].steps[1][1]

### 5.6.3 Create compleated pipelines (with scaling, encoding and futures selection) for each individual classifiers

In [13]:
# all base preprocess pipeline and transformers come from module preprocessing_pipelines.py
pipe_clf_1 = Pipeline([ ('preprocess_pipeline', basic_preprocess_pipeline),
                        ('feature_seletion', ImportantFeaturesSelector(clf_1, 'basic') ),
                        ('classification', clf_1)
                      ])

pipe_clf_2 = Pipeline([ ('preprocess_pipeline', basic_preprocess_pipeline),
                        ('feature_seletion', ImportantFeaturesSelector(clf_2, 'basic') ),
                        ('classification', clf_2)
                      ])

pipe_clf_3 = Pipeline([ ('preprocess_pipeline', basic_preprocess_pipeline),
                        ('feature_seletion', ImportantFeaturesSelector(clf_3, 'basic') ),
                        ('classification', clf_3)
                      ])

pipe_clf_4 = Pipeline([ ('preprocess_pipeline', basic_preprocess_pipeline),
                        ('feature_seletion', ImportantFeaturesSelector(clf_4, 'basic') ),
                        ('classification', clf_4)
                      ])

pipe_clf_5 = Pipeline([ ('preprocess_pipeline', basic_preprocess_pipeline),
                        ('feature_seletion', ImportantFeaturesSelector(clf_5, 'basic') ),
                        ('classification', clf_5)
                      ])

### 5.6.4  Make AveragingClassifier from the best 5 selected models (pipelines)

In [14]:
avg_clf = AveragingClassifier(base_estimators=[pipe_clf_1,
                                               pipe_clf_2,
                                               pipe_clf_3,
                                               pipe_clf_4,
                                               pipe_clf_5],
                              voting='soft')

# print(avg_clf.base_estimators[0])

### 5.6.5 Fit single and averaging models on the entire data set 

In [15]:
# to safely run multiprocessing on Windows
if __name__ == '__main__':
    
    # train model on all data
    pipe_clf_1.fit(X_train_all, y_train_all)
    avg_clf.fit(X_train_all, y_train_all)

    # give models a name
    clf_1_name = f'{clf_1.__class__.__name__}'
    avg_clf_name = f'Averaging{clf_1.__class__.__name__}'
    print(clf_1_name, avg_clf_name)

AdaBoostClassifier AveragingAdaBoostClassifier


### 5.6.6 Calculate metrics of prediction and add results to the lists

In [16]:
# add prediction metrics for single classifier to placeholder
prediction_metrics.add_metrics(pipe_clf_1, clf_1_name, X_test, y_test)

# add prediction metrics for averaging classifier to placeholder
prediction_metrics.add_metrics(avg_clf, avg_clf_name, X_test, y_test)

# add both classifiers to the lists (to create largest average classifiers)
single_models.append( (pipe_clf_1) )
averaging_models.append( (avg_clf) )

## 5.7 XGBClassifier

### 5.7.1 Select best models

Choose the best 5 models from 20 tested models using multiprocessing and <b> ParameterSampler </b> for generating random parameters. Use accuracy_score on validation set as metric for models evaluation.
Feature selection is made in the pipeline inside function for each model.

In [17]:
# define params for random grid search
params_grid = {
                  'random_state':[0, 1, 2 ,3 ,4, 5, 6, 7, 8, 9, 10],
                  'n_estimators': [300, 400, 500, 600, 700], 
                  'learning_rate' : [0.005, 0.01, 0.02],
                  'max_depth' : [3, 4, 5, 6, 7, 8],
                  'min_child_weight': [2, 3, 4],
                  'gamma':[0.2, 0.4, 0.6],
                  'subsample' : [0.7, 0.8, 0.9],
                  'colsample_bytree' : [0.7, 0.8, 0.9],
                  'scale_pos_weight' : [0.8, 1, 1.2],
                  'reg_alpha':[1e-4, 1e-5, 1e-6]
              } 

# to safely run multiprocessing on Windows
if __name__ == '__main__':
    
    # function selecting best classifiers using multiprocessing
    best_models, best_scoring = select_best_classifiers(estimator=XGBClassifier, 
                                                        params_grid=params_grid,
                                                        n_iter=20, 
                                                        random_state=23,
                                                        X_train=X_train, 
                                                        y_train=y_train, 
                                                        X_val=X_val, 
                                                        y_val=y_val, 
                                                        verbose=1,
                                                        n_best_models=5)
    # show best selected models
    show_best_models(best_models, best_scoring)

Place: 1
XGBClassifier{'subsample': 0.7, 'scale_pos_weight': 0.8, 'reg_alpha': 0.0001, 'random_state': 1, 'n_estimators': 400, 'min_child_weight': 4, 'max_depth': 6, 'learning_rate': 0.02, 'gamma': 0.4, 'colsample_bytree': 0.9}
Accuracy score on validation set: 0.7333
-------------------------------------------------------------------------------------------------------------------------------
Place: 2
XGBClassifier{'subsample': 0.9, 'scale_pos_weight': 1, 'reg_alpha': 1e-05, 'random_state': 8, 'n_estimators': 600, 'min_child_weight': 3, 'max_depth': 5, 'learning_rate': 0.02, 'gamma': 0.6, 'colsample_bytree': 0.8}
Accuracy score on validation set: 0.7333
-------------------------------------------------------------------------------------------------------------------------------
Place: 3
XGBClassifier{'subsample': 0.9, 'scale_pos_weight': 1, 'reg_alpha': 1e-06, 'random_state': 2, 'n_estimators': 700, 'min_child_weight': 2, 'max_depth': 4, 'learning_rate': 0.01, 'gamma': 0.2, 'colsampl

### 5.7.2 Extract single models from list

In [18]:
clf_1, clf_2, clf_3, clf_4, clf_5 = best_models[:,1][0].steps[1][1], best_models[:,1][1].steps[1][1], \
                                    best_models[:,1][2].steps[1][1], best_models[:,1][3].steps[1][1], \
                                    best_models[:,1][4].steps[1][1]

### 5.7.3 Create compleated pipelines (with scaling, encoding and futures selection) for each individual classifiers

In [19]:
# all base preprocess pipeline and transformers come from module preprocessing_pipelines.py
pipe_clf_1 = Pipeline([ ('preprocess_pipeline', basic_preprocess_pipeline),
                        ('feature_seletion', ImportantFeaturesSelector(clf_1, 'basic') ),
                        ('classification', clf_1)
                      ])

pipe_clf_2 = Pipeline([ ('preprocess_pipeline', basic_preprocess_pipeline),
                        ('feature_seletion', ImportantFeaturesSelector(clf_2, 'basic') ),
                        ('classification', clf_2)
                      ])

pipe_clf_3 = Pipeline([ ('preprocess_pipeline', basic_preprocess_pipeline),
                        ('feature_seletion', ImportantFeaturesSelector(clf_3, 'basic') ),
                        ('classification', clf_3)
                      ])

pipe_clf_4 = Pipeline([ ('preprocess_pipeline', basic_preprocess_pipeline),
                        ('feature_seletion', ImportantFeaturesSelector(clf_4, 'basic') ),
                        ('classification', clf_4)
                      ])

pipe_clf_5 = Pipeline([ ('preprocess_pipeline', basic_preprocess_pipeline),
                        ('feature_seletion', ImportantFeaturesSelector(clf_5, 'basic') ),
                        ('classification', clf_5)
                      ])

### 5.7.4  Make AveragingClassifier from the best 5 selected models (pipelines)

In [20]:
avg_clf = AveragingClassifier(base_estimators=[pipe_clf_1,
                                               pipe_clf_2,
                                               pipe_clf_3,
                                               pipe_clf_4,
                                               pipe_clf_5],
                              voting='soft')

# print(avg_clf.base_estimators[0])

### 5.7.5 Fit single and averaging models on the entire data set 

In [21]:
# to safely run multiprocessing on Windows
if __name__ == '__main__':
    
    # train model on all data
    pipe_clf_1.fit(X_train_all, y_train_all)
    avg_clf.fit(X_train_all, y_train_all)

    # give models a name
    clf_1_name = f'{clf_1.__class__.__name__}'
    avg_clf_name = f'Averaging{clf_1.__class__.__name__}'
    print(clf_1_name, avg_clf_name)

XGBClassifier AveragingXGBClassifier


### 5.7.6 Calculate metrics of prediction and add results to the lists

In [22]:
# add prediction metrics for single classifier to placeholder
prediction_metrics.add_metrics(pipe_clf_1, clf_1_name, X_test, y_test)

# add prediction metrics for averaging classifier to placeholder
prediction_metrics.add_metrics(avg_clf, avg_clf_name, X_test, y_test)

# add both classifiers to the lists (to create largest average classifiers)
single_models.append( (pipe_clf_1) )
averaging_models.append( (avg_clf) )

## 5.8 CatBoostClassifier

### 5.8.1 Select best models

Choose the best 5 models from 20 tested models using multiprocessing and <b> ParameterSampler </b> for generating random parameters. Use accuracy_score on validation set as metric for models evaluation.
Feature selection is made in the pipeline inside function for each model.

In [23]:
# define params for random grid search
params_grid = {
                  'random_state':[0, 1, 2 ,3 ,4, 5, 6, 7, 8, 9, 10],
                  'n_estimators': [None, 300, 400, 500, 600, 700], 
                  'max_depth' : [None, 4, 5, 6, 7, 8, 9, 10],
                  'subsample' : [None, 0.6,0.7, 0.8, 0.9],
                  'verbose': [0],
              } 

# to safely run multiprocessing on Windows
if __name__ == '__main__':
    
    # function build a voting classifier using multiprocessing
    best_models, best_scoring = select_best_classifiers(estimator=CatBoostClassifier, 
                                                        params_grid=params_grid,
                                                        n_iter=20, 
                                                        random_state=23,
                                                        X_train=X_train, 
                                                        y_train=y_train, 
                                                        X_val=X_val, 
                                                        y_val=y_val, 
                                                        verbose=1,
                                                        n_best_models=5)
    # show best selected models
    show_best_models(best_models, best_scoring)

Place: 1
CatBoostClassifier{'verbose': 0, 'subsample': None, 'random_state': 6, 'n_estimators': 700, 'max_depth': 9}
Accuracy score on validation set: 0.7485
-------------------------------------------------------------------------------------------------------------------------------
Place: 2
CatBoostClassifier{'verbose': 0, 'subsample': 0.9, 'random_state': 8, 'n_estimators': 600, 'max_depth': 7}
Accuracy score on validation set: 0.7455
-------------------------------------------------------------------------------------------------------------------------------
Place: 3
CatBoostClassifier{'verbose': 0, 'subsample': 0.7, 'random_state': 2, 'n_estimators': 600, 'max_depth': 8}
Accuracy score on validation set: 0.7455
-------------------------------------------------------------------------------------------------------------------------------
Place: 4
CatBoostClassifier{'verbose': 0, 'subsample': 0.7, 'random_state': 6, 'n_estimators': 500, 'max_depth': 10}
Accuracy score on validatio

### 5.8.2 Extract single models from list

In [24]:
clf_1, clf_2, clf_3, clf_4, clf_5 = best_models[:,1][0].steps[1][1], best_models[:,1][1].steps[1][1], \
                                    best_models[:,1][2].steps[1][1], best_models[:,1][3].steps[1][1], \
                                    best_models[:,1][4].steps[1][1]

### 5.8.3 Create compleated pipelines (with sclaing, encoding and futures selection) for each individual classifiers

In [25]:
# all base preprocess pipeline and transformers come from module preprocessing_pipelines.py
pipe_clf_1 = Pipeline([ ('preprocess_pipeline', basic_preprocess_pipeline),
                        ('feature_seletion', ImportantFeaturesSelector(clf_1, 'basic') ),
                        ('classification', clf_1)
                      ])

pipe_clf_2 = Pipeline([ ('preprocess_pipeline', basic_preprocess_pipeline),
                        ('feature_seletion', ImportantFeaturesSelector(clf_2, 'basic') ),
                        ('classification', clf_2)
                      ])

pipe_clf_3 = Pipeline([ ('preprocess_pipeline', basic_preprocess_pipeline),
                        ('feature_seletion', ImportantFeaturesSelector(clf_3, 'basic') ),
                        ('classification', clf_3)
                      ])

pipe_clf_4 = Pipeline([ ('preprocess_pipeline', basic_preprocess_pipeline),
                        ('feature_seletion', ImportantFeaturesSelector(clf_4, 'basic') ),
                        ('classification', clf_4)
                      ])

pipe_clf_5 = Pipeline([ ('preprocess_pipeline', basic_preprocess_pipeline),
                        ('feature_seletion', ImportantFeaturesSelector(clf_5, 'basic') ),
                        ('classification', clf_5)
                      ])

### 5.8.4  Make AveragingClassifier from the best 5 selected models (pipelines)

In [26]:
avg_clf = AveragingClassifier(base_estimators=[pipe_clf_1,
                                               pipe_clf_2,
                                               pipe_clf_3,
                                               pipe_clf_4,
                                               pipe_clf_5],
                              voting='soft')

# print(avg_clf.base_estimators[0])

### 5.8.5 Fit single and averaging models on the entire data set 

In [27]:
# to safely run multiprocessing on Windows
if __name__ == '__main__':
    
    # train model on all data
    pipe_clf_1.fit(X_train_all, y_train_all)
    avg_clf.fit(X_train_all, y_train_all)

    # give models a name
    clf_1_name = f'{clf_1.__class__.__name__}'
    avg_clf_name = f'Averaging{clf_1.__class__.__name__}'
    print(clf_1_name, avg_clf_name)

CatBoostClassifier AveragingCatBoostClassifier


### 5.8.6 Calculate metrics of prediction and add results to the lists

In [28]:
# add prediction metrics for single classifier to placeholder
prediction_metrics.add_metrics(pipe_clf_1, clf_1_name, X_test, y_test)

# add prediction metrics for averaging classifier to placeholder
prediction_metrics.add_metrics(avg_clf, avg_clf_name, X_test, y_test)

# add both classifiers to the lists (to create largest average classifiers)
single_models.append( (pipe_clf_1) )
averaging_models.append( (avg_clf) )

## 5.9 Merge single and averaging models in largest averaging models

### 5.9.1 Create new largest averaging models

In [29]:
# create models (all base model is already fitted)

# as base models using single classifier
average_tree_clf = AveragingClassifier(base_estimators=single_models, voting='soft')

# as base models using averaging classifier
large_average_tree_clf = LargeAveragingClassifier(base_estimators=averaging_models, voting='soft')

# give models a name
average_tree_clf_name = 'TreeModelsAveragingClassifier'
large_average_tree_clf_name = 'LargeTreeModelsAveragingClassifier'
print(average_tree_clf_name, large_average_tree_clf_name)

TreeModelsAveragingClassifier LargeTreeModelsAveragingClassifier


### 5.9.2 Calculate metrics of prediction and add results to the lists

In [30]:
# add prediction metrics for averaging classifier to placeholder
prediction_metrics.add_metrics(average_tree_clf, average_tree_clf_name, X_test, y_test)

# add prediction metrics for large averaging classifier to placeholder
prediction_metrics.add_metrics(large_average_tree_clf, large_average_tree_clf_name, X_test, y_test)

### 5.9.3 Save models for future purpose

In [31]:
# save single voting model using pickle library
with open(f'./models/{average_tree_clf_name}.pickle', 'wb') as f:
    # pickle the 'models'using the highest protocol available.
    pickle.dump(average_tree_clf, f, pickle.HIGHEST_PROTOCOL)
    
# save averaging voting model using pickle library
with open(f'./models/{large_average_tree_clf_name}.pickle', 'wb') as f:
    # pickle the 'models'using the highest protocol available.
    pickle.dump(large_average_tree_clf, f, pickle.HIGHEST_PROTOCOL)

## 5.10 Show all result in one table and save it for future purpose

In [32]:
# get prediction metric result lists from placeholder
precision_score, recall_score, f1_score, roc_auc_score, accuracy_score = prediction_metrics.get_metrics()

# get model names list from placeholder
models_name = prediction_metrics.get_names()

# create dictionary of results 
results_dict = {'precision_score': precision_score, 
               'recall_score': recall_score, 
               'f1_score': f1_score,
               'roc_auc_score' : roc_auc_score,
               'accuracy_score' : accuracy_score}

results_df = pd.DataFrame(data=results_dict)
results_df.insert(loc=0, column='Model', value=models_name)
results_df

Unnamed: 0,Model,precision_score,recall_score,f1_score,roc_auc_score,accuracy_score
0,RandomForestClassifier,0.623377,0.571429,0.596273,0.703953,0.657895
1,AveragingRandomForestClassifier,0.616438,0.535714,0.573248,0.706312,0.647368
2,AdaBoostClassifier,0.646341,0.630952,0.638554,0.747529,0.684211
3,AveragingAdaBoostClassifier,0.632911,0.595238,0.613497,0.733491,0.668421
4,XGBClassifier,0.676471,0.547619,0.605263,0.752695,0.684211
5,AveragingXGBClassifier,0.632911,0.595238,0.613497,0.743711,0.668421
6,CatBoostClassifier,0.6375,0.607143,0.621951,0.740004,0.673684
7,AveragingCatBoostClassifier,0.628205,0.583333,0.604938,0.744272,0.663158
8,TreeModelsAveragingClassifier,0.64,0.571429,0.603774,0.738208,0.668421
9,LargeTreeModelsAveragingClassifier,0.644737,0.583333,0.6125,0.7354,0.673684


In [33]:
results_df.to_csv("./results/tree_models_results.csv")