https://journals.lww.com/epidem/FullText/2019/07000/Can_Hyperparameter_Tuning_Improve_the_Performance.9.aspx

tuning is only done for the base model, as the meta model needs to learn from the kfold set and tuning would cause possible data leakage

the optimization is similar to the paper https://www.sciencedirect.com/science/article/pii/S2666827022000020 where they optimized the base models on k-fold, then they aggregated the tuned parameters and used it for the refit base models

# 2 steps
# 1) k-fold base models are tuned and trained for best parameters -> obtain training set to train meta model on tuned base models
# 2) retrained base models are fitted with their best parameters and combined with meta model -> final stacking model
# helps to prevent data leakage

the k-fold models find their best parameters for the k-fold dataset, the meta model finds the best parameters for the meta training set, and the refit model finds the best parameters for the whole train set

sorting the whole column, not just rows?

import

In [57]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os 
import sklearn
import shap 
import time
import math
import sys 
import pathlib

from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import StackingRegressor

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

change paths

In [58]:
path = os.path.abspath('')
os.chdir(f'{path}')

original_dataframe_path = '\\'.join(path.split('\\')[:-1])+'\\data\\Supercapacitor V2.csv'
original_dataframe = pd.read_csv(original_dataframe_path)

target_name = 'CAP'
original_dataframe

Unnamed: 0,SA,DG,%N,%O,%S,%P,%B,CD,CAP,CONC
0,343.5,0.84,2.30,,,4.5,,0.5,292.0,6.0
1,784.0,1.05,3.50,4.10,,,,0.5,98.0,1.0
2,784.0,1.05,3.50,4.10,,,,20.0,58.0,1.0
3,784.0,1.06,4.50,4.40,,,,0.5,104.0,1.0
4,784.0,1.06,4.50,4.40,,,,20.0,49.0,1.0
...,...,...,...,...,...,...,...,...,...,...
1084,571.0,,9.50,11.81,,,,1.0,239.0,6.0
1085,544.0,,8.47,11.77,,,,1.0,250.0,6.0
1086,373.0,,6.36,15.91,,,,1.0,193.0,6.0
1087,460.0,,7.84,14.40,,,,1.0,222.0,6.0


main process

In [59]:
class Main_Stacking_Process:

    # --- MAIN PROCESS LOOP

    # initialize process
    def __init__(self,base_model_names,base_model_classes,stack_meta_model_name,stack_meta_model_class,method):

        self.method = method
        self.dataframe = original_dataframe.copy()
        self.base_model_names = base_model_names
        self.base_model_classes = base_model_classes
        self.stack_meta_model_name = stack_meta_model_name
        self.stack_meta_model_class = stack_meta_model_class
        self.depth = len(self.base_model_names)
        self.dt_parameters = dict(criterion=['squared_error','friedman_mse','absolute_error','poisson'])
        self.knn_parameters = dict(n_neighbors=[5,20],weights=['uniform','distance'])
        self.lin_parameters = dict() # doesn't need to be tuned
        self.parameters_dict = {
            'LIN':self.lin_parameters,
            'DT':self.dt_parameters,
            'KNN':self.knn_parameters,
        }

        print('Start')
        print(f'META: {self.stack_meta_model_name}')
        print(f'MODELS: {self.base_model_names}')

        # loop through all techniques
        feature_importance_techniques_list = ['FPI','PDP','SHAP']
        for feature_importance_technique in feature_importance_techniques_list:
            print(feature_importance_technique)
            self.process(feature_importance_technique)

    # find model order/combinations
    def find_model_combination(self,stack_model_tuple): 
        combination_tuple = []
        def recursion(depth,current_tuple=[]):
            if depth == 0:
                combination = current_tuple
                combination_tuple.append(combination)
            else:
                for model in stack_model_tuple: # for every model in the model tuple
                    recursion(depth-1,current_tuple+[model]) # recursion and add model to tuple
        recursion(self.depth)
        return combination_tuple

    # main training loop, each for SHAP, PDP, and FPI
    def process(self,feature_importance_technique):
        # create file to store dataframes
        pathlib.Path(f'{self.method}\\{feature_importance_technique}CSVMETA{self.stack_meta_model_name}').mkdir(parents=True,exist_ok=True)
        # get order of base models
        combination_df_columns = ['Model'+str(x) for x in range(self.depth)]
        combination_tuple = self.find_model_combination(self.base_model_names)
        combination_df = pd.DataFrame(combination_tuple,columns=combination_df_columns)
        combination_df = combination_df.drop_duplicates(keep='first',ignore_index=True)
        for row in combination_df.index:
            if len(combination_df.loc[row].values) != len(set(combination_df.loc[row].values)):
                combination_df = combination_df.drop(row)
        combination_df = combination_df.reset_index(drop=True)
        # initialize feature importance technique dataframes
        # stores all the results of model prediction to compare
        self.all_prediction_results_df = pd.DataFrame() 
        self.all_feature_importance_results_df = pd.DataFrame()
        self.all_additional_plotting_value_df = pd.DataFrame()
        # train models on each order
        for combination_df_index in combination_df.index:
            self.combination_name_tuple = tuple(combination_df.loc[combination_df_index].values)
            self.combination_class_tuple = []
            self.complete_order_name = ''
            for stack_model in self.combination_name_tuple:
                self.combination_class_tuple.append(self.base_model_classes[stack_model])
                self.complete_order_name = self.complete_order_name + stack_model + '-'
            self.combination_class_tuple = tuple(self.combination_class_tuple)
            # create dataframe to store all seed results
            self.seed_prediction_results_df = pd.DataFrame()
            self.seed_feature_importance_results_df = pd.DataFrame()
            self.seed_additional_plotting_value_df = pd.DataFrame()
            # train in loop
            for random_seed in range(5):
                # process
                self.random_seed = random_seed 
                self.random_state = np.random.RandomState(self.random_seed)
                self.fill_missing_values()
                self.split_train_and_test()
                self.impute_data()
                self.scale_data()
                self.k_fold()
                self.fit()
                self.predict()
                if feature_importance_technique == 'FPI':
                    self.FPI_prediction_function()
                elif feature_importance_technique == 'PDP':
                    self.PDP_prediction_function()
                elif feature_importance_technique == 'SHAP':
                    self.SHAP_prediction_function()
                else:
                    print('error')
                # feature importance process
                self.seed_prediction_results_df = pd.concat([self.temp_normal_prediction_results_df,self.seed_prediction_results_df],axis='rows')
                self.seed_feature_importance_results_df = pd.concat([self.temp_feature_importance_results_df,self.seed_feature_importance_results_df],axis='rows')
                self.seed_additional_plotting_value_df = pd.concat([self.temp_additional_plotting_value_df,self.seed_additional_plotting_value_df],axis='rows')  
            # accumulate for all seeds
            self.all_prediction_results_df = pd.concat([self.seed_prediction_results_df,self.all_prediction_results_df],axis='rows') 
            self.all_feature_importance_results_df = pd.concat([self.seed_feature_importance_results_df,self.all_feature_importance_results_df],axis='rows')
            self.all_additional_plotting_value_df = pd.concat([self.seed_additional_plotting_value_df,self.all_additional_plotting_value_df],axis='rows')
        # reset index
        self.all_prediction_results_df = self.all_prediction_results_df.reset_index(drop=True)
        self.all_feature_importance_results_df = self.all_feature_importance_results_df.reset_index(drop=True)
        self.all_additional_plotting_value_df = self.all_additional_plotting_value_df.reset_index(drop=True)
        # save results
        self.all_prediction_results_df.to_csv(f'{self.method}\\{feature_importance_technique}CSVMETA{self.stack_meta_model_name}\\PREDICTION{feature_importance_technique}{self.stack_meta_model_name}.csv',index=False)
        self.all_feature_importance_results_df.to_csv(f'{self.method}\\{feature_importance_technique}CSVMETA{self.stack_meta_model_name}\\RESULTS{feature_importance_technique}{self.stack_meta_model_name}.csv',index=False)        
        self.all_additional_plotting_value_df.to_csv(f'{self.method}\\{feature_importance_technique}CSVMETA{self.stack_meta_model_name}\\ADDITIONAL{feature_importance_technique}{self.stack_meta_model_name}.csv',index=False)        

    # --- MODEL FUNCTIONS

    # fill missing values
    def fill_missing_values(self):
        self.dataframe = self.dataframe.fillna(0)

    # split into train/test
    def split_train_and_test(self): 
        # split 70/30 train/test
        self.dataframe_feature = self.dataframe.drop([target_name],axis='columns')
        self.dataframe_target = self.dataframe[[target_name]]      
        self.dataframe_train_feature, self.dataframe_test_feature, self.dataframe_train_target, self.dataframe_test_target = train_test_split(self.dataframe_feature,self.dataframe_target,test_size=0.3,random_state=self.random_state)

    # impute missing data, being careful of data leakage
    def impute_data(self):
        # get splits for imputers, non-imputes, and training 
        dataframe_impute_missing = self.dataframe_train_feature[self.dataframe_train_feature['DG'] == 0].drop(['CD','CONC'],axis='columns')
        dataframe_impute_non_missing = self.dataframe_train_feature[self.dataframe_train_feature['DG'] != 0].drop(['CD','CONC'],axis='columns')
        dataframe_unused_train_set = self.dataframe_train_feature[['CD','CONC']]
        dataframe_KNN_imputor_train_feature = dataframe_impute_non_missing.drop(['DG'],axis='columns')
        dataframe_KNN_imputor_train_target = dataframe_impute_non_missing['DG']
        # train KNN imputor based on train split
        KNN_imputor = KNeighborsRegressor(
            n_neighbors=3,
            weights='distance'
        )
        KNN_imputor.fit(dataframe_KNN_imputor_train_feature,dataframe_KNN_imputor_train_target)
        # predict for missing data in train set
        dataframe_impute_missing_feature = dataframe_impute_missing.drop(['DG'],axis='columns')
        imputation_index = dataframe_impute_missing_feature.index
        imputation_prediction = pd.DataFrame(KNN_imputor.predict(dataframe_impute_missing_feature),columns=['DG'])
        imputation_prediction = imputation_prediction.set_index(imputation_index)
        imputed_feature = pd.concat([imputation_prediction,dataframe_impute_missing_feature],axis='columns')
        imputed_and_non_missing_dataframe = pd.concat([imputed_feature,dataframe_impute_non_missing],axis='rows')
        old_index = self.dataframe_train_feature.index
        imputed_and_non_missing_dataframe = pd.concat([imputed_feature,dataframe_impute_non_missing],axis='rows').reindex(old_index)
        imputed_and_non_missing_dataframe = pd.concat([imputed_and_non_missing_dataframe,dataframe_unused_train_set],axis='columns')
        self.dataframe_train_feature = imputed_and_non_missing_dataframe
        self.dataframe_train_target = self.dataframe_train_target
        # predict for missing data in test set #
        dataframe_impute_missing_test = self.dataframe_test_feature[self.dataframe_test_feature['DG'] == 0].drop(['CD','CONC'],axis='columns')
        dataframe_impute_non_missing_test = self.dataframe_test_feature[self.dataframe_test_feature['DG'] != 0].drop(['CD','CONC'],axis='columns')
        dataframe_unused_test_set = self.dataframe_test_feature[['CD','CONC']]
        dataframe_impute_missing_feature_test = dataframe_impute_missing_test.drop(['DG'],axis='columns')
        imputation_index_test = dataframe_impute_missing_feature_test.index
        imputation_prediction_test = pd.DataFrame(KNN_imputor.predict(dataframe_impute_missing_feature_test),columns=['DG'])
        imputation_prediction_test = imputation_prediction_test.set_index(imputation_index_test)
        imputed_feature_test = pd.concat([imputation_prediction_test,dataframe_impute_missing_feature_test],axis='columns')
        imputed_and_non_missing_dataframe_test = pd.concat([imputed_feature_test,dataframe_impute_non_missing_test],axis='rows')
        old_index = self.dataframe_test_feature.index
        imputed_and_non_missing_dataframe_test = pd.concat([imputed_feature_test,dataframe_impute_non_missing_test],axis='rows').reindex(old_index)
        imputed_and_non_missing_dataframe_test = pd.concat([imputed_and_non_missing_dataframe_test,dataframe_unused_test_set],axis='columns')
        self.dataframe_test_feature = imputed_and_non_missing_dataframe_test
        self.dataframe_test_target = self.dataframe_test_target

    # scale data based on train set
    def scale_data(self):
        # keep scale value to rescale later
        self.feature_scale_dict = {}
        self.target_scale_dict = {}
        # scale based on train set for feature #
        for feature_column in self.dataframe_train_feature:
            self.feature_column_max = self.dataframe_train_feature[feature_column].max()
            self.feature_column_min = self.dataframe_train_feature[feature_column].min()
            self.feature_scale_dict[f'MIN_{feature_column}'] = self.feature_column_min
            self.feature_scale_dict[f'MAX_{feature_column}'] = self.feature_column_max            
            self.dataframe_train_feature[feature_column] = self.dataframe_train_feature[feature_column].apply(lambda x: (x-self.feature_column_min)/(self.feature_column_max-self.feature_column_min))
            self.dataframe_test_feature[feature_column] = self.dataframe_test_feature[feature_column].apply(lambda x: (x-self.feature_column_min)/(self.feature_column_max-self.feature_column_min))
        # scale based on train set for target #
        for target_column in self.dataframe_train_target:
            self.target_column_max = self.dataframe_train_target[target_column].max()
            self.target_column_min = self.dataframe_train_target[target_column].min()
            self.target_scale_dict[f'MIN_{target_column}'] = self.target_column_min
            self.target_scale_dict[f'MAX_{target_column}'] = self.target_column_max 
            self.dataframe_train_target[target_column] = self.dataframe_train_target[target_column].apply(lambda x: (x-self.target_column_min)/(self.target_column_max-self.target_column_min))
            self.dataframe_test_target[target_column] = self.dataframe_test_target[target_column].apply(lambda x: (x-self.target_column_min)/(self.target_column_max-self.target_column_min))
    
    # determine number of folds used when training meta model
    def k_fold(self):
        self.number_of_folds = 5
        kfold_holder_for_training = KFold(n_splits=self.number_of_folds,random_state=None,shuffle=False)
        self.splits_for_training = kfold_holder_for_training.split(self.dataframe_train_feature)

    # train the model, fit function
    def fit(self):
        # keep track of time
        self.train_start_time = time.time()
        # get each model used
        self.model_dict = {}
        for model in self.combination_name_tuple:
            self.model_dict[model] = {}
        # accumulate prediction set
        self.prediction_list = []
        self.META_hold_out_prediction_list = []
        # meta model training process
        for count_for_train, (train_index_for_validation,test_index_for_validation) in enumerate(self.splits_for_training):
            # splits
            kfold_train_feature_df = self.dataframe_train_feature.iloc[train_index_for_validation]
            kfold_train_target_df = self.dataframe_train_target.iloc[train_index_for_validation]
            kfold_test_feature_df = self.dataframe_train_feature.iloc[test_index_for_validation]
            kfold_test_target_df = self.dataframe_train_target.iloc[test_index_for_validation]
            # train model
            for count,model_name in enumerate(self.combination_name_tuple):
                self.model_dict[model_name] = {}
                # give the model random state of 0 if possible
                # check for best parameter, try with random state 0
                try:
                    estimator = self.combination_class_tuple[count]()
                    param_grid = self.parameters_dict[model_name].copy()
                    param_grid['random_state'] = [np.random.RandomState(0)]
                    k_fold_grid_search = sklearn.model_selection.GridSearchCV(estimator=estimator,param_grid=param_grid)
                    k_fold_grid_search.fit(kfold_train_feature_df,kfold_train_target_df[['CAP']]) 
                except:
                    estimator = self.combination_class_tuple[count]()
                    param_grid = self.parameters_dict[model_name].copy()
                    k_fold_grid_search = sklearn.model_selection.GridSearchCV(estimator=estimator,param_grid=param_grid)
                    k_fold_grid_search.fit(kfold_train_feature_df,kfold_train_target_df[['CAP']]) 
                model = self.combination_class_tuple[count](
                    **k_fold_grid_search.best_params_
                )
                model.fit(kfold_train_feature_df,kfold_train_target_df[['CAP']])
                model_prediction_for_training_meta_model = model.predict(kfold_test_feature_df)
                # accumulate results of model
                self.model_dict[model_name][count_for_train] = {}
                self.model_dict[model_name][count_for_train]['model'] = model
                self.model_dict[model_name][count_for_train]['meta_feature'] = model_prediction_for_training_meta_model                
            # stacking
            META_hold_out_prediction_df = pd.DataFrame()
            for count,key in enumerate(self.model_dict):
                meta_feature_df = pd.DataFrame(self.model_dict[key][count_for_train]['meta_feature'],columns=[f'MODEL {count}'])
                META_hold_out_prediction_df = pd.concat([META_hold_out_prediction_df,meta_feature_df],axis='columns')
            self.META_hold_out_prediction_list.append(META_hold_out_prediction_df)
        # meta model training set
        self.META_train_df = pd.concat(self.META_hold_out_prediction_list,axis='rows').reset_index(drop=True)
        # meta model
        # check for best parameter, try with random state 0
        try:
            estimator = self.stack_meta_model_class()
            param_grid = self.parameters_dict[self.stack_meta_model_name].copy()
            param_grid['random_state'] = [np.random.RandomState(0)]
            meta_model_grid_search = sklearn.model_selection.GridSearchCV(estimator=estimator,param_grid=param_grid)
            meta_model_grid_search.fit(self.META_train_df.values,self.dataframe_train_target.values.ravel()) 
        except:
            estimator = self.stack_meta_model_class()
            param_grid = self.parameters_dict[self.stack_meta_model_name].copy()
            meta_model_grid_search = sklearn.model_selection.GridSearchCV(estimator=estimator,param_grid=param_grid)
            meta_model_grid_search.fit(self.META_train_df.values,self.dataframe_train_target.values.ravel()) 
        META_model = self.stack_meta_model_class(
            **meta_model_grid_search.best_params_
        )
        META_model.fit(self.META_train_df.values,self.dataframe_train_target.values.ravel())
        self.META_model = META_model
        # refit base models
        self.refit_base_model_dict = {}
        for count,model_name in enumerate(self.combination_name_tuple):
            # check for best parameter, try with random state 0
            try:
                estimator = self.combination_class_tuple[count]()
                param_grid = self.parameters_dict[model_name].copy()
                param_grid['random_state'] = [np.random.RandomState(0)]
                refit_grid_search = sklearn.model_selection.GridSearchCV(estimator=estimator,param_grid=param_grid)
                refit_grid_search.fit(self.META_train_df.values,self.dataframe_train_target.values.ravel()) 
            except:
                estimator = self.combination_class_tuple[count]()
                param_grid = self.parameters_dict[model_name].copy()
                refit_grid_search = sklearn.model_selection.GridSearchCV(estimator=estimator,param_grid=param_grid)
                refit_grid_search.fit(self.META_train_df.values,self.dataframe_train_target.values.ravel()) 
            self.refit_base_model_dict[f'refit{model_name}'] = {}
            model = self.combination_class_tuple[count](
                **refit_grid_search.best_params_
            )
            model.fit(self.dataframe_train_feature.values,self.dataframe_train_target.values.ravel())
            self.refit_base_model_dict[f'refit{model_name}']['model'] = model
            # code name
            self.refit_base_model_dict[f'refit{model_name}']['code_name'] = f'MODEL {count}'
        self.train_end_time = time.time()

    # stacking model prediction function
    def model_prediction(self,features):
        META_prediction_df = pd.DataFrame()
        for count,refit_model in enumerate(self.refit_base_model_dict):
            try:
                refit_model_prediction = pd.DataFrame(self.refit_base_model_dict[refit_model]['model'].predict(features.values),columns=[self.refit_base_model_dict[refit_model]['code_name']])
            except:
                refit_model_prediction = pd.DataFrame(self.refit_base_model_dict[refit_model]['model'].predict(features),columns=[self.refit_base_model_dict[refit_model]['code_name']])
            META_prediction_df = pd.concat([META_prediction_df,refit_model_prediction],axis='columns')
        try:
            self.META_prediction = self.META_model.predict(META_prediction_df.values)
        except:
            self.META_prediction = self.META_model.predict(META_prediction_df)
        return self.META_prediction     

    # predict and compare to get error
    def predict(self):
        # rescale from model prediction 
        self.prediction = self.model_prediction(self.dataframe_test_feature)
        self.prediction = pd.DataFrame(self.prediction,columns=['CAP'])
        self.prediction = self.prediction.set_index(self.dataframe_test_target.index)
        self.real = self.dataframe_test_target[[target_name]].loc[self.dataframe_test_target.index]
        self.prediction = (self.prediction * (self.target_scale_dict['MAX_CAP'] - self.target_scale_dict['MIN_CAP'])) + self.target_scale_dict['MIN_CAP']
        self.real = (self.real * (self.target_scale_dict['MAX_CAP'] - self.target_scale_dict['MIN_CAP'])) + self.target_scale_dict['MIN_CAP']
        # all metrics
        self.y_mean = self.real.mean().values[0]
        self.MAE = abs(self.real - self.prediction).mean().values[0] # mean absolute error
        self.MSE = ((self.real - self.prediction)**2).mean().values[0] # mean squared error
        self.RMSE = math.sqrt(((self.real - self.prediction)**2).mean().values[0]) # root mean squared error
        self.R2 = 1 - (((self.real - self.prediction)**2).sum().values[0]/((self.real - self.y_mean)**2).sum().values[0]) # R2
        self.MAPE = abs((self.real - self.prediction)/self.real).mean().values[0] # mean absolute percentage error  
        # export results
        self.temp_normal_prediction_results_df = pd.DataFrame(
            [[self.MAE,self.MSE,self.RMSE,self.R2,self.MAPE,(self.train_end_time-self.train_start_time)/60,self.random_seed,self.complete_order_name]],
            columns=['MAE','MSE','RMSE','R2','MAPE','TIME','SEED','ORDER']
        )

    # shap
    def SHAP_prediction_function(self):
        # need 3 things 1) prediction function 2) training set 3) validation set
        explainer = shap.KernelExplainer(self.model_prediction,self.dataframe_train_feature.head(5))
        shap_values = explainer.shap_values(self.dataframe_test_feature.head(5),silent=True) # silent diables logging
        # shap results
        self.shap_df = pd.DataFrame(shap_values,columns=self.dataframe_train_feature.columns)
        self.shap_df['SEED'] = self.random_seed
        self.shap_df['ORDER'] = self.complete_order_name
        self.shap_df['INDEX'] = self.dataframe_test_feature.head(5).index
        # original values for plot
        self.original_feature_value_df = pd.DataFrame(self.dataframe_test_feature.head(5),columns=self.dataframe_test_feature.head(5).columns)
        for feature_column in self.original_feature_value_df:
            self.original_feature_value_df[feature_column] = self.original_feature_value_df[feature_column].apply(lambda x: (x * (self.feature_scale_dict[f'MAX_{feature_column}'] - self.feature_scale_dict[f'MIN_{feature_column}'])) + self.feature_scale_dict[f'MIN_{feature_column}'])
        self.original_feature_value_df['SEED'] = self.random_seed
        self.original_feature_value_df['ORDER'] = self.complete_order_name
        self.original_feature_value_df['INDEX'] = self.original_feature_value_df.index
        # export results
        self.temp_feature_importance_results_df = self.shap_df
        self.temp_additional_plotting_value_df = self.original_feature_value_df

    # pdp
    def PDP_prediction_function(self):
        self.PDP_prediction_df = pd.DataFrame()
        # get the mean max and min valyes of the training features to plot later
        mean_value_series = self.dataframe_train_feature.mean()
        max_value_series = self.dataframe_train_feature.max()
        min_value_series = self.dataframe_train_feature.min()
        mean_value_df = pd.DataFrame(columns=mean_value_series.index)
        mean_value_df.loc[0] = 0.5 # mean
        mean_value_df.loc[1] = 1 # max
        mean_value_df.loc[2] = 0 # min
        mean_value_df.index = ['MEAN','MAX','MIN']
        column_list = mean_value_df.columns
        export_mean_value_df = mean_value_df.copy()
        # the pdp process
        all_PDP_prediction_results = pd.DataFrame()
        for column in column_list:
            # get intervals of 10% from max to min to put in PDP
            column_values_list = np.arange(mean_value_df[column]['MIN'],mean_value_df[column]['MAX']+(abs(mean_value_df[column]['MAX']-mean_value_df[column]['MIN'])/10),abs(mean_value_df[column]['MAX']-mean_value_df[column]['MIN'])/10)
            column_values_df = pd.DataFrame()
            for column_values in column_values_list:
                temp_df = pd.DataFrame([mean_value_df.loc['MEAN']],columns=mean_value_df.columns)
                temp_df[column] = column_values
                column_values_df = pd.concat([temp_df,column_values_df],axis='rows')
            column_values_df = column_values_df.sort_values(column) 
            column_values_df = column_values_df.reset_index(drop=True)
            # predict for mean values and PDP
            PDP_prediction_for_column = pd.DataFrame(self.model_prediction(column_values_df),columns=[f'PDP_{column}_CAP']) # capacitance of that specific seed
            PDP_prediction_for_column = PDP_prediction_for_column.map(lambda x: x * (self.target_scale_dict['MAX_CAP'] - self.target_scale_dict['MIN_CAP']) + self.target_scale_dict['MIN_CAP'])
            all_PDP_prediction_results = pd.concat([PDP_prediction_for_column,all_PDP_prediction_results],axis='columns')
            # scale mean value df
            export_mean_value_df[column] = mean_value_df[column].map(lambda x: x * (self.feature_scale_dict[f'MAX_{column}'] - self.feature_scale_dict[f'MIN_{column}']) + self.feature_scale_dict[f'MIN_{column}'])
        # accumulate results
        all_PDP_prediction_results['SEED'] = self.random_seed
        all_PDP_prediction_results['ORDER'] = self.complete_order_name
        all_PDP_prediction_results['INDEX'] = all_PDP_prediction_results.index
        all_PDP_prediction_results['PDPVALUES'] = column_values_list
        self.all_PDP_prediction_results = all_PDP_prediction_results
        # accumulate results
        export_mean_value_df['SEED'] = self.random_seed
        export_mean_value_df['TYPE'] = export_mean_value_df.index
        export_mean_value_df['ORDER'] = self.complete_order_name
        export_mean_value_df = export_mean_value_df.reset_index(drop=True)
        self.PDP_mean_value_df = export_mean_value_df    
        # export results
        self.temp_feature_importance_results_df = self.all_PDP_prediction_results
        self.temp_additional_plotting_value_df = self.PDP_mean_value_df

    # fpi
    def FPI_prediction_function(self):
        self.FPI_prediction_df = pd.DataFrame()
        # get all features
        feature_column = self.dataframe_train_feature.columns
        self.dataframe_accumulated_FPI_prediction = pd.DataFrame()
        # apply FPI for all features
        for feature in feature_column:
            unshuffled_column = self.dataframe_test_feature[[feature]]
            shuffled_column = self.dataframe_test_feature[[feature]].sample(frac=1,random_state=0).set_index(unshuffled_column.index) # shuffle 
            dataframe_FPI_prediction = self.dataframe_test_feature.copy()
            dataframe_FPI_prediction[feature] = shuffled_column
            FPI_prediction = self.model_prediction(self.dataframe_test_feature)
            self.FPI_prediction = pd.DataFrame(self.model_prediction(dataframe_FPI_prediction),columns=['CAP'])
            self.FPI_prediction = self.FPI_prediction.map(lambda x: (x * (self.target_scale_dict['MAX_CAP'] - self.target_scale_dict['MIN_CAP'])) + self.target_scale_dict['MIN_CAP'])
            # results
            self.y_mean = self.real.mean().values[0]
            self.FPI_MAE = abs(self.real - self.FPI_prediction).mean().values[0] # mean absolute error
            self.FPI_MSE = ((self.real - self.FPI_prediction)**2).mean().values[0] # mean squared error
            self.FPI_RMSE = math.sqrt(((self.real - self.FPI_prediction)**2).mean().values[0]) # root mean squared error
            self.FPI_R2 = 1 - (((self.real - self.FPI_prediction)**2).sum().values[0]/((self.real - self.y_mean)**2).sum().values[0]) # R2
            self.FPI_MAPE = abs((self.real - self.FPI_prediction)/self.real).mean().values[0] # mean absolute percentage error
            # accumulate results
            self.dataframe_FPI_prediction = pd.DataFrame(
                [[self.FPI_MAE,self.FPI_MSE,self.FPI_RMSE,self.FPI_R2,self.FPI_MAPE,(self.train_end_time-self.train_start_time)/60,self.random_seed,self.complete_order_name,feature]],
                columns=['MAE','MSE','RMSE','R2','MAPE','TIME','SEED','ORDER','SHUFFLE']
            )
            self.dataframe_accumulated_FPI_prediction = pd.concat([self.dataframe_FPI_prediction,self.dataframe_accumulated_FPI_prediction],axis='rows')
        # export results
        self.temp_feature_importance_results_df = self.dataframe_accumulated_FPI_prediction
        self.temp_additional_plotting_value_df = pd.DataFrame()

# from here onwards, classes have overridden functions based on their use

novel method sort

In [60]:
# train the model, fit function, with added sorting
def fit(self):
    # keep track of time
    self.train_start_time = time.time()
    # get each model used
    self.model_dict = {}
    for model in self.combination_name_tuple:
        self.model_dict[model] = {}
    # accumulate prediction set
    self.prediction_list = []
    self.META_hold_out_prediction_list = []
    # meta model training process
    for count_for_train, (train_index_for_validation,test_index_for_validation) in enumerate(self.splits_for_training):
        # splits
        kfold_train_feature_df = self.dataframe_train_feature.iloc[train_index_for_validation]
        kfold_train_target_df = self.dataframe_train_target.iloc[train_index_for_validation]
        kfold_test_feature_df = self.dataframe_train_feature.iloc[test_index_for_validation]
        kfold_test_target_df = self.dataframe_train_target.iloc[test_index_for_validation]
        # train model
        for count,model_name in enumerate(self.combination_name_tuple):
            self.model_dict[model_name] = {}
            # give the model random state of 0 if possible
            # check for best parameter, try with random state 0
            try:
                estimator = self.combination_class_tuple[count]()
                param_grid = self.parameters_dict[model_name].copy()
                param_grid['random_state'] = [np.random.RandomState(0)]
                k_fold_grid_search = sklearn.model_selection.GridSearchCV(estimator=estimator,param_grid=param_grid)
                k_fold_grid_search.fit(kfold_train_feature_df,kfold_train_target_df[['CAP']]) 
            except:
                estimator = self.combination_class_tuple[count]()
                param_grid = self.parameters_dict[model_name].copy()
                k_fold_grid_search = sklearn.model_selection.GridSearchCV(estimator=estimator,param_grid=param_grid)
                k_fold_grid_search.fit(kfold_train_feature_df,kfold_train_target_df[['CAP']]) 
            model = self.combination_class_tuple[count](
                **k_fold_grid_search.best_params_
            )
            model.fit(kfold_train_feature_df,kfold_train_target_df[['CAP']])
            model_prediction_for_training_meta_model = model.predict(kfold_test_feature_df)
            # accumulate results of model
            self.model_dict[model_name][count_for_train] = {}
            self.model_dict[model_name][count_for_train]['model'] = model
            self.model_dict[model_name][count_for_train]['meta_feature'] = model_prediction_for_training_meta_model                
        # stacking
        META_hold_out_prediction_df = pd.DataFrame()
        for count,key in enumerate(self.model_dict):
            meta_feature_df = pd.DataFrame(self.model_dict[key][count_for_train]['meta_feature'],columns=[f'MODEL {count}'])
            META_hold_out_prediction_df = pd.concat([META_hold_out_prediction_df,meta_feature_df],axis='columns')
        self.META_hold_out_prediction_list.append(META_hold_out_prediction_df)
    # meta model training set
    self.META_train_df = pd.concat(self.META_hold_out_prediction_list,axis='rows').reset_index(drop=True)
    # meta model
    # new, sort the values
    self.sorted_META_train_df = self.META_train_df.sort_values(by=self.META_train_df.first_valid_index(),axis='columns',ascending=self.ascending)
    # check for best parameter, try with random state 0
    try:
        estimator = self.stack_meta_model_class()
        param_grid = self.parameters_dict[self.stack_meta_model_name].copy()
        param_grid['random_state'] = [np.random.RandomState(0)]
        meta_model_grid_search = sklearn.model_selection.GridSearchCV(estimator=estimator,param_grid=param_grid)
        meta_model_grid_search.fit(self.sorted_META_train_df.values,self.dataframe_train_target.values.ravel()) 
    except:
        estimator = self.stack_meta_model_class()
        param_grid = self.parameters_dict[self.stack_meta_model_name].copy()
        meta_model_grid_search = sklearn.model_selection.GridSearchCV(estimator=estimator,param_grid=param_grid)
        meta_model_grid_search.fit(self.sorted_META_train_df.values,self.dataframe_train_target.values.ravel()) 
    META_model = self.stack_meta_model_class(
        **meta_model_grid_search.best_params_
    )
    META_model.fit(self.sorted_META_train_df.values,self.dataframe_train_target.values.ravel())
    self.META_model = META_model
    # refit base models
    # the refit base model orders change
    self.refit_base_model_dict = {}
    for count,model_name in enumerate(self.combination_name_tuple):
        # check for best parameter, try with random state 0
        try:
            estimator = self.combination_class_tuple[count]()
            param_grid = self.parameters_dict[model_name].copy()
            param_grid['random_state'] = [np.random.RandomState(0)]
            refit_grid_search = sklearn.model_selection.GridSearchCV(estimator=estimator,param_grid=param_grid)
            refit_grid_search.fit(self.META_train_df.values,self.dataframe_train_target.values.ravel()) 
        except:
            estimator = self.combination_class_tuple[count]()
            param_grid = self.parameters_dict[model_name].copy()
            refit_grid_search = sklearn.model_selection.GridSearchCV(estimator=estimator,param_grid=param_grid)
            refit_grid_search.fit(self.META_train_df.values,self.dataframe_train_target.values.ravel()) 
        self.refit_base_model_dict[f'refit{model_name}'] = {}
        model = self.combination_class_tuple[count](
            **refit_grid_search.best_params_
        )
        model.fit(self.dataframe_train_feature.values,self.dataframe_train_target.values.ravel())
        self.refit_base_model_dict[f'refit{model_name}']['model'] = model
        # add code name for sorting
        self.refit_base_model_dict[f'refit{model_name}']['code_name'] = f'MODEL {count}'
    self.train_end_time = time.time()

# stacking model prediction function
def model_prediction(self,features):
    META_prediction_df = pd.DataFrame()
    for count,refit_model in enumerate(self.refit_base_model_dict):
        try:
            refit_model_prediction = pd.DataFrame(self.refit_base_model_dict[refit_model]['model'].predict(features.values),columns=[self.refit_base_model_dict[refit_model]['code_name']])
        except:
            refit_model_prediction = pd.DataFrame(self.refit_base_model_dict[refit_model]['model'].predict(features),columns=[self.refit_base_model_dict[refit_model]['code_name']])
        META_prediction_df = pd.concat([META_prediction_df,refit_model_prediction],axis='columns')
    rearranged_column_list = self.sorted_META_train_df.columns
    # new
    self.sorted_META_prediction_df = pd.DataFrame()
    for sorted_column in rearranged_column_list:
        self.sorted_META_prediction_df[sorted_column] = META_prediction_df[sorted_column]  
    # end new
    try:
        self.META_prediction = self.META_model.predict(self.sorted_META_prediction_df.values)
    except:
        self.META_prediction = self.META_model.predict(self.sorted_META_prediction_df)
    return self.META_prediction     

# class with overridden function
class Sort_Stacking_Process(Main_Stacking_Process):
  def __init__(self,base_model_names,base_model_classes,stack_meta_model_name,stack_meta_model_class,method,ascending):
    self.ascending = ascending
    super().__init__(base_model_names,base_model_classes,stack_meta_model_name,stack_meta_model_class,method)

Sort_Stacking_Process.fit = fit
Sort_Stacking_Process.model_prediction = model_prediction

# main loop

models used and process

In [61]:
# base models and meta models
stack_model_tuple = tuple(sorted(['KNN','DT','LIN'])) # number of choices (n)
stack_model_class_dict = {'DT':DecisionTreeRegressor,'KNN':KNeighborsRegressor,'LIN':LinearRegression}
stack_meta_model_tuple = tuple(sorted(['KNN','DT','LIN'])) # number of choices (n)
stack_meta_model_class_dict = {'DT':DecisionTreeRegressor,'KNN':KNeighborsRegressor,'LIN':LinearRegression}

true_depth = len(stack_model_tuple)

# loop through all meta models and methods
method_list = [
    (Sort_Stacking_Process,'Descending'),    
    (Sort_Stacking_Process,'Ascending'),
    (Main_Stacking_Process,'Handcode'),
]
for method in method_list:
    print(method)
    for count in range(len(stack_meta_model_tuple)):
        stack_meta_model_name = stack_meta_model_tuple[count]
        stack_meta_model_class = stack_meta_model_class_dict[stack_meta_model_name]
        if method[1] == 'Ascending':
            process = method[0](base_model_names=stack_model_tuple,base_model_classes=stack_model_class_dict,stack_meta_model_name=stack_meta_model_name,stack_meta_model_class=stack_meta_model_class,method=method[1],ascending=True)
        elif method[1] == 'Descending':
            process = method[0](base_model_names=stack_model_tuple,base_model_classes=stack_model_class_dict,stack_meta_model_name=stack_meta_model_name,stack_meta_model_class=stack_meta_model_class,method=method[1],ascending=False)
        else:
            process = method[0](base_model_names=stack_model_tuple,base_model_classes=stack_model_class_dict,stack_meta_model_name=stack_meta_model_name,stack_meta_model_class=stack_meta_model_class,method=method[1])

(<class '__main__.Sort_Stacking_Process'>, 'Descending')
Start
META: DT
MODELS: ('DT', 'KNN', 'LIN')
FPI
PDP
SHAP
Start
META: KNN
MODELS: ('DT', 'KNN', 'LIN')
FPI
PDP
SHAP
Start
META: LIN
MODELS: ('DT', 'KNN', 'LIN')
FPI
PDP
SHAP
(<class '__main__.Sort_Stacking_Process'>, 'Ascending')
Start
META: DT
MODELS: ('DT', 'KNN', 'LIN')
FPI
PDP
SHAP
Start
META: KNN
MODELS: ('DT', 'KNN', 'LIN')
FPI
PDP
SHAP
Start
META: LIN
MODELS: ('DT', 'KNN', 'LIN')
FPI
PDP
SHAP
(<class '__main__.Main_Stacking_Process'>, 'Handcode')
Start
META: DT
MODELS: ('DT', 'KNN', 'LIN')
FPI
PDP
SHAP
Start
META: KNN
MODELS: ('DT', 'KNN', 'LIN')
FPI
PDP
SHAP
Start
META: LIN
MODELS: ('DT', 'KNN', 'LIN')
FPI
PDP
SHAP
