# Stacking Method Function Definition

## Input : 

origin_df_X : The input(original) features of the data. It is in the dataframe format.

origin_df_Y : The input(original) labels of the data. It is in the dataframe format

kfold : The number of folds that is used to do the Stratified K-Fold cross validation.

is_debug : The enable signal to turn-on the verbose debug messages.

all_basic_classifiers : The dictionary format or "key=val" format of all the classifiers that you want to use to stack.

## Output : 

output_df_all : The origin_df_X that is appended with the predicition result of all classifiers in **all_basic_classifiers

model_dict : The dictioary that stores all the models used in the Stratified Cross Validation. Every classifiers in ** all_basic_classifiers has 'kfold' of classifers with same paramters.

In [1]:
import random
import sys
import numpy as np
import pandas as pd
import warnings
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from datetime import datetime

warnings.filterwarnings(action='ignore', category=DeprecationWarning)

def StackingMethod(origin_df_X, origin_df_Y, kfold=10, is_debug=0, **all_basic_classifiers):
    X_train = origin_df_X.values
    Y_train = origin_df_Y.values.ravel()
    random.seed(datetime.now())
    skf = StratifiedKFold(n_splits=kfold, random_state=random.randint(0, 2**32-1), shuffle=True)
    iteration = 0
    len_y = 0
    new_feature_columns = ['Label_'+x for x in all_basic_classifiers.keys()]
    new_feature_arr     = np.zeros([len(X_train), len(new_feature_columns)])
    model_dict = {}
    
    #Initialize the model list for every classifier
    for k in all_basic_classifiers.keys():
        model_dict[k] = []
    
    for train_index, test_index in skf.split(X_train, Y_train):
        X_cv_train = X_train[train_index]
        Y_cv_train = Y_train[train_index]
        X_cv_test  = X_train[test_index]
        Y_cv_test  = Y_train[test_index]
        column_label_index = 0
        
        if(is_debug):
            print(f"-----iteration {iteration}-----")
            print(f'test_index = {test_index}')
        for k, v in all_basic_classifiers.items():
            classifier_cv = v
            classifier_cv.fit(X_cv_train, Y_cv_train)
            model_dict[k].append(classifier_cv)
            Y_cv_test_result = classifier_cv.predict(X_cv_test)
            count_result_index = 0
            for index in test_index:
                new_feature_arr[index][column_label_index] = Y_cv_test_result[count_result_index]
                count_result_index += 1
                
            column_label_index += 1
            
            if(is_debug):
                len_y += len(Y_cv_test_result)
                print(f'key = {k}, val = {v}')
                print(f'Y_cv_test_result = {Y_cv_test_result}')
                print(f'len(Y_cv_test_result) = {len(Y_cv_test_result)}')
                print(type(Y_cv_test_result))
                print('-------')
        iteration += 1
        
    new_feature_df = pd.DataFrame(data = new_feature_arr, columns = new_feature_columns)
    output_df_all = pd.concat([origin_df_X, new_feature_df], axis=1, ignore_index=False)
    
    if(is_debug):
        print(f'total len_y = {len_y}')
        print(f'new_feature_columns = {new_feature_columns}')
        count_index_fin = 0
        for x in new_feature_arr:
            print(f'index = {count_index_fin}, label = {x}')
            count_index_fin += 1
        
    return (output_df_all, model_dict)

# Cross Validation Grid Search Nested Enhanced Function

It is enhanced to support those classifiers without tuned_param.

On the other words, you can pass empty tuned_param.

In [2]:
def CrossValidationGridSearchNested(origin_df_X, origin_df_Y, num_trials, fold_num, est_classifcation, tuned_param, scoring):
    X_data = origin_df_X.values
    Y_data = origin_df_Y.values.ravel()
    max_score = -1
    best_estimator = est_classifcation
    is_tuned_param_empty = (tuned_param == []) | (tuned_param == None)
    
    for i in range(num_trials):
        inner_cv = StratifiedKFold(n_splits=fold_num, random_state=i, shuffle=True)
        outer_cv = StratifiedKFold(n_splits=fold_num, random_state=i+1, shuffle=True)
        
        if(is_tuned_param_empty):
            param_score = cross_val_score(est_classifcation, X=X_data, y=Y_data, cv=outer_cv, scoring=scoring).mean()
        else:
            # Non_nested parameter search and scoring
            clf = GridSearchCV(estimator=est_classifcation, param_grid=tuned_param, cv=inner_cv, scoring=scoring)
            clf.fit(X_data, Y_data)
        
            # CV with parameter optimization
            param_score = cross_val_score(clf.best_estimator_, X=X_data, y=Y_data, cv=outer_cv, scoring=scoring).mean()
            
        if(param_score > max_score):
            max_score = param_score
            if(is_tuned_param_empty):
                best_estimator = est_classifcation
            else:
                best_estimator = clf.best_estimator_
            
        progress = (i+1)/num_trials*100
        print(f'> progress = {progress}%')
    
    return (max_score, best_estimator)

# Example
It is an example to 
1. Use CrossValidationGridSearchNested to find the best paramters of both SVM and NaiveBayes classifiers as the basic classifier. 
2. Use the best parameters of SVM and NaiveBayes classifiers to stack the origin feature space, or in other words, to find out and combine two new features that are individually from the cross validation result of SVM and NaiveBayes classifiers. The output is the new dataframe with spanned features.

Load the breast canced database

In [3]:
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
X = data.data
Y = data.target
print(f'shape of X = {X.shape}')
print(f'shape of Y = {Y.shape}')

shape of X = (569, 30)
shape of Y = (569,)


In [4]:
X

array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
        1.189e-01],
       [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
        8.902e-02],
       [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
        8.758e-02],
       ...,
       [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
        7.820e-02],
       [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
        1.240e-01],
       [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
        7.039e-02]])

In [5]:
Y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,

#### Transform X and Y to the format of dataframe.

#### Use CrossValidationGridSearchNested to find out the best parameters of SVM and NaiveBayes classifiers.

#### NaiveBayes part pass the empty tuned_param and the input is supported by CrossValidationGridSearchNested now.

In [6]:
#----------------Generate original df for reference-------------------#
org_columns = ["col_org_"+str(x) for x in range(0, len(X[0]), 1)]
original_df_X = pd.DataFrame(data = X, columns = org_columns)

org_columns = ["Label"+str(x) for x in range(0, len(Y.reshape(len(Y), 1)[0]), 1)]
original_df_Y = pd.DataFrame(data = Y.reshape(len(Y), 1), columns = org_columns)

#----------------Tuned SVM-------------------#
from sklearn.svm import SVC
# Set the parameters by cross-validation
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

# Number of random trials
NUM_TRIALS = 10

# We will use a Support Vector Classifier with "rbf" kernel
svm = SVC()

(max_score, svm_best_estimator) = CrossValidationGridSearchNested(original_df_X, original_df_Y, NUM_TRIALS, 10, svm, tuned_parameters, 'roc_auc')
svm_best_parameter = svm_best_estimator.get_params()

print(f'\nmax_score = {max_score}\n')
print(f'\nbest_estimator = {svm_best_estimator}\n')
print(f'\nbest_parameter = {svm_best_parameter}\n')

#----------------Tuned Naive Bayes - Gaussian-------------------#
from sklearn.naive_bayes import GaussianNB

# Set the parameters by cross-validation
tuned_parameters = []

# Number of random trials
NUM_TRIALS = 10

clf = GaussianNB()

(max_score, gau_nb_best_estimator) = CrossValidationGridSearchNested(original_df_X, original_df_Y, NUM_TRIALS, 10, clf, tuned_parameters, 'roc_auc')
gau_nb_best_parameter = gau_nb_best_estimator.get_params()

print(f'\nmax_score = {max_score}\n')
print(f'\nbest_estimator = {gau_nb_best_estimator}\n')
print(f'\nbest_parameter = {gau_nb_best_parameter}\n')

> progress = 10.0%
> progress = 20.0%
> progress = 30.0%
> progress = 40.0%
> progress = 50.0%
> progress = 60.0%
> progress = 70.0%
> progress = 80.0%
> progress = 90.0%
> progress = 100.0%

max_score = 0.9928779289493574


best_estimator = SVC(C=1000, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)


best_parameter = {'C': 1000, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 'auto', 'kernel': 'linear', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': True, 'tol': 0.001, 'verbose': False}

> progress = 10.0%
> progress = 20.0%
> progress = 30.0%
> progress = 40.0%
> progress = 50.0%
> progress = 60.0%
> progress = 70.0%
> progress = 80.0%
> progress = 90.0%
> progress = 100.0%

max_score = 0.989685975400261


best_estimator = Gaussi

#### Use the svm_best_estimator and gau_nb_best_estimator that are with best paramters of SVM and NaiveBayes classifiers find out by CrossValidationGridSearchNested.

#### StackingMethod will output the new_df that has two new features generated from the Stratified Cross Validation by svm_best_estimator and gau_nb_best_estimator

In [7]:
all_classifier = {'svm_classifier':svm_best_estimator, 'naive_bayse_gaussian_classifier':gau_nb_best_estimator}
is_debug = 1
new_df, model_dict = StackingMethod(original_df_X, original_df_Y, 10, is_debug, **all_classifier)

#another usage : 
#new_df = StackingMethod(original_df_X, original_df_Y, 10, svm_classifier=svm, naive_bayse_gaussian_classifier=gau_nb_best_estimator)

-----iteration 0-----
test_index = [  2   8  11  19  24  26  28  40  51  59  72  81  86 102 122 138 139 151
 155 167 226 229 232 241 250 277 282 289 292 298 299 303 307 317 341 349
 362 378 381 394 399 402 418 420 426 433 436 438 439 442 470 474 488 501
 503 516 557 564]
key = svm_classifier, val = SVC(C=1000, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
Y_cv_test_result = [0 0 0 1 0 0 0 1 1 1 0 1 1 1 0 0 1 1 1 0 1 0 1 1 0 0 0 1 1 1 1 1 1 0 1 1 1
 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 0 0 0 1 0]
len(Y_cv_test_result) = 58
<class 'numpy.ndarray'>
-------
key = naive_bayse_gaussian_classifier, val = GaussianNB(priors=None)
Y_cv_test_result = [0 0 0 1 0 0 0 1 1 1 0 1 1 1 0 0 1 1 1 0 1 0 1 1 0 0 0 1 1 1 1 1 1 0 1 1 1
 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 0 0 0 1 0]
len(Y_cv_test_result) = 58
<class 'numpy.ndarray'>
-------
-----ite

key = svm_classifier, val = SVC(C=1000, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
Y_cv_test_result = [0 0 1 0 0 0 1 1 0 1 0 0 1 1 1 0 0 1 1 1 1 1 1 0 1 0 1 0 0 0 1 1 1 1 1 1 0
 0 1 0 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 0]
len(Y_cv_test_result) = 56
<class 'numpy.ndarray'>
-------
key = naive_bayse_gaussian_classifier, val = GaussianNB(priors=None)
Y_cv_test_result = [0 0 1 0 1 0 1 1 0 1 0 1 1 1 1 0 0 1 1 1 1 1 1 0 1 0 1 0 0 0 1 1 1 1 1 1 0
 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0]
len(Y_cv_test_result) = 56
<class 'numpy.ndarray'>
-------
total len_y = 1138
new_feature_columns = ['Label_svm_classifier', 'Label_naive_bayse_gaussian_classifier']
index = 0, label = [0. 0.]
index = 1, label = [0. 0.]
index = 2, label = [0. 0.]
index = 3, label = [0. 0.]
index = 4, label = [0. 0.]
index = 5, label = [0. 0.]
index = 6, label = 

In [8]:
new_df

Unnamed: 0,col_org_0,col_org_1,col_org_2,col_org_3,col_org_4,col_org_5,col_org_6,col_org_7,col_org_8,col_org_9,...,col_org_22,col_org_23,col_org_24,col_org_25,col_org_26,col_org_27,col_org_28,col_org_29,Label_svm_classifier,Label_naive_bayse_gaussian_classifier
0,17.990,10.38,122.80,1001.0,0.11840,0.27760,0.300100,0.147100,0.2419,0.07871,...,184.60,2019.0,0.16220,0.66560,0.71190,0.26540,0.4601,0.11890,0.0,0.0
1,20.570,17.77,132.90,1326.0,0.08474,0.07864,0.086900,0.070170,0.1812,0.05667,...,158.80,1956.0,0.12380,0.18660,0.24160,0.18600,0.2750,0.08902,0.0,0.0
2,19.690,21.25,130.00,1203.0,0.10960,0.15990,0.197400,0.127900,0.2069,0.05999,...,152.50,1709.0,0.14440,0.42450,0.45040,0.24300,0.3613,0.08758,0.0,0.0
3,11.420,20.38,77.58,386.1,0.14250,0.28390,0.241400,0.105200,0.2597,0.09744,...,98.87,567.7,0.20980,0.86630,0.68690,0.25750,0.6638,0.17300,0.0,0.0
4,20.290,14.34,135.10,1297.0,0.10030,0.13280,0.198000,0.104300,0.1809,0.05883,...,152.20,1575.0,0.13740,0.20500,0.40000,0.16250,0.2364,0.07678,0.0,0.0
5,12.450,15.70,82.57,477.1,0.12780,0.17000,0.157800,0.080890,0.2087,0.07613,...,103.40,741.6,0.17910,0.52490,0.53550,0.17410,0.3985,0.12440,0.0,0.0
6,18.250,19.98,119.60,1040.0,0.09463,0.10900,0.112700,0.074000,0.1794,0.05742,...,153.20,1606.0,0.14420,0.25760,0.37840,0.19320,0.3063,0.08368,0.0,0.0
7,13.710,20.83,90.20,577.9,0.11890,0.16450,0.093660,0.059850,0.2196,0.07451,...,110.60,897.0,0.16540,0.36820,0.26780,0.15560,0.3196,0.11510,0.0,0.0
8,13.000,21.82,87.50,519.8,0.12730,0.19320,0.185900,0.093530,0.2350,0.07389,...,106.20,739.3,0.17030,0.54010,0.53900,0.20600,0.4378,0.10720,0.0,0.0
9,12.460,24.04,83.97,475.9,0.11860,0.23960,0.227300,0.085430,0.2030,0.08243,...,97.65,711.4,0.18530,1.05800,1.10500,0.22100,0.4366,0.20750,0.0,0.0


#### Combine with the original groundtruth Label(Y) see the result.

In [9]:
new_df['Label'] = original_df_Y['Label0']
new_df

Unnamed: 0,col_org_0,col_org_1,col_org_2,col_org_3,col_org_4,col_org_5,col_org_6,col_org_7,col_org_8,col_org_9,...,col_org_23,col_org_24,col_org_25,col_org_26,col_org_27,col_org_28,col_org_29,Label_svm_classifier,Label_naive_bayse_gaussian_classifier,Label
0,17.990,10.38,122.80,1001.0,0.11840,0.27760,0.300100,0.147100,0.2419,0.07871,...,2019.0,0.16220,0.66560,0.71190,0.26540,0.4601,0.11890,0.0,0.0,0
1,20.570,17.77,132.90,1326.0,0.08474,0.07864,0.086900,0.070170,0.1812,0.05667,...,1956.0,0.12380,0.18660,0.24160,0.18600,0.2750,0.08902,0.0,0.0,0
2,19.690,21.25,130.00,1203.0,0.10960,0.15990,0.197400,0.127900,0.2069,0.05999,...,1709.0,0.14440,0.42450,0.45040,0.24300,0.3613,0.08758,0.0,0.0,0
3,11.420,20.38,77.58,386.1,0.14250,0.28390,0.241400,0.105200,0.2597,0.09744,...,567.7,0.20980,0.86630,0.68690,0.25750,0.6638,0.17300,0.0,0.0,0
4,20.290,14.34,135.10,1297.0,0.10030,0.13280,0.198000,0.104300,0.1809,0.05883,...,1575.0,0.13740,0.20500,0.40000,0.16250,0.2364,0.07678,0.0,0.0,0
5,12.450,15.70,82.57,477.1,0.12780,0.17000,0.157800,0.080890,0.2087,0.07613,...,741.6,0.17910,0.52490,0.53550,0.17410,0.3985,0.12440,0.0,0.0,0
6,18.250,19.98,119.60,1040.0,0.09463,0.10900,0.112700,0.074000,0.1794,0.05742,...,1606.0,0.14420,0.25760,0.37840,0.19320,0.3063,0.08368,0.0,0.0,0
7,13.710,20.83,90.20,577.9,0.11890,0.16450,0.093660,0.059850,0.2196,0.07451,...,897.0,0.16540,0.36820,0.26780,0.15560,0.3196,0.11510,0.0,0.0,0
8,13.000,21.82,87.50,519.8,0.12730,0.19320,0.185900,0.093530,0.2350,0.07389,...,739.3,0.17030,0.54010,0.53900,0.20600,0.4378,0.10720,0.0,0.0,0
9,12.460,24.04,83.97,475.9,0.11860,0.23960,0.227300,0.085430,0.2030,0.08243,...,711.4,0.18530,1.05800,1.10500,0.22100,0.4366,0.20750,0.0,0.0,0
