# Building Automated Stacking Class

In this notebook I'm going to start building a stacking class that can automatically do a few things:
- Take in a dictionary of algorithms (hopefully eventually from different libraries like SKLearn, Keras, XGBoost, and LightGBM) and automatically grid search parameters and save the top performing models.
- Fit a cross validating stacking meta regressor or classifier

In [5]:
import pandas as pd
import numpy as np

from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import KFold

In [140]:
class stacking_ensemble(object):
    
    def __init__(self, base_models, stacking_model, k_folds=3):
        
        # 'models' will be a list of base model.
        self.base_models = base_models
        self.stacking_model = stacking_model
        self.k_folds = k_folds
        self.top_models_gs = False
        self.top_models_non_gs = False
        
    def find_top_models(self, X, y, k_models=1, cv=None, subsample=None, scoring=None):
        """
        DOCSTRING
        
        This function will look at the basemodels in, and finds the best models and then saves the best
        model/parameter combinations to a new object which can be used for fitting.
        
        scoring: way of comparing performance of models.  Can take on all values from SKLearn model
        evaluation documentation
        
        subsample: downsample the training data to make model selection faster.  Values between 0 and 1.
        """

        X_data = np.array(X)
        y_data = np.array(y)

        if subsample:
            axes_choices = np.random.binomial(n=1, p=subsample, size=X.shape[0])
            X_data = X_data[axes_choices == 1]
            y_data = y_data[axes_choices == 1]

        best_models = []
        from sklearn.model_selection import cross_val_score

        model_scores = []
        for model in self.base_models.values():
            score = np.mean(cross_val_score(model, X_data, y_data, scoring=scoring, cv=cv))
            model_scores.append({'model': model, 'score': score})

        self.top_models = sorted(model_scores, key=lambda k: k['score'], reverse=True)[:k_models]
        self.top_models_non_gs = True
        
    def find_top_models_gs(self, X, y, k_models=1, cv=None, subsample=None, scoring=None, verbose=0):
        """
        This will find the top models by gridsearching.
        
        To use this feature, base_models must be initially passed in this form: 
        
        base_models = {'log_reg': {'model': lr,
                                   'param_grid': {'penalty': ['l1', 'l2'], 'C': [.5, 1]}},
                       'rand_for': {'model': rf, 
                                    'param_grid': {'max_depth': [2,4,5]}},
                        }
                
        """
        
        from sklearn.model_selection import GridSearchCV
        
        model_scores = []
        
        X_data = np.array(X)
        y_data = np.array(y)

        if subsample:
            axes_choices = np.random.binomial(n=1, p=subsample, size=X.shape[0])
            X_data = X_data[axes_choices == 1]
            y_data = y_data[axes_choices == 1]

        for model_name, data in self.base_models.items():
            
            model = data['model']
            parameters = data['param_grid']
            
            gscv = GridSearchCV(model, param_grid=parameters, cv=cv, verbose=verbose)
            gscv.fit(X_data, y_data)
            
            model_scores += ([{'model': model,
                   'score': score,
                   'parameters': params} for params, score in zip(gscv.cv_results_['params'],
                                                               gscv.cv_results_['mean_test_score'])])
        
        top_models = sorted(model_scores, key=lambda k: k['score'], reverse=True)[:k_models]
        self.top_models_gs = True
        self.top_models = top_models
        
    def fit(self, X, y):
        X = np.array(X)
        y = np.array(y)
        
        df_meta = np.zeros(X.shape)
        fold_ids = np.zeros(df_meta.shape[0])  
        
        from sklearn.model_selection import KFold
        kf = KFold(n_splits=self.k_folds, shuffle=True)
        
        for i, (train, test) in enumerate(kf.split(X, y)):
            fold_ids[test] = i
        
        
        if not self.top_models_gs and not self.top_models_non_gs:
            
            print 'fitting base models'    
            for k in set(fold_ids):
                print '---> Fitting fold ', k + 1
                train_fold = X[fold_ids != k, :]
                
                for i, model in enumerate(self.base_models.values()):
                    print '------> Fitting model ', str(i)
                    model.fit(train_fold, y[fold_ids != k])
                    df_meta[fold_ids == k, i] = model.predict(X[fold_ids == k, :]) 
                    
            # Fit each base model to the entire X training set once we've fit the meta columns
            for model in self.base_models.values():
                model.fit(X, y)
            
        else:
            # If we've grid searched best models, here we'll fit the parameters to all the models.
            if self.top_models_gs:
                for model_dict in self.top_models:
                    model_dict['model'].set_params(**model_dict['parameters'])
            
            for k in set(fold_ids):
                print '---> Fitting fold ', k + 1
                train_fold = X[fold_ids != k, :]
                
                for i, model_dict in enumerate(self.top_models):
                    print '------> Fitting model ', str(i)
                    model_dict['model'].fit(train_fold, y[fold_ids != k])
                    df_meta[fold_ids == k, i] = model_dict['model'].predict(X[fold_ids == k, :])
                    
            # Fit each base model to the entire X training set once we've fit the meta columns
            for model_dict in self.top_models:
                model_dict['model'].fit(X, y)                
         
        # FITTING THE FINAL META MODEL
        print 'fitting stacking model...'
        self.stacking_model.fit(df_meta, y)
        
    def predict(self, X):
        X = np.array(X)
        test_meta = np.zeros(X.shape)
        
        if not self.top_models_gs and not self.top_models_non_gs:
            for i, model in enumerate(self.base_models.values()):
                test_meta[:, i] = model.predict(X)
        else:
            for i, model_dict in enumerate(self.top_models):
                test_meta[:, i] = model_dict['model'].predict(X)
        predictions = self.stacking_model.predict(test_meta)
        return predictions
    
    def score(self, X, y):
        predictions = self.predict(X)
        return accuracy_score(y, predictions)  

In [143]:
from sklearn.datasets import load_iris

iris_data = load_iris()
X = iris_data.data
y = iris_data.target

lr = LogisticRegression()
rf = RandomForestClassifier(n_estimators=50, max_depth=2)

base_models = {'log_reg': {'model': lr, 'param_grid': {'penalty': ['l1', 'l2'], 'C': [.5, 1]}},
                'rand_for': {'model': rf, 'param_grid': {'max_depth': [2,4,5]}
                }}

stacker = stacking_ensemble(base_models, stacking_model=gbc)
stacker.find_top_models_gs(X, y, k_models=4, cv=5, subsample=.5, scoring='r2', verbose=1)
stacker.top_models

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:    4.5s finished


Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    0.1s finished


[{'model': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
              max_depth=2, max_features='auto', max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=50, n_jobs=1, oob_score=False, random_state=None,
              verbose=0, warm_start=False),
  'parameters': {'max_depth': 2},
  'score': 0.94285714285714284},
 {'model': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
              max_depth=2, max_features='auto', max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=50, n_jobs=1, oob_score=False, random_state=None,
              verbose=0, warm_start=False),
  'parameters': {'max_depth': 5},
  'score': 0.94285714285714284},
 {'model': LogisticRegression(C=1.0, class_weight=None, dual=F

In [144]:
stacker.fit(X, y)

fitting base models
---> Fitting fold  1.0
------> Fitting model  0
------> Fitting model  1
------> Fitting model  2
------> Fitting model  3
---> Fitting fold  2.0
------> Fitting model  0
------> Fitting model  1
------> Fitting model  2
------> Fitting model  3
---> Fitting fold  3.0
------> Fitting model  0
------> Fitting model  1
------> Fitting model  2
------> Fitting model  3
fitting stacking model


### Notes

1. base models can't be passed in as a dict because order won't be maintained
    1. maybe what we can do is allow you to pass in basemodels as a list of dictionaries, and then reformat so that it's the same as the way we're currently storing the top_models (this already is a list of dictionaries).
2. If there's no parameters you should just be able to pass the models in as a list.
3. For grid searching parameters, we could make it like gridsearching in a pipeline is currently, and then internally it parses the dictionary and creates a new parameters dictionary that can be used for models later.
    1. Could also just make it so that you pass in the models as a list, and then pass in the parameters as a list of dictionaries, and you just have to maintain the order so they can be zipped together.