## Import Libraries

In [13]:
import numpy as np 
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_classification
import plotly.graph_objects as go
import datapane as dp
import chart_studio.plotly as py
import chart_studio
username = os.environ['PLOTLY_USERNAME']
api_key = os.environ['PLOTLY_API_KEY']
chart_studio.tools.set_credentials_file(username=username, api_key=api_key)
%matplotlib inline

## Generate Dataset

In [2]:
X, y = make_classification(n_samples=50000, 
                           n_features=20, 
                           n_informative=15, 
                           n_redundant=5,
                           n_clusters_per_class=5,
                           class_sep=0.7,
                           flip_y=0.03,
                           n_classes=2)

## Individual models

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold

In [26]:
models_dict = {'random_forest': RandomForestClassifier(n_estimators=50),
               'svm': SVC(),
               'knn': KNeighborsClassifier(n_neighbors=11)}

def evaluate_model(model, X, y):
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=1)
    scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, verbose=1, n_jobs=5, error_score='raise')
    return scores


In [5]:
from collections import defaultdict
model_scores = defaultdict()
model_names = []

for name, model in models_dict.items():
    print('Evaluating {}'.format(name))
    scores = evaluate_model(model, X, y)
    model_scores[name] = scores
    model_names.append(name)

Evaluating random_forest


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  10 out of  10 | elapsed:   31.0s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.


Evaluating svm


[Parallel(n_jobs=3)]: Done  10 out of  10 | elapsed:  3.3min finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.


Evaluating knn


[Parallel(n_jobs=3)]: Done  10 out of  10 | elapsed:   40.0s finished


In [33]:
def plot_results(model_scores, name):
    
    model_names = list(model_scores.keys())
    results = [model_scores[model] for model in model_names]
    fig = go.Figure()
    for model, result in zip(model_names, results):
        fig.add_trace(go.Box(
            y=result,
            name=model,
            boxpoints='all',
            jitter=0.5,
            whiskerwidth=0.2,
            marker_size=2,
            line_width=1)
        )
    
    fig.update_layout(
    title='Performance of Different Models Using 5-Fold Cross-Validation',
    paper_bgcolor='rgb(243, 243, 243)',
    plot_bgcolor='rgb(243, 243, 243)',
    xaxis_title='Model',
    yaxis_title='Accuracy',
    showlegend=False)
    fig.show()
    
    report = dp.Report(dp.Plot(fig) ) #Create a report
    report.publish(name=name, open=True, visibility='PUBLIC') #Publish the report

In [16]:
model_scores

defaultdict(None,
            {'random_forest': array([0.8724, 0.8739, 0.871 , 0.8728, 0.8703, 0.8778, 0.8709, 0.8625,
                    0.8749, 0.8706]),
             'svm': array([0.891 , 0.8945, 0.8956, 0.8973, 0.8899, 0.8983, 0.8911, 0.8881,
                    0.8927, 0.8974]),
             'knn': array([0.8881, 0.8907, 0.8923, 0.8932, 0.8915, 0.8973, 0.891 , 0.8884,
                    0.8844, 0.8929])})

In [17]:
plot_results(model_scores, 'base_models_cv')

Publishing report and associated data - please wait..
Report successfully published at https://datapane.com/u/AmolMavuduru/reports/base-models/


## Stacking

In [29]:
from sklearn.ensemble import StackingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegressionCV

base_models = [('random_forest', RandomForestClassifier(n_estimators=50)),
               ('svm', SVC()),
               ('knn', KNeighborsClassifier(n_neighbors=11))]
meta_model = LogisticRegressionCV()
stacking_model = StackingClassifier(estimators=base_models, 
                                    final_estimator=meta_model, 
                                    passthrough=True, 
                                    cv=5,
                                    verbose=2)

In [30]:
stacking_scores = evaluate_model(stacking_model, X, y)
model_scores['stacking'] = stacking_scores

[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  10 out of  10 | elapsed: 14.4min finished


In [34]:
plot_results(model_scores, name='stacking_model_cv')

Publishing report and associated data - please wait..
Report successfully published at https://datapane.com/u/AmolMavuduru/reports/stacking-model-cv/
