# Labs Data Science Workshop: Super Ensemble Classifier

## Workshop Outline
1. Introduction & Motivation
2. Classier Model
3. Ensemble Model
4. Super Ensemble Model

## Step 1 - Introduction & Motivation

## Step 2 - Classifier Model Review

In [2]:
import os
from time import perf_counter

import pandas
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split

#### Load Data

In [3]:
df = pandas.read_csv(os.path.join("data", "dataset.csv"))
df.sample(10)

Unnamed: 0,A,B,C,D,E,Target
3056,-0.720754,-0.037906,-1.376162,2.194837,-0.703752,3
108,-0.912421,0.487331,-0.534908,-0.541271,-1.588584,4
1129,-2.522669,-2.624756,2.517419,0.496563,3.207005,3
121,0.787462,2.035988,0.173846,1.066891,-1.916809,1
2133,-2.368517,1.633614,-1.775822,-3.083628,2.114997,2
4057,-0.691973,1.349705,1.152929,2.101712,0.818434,3
772,-1.227851,-1.62979,-0.073806,-1.26818,-0.445382,3
2285,-1.770167,3.11581,-0.427442,3.209732,3.390275,0
412,-1.411169,-1.837803,-1.346955,-1.9602,-0.431751,4
4175,-2.815265,0.509811,1.433668,-0.828211,-1.739501,4


#### Shape (row, col)

In [4]:
df.shape

(5000, 6)

#### Descriptive Statistics

In [5]:
df.describe()

Unnamed: 0,A,B,C,D,E,Target
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,-0.587581,0.614006,0.005137,-0.215878,0.006497,2.0014
std,1.493213,1.498148,1.72805,1.652978,1.632831,1.413576
min,-6.69598,-5.024347,-6.114165,-6.489088,-6.87751,0.0
25%,-1.575304,-0.346215,-1.198611,-1.386536,-1.075645,1.0
50%,-0.691885,0.703356,-0.068541,-0.253239,0.05598,2.0
75%,0.326658,1.63805,1.169151,0.945687,1.154812,3.0
max,5.128072,6.357553,6.419446,4.993319,4.938408,4.0


#### Correlation Matrix

Negative correlations?

In [6]:
df.corr()

Unnamed: 0,A,B,C,D,E,Target
A,1.0,0.187545,0.135122,0.341305,-0.110537,-0.181537
B,0.187545,1.0,0.037219,0.265218,-0.088616,-0.28738
C,0.135122,0.037219,1.0,-0.029098,0.14531,-0.15968
D,0.341305,0.265218,-0.029098,1.0,-0.057682,-0.252799
E,-0.110537,-0.088616,0.14531,-0.057682,1.0,-0.354342
Target,-0.181537,-0.28738,-0.15968,-0.252799,-0.354342,1.0


### Train/Test Split

In [7]:
target = df.columns[-1]
features = df.columns.drop(target)
X_train, X_test, y_train, y_test = train_test_split(
    df[features],
    df[target],
    random_state=42,
    test_size=0.2,
)

### Naive Baseline Score

In [8]:
print(f"{1 / len(df['Target'].unique()):.2%}")

20.00%


### Baseline Model - Logistic Regression

In [9]:
base_model = LogisticRegression(max_iter=1024, random_state=42)
start = perf_counter()
base_model.fit(X_train, y_train)
stop = perf_counter()
duration = stop - start
print(f"Algorithm: {base_model}")
print(f"Train Time: {duration:.2f}s")
print(f"Accuracy Score: {base_model.score(X_test, y_test):.2%}")

Algorithm: LogisticRegression(max_iter=1024, random_state=42)
Train Time: 0.03s
Accuracy Score: 54.80%


In [10]:
models = [
    KNeighborsClassifier(),
    SVC(random_state=42),
    DecisionTreeClassifier(random_state=42),
    RandomForestClassifier(random_state=42),
    AdaBoostClassifier(random_state=42),
    GaussianNB(),
]
for model in models:
    start = perf_counter()
    model.fit(X_train, y_train)
    stop = perf_counter()
    duration = stop - start
    print(f"Algorithm: {model}")
    print(f"Train Time: {duration:.2f}s")
    print(f"Test Score: {model.score(X_test, y_test):.2%}\n")

Algorithm: KNeighborsClassifier()
Train Time: 0.01s
Test Score: 75.30%

Algorithm: SVC(random_state=42)
Train Time: 0.32s
Test Score: 74.70%

Algorithm: DecisionTreeClassifier(random_state=42)
Train Time: 0.02s
Test Score: 62.20%

Algorithm: RandomForestClassifier(random_state=42)
Train Time: 0.63s
Test Score: 73.80%

Algorithm: AdaBoostClassifier(random_state=42)
Train Time: 0.17s
Test Score: 46.80%

Algorithm: GaussianNB()
Train Time: 0.00s
Test Score: 57.10%



Super Ensemble Classifier

In [11]:
from sklearn.ensemble import StackingClassifier

Which worker makes the best executive?

In [12]:
executives = [
    KNeighborsClassifier(),
    SVC(random_state=42),
    RandomForestClassifier(random_state=42),
    AdaBoostClassifier(random_state=42),
    GaussianNB(),
]
for executive in executives:
    model = StackingClassifier(
        estimators=[
            ("KNC", KNeighborsClassifier()),
            ("SVC", SVC(random_state=42)),
            ("RFC", RandomForestClassifier(random_state=42)),
            ("ABC", AdaBoostClassifier(random_state=42)),
            ("GNB", GaussianNB()),
        ],
        final_estimator=executive,
    )
    start = perf_counter()
    model.fit(X_train, y_train)
    stop = perf_counter()
    duration = stop - start
    print(f"Workers: {', '.join(model.named_estimators)}")
    print(f"Executive: {model.final_estimator}")
    print(f"Train Time: {duration:.2f}s")
    print(f"Test Score: {model.score(X_test, y_test):.2%}\n")

Workers: KNC, SVC, RFC, ABC, GNB
Executive: KNeighborsClassifier()
Train Time: 5.98s
Test Score: 72.90%

Workers: KNC, SVC, RFC, ABC, GNB
Executive: SVC(random_state=42)
Train Time: 6.30s
Test Score: 75.10%

Workers: KNC, SVC, RFC, ABC, GNB
Executive: RandomForestClassifier(random_state=42)
Train Time: 6.87s
Test Score: 77.10%

Workers: KNC, SVC, RFC, ABC, GNB
Executive: AdaBoostClassifier(random_state=42)
Train Time: 6.34s
Test Score: 66.90%

Workers: KNC, SVC, RFC, ABC, GNB
Executive: GaussianNB()
Train Time: 6.39s
Test Score: 74.10%



Validate all workers are contributing by ignoring each one in turn

In [13]:
workers = [
    ("KNC", KNeighborsClassifier()),
    ("SVC", SVC(random_state=42)),
    ("RFC", RandomForestClassifier(random_state=42)),
    ("ABC", AdaBoostClassifier(random_state=42)),
    ("GNB", GaussianNB()),
]
for i in range(len(workers)):
    worker_set = workers[:i] + workers[i + 1:]
    model = StackingClassifier(
        estimators=worker_set,
        final_estimator=RandomForestClassifier(random_state=42),
    )
    start = perf_counter()
    model.fit(X_train, y_train)
    stop = perf_counter()
    duration = stop - start
    print(f"Workers: {', '.join(model.named_estimators)}")
    print(f"Executive: {model.final_estimator}")
    print(f"Train Time: {duration:.2f}s")
    print(f"Test Score: {model.score(X_test, y_test):.2%}\n")

Workers: SVC, RFC, ABC, GNB
Executive: RandomForestClassifier(random_state=42)
Train Time: 6.94s
Test Score: 75.90%

Workers: KNC, RFC, ABC, GNB
Executive: RandomForestClassifier(random_state=42)
Train Time: 4.79s
Test Score: 75.40%

Workers: KNC, SVC, ABC, GNB
Executive: RandomForestClassifier(random_state=42)
Train Time: 3.78s
Test Score: 76.20%

Workers: KNC, SVC, RFC, GNB
Executive: RandomForestClassifier(random_state=42)
Train Time: 5.68s
Test Score: 76.20%

Workers: KNC, SVC, RFC, ABC
Executive: RandomForestClassifier(random_state=42)
Train Time: 6.66s
Test Score: 76.10%



Best Stacking Classifier

In [14]:
model = StackingClassifier(
    estimators=[
        ("KNC", KNeighborsClassifier()),
        ("SVC", SVC(random_state=42)),
        ("RFC", RandomForestClassifier(random_state=42)),
        ("ABC", AdaBoostClassifier(random_state=42)),
        ("GNB", GaussianNB()),
    ],
    final_estimator=RandomForestClassifier(random_state=42),
)
start = perf_counter()
model.fit(X_train, y_train)
stop = perf_counter()
duration = stop - start
print(f"Workers: {', '.join(model.named_estimators)}")
print(f"Executive: {model.final_estimator}")
print(f"Train Time: {duration:.2f}s")
print(f"Test Score: {model.score(X_test, y_test):.2%}\n")


Workers: KNC, SVC, RFC, ABC, GNB
Executive: RandomForestClassifier(random_state=42)
Train Time: 6.87s
Test Score: 77.10%



Voting Classifier

In [15]:
from sklearn.ensemble import VotingClassifier

In [17]:
model = VotingClassifier(
    estimators=[
        ("KNC", KNeighborsClassifier()),
        ("SVC", SVC(random_state=42)),
        ("RFC", RandomForestClassifier(random_state=42)),
        ("ABC", AdaBoostClassifier(random_state=42)),
        ("GNB", GaussianNB()),
    ],
)
start = perf_counter()
model.fit(X_train, y_train)
stop = perf_counter()
duration = stop - start
print(f"Workers: {', '.join(model.named_estimators)}")
print(f"Train Time: {duration:.2f}s")
print(f"Test Score: {model.score(X_test, y_test):.2%}\n")


Workers: KNC, SVC, RFC, ABC, GNB
Train Time: 1.13s
Test Score: 73.70%

