# Combining the efforts of ML models

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
import sklearn.tree
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.model_selection import train_test_split
#from dtreeviz.trees import dtreeviz

Using make_blobs to generate some data for classification:
https://docs.w3cub.com/scikit_learn/modules/generated/sklearn.datasets.make_blobs

In [None]:
x, y = make_blobs(n_samples=300,
                  random_state=0, 
                  cluster_std=2)

In [None]:
df = pd.DataFrame({'x0':x[:,0], 
                   'x1':x[:,1], 
                   'y':y})

In [None]:
sns.scatterplot(data=df,
                x='x0',
                y='x1',
                hue='y',
                palette=['red','green','blue'])

## Train/test split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(df[['x0','x1']], 
                                                    df['y'], 
                                                    random_state=0)

## First, Logistic Regression

In [None]:
logreg_clf = LogisticRegression()
logreg_clf.fit(x_train, y_train)

In [None]:
DecisionBoundaryDisplay.from_estimator(logreg_clf, 
                                       df[['x0','x1']], 
                                       response_method="predict",
                                       cmap="RdBu", 
                                       alpha=0.5
)

sns.scatterplot(data=df,
                x='x0', 
                y='x1',
                hue='y',
                palette=['red','green','blue'])

plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

In [None]:
test_score = logreg_clf.score(x_test, y_test)
print(f"Accuracy of Logistic Regression: {test_score:.2f}")

## Decision Tree

In [None]:
#tree_clf = DecisionTreeClassifier()

tree_clf = DecisionTreeClassifier(max_depth=4)

In [None]:
tree_clf.fit(x_train, y_train)

In [None]:
DecisionBoundaryDisplay.from_estimator(tree_clf, 
                                       df[['x0','x1']], 
                                       response_method="predict",
                                       cmap="RdBu", 
                                       alpha=0.5
)

sns.scatterplot(data=df,
                x='x0', 
                y='x1',
                hue='y',
                palette=['red','green','blue'])

plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

In [None]:
text_representation = sklearn.tree.export_text(tree_clf)
print(text_representation)

In [None]:
tree_clf.classes_

In [None]:
plt.figure(figsize=(12,8))
sklearn.tree.plot_tree(tree_clf, 
               feature_names=['x0','x1'],  
               class_names=[str(i) for i in tree_clf.classes_],
               filled=True);

In [None]:
plt.figure(figsize=(12,8))
dtreeviz(tree_clf, 
         df[['x0','x1']], 
         df['y'],
         feature_names=['x0','x1'],
         class_names=[i for i in tree_clf.classes_],
         target_name="y")

In [None]:
test_score = tree_clf.score(x_test, y_test)
print(f"Accuracy of Decision Tree: {test_score:.2f}")

In [None]:
sample = pd.DataFrame({"x0": [0.], "x1": [1]})
print('Predicted Class:',tree_clf.predict(sample))

In [None]:
y_pred_proba = tree_clf.predict_proba(sample)
print(y_pred_proba)

# K-Nearest Neighbors

In [None]:
# initialize the knn classification model

from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier(n_neighbors=10)
# knn_clf = KNeighborsClassifier()

In [None]:
# train the decision tree classification model

knn_clf.fit(x_train, y_train)

In [None]:
DecisionBoundaryDisplay.from_estimator(knn_clf, 
                                       df[['x0','x1']], 
                                       response_method="predict",
                                       cmap="RdBu", 
                                       alpha=0.5
)

sns.scatterplot(data=df,
                x='x0', 
                y='x1',
                hue='y',
                palette=['red','green','blue'])

plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

In [None]:
# print the accuracy of the model

test_score = knn_clf.score(x_test, y_test)
print(f"Accuracy of KNN: {test_score:.2f}")

# VotingClassifier
## Use the majority to determine the prediction

In [None]:
# print the accuracy of the model

classifiers = [knn_clf, logreg_clf, tree_clf]

for i in classifiers:
    print("Accuracy = {:.3f}".format(i.score(x_test, y_test)) + " for " + str(i))

In [None]:
from sklearn.ensemble import VotingClassifier

In [None]:
voting_clf = VotingClassifier(
    estimators = [('lr', logreg_clf), ('knn', knn_clf), ('tr', tree_clf)],
    voting = 'hard')

In [None]:
voting_clf.fit(x_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
classifiers = [knn_clf, logreg_clf, tree_clf, voting_clf]

In [None]:
for clf in classifiers:
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print("Accuracy = {:.3f}".format(accuracy_score(y_test, y_pred)) + " for " + str(clf))

In [None]:
votingsoft_clf = VotingClassifier(
    estimators=[('lr', logreg_clf), ('knn', knn_clf), ('tr', tree_clf)],
    voting='soft')

votingsoft_clf.fit(x_train, y_train)

classifiers.append(votingsoft_clf)

for clf in classifiers:
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print("Accuracy = {:.3f}".format(accuracy_score(y_test, y_pred)) + " for " + str(clf))

Voting won't necessarily give you a better model.  Consider a trivially bad case where 2 of 3 classifiers are both bad in the same region of feature space.

Voting will at least be a representation of several of your models combined together.

Voting will also be improved by using a wide variety of models and models which are not correlated with each other.

In [None]:
from sklearn.svm import SVC

In [None]:
svm_clf = SVC(probability=True)
svm_clf.fit(x_train, y_train)

In [None]:
classifiers = [knn_clf, logreg_clf, tree_clf, svm_clf]

for clf in classifiers:
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print("Accuracy = {:.3f}".format(accuracy_score(y_test, y_pred)) + " for " + str(clf))

In [None]:
voting_clf = VotingClassifier(
    estimators=[('lr', logreg_clf), ('knn', knn_clf), ('tr', tree_clf), ('svm', svm_clf)],
    voting='hard')

voting_clf.fit(x_train, y_train)

classifiers.append(voting_clf)

for clf in classifiers:
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print("Accuracy = {:.3f}".format(accuracy_score(y_test, y_pred)) + " for " + str(clf))

In [None]:
votingsoft_clf = VotingClassifier(
    estimators=[('lr', logreg_clf), ('knn', knn_clf), ('tr', tree_clf), ('svm', svm_clf)],
    voting='soft')

voting_clf.fit(x_train, y_train)

classifiers.append(votingsoft_clf)

for clf in classifiers:
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print("Accuracy = {:.3f}".format(accuracy_score(y_test, y_pred)) + " for " + str(clf))

## Let's apply this Ensembling to last week's data

Adapt the above to do Logistic Regression and Decision Tree Classification on last week's classification dataset on breast cancer.
* Use the feature variables "mean radius" and "mean texture"
* Predict the 0 or 1 cancer target variable

In [None]:
import sklearn.datasets
import sklearn.model_selection

x,y = sklearn.datasets.load_breast_cancer(return_X_y=True,
                                          as_frame=True)

In [None]:
x['target'] = y

In [None]:
x.corr()['target']

In [None]:
sns.scatterplot(data=x, 
                x='mean radius', 
                y='mean texture',
                hue='target')

In [None]:
# split into training/test sets

x_train, x_test, y_train, y_test = train_test_split(x[['mean radius','mean texture']], 
                                                    x['target'], 
                                                    random_state=0)

In [None]:
# initialize the logistic regression model

logreg_clf = LogisticRegression()

In [None]:
# train the logistic regression model

logreg_clf.fit(x_train, y_train)

In [None]:
# plot the decision boundary with the data

DecisionBoundaryDisplay.from_estimator(logreg_clf, 
                                       x_train, 
                                       response_method="predict",
                                       cmap="RdBu", 
                                       alpha=0.5
)

sns.scatterplot(data=x,
                x='mean radius', 
                y='mean texture',
                hue='target',
                palette=['red','green'])

plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

In [None]:
# print the accuracy of the model

test_score = logreg_clf.score(x_test, y_test)
print(f"Accuracy of Logistic Regression: {test_score:.2f}")

## Decision Tree

In [None]:
# initialize the decision tree classification model

tree_clf = DecisionTreeClassifier(max_depth=3)

In [None]:
# train the decision tree classification model

tree_clf.fit(x_train, y_train)

In [None]:
# plot the decision boundary with the data

DecisionBoundaryDisplay.from_estimator(tree_clf, 
                                       x_train, 
                                       response_method="predict",
                                       cmap="RdBu", 
                                       alpha=0.5
)

sns.scatterplot(data=x,
                x='mean radius', 
                y='mean texture',
                hue='target',
                palette=['red','green'])

plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

In [None]:
# print out a text representation of the tree

text_representation = sklearn.tree.export_text(tree_clf)
print(text_representation)

In [None]:
# plot a visualization of the tree

plt.figure(figsize=(12,8))
sklearn.tree.plot_tree(tree_clf, 
               feature_names=['mean radius','mean texture'],  
               class_names=[str(i) for i in tree_clf.classes_],
               filled=True);

In [None]:
# print the accuracy of the model

test_score = tree_clf.score(x_test, y_test)
print(f"Accuracy of Decision Tree: {test_score:.2f}")

In [None]:
# make a sample data point
# predict its class
# and print the calculated prediction probabilities for each class

sample = pd.DataFrame({"mean radius": [16.], "mean texture": [40]})
print('Predicted Class:',tree_clf.predict(sample))

y_pred_proba = tree_clf.predict_proba(sample)
print('Predicted Probabilies of Class Membership: ',y_pred_proba)

pd.Series(y_pred_proba[0], index=tree_clf.classes_).plot.bar()

Create one more classifier still.  Let's use K-Nearest Neighbors.

In [None]:
# initialize the knn classification model

from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier(n_neighbors=10)

In [None]:
# train the decision tree classification model

knn_clf.fit(x_train, y_train)

In [None]:
# plot the decision boundary with the data

DecisionBoundaryDisplay.from_estimator(knn_clf, 
                                       x_train, 
                                       response_method="predict",
                                       cmap="RdBu", 
                                       alpha=0.5
)

sns.scatterplot(data=x,
                x='mean radius', 
                y='mean texture',
                hue='target',
                palette=['red','green'])

plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

In [None]:
# print the accuracy of the model

test_score = knn_clf.score(x_test, y_test)
print(f"Accuracy of KNN: {test_score:.2f}")

In [None]:
# print the accuracy of the model

classifiers = [knn_clf, logreg_clf, tree_clf]

for clf in classifiers:
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print("Accuracy = {:.3f}".format(accuracy_score(y_test, y_pred)) + " for " + str(clf))

In [None]:
voting_clf = VotingClassifier(
    estimators=[('lr', logreg_clf), ('knn', knn_clf), ('tr', tree_clf)],
    voting='hard')

voting_clf.fit(x_train, y_train)

classifiers.append(voting_clf)

for clf in classifiers:
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print("Accuracy = {:.3f}".format(accuracy_score(y_test, y_pred)) + " for " + str(clf))

In [None]:
votingsoft_clf = VotingClassifier(
    estimators=[('lr', logreg_clf), ('knn', knn_clf), ('tr', tree_clf)],
    voting='soft')

voting_clf.fit(x_train, y_train)

classifiers.append(votingsoft_clf)

for clf in classifiers:
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print("Accuracy = {:.3f}".format(accuracy_score(y_test, y_pred)) + " for " + str(clf))

In [None]:
from sklearn.svm import SVC

In [None]:
svm_clf = SVC(probability=True)
svm_clf.fit(x_train, y_train)

In [None]:
classifiers = [knn_clf, logreg_clf, tree_clf, svm_clf]

for clf in classifiers:
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print("Accuracy = {:.3f}".format(accuracy_score(y_test, y_pred)) + " for " + str(clf))

In [None]:
voting_clf = VotingClassifier(
    estimators=[('lr', logreg_clf), ('knn', knn_clf), ('tr', tree_clf)],
    voting='hard')

voting_clf.fit(x_train, y_train)

classifiers.append(voting_clf)

for clf in classifiers:
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print("Accuracy = {:.3f}".format(accuracy_score(y_test, y_pred)) + " for " + str(clf))

In [None]:
votingsoft_clf = VotingClassifier(
    estimators=[('lr', logreg_clf), ('knn', knn_clf), ('tr', tree_clf)],
    voting='soft')

voting_clf.fit(x_train, y_train)

classifiers.append(votingsoft_clf)

for clf in classifiers:
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print("Accuracy = {:.3f}".format(accuracy_score(y_test, y_pred)) + " for " + str(clf))

# Moons

In [None]:
from sklearn.datasets import make_moons

In [None]:
x, y = make_moons(n_samples=100, noise=0.1, random_state=42)

In [None]:
sns.scatterplot(x=x[:,0], 
                y=x[:,1],
                hue=y)

In [None]:
# split into training/test sets

x_train, x_test, y_train, y_test = train_test_split(x, 
                                                    y, 
                                                    random_state=0)

In [None]:
# initialize the knn classification model

from sklearn.neighbors import KNeighborsClassifier

# knn_clf = KNeighborsClassifier(n_neighbors=3)
knn_clf = KNeighborsClassifier(n_neighbors=1)

In [None]:
# train the decision tree classification model

knn_clf.fit(x_train, y_train)

In [None]:
# plot the decision boundary with the data

DecisionBoundaryDisplay.from_estimator(knn_clf, 
                                       x, 
                                       response_method="predict",
                                       cmap="RdBu", 
                                       alpha=0.5
)

sns.scatterplot(x=x[:,0], 
                y=x[:,1],
                hue=y,
                palette=['red','green'])

plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

In [None]:
# print the accuracy of the model

test_score = knn_clf.score(x_test, y_test)
print(f"Accuracy of KNN: {test_score:.2f}")

In [None]:
x, y = make_moons(n_samples=400, noise=0.3, random_state=42)

In [None]:
sns.scatterplot(x=x[:,0], 
                y=x[:,1],
                hue=y)

In [None]:
# split into training/test sets

x_train, x_test, y_train, y_test = train_test_split(x, 
                                                    y, 
                                                    random_state=0)

In [None]:
# initialize the knn classification model

from sklearn.neighbors import KNeighborsClassifier

# knn_clf = KNeighborsClassifier(n_neighbors=5)
knn_clf = KNeighborsClassifier()

In [None]:
# train the decision tree classification model

knn_clf.fit(x_train, y_train)

In [None]:
# plot the decision boundary with the data

DecisionBoundaryDisplay.from_estimator(knn_clf, 
                                       x, 
                                       response_method="predict",
                                       cmap="RdBu", 
                                       alpha=0.5
)

sns.scatterplot(x=x[:,0], 
                y=x[:,1],
                hue=y,
                palette=['red','green'])

plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

In [None]:
# print the accuracy of the model

test_score = knn_clf.score(x_test, y_test)
print(f"Accuracy of KNN: {test_score:.2f}")

In [None]:
# initialize the logistic regression model

logreg_clf = LogisticRegression()

In [None]:
# train the logistic regression model

logreg_clf.fit(x_train, y_train)

In [None]:
# plot the decision boundary with the data

DecisionBoundaryDisplay.from_estimator(logreg_clf, 
                                       x, 
                                       response_method="predict",
                                       cmap="RdBu", 
                                       alpha=0.5
)

sns.scatterplot(x=x[:,0], 
                y=x[:,1],
                hue=y,
                palette=['red','green'])

plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

In [None]:
# print the accuracy of the model

test_score = logreg_clf.score(x_test, y_test)
print(f"Accuracy of Logistic Regression: {test_score:.2f}")

In [None]:
# initialize the logistic regression model

tree_clf = DecisionTreeClassifier()

In [None]:
# train the logistic regression model

tree_clf.fit(x_train, y_train)

In [None]:
# plot the decision boundary with the data

DecisionBoundaryDisplay.from_estimator(tree_clf, 
                                       x, 
                                       response_method="predict",
                                       cmap="RdBu", 
                                       alpha=0.5
)

sns.scatterplot(x=x[:,0], 
                y=x[:,1],
                hue=y,
                palette=['red','green'])

plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

In [None]:
# print the accuracy of the model

test_score = tree_clf.score(x_test, y_test)
print(f"Accuracy of Logistic Regression: {test_score:.2f}")

In [None]:
# print the accuracy of the model

classifiers = [knn_clf, logreg_clf, tree_clf]

for clf in classifiers:
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print("Accuracy = {:.3f}".format(accuracy_score(y_test, y_pred)) + " for " + str(clf))

In [None]:
voting_clf = VotingClassifier(
    estimators=[('lr', logreg_clf), ('knn', knn_clf), ('tr', tree_clf)],
    voting='hard')

voting_clf.fit(x_train, y_train)

classifiers.append(voting_clf)

for clf in classifiers:
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print("Accuracy = {:.3f}".format(accuracy_score(y_test, y_pred)) + " for " + str(clf))

In [None]:
# plot the decision boundary with the data

DecisionBoundaryDisplay.from_estimator(voting_clf, 
                                       x, 
                                       response_method="predict",
                                       cmap="RdBu", 
                                       alpha=0.5
)

sns.scatterplot(x=x[:,0], 
                y=x[:,1],
                hue=y,
                palette=['red','green'])

plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

In [None]:
votingsoft_clf = VotingClassifier(
    estimators=[('lr', logreg_clf), ('knn', knn_clf), ('tr', tree_clf)],
    voting='soft')

voting_clf.fit(x_train, y_train)

classifiers.append(votingsoft_clf)

for clf in classifiers:
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print("Accuracy = {:.3f}".format(accuracy_score(y_test, y_pred)) + " for " + str(clf))

In [None]:
# plot the decision boundary with the data

DecisionBoundaryDisplay.from_estimator(votingsoft_clf, 
                                       x, 
                                       response_method="predict",
                                       cmap="RdBu", 
                                       alpha=0.5
)

sns.scatterplot(x=x[:,0], 
                y=x[:,1],
                hue=y,
                palette=['red','green'])

plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')