<a href="https://colab.research.google.com/github/DeenReinierCrusem/CPEN65-1/blob/main/Laboratory_Exercise_5_Ensemble_Learning_and_Random_Forests.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Laboratory Exercise 5: Ensemble Learning and Random Forests

In [None]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# To plot pretty figures
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "ensembles"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import fetch_openml

mnist = fetch_openml('mnist_784', version=1, as_frame=False)
mnist.keys()

X, y = mnist['data'], mnist['target']

X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=10000, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=10000, random_state=42)

random_forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)
extra_trees_clf = ExtraTreesClassifier(n_estimators=100, random_state=42)
svm_clf = LinearSVC(max_iter=100, tol=20, random_state=42)
mlp_clf = MLPClassifier(random_state=42)

estimators = [random_forest_clf, extra_trees_clf, svm_clf, mlp_clf]
for estimator in estimators:
    print("Training the", estimator.__class__.__name__)
    estimator.fit(X_train, y_train)

[estimator.score(X_val, y_val) for estimator in estimators]

  warn(


Training the RandomForestClassifier
Training the ExtraTreesClassifier
Training the LinearSVC
Training the MLPClassifier


[0.9692, 0.9715, 0.859, 0.9614]

In [None]:
# To plot pretty figures
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)


In [None]:
estimators = [random_forest_clf, extra_trees_clf, svm_clf, mlp_clf]
for estimator in estimators:
    print("Training", estimator.__class__.__name__)
    estimator.fit(X_train, y_train)


In [None]:
named_estimators = [
    ("random_forest_clf", random_forest_clf),
    ("extra_trees_clf", extra_trees_clf),
    ("svm_clf", svm_clf),
    ("mlp_clf", mlp_clf),
]
voting_clf = VotingClassifier(estimators=named_estimators)
voting_clf.fit(X_train, y_train)

[voting_clf.named_estimators_[name].score(X_val, y_val) for name, _ in named_estimators]


In [None]:
voting_clf = VotingClassifier(estimators=named_estimators)
voting_clf.fit(X_train, y_train)
voting_clf.score(X_val, y_val)

In [None]:
[voting_clf.estimators_[i].score(X_val, y_val) for i in range(len(voting_clf.estimators_))]

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier

In [None]:
X_train_val, X_test, y_train_val, y_test = train_test_split(
    mnist.data, mnist.target, test_size=10000, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=10000, random_state=42)

In [None]:
[voting_clf.estimators_[i].score(X_val, y_val) for i in range(len(voting_clf.estimators_))]

In [None]:
voting_clf.score(X_val, y_val)

0.974

In [None]:
[estimator.score(X_val, y_val) for estimator in voting_clf.estimators_]

[0.0, 0.0, 0.0]

In [None]:
voting_clf.set_params(svm_clf=None)

In [None]:
voting_clf.estimators

[('random_forest_clf', RandomForestClassifier(random_state=42)),
 ('extra_trees_clf', ExtraTreesClassifier(random_state=42)),
 ('svm_clf', None),
 ('mlp_clf', MLPClassifier(random_state=42))]

In [None]:
voting_clf.estimators_

[RandomForestClassifier(random_state=42),
 ExtraTreesClassifier(random_state=42),
 LinearSVC(max_iter=100, random_state=42, tol=20),
 MLPClassifier(random_state=42)]

In [None]:
del voting_clf.estimators_[2]

In [None]:
voting_clf.score(X_val, y_val)

0.974

In [None]:
voting_clf.voting = "soft"
voting_clf.score(X_val, y_val)

0.9699

In [None]:
voting_clf.voting = "hard"
voting_clf.score(X_test, y_test)

0.9703

In [None]:
[estimator.score(X_test, y_test) for estimator in voting_clf.estimators_]

[0.0, 0.0, 0.0]

In [None]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split

# Load MNIST dataset
mnist = fetch_openml('mnist_784', version=1, cache=True)

# Split data into features (X) and labels (y)
X, y = mnist["data"], mnist["target"]

# Split data into training, validation, and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=10000, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=10000, random_state=42)

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.svm import SVC

# Initialize individual classifiers
random_forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)
extra_trees_clf = ExtraTreesClassifier(n_estimators=100, random_state=42)
svm_clf = SVC(probability=True, random_state=42)

# Train individual classifiers
random_forest_clf.fit(X_train, y_train)
extra_trees_clf.fit(X_train, y_train)
svm_clf.fit(X_train, y_train)

# Create a voting classifier with soft voting
voting_clf = VotingClassifier(
    estimators=[('random_forest', random_forest_clf), ('extra_trees', extra_trees_clf), ('svm', svm_clf)],
    voting='soft'
)

# Train the voting classifier
voting_clf.fit(X_train, y_train)

from sklearn.metrics import accuracy_score

# Make predictions on the validation set
y_val_pred = voting_clf.predict(X_val)

# Calculate accuracy of the ensemble classifier on the validation set
ensemble_accuracy = accuracy_score(y_val, y_val_pred)
print("Ensemble accuracy on validation set:", ensemble_accuracy)

  warn(


Ensemble accuracy on validation set: 0.9791


In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score

# Make predictions on the test set
y_test_pred = voting_clf.predict(X_test)

# Calculate accuracy of the ensemble classifier on the test set
ensemble_test_accuracy = accuracy_score(y_test, y_test_pred)
print("Ensemble accuracy on test set:", ensemble_test_accuracy)


In [None]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# To plot pretty figures
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "ensembles"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import fetch_openml

mnist = fetch_openml('mnist_784', version=1, as_frame=False)
mnist.keys()

X, y = mnist['data'], mnist['target']

X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=10000, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=10000, random_state=42)

random_forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)
extra_trees_clf = ExtraTreesClassifier(n_estimators=100, random_state=42)
svm_clf = LinearSVC(max_iter=100, tol=20, random_state=42)
mlp_clf = MLPClassifier(random_state=42)

estimators = [random_forest_clf, extra_trees_clf, svm_clf, mlp_clf]
for estimator in estimators:
    print("Training the", estimator.__class__.__name__)
    estimator.fit(X_train, y_train)

[estimator.score(X_val, y_val) for estimator in estimators]

  warn(


Training the RandomForestClassifier
Training the ExtraTreesClassifier
Training the LinearSVC
Training the MLPClassifier


[0.9692, 0.9715, 0.859, 0.9614]

In [None]:
# To plot pretty figures
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)


In [None]:
estimators = [random_forest_clf, extra_trees_clf, svm_clf, mlp_clf]
for estimator in estimators:
    print("Training", estimator.__class__.__name__)
    estimator.fit(X_train, y_train)


In [None]:
named_estimators = [
    ("random_forest_clf", random_forest_clf),
    ("extra_trees_clf", extra_trees_clf),
    ("svm_clf", svm_clf),
    ("mlp_clf", mlp_clf),
]
voting_clf = VotingClassifier(estimators=named_estimators)
voting_clf.fit(X_train, y_train)

[voting_clf.named_estimators_[name].score(X_val, y_val) for name, _ in named_estimators]


In [None]:
voting_clf = VotingClassifier(estimators=named_estimators)
voting_clf.fit(X_train, y_train)
voting_clf.score(X_val, y_val)

In [None]:
[voting_clf.estimators_[i].score(X_val, y_val) for i in range(len(voting_clf.estimators_))]

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier

In [None]:
X_train_val, X_test, y_train_val, y_test = train_test_split(
    mnist.data, mnist.target, test_size=10000, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=10000, random_state=42)

In [None]:
[voting_clf.estimators_[i].score(X_val, y_val) for i in range(len(voting_clf.estimators_))]

In [None]:
voting_clf.score(X_val, y_val)

0.974

In [None]:
[estimator.score(X_val, y_val) for estimator in voting_clf.estimators_]

[0.0, 0.0, 0.0]

In [None]:
voting_clf.set_params(svm_clf=None)

In [None]:
voting_clf.estimators

[('random_forest_clf', RandomForestClassifier(random_state=42)),
 ('extra_trees_clf', ExtraTreesClassifier(random_state=42)),
 ('svm_clf', None),
 ('mlp_clf', MLPClassifier(random_state=42))]

In [None]:
voting_clf.estimators_

[RandomForestClassifier(random_state=42),
 ExtraTreesClassifier(random_state=42),
 LinearSVC(max_iter=100, random_state=42, tol=20),
 MLPClassifier(random_state=42)]

In [None]:
del voting_clf.estimators_[2]

In [None]:
voting_clf.score(X_val, y_val)

0.974

In [None]:
voting_clf.voting = "soft"
voting_clf.score(X_val, y_val)

0.9699

In [None]:
voting_clf.voting = "hard"
voting_clf.score(X_test, y_test)

0.9703

In [None]:
[estimator.score(X_test, y_test) for estimator in voting_clf.estimators_]

[0.0, 0.0, 0.0]

In [None]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split

# Load MNIST dataset
mnist = fetch_openml('mnist_784', version=1, cache=True)

# Split data into features (X) and labels (y)
X, y = mnist["data"], mnist["target"]

# Split data into training, validation, and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=10000, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=10000, random_state=42)

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.svm import SVC

# Initialize individual classifiers
random_forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)
extra_trees_clf = ExtraTreesClassifier(n_estimators=100, random_state=42)
svm_clf = SVC(probability=True, random_state=42)

# Train individual classifiers
random_forest_clf.fit(X_train, y_train)
extra_trees_clf.fit(X_train, y_train)
svm_clf.fit(X_train, y_train)

# Create a voting classifier with soft voting
voting_clf = VotingClassifier(
    estimators=[('random_forest', random_forest_clf), ('extra_trees', extra_trees_clf), ('svm', svm_clf)],
    voting='soft'
)

# Train the voting classifier
voting_clf.fit(X_train, y_train)

from sklearn.metrics import accuracy_score

# Make predictions on the validation set
y_val_pred = voting_clf.predict(X_val)

# Calculate accuracy of the ensemble classifier on the validation set
ensemble_accuracy = accuracy_score(y_val, y_val_pred)
print("Ensemble accuracy on validation set:", ensemble_accuracy)

  warn(


Ensemble accuracy on validation set: 0.9791


In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score

# Make predictions on the test set
y_test_pred = voting_clf.predict(X_test)

# Calculate accuracy of the ensemble classifier on the test set
ensemble_test_accuracy = accuracy_score(y_test, y_test_pred)
print("Ensemble accuracy on test set:", ensemble_test_accuracy)


In [None]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# To plot pretty figures
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "ensembles"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import fetch_openml

mnist = fetch_openml('mnist_784', version=1, as_frame=False)
mnist.keys()

X, y = mnist['data'], mnist['target']

X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=10000, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=10000, random_state=42)

random_forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)
extra_trees_clf = ExtraTreesClassifier(n_estimators=100, random_state=42)
svm_clf = LinearSVC(max_iter=100, tol=20, random_state=42)
mlp_clf = MLPClassifier(random_state=42)

estimators = [random_forest_clf, extra_trees_clf, svm_clf, mlp_clf]
for estimator in estimators:
    print("Training the", estimator.__class__.__name__)
    estimator.fit(X_train, y_train)

[estimator.score(X_val, y_val) for estimator in estimators]

  warn(


Training the RandomForestClassifier
Training the ExtraTreesClassifier
Training the LinearSVC
Training the MLPClassifier


[0.9692, 0.9715, 0.859, 0.9614]

In [None]:
# To plot pretty figures
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)


In [None]:
estimators = [random_forest_clf, extra_trees_clf, svm_clf, mlp_clf]
for estimator in estimators:
    print("Training", estimator.__class__.__name__)
    estimator.fit(X_train, y_train)


In [None]:
named_estimators = [
    ("random_forest_clf", random_forest_clf),
    ("extra_trees_clf", extra_trees_clf),
    ("svm_clf", svm_clf),
    ("mlp_clf", mlp_clf),
]
voting_clf = VotingClassifier(estimators=named_estimators)
voting_clf.fit(X_train, y_train)

[voting_clf.named_estimators_[name].score(X_val, y_val) for name, _ in named_estimators]


In [None]:
voting_clf = VotingClassifier(estimators=named_estimators)
voting_clf.fit(X_train, y_train)
voting_clf.score(X_val, y_val)

In [None]:
[voting_clf.estimators_[i].score(X_val, y_val) for i in range(len(voting_clf.estimators_))]

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier

In [None]:
X_train_val, X_test, y_train_val, y_test = train_test_split(
    mnist.data, mnist.target, test_size=10000, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=10000, random_state=42)

In [None]:
[voting_clf.estimators_[i].score(X_val, y_val) for i in range(len(voting_clf.estimators_))]

In [None]:
voting_clf.score(X_val, y_val)

0.974

In [None]:
[estimator.score(X_val, y_val) for estimator in voting_clf.estimators_]

[0.0, 0.0, 0.0]

In [None]:
voting_clf.set_params(svm_clf=None)

In [None]:
voting_clf.estimators

[('random_forest_clf', RandomForestClassifier(random_state=42)),
 ('extra_trees_clf', ExtraTreesClassifier(random_state=42)),
 ('svm_clf', None),
 ('mlp_clf', MLPClassifier(random_state=42))]

In [None]:
voting_clf.estimators_

[RandomForestClassifier(random_state=42),
 ExtraTreesClassifier(random_state=42),
 LinearSVC(max_iter=100, random_state=42, tol=20),
 MLPClassifier(random_state=42)]

In [None]:
del voting_clf.estimators_[2]

In [None]:
voting_clf.score(X_val, y_val)

0.974

In [None]:
voting_clf.voting = "soft"
voting_clf.score(X_val, y_val)

0.9699

In [None]:
voting_clf.voting = "hard"
voting_clf.score(X_test, y_test)

0.9703

In [None]:
[estimator.score(X_test, y_test) for estimator in voting_clf.estimators_]

[0.0, 0.0, 0.0]

In [None]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split

# Load MNIST dataset
mnist = fetch_openml('mnist_784', version=1, cache=True)

# Split data into features (X) and labels (y)
X, y = mnist["data"], mnist["target"]

# Split data into training, validation, and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=10000, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=10000, random_state=42)

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.svm import SVC

# Initialize individual classifiers
random_forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)
extra_trees_clf = ExtraTreesClassifier(n_estimators=100, random_state=42)
svm_clf = SVC(probability=True, random_state=42)

# Train individual classifiers
random_forest_clf.fit(X_train, y_train)
extra_trees_clf.fit(X_train, y_train)
svm_clf.fit(X_train, y_train)

# Create a voting classifier with soft voting
voting_clf = VotingClassifier(
    estimators=[('random_forest', random_forest_clf), ('extra_trees', extra_trees_clf), ('svm', svm_clf)],
    voting='soft'
)

# Train the voting classifier
voting_clf.fit(X_train, y_train)

from sklearn.metrics import accuracy_score

# Make predictions on the validation set
y_val_pred = voting_clf.predict(X_val)

# Calculate accuracy of the ensemble classifier on the validation set
ensemble_accuracy = accuracy_score(y_val, y_val_pred)
print("Ensemble accuracy on validation set:", ensemble_accuracy)

  warn(


Ensemble accuracy on validation set: 0.9791


In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score

# Make predictions on the test set
y_test_pred = voting_clf.predict(X_test)

# Calculate accuracy of the ensemble classifier on the test set
ensemble_test_accuracy = accuracy_score(y_test, y_test_pred)
print("Ensemble accuracy on test set:", ensemble_test_accuracy)
