In [1]:
import numpy as np
import pandas as pd

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    VotingClassifier, BaggingClassifier, RandomForestClassifier,
    AdaBoostClassifier, GradientBoostingClassifier
)
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    confusion_matrix, f1_score, roc_auc_score
)

In [2]:
seed = 42

### **Ensemble Learning**

In [3]:
cancer = load_breast_cancer()
print(cancer.DESCR)

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        worst/largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 0 is Mean Radi

In [4]:
X = cancer.data
y = cancer.target

In [5]:
X_train, X_aux, y_train, y_aux = \
    train_test_split(
        X, y, test_size=0.25, random_state=seed, stratify=y
    )

X_val, X_test, y_val, y_test= \
    train_test_split(
        X_aux, y_aux, test_size=0.75, random_state=seed, stratify=y_aux
    )

#### **Voting**

##### - **Hard Voting**

In [6]:
tree1 = DecisionTreeClassifier(
    criterion="gini", 
    max_depth=1, 
    random_state=seed
)
tree2 = DecisionTreeClassifier(
    criterion="gini", 
    max_depth=2, 
    random_state=seed
)
tree3 = DecisionTreeClassifier(
    criterion="gini", 
    max_depth=3, 
    random_state=seed
)

for idx, clf in enumerate([tree1, tree2, tree3]):
    clf.fit(X_train, y_train)
    print(f"Validation Acurracy: {clf.score(X_val, y_val):.2f} (tree-{idx + 1})")

hard_voting = VotingClassifier(
    estimators=[("tree-1", tree1), ("tree-2", tree2), ("tree-3", tree3)], 
    voting="hard",
    weights=None
)
hard_voting.fit(X_train, y_train)

print(f"\nTest Acurracy: {hard_voting.score(X_test, y_test):.2f} (Hard Voting)")

Validation Acurracy: 0.94 (tree-1)
Validation Acurracy: 0.94 (tree-2)
Validation Acurracy: 0.97 (tree-3)

Test Acurracy: 0.93 (Hard Voting)


##### - **Soft Voting**

In [7]:
tree1 = DecisionTreeClassifier(
    criterion="gini", 
    max_depth=1, 
    random_state=seed
)
tree2 = DecisionTreeClassifier(
    criterion="gini", 
    max_depth=2, 
    random_state=seed
)
tree3 = DecisionTreeClassifier(
    criterion="gini", 
    max_depth=3, 
    random_state=seed
)

for idx, clf in enumerate([tree1, tree2, tree3]):
    clf.fit(X_train, y_train)
    print(f"Validation Acurracy: {clf.score(X_val, y_val):.2f} (tree-{idx + 1})")

soft_voting = VotingClassifier(
    estimators=[("tree-1", tree1), ("tree-2", tree2), ("tree-3", tree3)], 
    voting="soft",
    weights=None
)
soft_voting.fit(X_train, y_train)

print(f"\nTest Acurracy: {soft_voting.score(X_test, y_test):.2f} (Soft Voting)")

Validation Acurracy: 0.94 (tree-1)
Validation Acurracy: 0.94 (tree-2)
Validation Acurracy: 0.97 (tree-3)

Test Acurracy: 0.93 (Soft Voting)


#### **Bagging**

In [8]:
tree = DecisionTreeClassifier(
    criterion="entropy", 
    max_depth=None, 
    random_state=seed
)

bagging = BaggingClassifier(
    estimator=tree,
    n_estimators=100,
    bootstrap=True,
    oob_score=True,
    random_state=seed
)
bagging.fit(X_train, y_train)

print(f" OOB Accuracy: {bagging.oob_score_:.2f} (Bagging)")
print(f"Test Accuracy: {bagging.score(X_test, y_test):.2f} (Bagging)")

 OOB Accuracy: 0.96 (Bagging)
Test Accuracy: 0.94 (Bagging)


##### - **Random Forests**

In [9]:
rf = RandomForestClassifier(
    n_estimators=100,
    criterion="gini",
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=42
)
rf.fit(X_train, y_train)

print(f"Test Accuracy: {rf.score(X_test, y_test):.2f} (Random Forest)")

Test Accuracy: 0.94 (Random Forest)


#### **Boosting**

##### - **AdaBoost**

In [10]:
tree = DecisionTreeClassifier(
    criterion="entropy", 
    max_depth=1, 
    random_state=seed
)

adaboost = AdaBoostClassifier(
    estimator=tree,
    n_estimators=50,
    algorithm="SAMME.R",
    random_state=seed
)
adaboost.fit(X_train, y_train)

print(f"Test Accuracy: {adaboost.score(X_test, y_test):.2f} (AdaBoost)")

Test Accuracy: 0.94 (AdaBoost)


##### - **Gradient Boosting**

In [11]:
gb = GradientBoostingClassifier(
    learning_rate=0.1,
    n_estimators=100,
    max_depth=3,
    random_state=seed
)
gb.fit(X_train, y_train)

print(f"Test Accuracy: {gb.score(X_test, y_test):.2f} (Gradient Boosting)")

Test Accuracy: 0.94 (Gradient Boosting)


#### **Summary**

In [12]:
def summary(models: dict) -> pd.DataFrame:

    s = []

    for name, model in models.items():
        y_pred = model.predict(X_test)
        y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None

        s.append({
            "Model": name,
            "Accuracy": accuracy_score(y_test, y_pred),
            "Precision": precision_score(y_test, y_pred),
            "Recall": recall_score(y_test, y_pred),
            "F1": f1_score(y_test, y_pred),
            "AUC": roc_auc_score(y_test, y_proba) if y_proba is not None else None
        })

    return pd.DataFrame(s)

In [13]:
models = {
    "Hard Voting": hard_voting,
    "Soft Voting": soft_voting,
    "Bagging": bagging,
    "Random Forest": rf,
    "AdaBoost": adaboost,
    "Gradient Boosting": gb
}

summary(models).round(4)

Unnamed: 0,Model,Accuracy,Precision,Recall,F1,AUC
0,Hard Voting,0.9259,0.9412,0.9412,0.9412,
1,Soft Voting,0.9259,0.9412,0.9412,0.9412,0.9509
2,Bagging,0.9444,0.9559,0.9559,0.9559,0.9932
3,Random Forest,0.9444,0.9429,0.9706,0.9565,0.9915
4,AdaBoost,0.9444,0.9429,0.9706,0.9565,0.9728
5,Gradient Boosting,0.9444,0.9306,0.9853,0.9571,0.9882
