# Ensembles Assignment

## Read data

In [1]:
from sklearn.datasets import load_breast_cancer

# Load the breast cancer dataset
breast_cancer = load_breast_cancer()

# Print the number of instances and features in the dataset
print("Number of instances:", breast_cancer.data.shape[0])
print("Number of features:", breast_cancer.data.shape[1])
print("Number of classes:", len(set(breast_cancer.target)))

Number of instances: 569
Number of features: 30
Number of classes: 2


## Description of the data:
- This code loads the breast cancer dataset from the scikit-learn library, which contains data on breast cancer tumors. 
- The number of classes of 2. 
- The classes are binary, indicating whether a breast mass is benign or malignant.
<br>
<br>

Splitting

In [6]:
from sklearn.model_selection import train_test_split

X, y = breast_cancer.data, breast_cancer.target

X_train, X_test, y_train, y_test =\
        train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)

In [5]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Perceptron
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score

pipe1 = make_pipeline(StandardScaler(), Perceptron(eta0=1.0, random_state=1))

pipe2 = make_pipeline(DecisionTreeClassifier(max_depth=6,
                                             criterion='entropy',
                                             random_state=0))

pipe3 = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=20,
                                                             p=3,
                                                             metric='minkowski'))

clf_labels = ['Perceptron', 'Decision tree', 'KNN']

print('10-fold cross validation:\n')
for clf, label in zip([pipe1, pipe2, pipe3], clf_labels):
    scores = cross_val_score(estimator=clf,
                             X=X_train,
                             y=y_train,
                             cv=10,
                             scoring='accuracy')
    print("Accuracy: " + str(round(scores.mean(), 2)) + 
          " Stdev: " + str(round(scores.std(), 3)) +
          " [" + label + "]")

10-fold cross validation:

Accuracy: 0.96 Stdev: 0.023 [Perceptron]
Accuracy: 0.91 Stdev: 0.043 [Decision tree]
Accuracy: 0.95 Stdev: 0.029 [KNN]


In [6]:
from sklearn.ensemble import VotingClassifier

mv_clf = VotingClassifier(estimators=[('p', pipe1), ('dt', pipe2), ('kn', pipe3)])

clf_labels += ['Majority voting']
all_clf = [pipe1, pipe2, pipe3, mv_clf]

for clf, label in zip(all_clf, clf_labels):
    scores = cross_val_score(estimator=clf,
                             X=X_train,
                             y=y_train,
                             cv=10,
                             scoring='accuracy')
    print("Accuracy: " + str(round(scores.mean(), 2)) + 
          " Stdev: " + str(round(scores.std(), 3)) +
          " [" + label + "]")

Accuracy: 0.96 Stdev: 0.023 [Perceptron]
Accuracy: 0.91 Stdev: 0.043 [Decision tree]
Accuracy: 0.95 Stdev: 0.029 [KNN]
Accuracy: 0.96 Stdev: 0.028 [Majority voting]


In [7]:
pipe1.fit(X_train, y_train)

y_pred = pipe1.predict(X_test)
print('Misclassified test set examples:', (y_test != y_pred).sum())
print('Out of a total of:', y_test.shape[0])
print('Accuracy:', pipe1.score(X_test, y_test))

Misclassified test set examples: 6
Out of a total of: 171
Accuracy: 0.9649122807017544


In [8]:
pipe2.fit(X_train, y_train)

y_pred = pipe2.predict(X_test)
print('Misclassified test set examples:', (y_test != y_pred).sum())
print('Out of a total of:', y_test.shape[0])
print('Accuracy:', pipe2.score(X_test, y_test))

Misclassified test set examples: 9
Out of a total of: 171
Accuracy: 0.9473684210526315


In [9]:
pipe3.fit(X_train, y_train)

y_pred = pipe3.predict(X_test)
print('Misclassified test set examples:', (y_test != y_pred).sum())
print('Out of a total of:', y_test.shape[0])
print('Accuracy:', pipe3.score(X_test, y_test))

Misclassified test set examples: 8
Out of a total of: 171
Accuracy: 0.9532163742690059


In [10]:
mv_clf.fit(X_train, y_train)

y_pred = mv_clf.predict(X_test)
print('Misclassified test set examples:', (y_test != y_pred).sum())
print('Out of a total of:', y_test.shape[0])
print('Accuracy:', mv_clf.score(X_test, y_test))

Misclassified test set examples: 5
Out of a total of: 171
Accuracy: 0.9707602339181286
