# TP_Bagging vs. Boost 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

np.random.seed(42)

## I. Import data

In [None]:
cancer = load_breast_cancer()
# print the names of the 13 features
print("Features: ", cancer.feature_names)

# print the label type of cancer('malignant' 'benign')
print("Labels: ", cancer.target_names)

# print data(feature)shape
cancer.data.shape

In [None]:
# print the cancer data features (top 5 records)
print(cancer.data[0:5])

# print the cancer labels (0:malignant, 1:benign)
print(cancer.target)

In [None]:
x = cancer['data']
y = cancer['target']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)  # 70% training and 30% test


### <font color='red'>*Q1 : Décrire la base de données de l'étude.*</font>

# II. Comparer : DT, RF, Bagging, Boosting

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn import tree

# Bagging Classifier with 1 model (A decision tree)
clf = tree.DecisionTreeClassifier(random_state=42)
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
accuracy = clf.score(x_test,y_test)
print("Accuracy value:","%.3f" %(100*accuracy),"%")


In [None]:
# Bagging Classifier with several decision trees
clf = BaggingClassifier(tree.DecisionTreeClassifier(), n_estimators=300, random_state=42)

clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
accuracy = clf.score(x_test,y_test)
print("Accuracy value:","%.3f" %(100*accuracy),"%")


In [None]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=300, random_state=42)
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
accuracy = clf.score(x_test,y_test)
print("Accuracy value:","%.3f" %(100*accuracy),"%")

In [None]:
# Adaboost 
weak_learner = DecisionTreeClassifier(max_depth=1) #One level decision tree (decision stump)
model_ada_tree = AdaBoostClassifier(base_estimator=weak_learner)

model_ada_tree.fit(x_train, y_train)
y_pred = model_ada_tree.predict(x_test)

print("Accuracy value:","%.3f" %(100*np.mean(y_test==y_pred)),"%")


### <font color='red'>*Q2 : Comparer les résultats de ces quatres modèles. Expliquer.*</font>

# III. Analyse des hysperparamètres de Adaboost

### III.1. base_estimators

### It defines which algorithm we will be used in our Boosting procedure. We can use any type of algorithm with sklearn.

In [None]:
#Estimator: decision tree
weak_learner = DecisionTreeClassifier(max_depth=1) #One level decision tree 
model_ada_tree = AdaBoostClassifier(base_estimator=weak_learner)

model_ada_tree.fit(x_train, y_train)
y_pred = model_ada_tree.predict(x_test)

print("Accuracy value:","%.3f" %(100*np.mean(y_test==y_pred)),"%")

In [None]:
#Estimator: decision tree
weak_learner = DecisionTreeClassifier(max_depth=10) 
model_ada_tree = AdaBoostClassifier(base_estimator=weak_learner)

model_ada_tree.fit(x_train, y_train)
y_pred = model_ada_tree.predict(x_test)

print("Accuracy value:","%.3f" %(100*np.mean(y_test==y_pred)),"%")

In [None]:
#Estimator: SVM
weak_learner = SVC(probability=True, kernel='linear')
model_ada_rl = AdaBoostClassifier(base_estimator=weak_learner)

scaler = StandardScaler().fit(x_train)

x_train_scale = scaler.transform(x_train)
model_ada_rl.fit(x_train_scale, y_train)

x_test_scale = scaler.transform(x_test)
y_pred = model_ada_rl.predict(x_test_scale)

print("Accuracy value:","%.3f" %(100*np.mean(y_test==y_pred)),"%")

In [None]:
#Estimator: logistic regression
weak_learner = LogisticRegression()
model_ada_rl = AdaBoostClassifier(base_estimator=weak_learner)

scaler = StandardScaler().fit(x_train)

x_train_scale = scaler.transform(x_train)
model_ada_rl.fit(x_train_scale, y_train)

x_test_scale = scaler.transform(x_test)
y_pred = model_ada_rl.predict(x_test_scale)

print("Accuracy value:","%.3f" %(100*np.mean(y_test==y_pred)),"%")

### <font color='red'>*Q3 : Comparer les résultats de ces quatres modèles. Expliquer.*</font>

### III.2. n_estimators
### It defines the number of estimators used for the ensemble design.

In [None]:
weak_learner = DecisionTreeClassifier(max_depth=1)
model_ada_tree = AdaBoostClassifier(base_estimator=weak_learner, n_estimators=5)

model_ada_tree.fit(x_train, y_train)
y_pred = model_ada_tree.predict(x_test)

print("Accuracy value:","%.3f" %(100*np.mean(y_test==y_pred)),"%")

In [None]:
weak_learner = DecisionTreeClassifier(max_depth=1)
model_ada_tree = AdaBoostClassifier(base_estimator=weak_learner, n_estimators=100)

model_ada_tree.fit(x_train, y_train)
y_pred = model_ada_tree.predict(x_test)

print("Accuracy value:","%.3f" %(100*np.mean(y_test==y_pred)),"%")

### <font color='red'>*Q4 : Analyser l'impact du nombre d'estimateurs.*</font>

### III.3. learning_rate
### It controls the rate or the speed at which the weights change per iteration.

In [None]:
weak_learner = DecisionTreeClassifier(max_depth=1)
# Try with n_estimators=100 and learning_rate=1
# Then, with n_estimators=100 and learning_rate=2
model_ada_tree = AdaBoostClassifier(base_estimator=weak_learner, n_estimators=100, learning_rate=1, random_state=42)

model_ada_tree.fit(x_train, y_train)
y_pred = model_ada_tree.predict(x_test)

print("Accuracy value:","%.3f" %(100*np.mean(y_test==y_pred)),"%")

In [None]:
weak_learner = DecisionTreeClassifier(max_depth=1)
# Try with learning_rate=0.5 and n_estimators=100
# Then, with learning_rate=0.5 and n_estimators=500
model_ada_tree = AdaBoostClassifier(base_estimator=weak_learner, n_estimators=100, learning_rate=0.5, random_state=42) 

model_ada_tree.fit(x_train, y_train)
y_pred = model_ada_tree.predict(x_test)

print("Accuracy value:","%.3f" %(100*np.mean(y_test==y_pred)),"%")

### <font color='red'>*Q5 : Analyser l'impact du learning rate et étudier sa relation avec le nombre d'estimateurs.*</font>

### III.4. estimator_weights_
### To know the weight (the contribution) of each estimator :

In [None]:
weak_learner = DecisionTreeClassifier(max_depth=1)
#By default: algorithm=SAMME.R : a variant that works with classifiers that can output prediction probabilities.
model_ada_tree = AdaBoostClassifier(base_estimator=weak_learner, n_estimators=100, algorithm="SAMME", learning_rate=1,random_state=42)

model_ada_tree.fit(x_train, y_train)
y_pred = model_ada_tree.predict(x_test)

print("Accuracy value:","%.3f" %(100*np.mean(y_test==y_pred)),"%")

wj=model_ada_tree.estimator_weights_
print(wj)

In [None]:
plt.plot(wj)
plt.xlabel('Nmber of estimators')
plt.ylabel('Estimator weights')
plt.show()

### <font color='red'>*Q6 : Interpréter la courbe obtenue.*</font>

### III.5. feature_importances_

In [None]:
model_ada_tree.feature_importances_

In [None]:
pd.DataFrame([model_ada_tree.feature_importances_], columns=cancer['feature_names'])

### <font color='red'>*Q7 : Quelles sont les variables qui semblent pertinentes?*</font>