# **Bagging implementation on breast_cancer Dataset**

* **Hyperparameters of Decision tree:** 

class_weight=None, criterion='entropy', max_depth=24,max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best'
                    
* **Hyperparameters of Bagging classifier:**

base_estimator=None, bootstrap=True, bootstrap_features=False,
                  max_features=1.0, max_samples=1.0, n_estimators=10,
                  n_jobs=None, oob_score=False, random_state=None, verbose=0,
                  warm_start=False
                  
* **Hyperparameters of Random forest classifier:**

bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False

### **Bagging Implementation**

In [None]:
# import the required libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split

from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score,confusion_matrix,roc_curve,roc_auc_score,precision_score,recall_score,f1_score,classification_report

from sklearn.model_selection import GridSearchCV

import warnings
from warnings import filterwarnings
filterwarnings("ignore")

sns.set()

In [None]:
from sklearn.datasets import load_breast_cancer
dataset = load_breast_cancer()
X = dataset.data
y = dataset.target

In [None]:
X

array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
        1.189e-01],
       [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
        8.902e-02],
       [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
        8.758e-02],
       ...,
       [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
        7.820e-02],
       [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
        1.240e-01],
       [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
        7.039e-02]])

In [None]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state = 0) 

### **Bagging Implementation**

**n_estimators=10:** the code is creating an ensemble of 10 SVC models that will work together to make predictions on the given dataset.

In [None]:
clf1 = BaggingClassifier(base_estimator=SVC(),n_estimators=10,random_state=0)
clf1.fit(X_train,y_train)

In [None]:
clf2 = BaggingClassifier(base_estimator=KNeighborsClassifier(),n_estimators=10,random_state = 0)
clf2.fit(X_train,y_train)

In [None]:
clf3 = BaggingClassifier(base_estimator=LogisticRegression(),n_estimators=10,random_state = 10)
clf3.fit(X_train,y_train)

In [None]:
clf4 = BaggingClassifier(base_estimator=DecisionTreeClassifier(),n_estimators=10,random_state=10)
clf4.fit(X_train,y_train)

In [None]:
clf5 = BaggingClassifier(base_estimator=RandomForestClassifier(),n_estimators=10,random_state = 10)
clf5.fit(X_train,y_train)

In [None]:
# SVC
clf1.score(X_test,y_test)

0.9298245614035088

In [None]:
# KNeighborsClassifier
clf2.score(X_test,y_test)

0.9385964912280702

In [None]:
# LogisticRegression
clf3.score(X_test,y_test)

0.956140350877193

In [None]:
# DecisionTreeClassifier
clf4.score(X_test,y_test)

0.9649122807017544

In [None]:
# RandomForestClassifier
clf5.score(X_test,y_test)

0.9473684210526315

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=10)
rfc.fit(X_train,y_train)

In [None]:
rfc.score(X_test,y_test)

0.956140350877193

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train,y_train)

In [None]:
knn.score(X_test,y_test)

0.9385964912280702

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(X_train,y_train)

In [None]:
dtc.score(X_train,y_train)

1.0

**let's using bagging over our KNN classifier and see if our score improves:**

In [None]:
bag_knn = BaggingClassifier(KNeighborsClassifier(n_neighbors=5),
                            n_estimators=50,
                            max_samples=0.5,
                            bootstrap=True,
                            random_state=0,
                            oob_score=True)

In [None]:
bag_knn.fit(X_train,y_train)
bag_knn.score(X_test,y_test)

0.9649122807017544

In [None]:
bag_knn.oob_score_

0.9274725274725275

Great! our score sginificantly improves with use of bagging.

let's not use bootstrap and see the model accuracy! Remember this is "Pasting"

In [None]:
pasting_knn = BaggingClassifier(KNeighborsClassifier(n_neighbors=5),
                            n_estimators=10, 
                            max_samples=0.5,
                            bootstrap=False, 
                            random_state=3) 

In [None]:
pasting_knn.fit(X_train,y_train)
pasting_knn.score(X_test,y_test)

0.956140350877193