In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
from sklearn.datasets import make_classification

X,y = make_classification(n_samples=10000,n_features=10,n_informative=3,random_state=42)

In [4]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [5]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [6]:
dt = DecisionTreeClassifier()
dt.fit(X_train,y_train)
y_pred1 = dt.predict(X_test)
accuracy_score(y_pred1,y_test)

0.928

In [7]:
svc = SVC()
svc.fit(X_train,y_train)
ypred2 = svc.predict(X_test)
accuracy_score(ypred2,y_test)

0.9255

In [8]:
knn = KNeighborsClassifier()
knn.fit(X_train,y_train)
ypred3 = knn.predict(X_test)
accuracy_score(ypred3,y_test)

0.91

**Bagging with Bootsrap Aggregation**


In [10]:
bc = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
  n_estimators = 500,
    bootstrap = True,
    max_samples = 0.25 #that is out of 8000 training data rows we'll be having 2000 here

)
bc.fit(X_train,y_train)
ypred = bc.predict(X_test)
accuracy_score(ypred,y_test)

0.9445

**Pasting**

In [14]:
bc = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
  n_estimators = 500,
    bootstrap = False,
    max_samples = 0.25,
    verbose = 1,
    n_jobs = -1

)
bc.fit(X_train,y_train)
ypred = bc.predict(X_test)
accuracy_score(ypred,y_test)

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:   10.4s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.2s finished


0.9475

In [15]:
bc.estimators_samples_[0].shape

(2000,)

**Random Subspaces**

In [16]:
bc = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
  n_estimators = 500,
    bootstrap = False,
    max_samples = 1,
    verbose = 1,
    n_jobs = -1 ,
    bootstrap_features = True,
    max_features = 0.5

)
bc.fit(X_train,y_train)
ypred = bc.predict(X_test)
accuracy_score(ypred,y_test)

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    2.8s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.2s finished


0.5015

**Random Patches**

In [17]:
bc = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
  n_estimators = 500,
    bootstrap = True,
    max_samples = 0.25,
    n_jobs = -1 ,
    bootstrap_features = True,
    max_features = 0.5

)
bc.fit(X_train,y_train)
ypred = bc.predict(X_test)
accuracy_score(ypred,y_test)

0.9355

**Out Of Bagging Score (OOB)**

In [18]:
bc = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
  n_estimators = 500,
    bootstrap = True,
    oob_score = True,
    max_samples = 0.25

)
bc.fit(X_train,y_train)
ypred = bc.predict(X_test)
accuracy_score(ypred,y_test)

0.9455

**GridSearchCV**

In [20]:
# from sklearn.model_selection import GridSearchCV
# params = {
#     'n_estimators' : [100, 200, 300, 400, 500],
#     'max_samples' : [0.1, 0.25, 0.5, 0.75, 1],
#     'bootstrap' : [True, False],
#     'max_features' : [0.1, 0.25, 0.5, 0.75, 1],
#     'bootstrap_features' : [True, False]
# }
# search = GridSearchCV(BaggingClassifier(), params, cv=5)
# search.fit(X_train, y_train)