In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

In [3]:
X, y = make_classification(n_samples = 10000, n_features = 10, n_informative = 3)


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [5]:
dt = DecisionTreeClassifier(random_state = 42)
dt.fit(X_train, y_train)

y_pred = dt.predict(X_test)

print("Decision tree accuracy:", accuracy_score(y_test, y_pred))

Decision tree accuracy: 0.9525


## Bagging Classifier 

In [6]:
bag = BaggingClassifier( 
      base_estimator  = DecisionTreeClassifier(),
      n_estimators = 500, max_samples = 0.25, bootstrap = True, random_state = 42)

In [7]:
bag.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_samples=0.25,
                  n_estimators=500, random_state=42)

In [8]:
y_pred = bag.predict(X_test)

In [9]:
print("Bagging Accuracy Score:", accuracy_score(y_test, y_pred))

Bagging Accuracy Score: 0.9695


# Bagging Using SVM

In [10]:
bag = BaggingClassifier(base_estimator= SVC(),
                       n_estimators = 500,
                       max_samples = 0.25,
                       bootstrap = True,
                       random_state = 42)

In [11]:
bag.fit(X_train, y_train)

BaggingClassifier(base_estimator=SVC(), max_samples=0.25, n_estimators=500,
                  random_state=42)

In [12]:
y_pred = bag.predict(X_test)

In [13]:
print("Accuraacy Score:", accuracy_score(y_test, y_pred))

Accuraacy Score: 0.9515


## Random Subspaces 

In [14]:
bag = BaggingClassifier( 
      base_estimator  = DecisionTreeClassifier(),
      n_estimators = 500, max_samples = 1, bootstrap = True, random_state = 42, max_features = 0.5)

In [15]:
bag.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_features=0.5,
                  max_samples=1, n_estimators=500, random_state=42)

In [16]:
y_pred = bag.predict(X_test)

In [17]:
print("Accuraacy Score:", accuracy_score(y_test, y_pred))

Accuraacy Score: 0.4965


## OOB Score 

It helps in adding the out of bag samples to check the performance of model using oob_score

In [18]:
bag = BaggingClassifier( 
      base_estimator  = DecisionTreeClassifier(),
      n_estimators = 500, max_samples = 0.25, bootstrap = True, random_state = 42, oob_score = True)

In [19]:
bag.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_samples=0.25,
                  n_estimators=500, oob_score=True, random_state=42)

In [20]:
bag.oob_score_

0.967875

## Tips for Bagging 

It generally gives better results than Pasting 
Good results come around the 25% and 50% for sampling work

Random Patches and subspaces should be used only with high dimensional data

to find the correct hyperparameters values we do grid search cv or random search cv

## Applying Grid Search

In [21]:
from sklearn.model_selection import GridSearchCV

In [22]:
parameters = {"n_estimators": [50, 100, 150, 200, 250, 300, 350, 400, 450, 500],
             'max_samples':[0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60, 0.65, 0.67],
             'bootstrap': [True, False],
             'max_features': [1, 0.9, 0.8, 0.7, 0.6, 0.5]}

search = GridSearchCV(BaggingClassifier(), parameters , cv = 5, n_jobs=-1)

In [None]:
search.fit(X_train, y_train)