In [1]:
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

In [2]:
X,y = make_classification(n_samples=10000, n_features=10, n_informative=3)

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)

In [4]:
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train,y_train)
y_pred = dt.predict(X_test)

print("Decision Tree accuracy",accuracy_score(y_test,y_pred))

Decision Tree accuracy 0.961


# Bagging

In [5]:
bag = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=0.25,
    bootstrap=True,
    random_state=42
)

In [6]:
bag.fit(X_train,y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_samples=0.25,
                  n_estimators=500, random_state=42)

In [7]:
y_pred = bag.predict(X_test)

In [8]:
accuracy_score(y_test,y_pred)

0.972

In [10]:
bag.estimators_samples_[0].shape

(2000,)

# Bagging using SVM

In [11]:
bag = BaggingClassifier(
    base_estimator=SVC(),
    n_estimators=500,
    max_samples=0.25,
    bootstrap=True,
    random_state=42
)

In [12]:
bag.fit(X_train,y_train)
y_pred = bag.predict(X_test)
print("Bagging using SVM",accuracy_score(y_test,y_pred))

Bagging using SVM 0.9605


# Pasting

In [13]:
bag = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=0.25,
    bootstrap=False,
    random_state=42,
    verbose=1,
    n_jobs=1
)

In [20]:
bag.fit(X_train,y_train)
y_pred = bag.predict(X_test)
print("Pasting classifier Bagging",accuracy_score(y_test,y_pred))

Pasting classifier Bagging 0.9675


# Random Subspaces

In [18]:
bag = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=1.0,
    bootstrap=False,
    max_features=0.5,
    bootstrap_features=True,
    random_state=42
)

In [19]:
bag.fit(X_train,y_train)
y_pred = bag.predict(X_test)
print("Random Spaces Classifier",accuracy_score(y_test,y_pred))

Random Spaces Classifier 0.9675


# Random Patches

In [21]:
bag = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=0.25,
    bootstrap=False,
    max_features=0.5,
    bootstrap_features=True,
    random_state=42
)

In [22]:
bag.fit(X_train,y_train)
y_pred = bag.predict(X_test)
print("Random Patches Classifier",accuracy_score(y_test,y_pred))

Random Patches Classifier 0.968


# OOB Score
## Out of Bag Samples

### Some rows have been ignored by bagging technique as estimates only 63% rows are used properky and remaining 37% remains untouched during random sampling of rows for models

In [23]:
bag = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=0.25,
    bootstrap=True,
    oob_score=True,
    random_state=42
)

In [24]:
bag.fit(X_train,y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_samples=0.25,
                  n_estimators=500, oob_score=True, random_state=42)

In [25]:
bag.oob_score_

0.97225

In [26]:
y_pred = bag.predict(X_test)
print("Accuracy" ,accuracy_score(y_test,y_pred))

Accuracy 0.972


# Bagging Tips
### Bagging generally gives better results than pasting.
### Good redults come around the 25% to 50% for sampling mark.
### Random patches and subspaces should be used while dealing with high dimensional data.
### To find the correct hyperparameter values we can do GridSearchCV/RandomSearchCV

# Applying GridSearchCV

In [27]:
parameters = {
    'n_estimators' : [50,100,500],
    'max_samples' : [0.1,0.4,0.7,1.0],
    'bootstrap' : [True,False],
    'max_features' : [0.1,0.4,0.7,1.0]
}

In [28]:
search = GridSearchCV(BaggingClassifier(),parameters,cv=5)

<IPython.core.display.Javascript object>

In [29]:
search.fit(X_train,y_train)

KeyboardInterrupt: 

In [None]:
search.best_params_
search.best_score_

In [None]:
search.best_