In [34]:
import pandas as pd

In [35]:
bank_data = pd.read_csv('datasets/bank_data_processed.csv')

bank_data.head()

Unnamed: 0,Age,Income,Family,Education,Mortgage,Securities Account,CD Account,Online,CreditCard
0,34,180,1,3,0,0,0,0,0
1,38,130,4,3,134,0,0,0,0
2,46,193,2,3,0,0,0,0,0
3,38,119,1,2,0,0,1,1,1
4,42,141,3,3,0,1,1,1,0


In [36]:
X = bank_data.drop('CreditCard', axis=1)

Y = bank_data['CreditCard']

In [37]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [38]:
x_train.shape, y_train.shape

((384, 8), (384,))

In [39]:
x_test.shape, y_test.shape

((96, 8), (96,))

### Bagging Classifier
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html

In [40]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier

### Pasting

- Here we are perform sampling without replacement (bootstrap=False), this method is called pasting

In [41]:
bag_clf = BaggingClassifier(DecisionTreeClassifier(), 
                            n_estimators = 400,
                            bootstrap = False,
                            max_samples = 1.0,
                            n_jobs = -1)

In [42]:
bag_clf.fit(x_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None,
                                                        criterion='gini',
                                                        max_depth=None,
                                                        max_features=None,
                                                        max_leaf_nodes=None,
                                                        min_impurity_decrease=0.0,
                                                        min_impurity_split=None,
                                                        min_samples_leaf=1,
                                                        min_samples_split=2,
                                                        min_weight_fraction_leaf=0.0,
                                                        presort=False,
                                                        random_state=None,
                                                        splitter='best'),
    

In [43]:
y_pred = bag_clf.predict(x_test)

In [44]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)

0.78125

### Bagging
- Here we are perform sampling with replacement (bootstrap=True), this method is called bagging. 

In [45]:
bag_clf = BaggingClassifier(DecisionTreeClassifier(), 
                            n_estimators = 400,
                            bootstrap = True,
                            max_samples = 0.7,
                            n_jobs = -1,
                            oob_score = True)

bag_clf.fit(x_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None,
                                                        criterion='gini',
                                                        max_depth=None,
                                                        max_features=None,
                                                        max_leaf_nodes=None,
                                                        min_impurity_decrease=0.0,
                                                        min_impurity_split=None,
                                                        min_samples_leaf=1,
                                                        min_samples_split=2,
                                                        min_weight_fraction_leaf=0.0,
                                                        presort=False,
                                                        random_state=None,
                                                        splitter='best'),
    

In [46]:
bag_clf.oob_score_

0.765625

In [47]:
y_pred = bag_clf.predict(x_test)

accuracy_score(y_test, y_pred)

0.8541666666666666

#### Random Patches

- Use a subset of training instances (bootstrap=False and max_samples=100) 
- But sample features (bootstrap_features=True and/or max_features=smaller than 1) 

In [48]:
bag_clf = BaggingClassifier(DecisionTreeClassifier(), 
                            n_estimators = 400,
                            bootstrap = False,
                            max_samples = 100,
                            bootstrap_features = True,
                            max_features = 0.8,
                            n_jobs = -1)

bag_clf.fit(x_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None,
                                                        criterion='gini',
                                                        max_depth=None,
                                                        max_features=None,
                                                        max_leaf_nodes=None,
                                                        min_impurity_decrease=0.0,
                                                        min_impurity_split=None,
                                                        min_samples_leaf=1,
                                                        min_samples_split=2,
                                                        min_weight_fraction_leaf=0.0,
                                                        presort=False,
                                                        random_state=None,
                                                        splitter='best'),
    

In [49]:
y_pred = bag_clf.predict(x_test)

accuracy_score(y_test, y_pred)

0.8645833333333334

In [50]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.99      0.92        74
           1       0.91      0.45      0.61        22

    accuracy                           0.86        96
   macro avg       0.88      0.72      0.76        96
weighted avg       0.87      0.86      0.85        96

