# Bagging

In [1]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd

In [3]:
df = sns.load_dataset('titanic')

In [4]:
df.shape

(891, 15)

In [5]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [6]:
df.dropna(inplace=True)

## Data Preprocessing

In [7]:
X = df[['pclass', 'sex', 'age']]

In [8]:
from sklearn import preprocessing
lb = preprocessing.LabelBinarizer()

In [9]:
X['sex'] = lb.fit_transform(X['sex'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [10]:
X.head()

Unnamed: 0,pclass,sex,age
1,1,0,38.0
3,1,0,35.0
6,1,1,54.0
10,3,0,4.0
11,1,0,58.0


In [12]:
X.shape

(182, 3)

In [13]:
X.describe()

Unnamed: 0,pclass,sex,age
count,182.0,182.0,182.0
mean,1.192308,0.516484,35.623187
std,0.516411,0.501107,15.671615
min,1.0,0.0,0.92
25%,1.0,0.0,24.0
50%,1.0,1.0,36.0
75%,1.0,1.0,47.75
max,3.0,1.0,80.0


In [14]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 182 entries, 1 to 889
Data columns (total 3 columns):
pclass    182 non-null int64
sex       182 non-null int32
age       182 non-null float64
dtypes: float64(1), int32(1), int64(1)
memory usage: 5.0 KB


In [15]:
y = df['survived']

In [16]:
y.value_counts()

1    123
0     59
Name: survived, dtype: int64

## Fit Model

In [17]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import train_test_split

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size= 0.33, random_state=42)

In [19]:
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [33]:
def print_score(clf, X_train, X_test, y_train, y_test, train = True):
    '''
    print accuracy score and confusion matrix
    '''
    
    if train:
        '''
        train performance
        '''
        print('Train Result: \n')
        print('Accuracy Score: {0:.4f}\n'.format(accuracy_score(y_train, clf.predict(X_train))))
        print('Classification Report: \n {} \n'.format(classification_report(y_train, clf.predict(X_train))))
        print('Confusion Matrix: \n {} \n'.format(confusion_matrix(y_train, clf.predict(X_train))))
        
        res = cross_val_score(clf, X_train, y_train, cv = 10, scoring='accuracy')
        print('Average Accuracy: \t {0:.4f}'.format(np.mean(res)))
        print('Average SD: \t\t {0:.4f}'.format(np.std(res)))
        
    elif train == False:
        '''
        test performance
        '''
        print('Test Result \n')
        print('Accuracy Score: {0:.4f}\n'.format(accuracy_score(y_test, clf.predict(X_test))))
        print('Classification Report: \n {} \n'.format(classification_report(y_test, clf.predict(X_test))))
        print('Confusion Matrix: \n {} \n'.format(confusion_matrix(y_test, clf.predict(X_test))))

## Decision Tree

In [34]:
clf = DecisionTreeClassifier(random_state = 42)

In [35]:
clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best')

In [36]:
print_score(clf, X_train, X_test, y_train, y_test, train=True)

Train Result: 

Accuracy Score: 0.9669

Classification Report: 
               precision    recall  f1-score   support

           0       0.90      1.00      0.95        35
           1       1.00      0.95      0.98        86

   micro avg       0.97      0.97      0.97       121
   macro avg       0.95      0.98      0.96       121
weighted avg       0.97      0.97      0.97       121
 

Confusion Matrix: 
 [[35  0]
 [ 4 82]] 

Average Accuracy: 	 0.8198
Average SD: 		 0.1217


In [37]:
print_score(clf, X_train, X_test, y_train, y_test, train=False)

Test Result 

Accuracy Score: 0.7213

Classification Report: 
               precision    recall  f1-score   support

           0       0.67      0.58      0.62        24
           1       0.75      0.81      0.78        37

   micro avg       0.72      0.72      0.72        61
   macro avg       0.71      0.70      0.70        61
weighted avg       0.72      0.72      0.72        61
 

Confusion Matrix: 
 [[14 10]
 [ 7 30]] 



In [38]:
df2 = sns.load_dataset('titanic')

In [40]:
X = df2[['pclass', 'sex', 'age']]
X['sex'] = lb.fit_transform(X['sex'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [41]:
y = df2['survived']

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size= 0.33, random_state=42)

In [47]:
clf = DecisionTreeClassifier(random_state = 42)

In [48]:
##clf.fit(X_train, y_train) #should fail, can't accept NaN

## Bagging (OOB = False)

In [49]:
X = df[['pclass', 'sex', 'age']]
X['sex'] = lb.fit_transform(X['sex'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [50]:
y = df['survived']

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size= 0.33, random_state=42)

In [53]:
clf = DecisionTreeClassifier(random_state = 42)
clf.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best')

In [54]:
bag_clf = BaggingClassifier(base_estimator = clf, n_estimators = 1000,
                           bootstrap = True, n_jobs = 1, random_state = 42)

In [55]:
bag_clf.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best'),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=1.0, n_estimators=1000, n_jobs=1, oob_score=False,
         random_state=42, verbose=0, warm_start=False)

In [58]:
print_score(bag_clf, X_train, X_test, y_train, y_test, train=True)

Train Result: 

Accuracy Score: 0.9669

Classification Report: 
               precision    recall  f1-score   support

           0       0.94      0.94      0.94        35
           1       0.98      0.98      0.98        86

   micro avg       0.97      0.97      0.97       121
   macro avg       0.96      0.96      0.96       121
weighted avg       0.97      0.97      0.97       121
 

Confusion Matrix: 
 [[33  2]
 [ 2 84]] 

Average Accuracy: 	 0.8044
Average SD: 		 0.1360


In [59]:
print_score(bag_clf, X_train, X_test, y_train, y_test, train=False)

Test Result 

Accuracy Score: 0.7541

Classification Report: 
               precision    recall  f1-score   support

           0       0.74      0.58      0.65        24
           1       0.76      0.86      0.81        37

   micro avg       0.75      0.75      0.75        61
   macro avg       0.75      0.72      0.73        61
weighted avg       0.75      0.75      0.75        61
 

Confusion Matrix: 
 [[14 10]
 [ 5 32]] 



## Bagging (OOB = True)

In [60]:
bag_clf = BaggingClassifier(base_estimator = clf, n_estimators = 1000,
                           bootstrap = True, oob_score = True,
                            n_jobs = -1, random_state = 42)

In [61]:
bag_clf.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best'),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=1.0, n_estimators=1000, n_jobs=-1, oob_score=True,
         random_state=42, verbose=0, warm_start=False)

In [62]:
bag_clf.oob_score_

0.8016528925619835

In [63]:
print_score(bag_clf, X_train, X_test, y_train, y_test, train=True)

Train Result: 

Accuracy Score: 0.9669

Classification Report: 
               precision    recall  f1-score   support

           0       0.94      0.94      0.94        35
           1       0.98      0.98      0.98        86

   micro avg       0.97      0.97      0.97       121
   macro avg       0.96      0.96      0.96       121
weighted avg       0.97      0.97      0.97       121
 

Confusion Matrix: 
 [[33  2]
 [ 2 84]] 

Average Accuracy: 	 0.8044
Average SD: 		 0.1360


In [64]:
print_score(bag_clf, X_train, X_test, y_train, y_test, train=False)

Test Result 

Accuracy Score: 0.7541

Classification Report: 
               precision    recall  f1-score   support

           0       0.74      0.58      0.65        24
           1       0.76      0.86      0.81        37

   micro avg       0.75      0.75      0.75        61
   macro avg       0.75      0.72      0.73        61
weighted avg       0.75      0.75      0.75        61
 

Confusion Matrix: 
 [[14 10]
 [ 5 32]] 



In [65]:
bag_clf = BaggingClassifier(base_estimator = clf, n_estimators = 5000,
                           bootstrap = True, n_jobs = 1, random_state = 42)

In [66]:
bag_clf.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best'),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=1.0, n_estimators=5000, n_jobs=1, oob_score=False,
         random_state=42, verbose=0, warm_start=False)

In [67]:
print_score(bag_clf, X_train, X_test, y_train, y_test, train=True)

Train Result: 

Accuracy Score: 0.9669

Classification Report: 
               precision    recall  f1-score   support

           0       0.94      0.94      0.94        35
           1       0.98      0.98      0.98        86

   micro avg       0.97      0.97      0.97       121
   macro avg       0.96      0.96      0.96       121
weighted avg       0.97      0.97      0.97       121
 

Confusion Matrix: 
 [[33  2]
 [ 2 84]] 

Average Accuracy: 	 0.8044
Average SD: 		 0.1360


In [68]:
print_score(bag_clf, X_train, X_test, y_train, y_test, train=False)

Test Result 

Accuracy Score: 0.7541

Classification Report: 
               precision    recall  f1-score   support

           0       0.74      0.58      0.65        24
           1       0.76      0.86      0.81        37

   micro avg       0.75      0.75      0.75        61
   macro avg       0.75      0.72      0.73        61
weighted avg       0.75      0.75      0.75        61
 

Confusion Matrix: 
 [[14 10]
 [ 5 32]] 

