## Getting data ready

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
sns.set()

In [None]:
# The classifiers
from sklearn.ensemble import BaggingClassifier

# Metrics and Validation
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
# Custom imports
from util.pickler import pickle_out

In [None]:
# Load Precessed Data
train = pd.read_csv('data/train_processed_1.csv', index_col='PassengerId')
test = pd.read_csv('data/test_processed_1.csv', index_col='PassengerId')

# Feature Engineering
train = pd.get_dummies(data=train, drop_first=True)
test = pd.get_dummies(data=test, drop_first=True)

# Split Datasets
train_y = train.pop('Survived')
train_x = train
test_x = test # Nothing to split! Test-set has no target columns.

## Ensemble Method : Bagging (aka Bootstrap Aggregator) with setting OOB=False
* With Bagging, not all data is used for training but only a subset of data is used by each of the base_estimator.
* With the reamining unused training data, you can cross-validate it inside andd this is called OOB (Out Of Bag) score.

In [None]:
# Unpickle The classifier we want to use with Ensemble
odtclf = pickle_out('pickle/optimized_dtree_clf.pkl')

bagclf = BaggingClassifier(base_estimator=odtclf, 
                           n_estimators=500, # Number of base estimators
                           bootstrap=True, # Sampling with replacement
                           n_jobs=-1, # Use all cores
#                            verbose=1,
                           random_state=42)
bagclf.fit(train_x, train_y)

train_y_pred = bagclf.predict(train_x)
print('accuracy_score = ',accuracy_score(train_y, train_y_pred))
print('\n confusion_matrix :\n', confusion_matrix(train_y, train_y_pred))
print('\n classification_report: \n', classification_report(train_y, train_y_pred))

cv_scores = cross_val_score(bagclf, train_x, train_y, cv=3, scoring='accuracy')
print('\n cross_val_score : \n', cv_scores)
# [0.7979798  0.81144781 0.81144781] with n_estimators=10
# [0.8047138 0.8047138 0.7979798] with n_estimators=100
# [0.8013468 0.8013468 0.8013468] with n_estimators=500

# Make Predictions
# test_y_pred = bagclf.predict(test_x)