# Attempting Ensemble Methods

## Getting data ready

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
sns.set()

In [None]:
# The classifiers
from sklearn.ensemble import BaggingClassifier
from sklearn import tree # for Decision Tree

# Pre-processing
from sklearn.preprocessing import StandardScaler

# Metrics and Validation
from sklearn.model_selection import GridSearchCV, ShuffleSplit, StratifiedShuffleSplit, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
# Custom imports
from util.pickler import pickle_out
from util.author import results2csv
from util.fe import transform

In [None]:
# Load Precessed Data
train = pd.read_csv('data/train_processed_1.csv', index_col='PassengerId')
test = pd.read_csv('data/test_processed_1.csv', index_col='PassengerId')

# Feature Engineering
train = pd.get_dummies(data=train, drop_first=True)
test = pd.get_dummies(data=test, drop_first=True)

# Split Datasets
train_y = train.pop('Survived')
train_x = train
test_x = test # Nothing to split! Test-set has no target columns.

# select_colns = ['Pclass', 'RoundedFare', 'Age', 'SibSp', 'Parch', 'Sex_male', 'Embarked_Q', 'Embarked_S']
select_colns = ['Pclass', 'Age', 'GroupCount', 'Sex_male', 'Embarked_Q', 'Embarked_S']
train_x = transform(train_x, select_colns)
test_x = transform(test_x, select_colns)

## Ensemble Method : Bagging (aka Bootstrap Aggregator) with setting oob_score=False
* With Bagging, not all data is used for training but only a subset of data is used by each of the base_estimator.
* With the reamining unused training data, you can cross-validate it inside andd this is called OOB (Out Of Bag) score.

In [None]:
# Unpickle The classifier we want to use with Ensemble
odtclf = pickle_out('pickle/optimized_dtree_clf.pkl') # Not reqd. Just create a new object of your dedsired classifier class
# odtclf = tree.DecisionTreeClassifier(random_state=42) # You can't tune the params of this estimator

bagclf = BaggingClassifier(base_estimator=odtclf, 
                           bootstrap=True, # Sampling with replacement
                           n_jobs=-1, # Use all cores
                           verbose=1,
                           random_state=42)

# Possible params for estimator=BaggingClassifier
params = {
    'n_estimators': [10,15,20,25,50],
    'max_samples': [0.4,0.6,0.8,0.9,1.0]
}

gscv = GridSearchCV(estimator=bagclf, 
                    cv=3,
                    param_grid=params, 
                    n_jobs=-1, 
                    verbose=1)
gscv.fit(train_x, train_y)
print('Grid Scores:\n',gscv.grid_scores_) # Debugging: Best Score = Check out for highest mean-scores with least Std-deviation.

best_score = gscv.best_score_
print('Best Score : ', best_score)
'''
Best Score :  0.8148148148148148
	max_samples 	 0.4
	n_estimators 	 20
'''
best_params = gscv.best_estimator_.get_params()
for k in sorted(params.keys()):
    print('\t{0} \t {1}'.format(k, best_params[k]))

train_y_pred = gscv.predict(train_x)
print('accuracy_score = ',accuracy_score(train_y, train_y_pred))
print('\n confusion_matrix :\n', confusion_matrix(train_y, train_y_pred))
print('\n classification_report: \n', classification_report(train_y, train_y_pred))

cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=42)
cv_scores = cross_val_score(gscv, train_x, train_y, cv=cv, scoring='accuracy')
print('\n cross_val_score : \n', cv_scores)
# [0.7979798  0.81144781 0.81144781] with n_estimators=10
# [0.8047138 0.8047138 0.7979798] with n_estimators=100
# [0.8013468 0.8013468 0.8013468] with n_estimators=500
# [0.82089552 0.8358209  0.81343284] with gridsearch
# [0.80223881 0.83208955 0.79104478] with all columns selected
# [0.77985075 0.83208955 0.80597015] with columns - 'RoundedFare', 'SibSp', 'Parch' - removed
# [0.81343284 0.82462687 0.80970149] with column 'GroupCount' replacing 'SibSp', 'Parch'. and removing 'RoundedFare' 

# Make Predictions
test_y_pred = gscv.predict(test_x)

# Persist Data to CSV file for submission
fname = "data/predictions/bagging_gridsearch.csv"
results2csv(test_x.index, test_y_pred, fname)

## Ensemble Method : Bagging (aka Bootstrap Aggregator) with setting oob_score=True
* With Bagging, not all data is used for training but only a subset of data is used by each of the base_estimator.
* With the reamining unused training data, you can cross-validate it inside andd this is called OOB (Out Of Bag) score. For this, you set OOB=True and check its score later like below

In [None]:
from sklearn import tree # for Decision Tree

# Unpickle The classifier we want to use with Ensemble
# odtclf = pickle_out('pickle/optimized_dtree_clf.pkl') # Not reqd. Just create a new object of your dedsired classifier class
odtclf = tree.DecisionTreeClassifier(random_state=42)

bagclf = BaggingClassifier(base_estimator=odtclf, 
                           n_estimators=20, # Number of base estimators
                           bootstrap=True, # Sampling with replacement
                           n_jobs=-1, # Use all cores
                           oob_score=True,
#                            verbose=1,
                           random_state=42)
bagclf.fit(train_x, train_y)

# Check OOB  Score
print('OOB Score : ', bagclf.oob_score_)

train_y_pred = bagclf.predict(train_x)
print('accuracy_score = ',accuracy_score(train_y, train_y_pred))
print('\n confusion_matrix :\n', confusion_matrix(train_y, train_y_pred))
print('\n classification_report: \n', classification_report(train_y, train_y_pred))

cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=42)
cv_scores = cross_val_score(bagclf, train_x, train_y, cv=cv, scoring='accuracy')
print('\n cross_val_score : \n', cv_scores)
# [0.7979798  0.81144781 0.81144781] with n_estimators=10
# [0.8047138 0.8047138 0.7979798] with n_estimators=100
# [0.8013468 0.8013468 0.8013468] with n_estimators=500 
# [0.79104478 0.79477612 0.80597015] with all columns selected
# [0.75746269 0.8358209  0.79850746] with columns - 'RoundedFare', 'SibSp', 'Parch' - removed

# Make Predictions
test_y_pred = bagclf.predict(test_x)

# Persist Data to CSV file for submission
fname = "data/predictions/bagging.csv"
results2csv(test_x.index, test_y_pred, fname)

### Visualize Survival - Dead/Alive
Just wanted to visualaize survival in scatterplot for intuitiveness

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
# pca = PCA(n_components=2, svd_solver='full')
# PCA(copy=True, iterated_power='auto', n_components=2, random_state=None, svd_solver='full', tol=0.0, whiten=False)
pca.fit(train_x)
print('explained_variance_ratio_ : ',pca.explained_variance_ratio_)  
print('singular_values_ : ', pca.singular_values_)
X = pd.DataFrame(pca.transform(train_x), 
                 index=train_x.index)
y = train_y.copy()

fig = plt.figure(1, figsize=(6, 6))
plt.clf() # clear currrent figure to clean memory footprint
ax = plt.gca() # gca = get current axes

# # Reorder the labels to have colors matching the cluster results
y = np.choose(train_y, [0,1]).astype(np.float)
colors = ['red','green']
labels = ['Dead', 'Alive']
for k in [0,1]:
    rows = y.loc[y==k].index.values
    plt.scatter(X.loc[rows, 0], X.loc[rows, 1], c=colors[k], label=labels[k], alpha=0.25)
plt.legend(loc='best')
plt.show()
# Observation: PCA with just 2 dimensions isn't helping see distinction. The survival is so mixed-up.

In [None]:
lst=[]
for i in range(1,train_x.shape[1]+1):
    lst.append("PC_{0}".format(i))
pca = PCA(n_components=(train_x.shape[1]))
pca.fit(train_x)
pd.DataFrame(data=pca.components_, columns=train_x.columns, index=lst)