# AdaBoost (Adaptive Boosting)
Similar to human learning, the algorithm learns from its past mistakes by focusing more on difficult problems it did not get right in prior learning.

## Getting data ready

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
sns.set()

In [None]:
# The classifiers
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn import svm # for SVM classifier

# Metrics and Validation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit, cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Custom Functions
from util.author import results2csv
from util.fe import transform

In [None]:
# Load Precessed Data
train = pd.read_csv('data/train_processed_1.csv', index_col='PassengerId')
test = pd.read_csv('data/test_processed_1.csv', index_col='PassengerId')

# Feature Engineering
train = pd.get_dummies(data=train, drop_first=True)
test = pd.get_dummies(data=test, drop_first=True)

# Split Datasets
train_y = train.pop('Survived')
train_x = train
test_x = test # Nothing to split! Test-set has no target columns.

# select_colns = ['Pclass', 'RoundedFare', 'Age', 'SibSp', 'Parch', 'Sex_male', 'Embarked_Q', 'Embarked_S']
select_colns = ['Pclass', 'Age', 'GroupCount', 'Sex_male', 'Embarked_Q', 'Embarked_S']
train_x = transform(train_x, select_colns)
test_x = transform(test_x, select_colns)

## AdaBoost with DecisionTree classifier (default)

In [None]:
abclf = AdaBoostClassifier(random_state=42) # default base estimator = DecisionTreeClassifier
abclf.fit(train_x, train_y)

# Measuring Accuracy with K-fold Cross-Validation
cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=42)
cv_scores = cross_val_score(abclf, train_x, train_y, cv=cv, scoring='accuracy')
print('CV Scores :', cv_scores)
# CV Scores : [0.80597015 0.79850746 0.81716418]

# Make Predictions    
test_y_pred = abclf.predict(test_x)

# Persist Data to CSV file for submission
fname = "data/predictions/adaboost.csv"
results2csv(test_x.index, test_y_pred, fname)

confusion_matrix(train_y, abclf.predict(train_x))    

## AdaBoost and GridSearchCV

In [None]:
abclf = AdaBoostClassifier(random_state=42) 

params={
    'n_estimators': [10,15,20,25,50,100],
    'learning_rate': [0.25,0.5,0.6,0.75,1.0]
}
gscv = GridSearchCV(estimator=abclf, n_jobs=-1, cv=3, verbose=1, scoring='accuracy', param_grid=params)
gscv.fit(train_x, train_y)

# Print Best Score and Params
print('Best ', gscv.best_score_)
best_params = gscv.best_estimator_.get_params()
for k in sorted(params.keys()):
    print('\t{0} \t {1}'.format(k, best_params[k]))
'''
Best  0.8148148148148148
	learning_rate 	 0.6
	n_estimators 	 15
'''
# Measuring Accuracy with K-fold Cross-Validation
cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=42)
cv_scores = cross_val_score(gscv, train_x, train_y, cv=cv, scoring='accuracy')
print('CV Scores :', cv_scores)
# CV Scores : [0.82089552 0.80970149 0.83208955]

# Make Predictions    
test_y_pred = gscv.predict(test_x)

# Persist Data to CSV file for submission
fname = "data/predictions/adaboost_gscv.csv"
results2csv(test_x.index, test_y_pred, fname)

confusion_matrix(train_y, gscv.predict(train_x))    

## AdaBoost with RandomForest classifier

In [None]:
# Params borrowed from RandomForest classifier's best params in earlier notebook
rfclf = RandomForestClassifier(random_state=42, max_depth=6, min_samples_leaf=3, min_samples_split=2, n_estimators=20)

abclf = AdaBoostClassifier(base_estimator=rfclf, random_state=42)
abclf.fit(train_x, train_y)

# Measuring Accuracy with K-fold Cross-Validation
cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=42)
cv_scores = cross_val_score(abclf, train_x, train_y, cv=cv, scoring='accuracy')
print('CV Scores :', cv_scores)
# CV Scores : [0.80223881 0.82089552 0.80597015]

# Make Predictions    
test_y_pred = abclf.predict(test_x)

# Persist Data to CSV file for submission
fname = "data/predictions/adaboost_randomforest.csv"
results2csv(test_x.index, test_y_pred, fname)

confusion_matrix(train_y, abclf.predict(train_x))    

## AdaBoost with SVM classifier
* NOTE: This predicts so terribly. See confusion matrix and it has predicted  everything positive. Meh!

In [None]:
# Params borrowed from earlier notebook (see Task 10 in DataModelling)
svmclf = svm.SVC(kernel='rbf', gamma=.25, C=.5, random_state=42)

abclf = AdaBoostClassifier(base_estimator=svmclf,
                           algorithm='SAMME', # Reqd. to work with SVM classifier
                           random_state=42)
abclf.fit(train_x, train_y)

# Measuring Accuracy with K-fold Cross-Validation
cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=42)
cv_scores = cross_val_score(abclf, train_x, train_y, cv=cv, scoring='accuracy')
print('CV Scores :', cv_scores)
# CV Scores : [0.5858209  0.61567164 0.65671642]

# Make Predictions    
test_y_pred = abclf.predict(test_x)

# Persist Data to CSV file for submission
fname = "data/predictions/adaboost_randomforest.csv"
results2csv(test_x.index, test_y_pred, fname)

confusion_matrix(train_y, abclf.predict(train_x))
'''
array([[549,   0],
       [342,   0]], dtype=int64)
'''