# Gradient Boosting Machine (GBM) & Xtreme Gradient Boosting (XGB)

## Getting data ready

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
sns.set()

In [None]:
# The classifiers
from sklearn.ensemble import GradientBoostingClassifier

# Metrics and Validation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit, cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Custom Functions
from util.author import results2csv
from util.fe import transform

In [None]:
# Install if xgboost is not already installed
# !conda install py-xgboost

import xgboost as xgb

In [None]:
# Load Precessed Data
train = pd.read_csv('data/train_processed_1.csv', index_col='PassengerId')
test = pd.read_csv('data/test_processed_1.csv', index_col='PassengerId')

# Feature Engineering
train = pd.get_dummies(data=train, drop_first=True)
test = pd.get_dummies(data=test, drop_first=True)

# Split Datasets
train_y = train.pop('Survived')
train_x = train
test_x = test # Nothing to split! Test-set has no target columns.

# select_colns = ['Pclass', 'RoundedFare', 'Age', 'SibSp', 'Parch', 'Sex_male', 'Embarked_Q', 'Embarked_S']
select_colns = ['Pclass', 'Age', 'GroupCount', 'Sex_male', 'Embarked_Q', 'Embarked_S']
train_x = transform(train_x, select_colns)
test_x = transform(test_x, select_colns)

## GBM with default params

In [None]:
gbmclf = GradientBoostingClassifier()
gbmclf.fit(train_x, train_y)

# Measuring Accuracy with K-fold Cross-Validation
cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=42)
cv_scores = cross_val_score(gbmclf, train_x, train_y, cv=cv, scoring='accuracy')
print('CV Scores :', cv_scores)
# CV Scores : [0.82089552 0.83208955 0.8619403 ]

print('Confusion Matrix :\n', confusion_matrix(train_y, gbmclf.predict(train_x)))
'''
 [[522  27]
 [ 79 263]]
'''

# Make Predictions    
test_y_pred = gbmclf.predict(test_x)

# Persist Data to CSV file for submission
fname = "data/predictions/gbm_default.csv"
results2csv(test_x.index, test_y_pred, fname)

## XGB with default params, manual tuning  of params and using GridSearchCV tuning
* Ref.: https://xgboost.readthedocs.io/en/latest/tutorials/model.html
* XGB = GB with Regularization

In [None]:
xgbclf = xgb.XGBClassifier(random_state=42)
xgbclf.fit(train_x,train_y)

# Measuring Accuracy with K-fold Cross-Validation
cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=42)
cv_scores = cross_val_score(xgbclf, train_x, train_y, cv=cv, scoring='accuracy')
print('CV Scores :', cv_scores)
# CV Scores : [0.82089552 0.8358209  0.87313433]

print('Confusion Matrix :\n', confusion_matrix(train_y, xgbclf.predict(train_x)))
'''
 [[518  31]
 [ 87 255]]
'''

# Make Predictions    
test_y_pred = xgbclf.predict(test_x)

# Persist Data to CSV file for submission
fname = "data/predictions/xgb_default.csv"
results2csv(test_x.index, test_y_pred, fname)

In [None]:
xgbclf

In [None]:
# Manual Tuning of Parameters
xgbclf = xgb.XGBClassifier(random_state=42, n_estimators=77, n_jobs=-1, silent=False, 
                           learning_rate=0.15, gamma=0.5, max_depth=3, subsample=0.3)
xgbclf.fit(train_x,train_y)

# Measuring Accuracy with K-fold Cross-Validation
cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=42)
cv_scores = cross_val_score(xgbclf, train_x, train_y, cv=cv, scoring='accuracy')
print('CV Scores :', cv_scores)
# CV Scores : [0.82089552 0.8358209  0.87313433] # with default params
# CV Scores : [0.82462687 0.84701493 0.85820896] # with manual tuning of params

print('Confusion Matrix :\n', confusion_matrix(train_y, xgbclf.predict(train_x)))
'''
 [[514  35]
 [ 90 252]]
'''

# Make Predictions    
test_y_pred = xgbclf.predict(test_x)

# Persist Data to CSV file for submission
fname = "data/predictions/xgb_manual_tuning.csv"
results2csv(test_x.index, test_y_pred, fname)

In [None]:
xgbclf = xgb.XGBClassifier(random_state=42,n_jobs=-1, silent=False)
params ={
    'learning_rate': [0.05,0.1,0.15,0.2,0.25,0.5],
    'max_depth': [3,5,7,9],
    'subsample': [0.3,0.5,0.7,1]
}
gsclf = GridSearchCV(xgbclf, 
                     n_jobs=-1, # Use all cores of the machine
                     param_grid=params,
                     cv=3,
                     verbose=1, 
                     scoring='accuracy')
gsclf.fit(train_x, train_y)

best_score = gsclf.best_score_
print('Best Score : ', best_score)
# Best Score :  0.8428731762065096

best_params = gsclf.best_estimator_.get_params()
for k in sorted(params.keys()):
    print('\t{0} \t {1}'.format(k, best_params[k]))
'''
	learning_rate 	 0.1
	max_depth 	 3
	subsample 	 1
'''    
# Measuring Accuracy with K-fold Cross-Validation
cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=42)
cv_scores = cross_val_score(gsclf, train_x, train_y, cv=cv, scoring='accuracy')
print('CV Scores :', cv_scores)
# CV Scores : [0.82089552 0.8358209  0.84701493]

print('Confusion Matrix :\n', confusion_matrix(train_y, gsclf.predict(train_x)))
'''
 [[518  31]
 [ 87 255]]
'''

# Make Predictions    
test_y_pred = gsclf.predict(test_x)

# Persist Data to CSV file for submission
fname = "data/predictions/xgb_gscv_tuning.csv"
results2csv(test_x.index, test_y_pred, fname)