# OBJECTIVE : Beat the baseline accuracy of ~78.57% (See A_*.ipynb)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
sns.set()

In [None]:
# The Classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn import svm # for SVM classifier
from sklearn import tree # for Decision Tree

# Pre-processing
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Metrics and Validation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit, cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [None]:
# Custom Utility Packages used in this file
from util.math import round_single, round_double, range_simple
from util.plot_helper import make_meshgrid, plot_contours
from util.pickler import pickle_in
from util.author import results2csv
from util.fe import transform

## Task 1 : Feature Engineering Train and Test Data

In [None]:
train = pd.read_csv('data/train_processed_1.csv', index_col='PassengerId')
train.head()

In [None]:
train.Survived.value_counts()

In [None]:
test = pd.read_csv('data/test_processed_1.csv', index_col='PassengerId')
test.head()

In [None]:
print('Train Shape :', train.shape)
print('Test Shape :', test.shape)

In [None]:
train = pd.get_dummies(data=train, drop_first=True)
print('Train Shape :', train.shape)
train.head()

In [None]:
test = pd.get_dummies(data=test, drop_first=True)
print('Test Shape :', test.shape)
test.head()

## Task 2 : Split Datasets as x and y

In [None]:
train_y = train.pop('Survived')
train_x = train
print('train_x shape :', train_x.shape)
print('train_y shape :', train_y.shape)

In [None]:
test_x = test # Test-set has no target columns
print('test_x shape :', test_x.shape)

## Task 2.1 : Pre-processing Datasets

In [None]:
select_colns = ['Pclass', 'RoundedFare', 'Age', 'SibSp', 'Parch', 'Sex_male', 'Embarked_Q', 'Embarked_S'] # Removing 'RoundedFare' yields better accuracy

train_xx = transform(train_x, select_colns)
test_xx = transform(test_x, select_colns)

## Task 3 : Data Modelling with Logistic Regression Classifier (default params)

In [None]:
lrclf = LogisticRegression(random_state=42)
lrclf.fit(train_x,train_y)

cv_scores = cross_val_score(lrclf, train_x, train_y, cv=3, scoring='accuracy')
print('CV Scores :', cv_scores) # [0.79124579 0.8047138  0.79124579]

results = lrclf.predict(test_x)

# Persist Data to CSV file for submission
results2csv(test_x.index, results, 'data/predictions/logistic_regression.csv')

## Task 4 : Data Modelling with SGDClassifier (default params)

In [None]:
# By default the param loss='hinge'. When the loss function is 'hinge', it gives linear SVM.
# This one below thus gives Linear SVM model
sgdclf = SGDClassifier(random_state=42, max_iter=100)
sgdclf.fit(train_x,train_y)

# Measuring Accuracy with K-fold Cross-Validation
cv_scores = cross_val_score(sgdclf, train_x, train_y, cv=3, scoring='accuracy')
print('CV Scores :', cv_scores) # [0.76767677 0.71043771 0.79124579]

sgd_results = sgdclf.predict(test_x)

# Persist Data to CSV file for submission
results2csv(test_x.index, sgd_results, 'data/predictions/sgd.csv')

## Task 5 : Data Modelling with Logistic Regression Classifier (custom params)

In [None]:
lrclf = LogisticRegression(random_state=42, max_iter=300, C=0.3, solver='sag',n_jobs=3) # C=0.3 maade the real difference here
lrclf.fit(train_xx,train_y)

cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=42)
cv_scores = cross_val_score(lrclf, train_xx, train_y, cv=cv, scoring='accuracy')
print('CV Scores :', cv_scores) 
# [0.79124579 0.8047138  0.79124579] # Default params and without StandardScaler preprocessing
# [0.78787879 0.79124579 0.8047138 ] # With just StandardScaler preprocessing
# [0.79124579 0.8047138  0.8013468 ] # With StandardScaler preprocessing and Custom Params
# [0.80970149 0.79850746 0.80223881] # With StandardScaler preprocessing, Custom Params and ShuffleSplit cv-strategy

# Make Predictions
results = lrclf.predict(test_xx)

# Persist Data to CSV file for submission
results2csv(test_x.index, results, 'data/predictions/logistic_regression_tuned.csv')

In [None]:
coeffs = pd.Series(data=lrclf.coef_.flatten(),index=select_colns)
coeffs
# Gosh, Pclass and Sex seem to have got least importance and RoundedFare got highest importance :facepalm:

## Task 6 : Data Modelling with SGDClassifier (custom params) giving Linear SVM

In [None]:
# By default the param loss='hinge'. When the loss function is 'hinge', it gives linear SVM.
sgdclf = SGDClassifier(random_state=42, max_iter=1000, alpha=0.7)
sgdclf.fit(train_xx,train_y)

# Measuring Accuracy with K-fold Cross-Validation
cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=42)
cv_scores = cross_val_score(sgdclf, train_xx, train_y, cv=cv, scoring='accuracy')
print('CV Scores :', cv_scores) 
# [0.76767677 0.71043771 0.79124579] # with default params
# [0.79104478 0.79850746 0.82089552] # With SS preprocessing, 1k iterations
# [0.82462687 0.82089552 0.79104478] # With SS preprocessing, 1k iterations, alpha=0.7, default loss=hinge

# Make Predictions
sgd_results = sgdclf.predict(test_xx)

# Persist Data to CSV file for submission
loss_function_name = sgdclf.loss_function_.__class__.__name__.lower()
fname = "data/predictions/sgd_tuned_with_{0}.csv".format(loss_function_name)
results2csv(test_x.index, sgd_results, fname)

## Task 7 : Data Modelling with SGDClassifier (custom params) giving Logistic Regression

In [None]:
# By default the param loss='hinge'. When the loss function is 'hinge', it gives linear SVM.
# When the loss function is set to 'log', it gives Logistic Regression
# For other loss functions see http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html
# sgdclf = SGDClassifier(random_state=42, max_iter=1000, alpha=0.7)
sgdclf = SGDClassifier(random_state=42, max_iter=5000, alpha=0.25, loss='log')
sgdclf.fit(train_xx,train_y)

# Measuring Accuracy with K-fold Cross-Validation
cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=42)
cv_scores = cross_val_score(sgdclf, train_xx, train_y, cv=cv, scoring='accuracy')
print('CV Scores :', cv_scores) 
# [0.76767677 0.71043771 0.79124579] # with default params
# [0.79104478 0.79850746 0.82089552] # With SS preprocessing, 1k iterations
# [0.82462687 0.82089552 0.79104478] # With SS preprocessing, 1k iterations, alpha=0.7, default loss=hinge
# [0.80970149 0.82835821 0.82462687] # With SS preprocessing, 1k iterations, alpha=0.7, loss=log

# Make Predictions
sgd_results = sgdclf.predict(test_xx)

# Persist Data to CSV file for submission
loss_function_name = sgdclf.loss_function_.__class__.__name__.lower()
fname = "data/predictions/sgd_tuned_with_{0}.csv".format(loss_function_name)
results2csv(test_x.index, sgd_results, fname)

## Task 8 : Data Modelling with SVM Classifier - Linear

In [None]:
C= 1.0 #0.1 
svmclf = svm.SVC(kernel='linear', C=C, random_state=42)
# svmclf = svm.SVC(kernel='linear', C=C, random_state=42, class_weight={1:2})
svmclf.fit(train_xx,train_y)

# Measuring Accuracy with K-fold Cross-Validation
cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=42)
cv_scores = cross_val_score(svmclf, train_xx, train_y, cv=cv, scoring='accuracy')
print('CV Scores :', cv_scores) 
# [0.79104478 0.79477612 0.79104478]

# Make Predictions
svm_results = svmclf.predict(test_xx)

# Persist Data to CSV file for submission
kernel_name = svmclf.kernel
fname = "data/predictions/svm_with_{0}_kernel.csv".format(kernel_name)
results2csv(test_x.index, svm_results, fname)

## Task 9 : Data Modelling with SVM Classifier - Polynomial

In [None]:
C= 0.75 #1.0
svmclf = svm.SVC(kernel='poly', degree=3, C=C, random_state=42)
# svmclf = svm.SVC(kernel='poly', degree=3, C=C, random_state=42, class_weight={0:3, 1:5})
svmclf.fit(train_xx,train_y)

# Measuring Accuracy with K-fold Cross-Validation
cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=42)
cv_scores = cross_val_score(svmclf, train_xx, train_y, cv=cv, scoring='accuracy')
print('CV Scores :', cv_scores)
# [0.81716418 0.82462687 0.81716418] # When degree=2, class_weight is default  and C=0.75 
# [0.80223881 0.81343284 0.8358209 ] # When degree=3, class_weight is default  and C=0.75 
# [0.80597015 0.80597015 0.80970149] # When degree=3, class_weight={0:3, 1:5} and C=0.75 

# Make Predictions
svm_results = svmclf.predict(test_xx)

# Persist Data to CSV file for submission
kernel_name = svmclf.kernel
fname = "data/predictions/svm_with_{0}_kernel.csv".format(kernel_name)
results2csv(test_x.index, svm_results, fname)

## Task 10 : Data Modelling with SVM Classifier - Gaussian Radial Basis Function (RBF)

In [None]:
# select_colns = ['Pclass','RoundedFare', 'Age', 'SibSp', 'Parch', 'Sex_male', 'Embarked_Q', 'Embarked_S']
# select_colns = ['Pclass', 'Age', 'SibSp', 'Parch', 'Sex_male', 'Embarked_Q', 'Embarked_S']
select_colns = ['Pclass', 'Age', 'Sex_male', 'Embarked_Q', 'Embarked_S']

train_xx = transform(train_x, select_colns) #train_xx.loc[:, select_colns]
test_xx = transform(test_x, select_colns) #test_xx.loc[:, select_colns]

svmclf = svm.SVC(kernel='rbf', gamma=.25, C=.5, random_state=42)
svmclf.fit(train_xx,train_y)

# Measuring Accuracy with K-fold Cross-Validation
cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=42)
cv_scores = cross_val_score(svmclf, train_xx, train_y, cv=cv, scoring='accuracy')
print('CV Scores :', cv_scores)
# [0.81343284 0.8358209  0.82462687] # when C=0.75, kernel='rbf', gamma=0.7

# Make Predictions
svm_results = svmclf.predict(test_xx)

# Persist Data to CSV file for submission
kernel_name = svmclf.kernel
fname = "data/predictions/svm_with_{0}_kernel.csv".format(kernel_name)
results2csv(test_x.index, svm_results, fname)

confusion_matrix(train_y, svmclf.predict(train_xx))

### OBJECTIVE : Tune SVC with GBF Kernel for better results - An attempt.
* Notes *
* C and Gamma are the parameters for a nonlinear support vector machine (SVM) with a Gaussian radial basis function kernel.
* C is the parameter for the soft margin cost function, which controls the influence of each individual support vector; this process involves trading error penalty for stability.
* C controls the cost of misclassification on the training data.
* Small C makes the cost of misclassificaiton low ("soft margin"), thus allowing more of them for the sake of wider "cushion".
* Large C makes the cost of misclassification high ('hard margin"), thus forcing the algorithm to explain the input data stricter and potentially overfit.
* The goal is to find the balance between "not too strict" and "not too loose". Cross-validation and resampling, along with grid search, are good ways to finding the best C.
* Gamma is the free parameter of the Gaussian radial basis function.
* large gamma leads to high bias and low variance models, and vice-versa.
* Intuitively, the gamma parameter defines how far the influence of a single training example reaches, with low values meaning ‘far’ and high values meaning ‘close’. 
* The gamma parameters can be seen as the inverse of the radius of influence of samples selected by the model as support vectors.

In [None]:
select_colns = ['Pclass','RoundedFare', 'Age', 'SibSp', 'Parch', 'Sex_male', 'Embarked_Q', 'Embarked_S']
train_xx = transform(train_x, select_colns)
test_xx = transform(test_x, select_colns)

C= 2.5 # Cost of mis-classification
svmclf = svm.SVC(kernel='rbf', gamma=0.05, C=C, random_state=42, class_weight={1:1.25}) # Gamma is the Bias-factor
svmclf.fit(train_xx,train_y)

# Measuring Accuracy with K-fold Cross-Validation
cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=42)
cv_scores = cross_val_score(svmclf, train_xx, train_y, cv=cv, scoring='accuracy')
print('CV Scores :', cv_scores)
# [0.81343284 0.8358209  0.82462687] # when C=0.75, kernel='rbf', gamma=0.7
# [0.8358209  0.83955224 0.84701493] # when C=2.5, kernel='rbf', gamma=0.05 (improved my ranking in Kaggle by 2722 places)

# Make Predictions
svm_results = svmclf.predict(test_xx)

# Persist Data to CSV file for submission
kernel_name = svmclf.kernel
fname = "data/predictions/svm_with_{0}_kernel_tuned.csv".format(kernel_name)
results2csv(test_x.index, svm_results, fname)

confusion_matrix(train_y, svmclf.predict(train_xx))

In [None]:
C= 1.0 # Cost of mis-classification
svmclf = svm.SVC(kernel='rbf', gamma=0.1, C=C, random_state=42) # Gamma is the Bias-factor
svmclf.fit(train_xx,train_y)

# Measuring Accuracy with K-fold Cross-Validation
cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=42)
cv_scores = cross_val_score(svmclf, train_xx, train_y, cv=cv, scoring='accuracy')
print('CV Scores :', cv_scores)
# [0.81343284 0.8358209  0.82462687] # when C=0.75, gamma=0.7
# [0.8358209  0.83955224 0.84701493] # when C=2.5, gamma=0.05, class_weight={1:1.25} (improved my ranking in Kaggle by 2722 places)
# [0.80246914 0.80246914 0.77777778] # when C=100, gamma=0.3 # Scored 0.76076, equivalent to LR in Kaggle. Bad!
# [0.82462687 0.8358209  0.85074627] # when C=1.0, gamma=0.1

# Make Predictions
svm_results = svmclf.predict(test_xx)

# Persist Data to CSV file for submission
kernel_name = svmclf.kernel
fname = "data/predictions/svm_with_{0}_kernel_tuned2.csv".format(kernel_name)
results2csv(test_x.index, svm_results, fname)

confusion_matrix(train_y, svmclf.predict(train_xx))

## Task 11 : Data Modelling with SVM using RBF Kernel and Grid Search for tuning params

In [None]:
tpl_c = range_simple(0.1,2.1,0.1) + [3,5]
tpl_gamma = tuple(range_simple(0.01,1.01,0.01,decimal=2))
len(tpl_gamma)

In [None]:
pipeline = Pipeline([
    ('clf',svm.SVC(kernel='rbf', random_state=42))
])

'''
params = {
    'clf__C':(0.1,0.5,1,2,3,5,10),
    'clf__gamma':(0.01,0.1,0.2,0.3,0.5,0.7,0.9,1.0)
}
'''

params = {
    'clf__C':tuple(range_simple(0.1,2.1,0.1) + [3,5]),
    'clf__gamma':tuple(range_simple(0.00,1.01,0.05,decimal=2))
}

grid_svm_rbf = GridSearchCV(pipeline,
                           params,
                           n_jobs=-1, # Use all cores of the machine
                           cv=3,
                           verbose=1,
                           scoring='accuracy')

# select_colns = ['Pclass','RoundedFare', 'Age', 'SibSp', 'Parch', 'Sex_male', 'Embarked_Q', 'Embarked_S']
# select_colns = ['Pclass', 'Age', 'SibSp', 'Parch', 'Sex_male', 'Embarked_Q', 'Embarked_S']
select_colns = ['Pclass', 'Age', 'SibSp', 'Parch', 'Sex_male', 'Embarked_Q', 'Embarked_S']
train_xx = transform(train_x, select_colns)
test_xx = transform(test_x, select_colns)

grid_svm_rbf.fit(train_xx, train_y)
best_score = grid_svm_rbf.best_score_
print('Best Score : ', best_score)

best_params = grid_svm_rbf.best_estimator_.get_params()
for k in sorted(params.keys()):
    print('\t{0} \t {1:.2f}'.format(k, best_params[k]))

# Measuring Accuracy with K-fold Cross-Validation
cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=42)
cv_scores = cross_val_score(grid_svm_rbf, train_xx, train_y, cv=cv, scoring='accuracy')
print('cv_scores :',cv_scores)
# cv_scores : [0.80970149, 0.8358209 , 0.85074627] # when all cols selected
# cv_scores : [0.80223881 0.83208955 0.84701493] # When cols - RoundedFare, SibSp, Parch are removed
# cv_scores : [0.80223881 0.83208955 0.84701493] # when col - RoundedFare is removed
# cv_scores : [0.79850746 0.83208955 0.85074627]

# Make Predictions
svm_results = grid_svm_rbf.predict(test_xx)

# Persist Data to CSV file for submission
fname = "data/predictions/svm_with_svm_rbf_optimized_by_grid_search.csv"
results2csv(test_x.index, svm_results, fname)

confusion_matrix(train_y, grid_svm_rbf.predict(train_xx))    

## Task 12 : Visualizing SVM Classifier with Plotting

In [None]:
# sc = StandardScaler()
select_colns = ['Pclass', 'Age']
# sc.fit(train_x[select_colns])
# train_xx = sc.transform(train_x[select_colns])
train_xx = train_xx.loc[:,select_colns]

C= 1.0 # Cost of mis-classification
svmclf = svm.SVC(kernel='rbf', gamma=0.1, C=C, random_state=42) # Gamma is the Bias-factor
svmclf.fit(train_xx,train_y)

xx, yy = make_meshgrid(train_x.Pclass, train_x.Age) # Because we can only take 2 features to plot it in a 2-d plane

# Set-up 2x2 grid for plotting.
# fig, sub = plt.subplots(2, 2)
# plt.subplots_adjust(wspace=0.4, hspace=0.4)
fig, ax = plt.subplots(1, 1, figsize=(6, 10)) # ax = subplot
plot_contours(ax, svmclf, xx, yy, cmap=plt.cm.coolwarm, alpha=0.8)
ax.scatter(train_x.Pclass, train_x.Age, c=train_y, cmap=plt.cm.coolwarm, s=20, edgecolors='k')
# ax.set_xlim(xx.min(), xx.max())
# ax.set_ylim(yy.min(), yy.max())
ax.set_xlabel('P-Class')
ax.set_ylabel('Age')
# ax.set_xticks(())
# ax.set_yticks(())
ax.set_title("Grid Search with SVM RBF Classifier")

plt.show()

## Task 13 : Data Modelling with Decision Tree Classifier

In [None]:
dtclf = tree.DecisionTreeClassifier(random_state=42)
dtclf.fit(train_x,train_y)

# Measuring Accuracy with K-fold Cross-Validation
cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=42)
cv_scores = cross_val_score(dtclf, train_x, train_y, cv=cv, scoring='accuracy')
print('CV Scores :', cv_scores)
# [0.76492537 0.7761194  0.81716418] # With default params

# Make Predictions
dt_results = dtclf.predict(test_x)

# Persist Data to CSV file for submission
fname = "data/predictions/decision_tree.csv"
results2csv(test_x.index, dt_results, fname)

confusion_matrix(train_y, dtclf.predict(train_x))

In [None]:
# Visualizing Decision Tree graph
# import sys
# sys.path.append('C:\graphviz-2.38\bin')
# !conda install graphviz

import graphviz

dot_data = tree.export_graphviz(dtclf,
                                out_file='images/tree.dot',
                                feature_names=train_x.columns.tolist(),
                                class_names=['Dead','Survived'],
                                rounded=True,
                                filled=True)

# Uncomment line below to write the output to the file in the disk
# ! dot -Tpng images/tree.dot -o images/tree.png

'''
# The method spits out a ".dot" extension file for the graph. 
# Essentialy this format data is referenced in dot_data variable and later used to show the tree graph
dot_data = tree.export_graphviz(dtclf,
                                out_file=None,
                                feature_names=train_x.columns.tolist(),
                                class_names=['Dead','Survived'],
#                                 proportion=True,
                                rounded=True,
                                filled=True)
graph = graphviz.Source(dot_data)
graph.format = 'png'
graph
'''

## Task 14 : Data Modelling with Optimized Decision Tree Classifier tuned using Grid Search

In [None]:
odtclf = tree.DecisionTreeClassifier(random_state=42)

params = {
    'max_leaf_nodes' : list(range(2,50)),
    'min_samples_split' : [2,3,4],
    'min_samples_leaf' : list(range(1,22,2)),
    'criterion' : ['gini', 'entropy']
}

# Measuring Accuracy with K-fold Cross-Validation
cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=42)
gsclf = GridSearchCV(odtclf, params, n_jobs=-1,cv=cv,verbose=1,scoring='accuracy')
gsclf.fit(train_x,train_y)
cv_scores = cross_val_score(gsclf, train_x, train_y, cv=cv, scoring='accuracy')
print('CV Scores :', cv_scores)
# CV Scores : [0.79850746 0.80970149 0.84701493]

print('Best Score : ', gsclf.best_score_)
print(gsclf.best_estimator_)

# Print the key/value pairs of best params
best_params = gsclf.best_estimator_.get_params()
for k in sorted(params.keys()):
    print('\t{0} \t {1}'.format(k, best_params[k]))

# Make Predictions    
test_y_pred = gsclf.predict(test_x)

# Persist Data to CSV file for submission
fname = "data/predictions/decision_tree_optimized_by_grid_search.csv"
results2csv(test_x.index, test_y_pred, fname)

confusion_matrix(train_y, gsclf.predict(train_x))    

In [None]:
# Persist Python Object : The  Optimized Decision Tree Classifier, for use in advancecd modelling
pickle_in(odtclf, 'pickle/optimized_dtree_clf.pkl')