# OBJECTIVE : Beat the baseline accuracy of ~78.57% (See A_*.ipynb)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
sns.set()

## Task 1 : Feature Engineering Train and Test Data

In [None]:
train = pd.read_csv('data/train_processed_1.csv', index_col='PassengerId')
train.head()

In [None]:
train.Survived.value_counts()

In [None]:
test = pd.read_csv('data/test_processed_1.csv', index_col='PassengerId')
test.head()

In [None]:
print('Train Shape :', train.shape)
print('Test Shape :', test.shape)

In [None]:
train = pd.get_dummies(data=train, drop_first=True)
print('Train Shape :', train.shape)
train.head()

In [None]:
test = pd.get_dummies(data=test, drop_first=True)
print('Test Shape :', test.shape)
test.head()

## Task 2 : Split Datasets as x and y

In [None]:
train_y = train.pop('Survived')
train_x = train
print('train_x shape :', train_x.shape)
print('train_y shape :', train_y.shape)

In [None]:
test_x = test # Test-set has no target columns
print('test_x shape :', test_x.shape)

## Task 3 : Data Modelling with Logistic Regression Classifier (default params)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

lrclf = LogisticRegression(random_state=42)
lrclf.fit(train_x,train_y)

cv_scores = cross_val_score(lrclf, train_x, train_y, cv=3, scoring='accuracy')
print('CV Scores :', cv_scores) # [0.79124579 0.8047138  0.79124579]

results = lrclf.predict(test_x)

In [None]:
df = pd.DataFrame({
    test_x.index.name : test_x.index,
    'Survived' : results
    })
df.set_index(test_x.index.name, inplace=True)
df.head()

In [None]:
df.to_csv('data/predictions/logistic_regression.csv')

## Task 4 : Data Modelling with SGDClassifier (default params)

In [None]:
from sklearn.linear_model import SGDClassifier

# By default the param loss='hinge'. When the loss function is 'hinge', it gives linear SVM.
# This one below thus gives Linear SVM model
sgdclf = SGDClassifier(random_state=42, max_iter=100)
sgdclf.fit(train_x,train_y)

# Measuring Accuracy with K-fold Cross-Validation
from sklearn.model_selection import cross_val_score
cv_scores = cross_val_score(sgdclf, train_x, train_y, cv=3, scoring='accuracy')
print('CV Scores :', cv_scores) # [0.76767677 0.71043771 0.79124579]

sgd_results = sgdclf.predict(test_x)

In [None]:
df = pd.DataFrame({
    test_x.index.name : test_x.index,
    'Survived' : sgd_results
    })
df.set_index(test_x.index.name, inplace=True)
df.to_csv('data/predictions/sgd.csv')
df.head()

## Task 5 : Data Modelling with Logistic Regression Classifier (custom params)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import ShuffleSplit

sc = StandardScaler()
select_colns = ['Pclass', 'Age', 'SibSp', 'Parch', 'Sex_male', 'Embarked_Q', 'Embarked_S']
sc.fit(train_x[select_colns])

train_xx = sc.transform(train_x[select_colns])
test_xx = sc.transform(test_x[select_colns])

lrclf = LogisticRegression(random_state=42, max_iter=300, C=0.3, solver='sag',n_jobs=3) # C=0.3 maade the real difference here
lrclf.fit(train_xx,train_y)

cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=42)
cv_scores = cross_val_score(lrclf, train_xx, train_y, cv=cv, scoring='accuracy')
print('CV Scores :', cv_scores) 
# [0.79124579 0.8047138  0.79124579] # Default params and without StandardScaler preprocessing
# [0.78787879 0.79124579 0.8047138 ] # With just StandardScaler preprocessing
# [0.79124579 0.8047138  0.8013468 ] # With StandardScaler preprocessing and Custom Params
# [0.80970149 0.79850746 0.80223881] # With StandardScaler preprocessing, Custom Params and ShuffleSplit cv-strategy
results = lrclf.predict(test_xx)

df = pd.DataFrame({
    test_x.index.name : test_x.index, # which is 'PassengerId' and its values
    'Survived' : results
    })
df.set_index(test_x.index.name, inplace=True)
# df.head()
df.to_csv('data/predictions/logistic_regression_tuned.csv')

In [None]:
coeffs = pd.Series(data=lrclf.coef_.flatten(),index=select_colns)
coeffs
# Gosh, Pclass and Sex seem to have got least importance and RoundedFare got highest importance :facepalm:

## Task 6 : Data Modelling with SGDClassifier (custom params) giving Linear SVM

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

sc = StandardScaler()
select_colns = ['Pclass', 'Age', 'SibSp', 'Parch', 'Sex_male', 'Embarked_Q', 'Embarked_S'] # Removing 'RoundedFare' yields better accuracy
sc.fit(train_x[select_colns])
train_xx = sc.transform(train_x[select_colns])
test_xx = sc.transform(test_x[select_colns])

# By default the param loss='hinge'. When the loss function is 'hinge', it gives linear SVM.
sgdclf = SGDClassifier(random_state=42, max_iter=1000, alpha=0.7)
sgdclf.fit(train_xx,train_y)

# Measuring Accuracy with K-fold Cross-Validation
cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=42)
cv_scores = cross_val_score(sgdclf, train_xx, train_y, cv=cv, scoring='accuracy')
print('CV Scores :', cv_scores) 
# [0.76767677 0.71043771 0.79124579] # with default params
# [0.79104478 0.79850746 0.82089552] # With SS preprocessing, 1k iterations
# [0.82462687 0.82089552 0.79104478] # With SS preprocessing, 1k iterations, alpha=0.7, default loss=hinge
sgd_results = sgdclf.predict(test_xx)

df = pd.DataFrame({
    test_x.index.name : test_x.index,
    'Survived' : sgd_results
    })
df.set_index(test_x.index.name, inplace=True)
loss_function_name = sgdclf.loss_function_.__class__.__name__.lower()
fname = "data/predictions/sgd_tuned_with_{0}.csv".format(loss_function_name)
df.to_csv(fname)
df.head()

## Task 7 : Data Modelling with SGDClassifier (custom params) giving Logistic Regression

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

sc = StandardScaler()
select_colns = ['Pclass', 'Age', 'SibSp', 'Parch', 'Sex_male', 'Embarked_Q', 'Embarked_S'] # Removing 'RoundedFare' yields better accuracy
sc.fit(train_x[select_colns])
train_xx = sc.transform(train_x[select_colns])
test_xx = sc.transform(test_x[select_colns])

# By default the param loss='hinge'. When the loss function is 'hinge', it gives linear SVM.
# When the loss function is set to 'log', it gives Logistic Regression
# For other loss functions see http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html
# sgdclf = SGDClassifier(random_state=42, max_iter=1000, alpha=0.7)
sgdclf = SGDClassifier(random_state=42, max_iter=5000, alpha=0.25, loss='log')
sgdclf.fit(train_xx,train_y)

# Measuring Accuracy with K-fold Cross-Validation
cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=42)
cv_scores = cross_val_score(sgdclf, train_xx, train_y, cv=cv, scoring='accuracy')
print('CV Scores :', cv_scores) 
# [0.76767677 0.71043771 0.79124579] # with default params
# [0.79104478 0.79850746 0.82089552] # With SS preprocessing, 1k iterations
# [0.82462687 0.82089552 0.79104478] # With SS preprocessing, 1k iterations, alpha=0.7, default loss=hinge
# [0.80970149 0.82835821 0.82462687] # With SS preprocessing, 1k iterations, alpha=0.7, loss=log

sgd_results = sgdclf.predict(test_xx)

df = pd.DataFrame({
    test_x.index.name : test_x.index,
    'Survived' : sgd_results
    })
df.set_index(test_x.index.name, inplace=True)
loss_function_name = sgdclf.loss_function_.__class__.__name__.lower()
fname = "data/predictions/sgd_tuned_with_{0}.csv".format(loss_function_name)
df.to_csv(fname)
df.head()

## Task 8 : Data Modelling with SVM Classifier - Linear

In [None]:
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

sc = StandardScaler()
select_colns = ['Pclass', 'Age', 'SibSp', 'Parch', 'Sex_male', 'Embarked_Q', 'Embarked_S'] # Removing 'RoundedFare' yields better accuracy
sc.fit(train_x[select_colns])
train_xx = sc.transform(train_x[select_colns])
test_xx = sc.transform(test_x[select_colns])

C= 1.0 #0.1 
svmclf = svm.SVC(kernel='linear', C=C, random_state=42)
# svmclf = svm.SVC(kernel='linear', C=C, random_state=42, class_weight={1:2})
svmclf.fit(train_xx,train_y)

# Measuring Accuracy with K-fold Cross-Validation
cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=42)
cv_scores = cross_val_score(svmclf, train_xx, train_y, cv=cv, scoring='accuracy')
print('CV Scores :', cv_scores) 
# [0.79104478 0.79477612 0.79104478]

svm_results = svmclf.predict(test_xx)

df = pd.DataFrame({
    test_x.index.name : test_x.index,
    'Survived' : svm_results
    })
df.set_index(test_x.index.name, inplace=True)

kernel_name = svmclf.kernel
fname = "data/predictions/svm_with_{0}_kernel.csv".format(kernel_name)
df.to_csv(fname)
df.head()

## Task 9 : Data Modelling with SVM Classifier - Polynomial

In [None]:
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

sc = StandardScaler()
select_colns = ['Pclass', 'Age', 'SibSp', 'Parch', 'Sex_male', 'Embarked_Q', 'Embarked_S'] # Removing 'RoundedFare' yields better accuracy
sc.fit(train_x[select_colns])
train_xx = sc.transform(train_x[select_colns])
test_xx = sc.transform(test_x[select_colns])

C= 0.75 #1.0
svmclf = svm.SVC(kernel='poly', degree=3, C=C, random_state=42)
# svmclf = svm.SVC(kernel='poly', degree=3, C=C, random_state=42, class_weight={0:3, 1:5})
svmclf.fit(train_xx,train_y)

# Measuring Accuracy with K-fold Cross-Validation
cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=42)
cv_scores = cross_val_score(svmclf, train_xx, train_y, cv=cv, scoring='accuracy')
print('CV Scores :', cv_scores)
# [0.81716418 0.82462687 0.81716418] # When degree=2, class_weight is default  and C=0.75 
# [0.80223881 0.81343284 0.8358209 ] # When degree=3, class_weight is default  and C=0.75 
# [0.80597015 0.80597015 0.80970149] # When degree=3, class_weight={0:3, 1:5} and C=0.75 
svm_results = svmclf.predict(test_xx)

df = pd.DataFrame({
    test_x.index.name : test_x.index,
    'Survived' : svm_results
    })
df.set_index(test_x.index.name, inplace=True)

kernel_name = svmclf.kernel
fname = "data/predictions/svm_with_{0}_kernel.csv".format(kernel_name)
df.to_csv(fname)
df.head()

## Task 10 : Data Modelling with SVM Classifier - Gaussian Radial Basis Function (RBF)

In [181]:
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

sc = StandardScaler()
select_colns = ['Pclass', 'Age', 'SibSp', 'Parch', 'Sex_male', 'Embarked_Q', 'Embarked_S'] # Removing 'RoundedFare' yields better accuracy
sc.fit(train_x[select_colns])
train_xx = sc.transform(train_x[select_colns])
test_xx = sc.transform(test_x[select_colns])

C= 0.75
svmclf = svm.SVC(kernel='rbf', gamma=0.7, C=C, random_state=42)
svmclf.fit(train_xx,train_y)

# Measuring Accuracy with K-fold Cross-Validation
cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=42)
cv_scores = cross_val_score(svmclf, train_xx, train_y, cv=cv, scoring='accuracy')
print('CV Scores :', cv_scores)
# [0.81343284 0.8358209  0.82462687] # when C=0.75, kernel='rbf', gamma=0.7

svm_results = svmclf.predict(test_xx)
df = pd.DataFrame({
    test_x.index.name : test_x.index,
    'Survived' : svm_results
    })
df.set_index(test_x.index.name, inplace=True)

kernel_name = svmclf.kernel
fname = "data/predictions/svm_with_{0}_kernel.csv".format(kernel_name)
df.to_csv(fname)

from sklearn.metrics import confusion_matrix
confusion_matrix(train_y, svmclf.predict(train_xx))

CV Scores : [0.81343284 0.8358209  0.82462687]


array([[525,  24],
       [104, 238]], dtype=int64)

In [209]:
#
# OBJECTIVE : Tune SVC with GBF Kernel for better results - An attempt.
# Notes: 
# C and Gamma are the parameters for a nonlinear support vector machine (SVM) with a Gaussian radial basis function kernel.
# C is the parameter for the soft margin cost function, which controls the influence of each individual support vector; this process involves trading error penalty for stability.
# C controls the cost of misclassification on the training data.
# Small C makes the cost of misclassificaiton low ("soft margin"), thus allowing more of them for the sake of wider "cushion".
# Large C makes the cost of misclassification high ('hard margin"), thus forcing the algorithm to explain the input data stricter and potentially overfit.
# The goal is to find the balance between "not too strict" and "not too loose". Cross-validation and resampling, along with grid search, are good ways to finding the best C.
# Gamma is the free parameter of the Gaussian radial basis function.
# large gamma leads to high bias and low variance models, and vice-versa.
# Intuitively, the gamma parameter defines how far the influence of a single training example reaches, 
# with low values meaning ‘far’ and high values meaning ‘close’. 
# The gamma parameters can be seen as the inverse of the radius of influence of samples selected by the model as support vectors.
#
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

sc = StandardScaler()
select_colns = ['Pclass', 'Age', 'SibSp', 'Parch', 'Sex_male', 'Embarked_Q', 'Embarked_S'] # Removing 'RoundedFare' yields better accuracy
sc.fit(train_x[select_colns])
train_xx = sc.transform(train_x[select_colns])
test_xx = sc.transform(test_x[select_colns])

C= 2.5 # Cost of mis-classification
svmclf = svm.SVC(kernel='rbf', gamma=0.05, C=C, random_state=42, class_weight={1:1.25}) # Gamma is the Bias-factor
svmclf.fit(train_xx,train_y)

# Measuring Accuracy with K-fold Cross-Validation
cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=42)
cv_scores = cross_val_score(svmclf, train_xx, train_y, cv=cv, scoring='accuracy')
print('CV Scores :', cv_scores)
# [0.81343284 0.8358209  0.82462687] # when C=0.75, kernel='rbf', gamma=0.7
# [0.8358209  0.83955224 0.84701493] # when C=2.5, kernel='rbf', gamma=0.05 (improved my ranking in Kaggle by 2722 places)

svm_results = svmclf.predict(test_xx)
df = pd.DataFrame({
    test_x.index.name : test_x.index,
    'Survived' : svm_results
    })
df.set_index(test_x.index.name, inplace=True)

kernel_name = svmclf.kernel
fname = "data/predictions/svm_with_{0}_kernel_tuned.csv".format(kernel_name)
df.to_csv(fname)

from sklearn.metrics import confusion_matrix
confusion_matrix(train_y, svmclf.predict(train_xx))

CV Scores : [0.8358209  0.83955224 0.84701493]


array([[493,  56],
       [ 92, 250]], dtype=int64)

In [265]:
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

sc = StandardScaler()
select_colns = ['Pclass', 'Age', 'SibSp', 'Parch', 'Sex_male', 'Embarked_Q', 'Embarked_S'] # Removing 'RoundedFare' yields better accuracy
sc.fit(train_x[select_colns])
train_xx = sc.transform(train_x[select_colns])
test_xx = sc.transform(test_x[select_colns])

C= 1.0 # Cost of mis-classification
svmclf = svm.SVC(kernel='rbf', gamma=0.1, C=C, random_state=42) # Gamma is the Bias-factor
svmclf.fit(train_xx,train_y)

# Measuring Accuracy with K-fold Cross-Validation
cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=42)
cv_scores = cross_val_score(svmclf, train_xx, train_y, cv=cv, scoring='accuracy')
print('CV Scores :', cv_scores)
# [0.81343284 0.8358209  0.82462687] # when C=0.75, gamma=0.7
# [0.8358209  0.83955224 0.84701493] # when C=2.5, gamma=0.05, class_weight={1:1.25} (improved my ranking in Kaggle by 2722 places)
# [0.80246914 0.80246914 0.77777778] # when C=100, gamma=0.3 # Scored 0.76076, equivalent to LR in Kaggle. Bad!
# [0.82462687 0.8358209  0.85074627] # when C=1.0, gamma=0.1

svm_results = svmclf.predict(test_xx)
df = pd.DataFrame({
    test_x.index.name : test_x.index,
    'Survived' : svm_results
    })
df.set_index(test_x.index.name, inplace=True)

kernel_name = svmclf.kernel
fname = "data/predictions/svm_with_{0}_kernel_tuned2.csv".format(kernel_name)
df.to_csv(fname)

from sklearn.metrics import confusion_matrix
confusion_matrix(train_y, svmclf.predict(train_xx))

CV Scores : [0.82462687 0.8358209  0.85074627]


array([[507,  42],
       [100, 242]], dtype=int64)

## Task 11 : Data Modelling with Grid Search

In [284]:
def round_single(n):
    return np.round(n,decimals=1)

def round_double(n):
    return np.round(n,decimals=2)

def range_simple(start,end,step, decimal=1):
    lst = list(np.arange(start,end,step))
    mp = map(round_single, lst) if decimal==1 else map(round_double, lst) # Python's Ternary Operator
    lst = list(mp)
    return lst

tpl_c = range_simple(0.1,2.1,0.1) + [3,5]
tpl_gamma = tuple(range_simple(0.01,1.01,0.01,decimal=2))
len(tpl_gamma)

100

In [285]:
from sklearn import svm
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV

sc = StandardScaler()
select_colns = ['Pclass', 'Age', 'SibSp', 'Parch', 'Sex_male', 'Embarked_Q', 'Embarked_S'] # Removing 'RoundedFare' yields better accuracy
sc.fit(train_x[select_colns])
train_xx = sc.transform(train_x[select_colns])
test_xx = sc.transform(test_x[select_colns])

pipeline = Pipeline([
    ('clf',svm.SVC(kernel='rbf', gamma=0.1, C=1.0, random_state=42))
])

'''
params = {
    'clf__C':(0.1,0.5,1,2,3,5,10),
    'clf__gamma':(0.01,0.1,0.2,0.3,0.5,0.7,0.9,1.0)
}
'''

params = {
    'clf__C':tuple(range_simple(0.1,2.1,0.1) + [3,5]),
    'clf__gamma':tuple(range_simple(0.01,1.01,0.01,decimal=2))
}

grid_svm_rbf = GridSearchCV(pipeline,
                           params,
                           n_jobs=-1,
                           cv=3,
                           verbose=1,
                           scoring='accuracy')

grid_svm_rbf.fit(train_xx, train_y)
best_score = grid_svm_rbf.best_score_
print('Best Score : ', best_score)

best_params = grid_svm_rbf.best_estimator_.get_params()
for k in sorted(params.keys()):
    print('\t{0} \t {1:.2f}'.format(k, best_params[k]))

Fitting 3 folds for each of 2200 candidates, totalling 6600 fits


[Parallel(n_jobs=-1)]: Done  47 tasks      | elapsed:    6.5s
[Parallel(n_jobs=-1)]: Done 935 tasks      | elapsed:   18.1s
[Parallel(n_jobs=-1)]: Done 2435 tasks      | elapsed:   35.6s
[Parallel(n_jobs=-1)]: Done 4535 tasks      | elapsed:  1.0min


Best Score :  0.8361391694725028
	clf__C 	 5.00
	clf__gamma 	 0.07


[Parallel(n_jobs=-1)]: Done 6600 out of 6600 | elapsed:  1.5min finished


In [288]:
svm_results = grid_svm_rbf.predict(test_xx)
df = pd.DataFrame({
    test_x.index.name : test_x.index,
    'Survived' : svm_results
    })
df.set_index(test_x.index.name, inplace=True)

fname = "data/predictions/svm_with_svm_rbf_optimized_by_grid_search.csv"
df.to_csv(fname)

from sklearn.metrics import confusion_matrix
confusion_matrix(train_y, grid_svm_rbf.predict(train_xx))

Fitting 3 folds for each of 2200 candidates, totalling 6600 fits


[Parallel(n_jobs=-1)]: Done  94 tasks      | elapsed:    5.8s
[Parallel(n_jobs=-1)]: Done 2494 tasks      | elapsed:   21.7s
[Parallel(n_jobs=-1)]: Done 6494 tasks      | elapsed:   52.8s
[Parallel(n_jobs=-1)]: Done 6600 out of 6600 | elapsed:   53.8s finished


Fitting 3 folds for each of 2200 candidates, totalling 6600 fits


[Parallel(n_jobs=-1)]: Done  59 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done 1559 tasks      | elapsed:   18.5s
[Parallel(n_jobs=-1)]: Done 4059 tasks      | elapsed:   37.1s
[Parallel(n_jobs=-1)]: Done 6600 out of 6600 | elapsed:  1.0min finished


Fitting 3 folds for each of 2200 candidates, totalling 6600 fits


[Parallel(n_jobs=-1)]: Done  63 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done 2163 tasks      | elapsed:   22.2s
[Parallel(n_jobs=-1)]: Done 5663 tasks      | elapsed:   54.6s
[Parallel(n_jobs=-1)]: Done 6600 out of 6600 | elapsed:  1.0min finished


CV Scores : [0.80223881 0.83208955 0.84701493]


array([[524,  25],
       [115, 227]], dtype=int64)