# OBJECTIVE : Beat the baseline accuracy of ~78.57% (See A_*.ipynb)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
sns.set()

## Task 1 : Feature Engineering Train and Test Data

In [None]:
train = pd.read_csv('data/train_processed_1.csv', index_col='PassengerId')
train.head()

In [None]:
test = pd.read_csv('data/test_processed_1.csv', index_col='PassengerId')
test.head()

In [None]:
print('Train Shape :', train.shape)
print('Test Shape :', test.shape)

In [None]:
train = pd.get_dummies(data=train, drop_first=True)
print('Train Shape :', train.shape)
train.head()

In [None]:
test = pd.get_dummies(data=test, drop_first=True)
print('Test Shape :', test.shape)
test.head()

## Task 2 : Split Datasets as x and y

In [None]:
train_y = train.pop('Survived')
train_x = train
print('train_x shape :', train_x.shape)
print('train_y shape :', train_y.shape)

In [None]:
test_x = test # Test-set has no target columns
print('test_x shape :', test_x.shape)

## Task 3 : Data Modelling with Logistic Regression Classifier (default params)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

lrclf = LogisticRegression(random_state=42)
lrclf.fit(train_x,train_y)

cv_scores = cross_val_score(lrclf, train_x, train_y, cv=3, scoring='accuracy')
print('CV Scores :', cv_scores) # [0.79124579 0.8047138  0.79124579]

results = lrclf.predict(test_x)

In [None]:
df = pd.DataFrame({
    test_x.index.name : test_x.index,
    'Survived' : results
    })
df.set_index(test_x.index.name, inplace=True)
df.head()

In [None]:
df.to_csv('data/predictions/logistic_regression.csv')

## Task 4 : Data Modelling with SGDClassifier (default params)

In [None]:
from sklearn.linear_model import SGDClassifier

# By default the param loss='hinge'. When the loss function is 'hinge', it gives linear SVM.
# This one below thus gives Linear SVM model
sgdclf = SGDClassifier(random_state=42, max_iter=100)
sgdclf.fit(train_x,train_y)

# Measuring Accuracy with K-fold Cross-Validation
from sklearn.model_selection import cross_val_score
cv_scores = cross_val_score(sgdclf, train_x, train_y, cv=3, scoring='accuracy')
print('CV Scores :', cv_scores) # [0.76767677 0.71043771 0.79124579]

sgd_results = sgdclf.predict(test_x)

In [None]:
df = pd.DataFrame({
    test_x.index.name : test_x.index,
    'Survived' : sgd_results
    })
df.set_index(test_x.index.name, inplace=True)
df.to_csv('data/predictions/sgd.csv')
df.head()

## Task 5 : Data Modelling with Logistic Regression Classifier (custom params)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import ShuffleSplit

sc = StandardScaler()
select_colns = ['Pclass', 'Age', 'SibSp', 'Parch', 'Sex_male', 'Embarked_Q', 'Embarked_S']
sc.fit(train_x[select_colns])

train_xx = sc.transform(train_x[select_colns])
test_xx = sc.transform(test_x[select_colns])

lrclf = LogisticRegression(random_state=42, max_iter=300, C=0.3, solver='sag',n_jobs=3) # C=0.3 maade the real difference here
lrclf.fit(train_xx,train_y)

cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=42)
cv_scores = cross_val_score(lrclf, train_xx, train_y, cv=cv, scoring='accuracy')
print('CV Scores :', cv_scores) 
# [0.79124579 0.8047138  0.79124579] # Default params and without StandardScaler preprocessing
# [0.78787879 0.79124579 0.8047138 ] # With just StandardScaler preprocessing
# [0.79124579 0.8047138  0.8013468 ] # With StandardScaler preprocessing and Custom Params
# [0.80970149 0.79850746 0.80223881] # With StandardScaler preprocessing, Custom Params and ShuffleSplit cv-strategy
results = lrclf.predict(test_xx)

df = pd.DataFrame({
    test_x.index.name : test_x.index, # which is 'PassengerId' and its values
    'Survived' : results
    })
df.set_index(test_x.index.name, inplace=True)
# df.head()
df.to_csv('data/predictions/logistic_regression_tuned.csv')

In [None]:
coeffs = pd.Series(data=lrclf.coef_.flatten(),index=select_colns)
coeffs
# Gosh, Pclass and Sex seem to have got least importance and RoundedFare got highest importance :facepalm:

## Task 6 : Data Modelling with SGDClassifier (custom params) giving Linear SVM

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

sc = StandardScaler()
select_colns = ['Pclass', 'Age', 'SibSp', 'Parch', 'Sex_male', 'Embarked_Q', 'Embarked_S'] # Removing 'RoundedFare' yields better accuracy
sc.fit(train_x[select_colns])
train_xx = sc.transform(train_x[select_colns])
test_xx = sc.transform(test_x[select_colns])

# By default the param loss='hinge'. When the loss function is 'hinge', it gives linear SVM.
sgdclf = SGDClassifier(random_state=42, max_iter=1000, alpha=0.7)
sgdclf.fit(train_xx,train_y)

# Measuring Accuracy with K-fold Cross-Validation
cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=42)
cv_scores = cross_val_score(sgdclf, train_xx, train_y, cv=cv, scoring='accuracy')
print('CV Scores :', cv_scores) 
# [0.76767677 0.71043771 0.79124579] # with default params
# [0.79104478 0.79850746 0.82089552] # With SS preprocessing, 1k iterations
# [0.82462687 0.82089552 0.79104478] # With SS preprocessing, 1k iterations, alpha=0.7, default loss=hinge
sgd_results = sgdclf.predict(test_xx)

df = pd.DataFrame({
    test_x.index.name : test_x.index,
    'Survived' : sgd_results
    })
df.set_index(test_x.index.name, inplace=True)
loss_function_name = sgdclf.loss_function_.__class__.__name__.lower()
fname = "data/predictions/sgd_tuned_with_{0}.csv".format(loss_function_name)
df.to_csv(fname)
df.head()

## Task 7 : Data Modelling with SGDClassifier (custom params) giving Logistic Regression

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

sc = StandardScaler()
select_colns = ['Pclass', 'Age', 'SibSp', 'Parch', 'Sex_male', 'Embarked_Q', 'Embarked_S'] # Removing 'RoundedFare' yields better accuracy
sc.fit(train_x[select_colns])
train_xx = sc.transform(train_x[select_colns])
test_xx = sc.transform(test_x[select_colns])

# By default the param loss='hinge'. When the loss function is 'hinge', it gives linear SVM.
# When the loss function is set to 'log', it gives Logistic Regression
# For other loss functions see http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html
# sgdclf = SGDClassifier(random_state=42, max_iter=1000, alpha=0.7)
sgdclf = SGDClassifier(random_state=42, max_iter=5000, alpha=0.25, loss='log')
sgdclf.fit(train_xx,train_y)

# Measuring Accuracy with K-fold Cross-Validation
cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=42)
cv_scores = cross_val_score(sgdclf, train_xx, train_y, cv=cv, scoring='accuracy')
print('CV Scores :', cv_scores) 
# [0.76767677 0.71043771 0.79124579] # with default params
# [0.79104478 0.79850746 0.82089552] # With SS preprocessing, 1k iterations
# [0.82462687 0.82089552 0.79104478] # With SS preprocessing, 1k iterations, alpha=0.7, default loss=hinge
# [0.80970149 0.82835821 0.82462687] # With SS preprocessing, 1k iterations, alpha=0.7, loss=log

sgd_results = sgdclf.predict(test_xx)

df = pd.DataFrame({
    test_x.index.name : test_x.index,
    'Survived' : sgd_results
    })
df.set_index(test_x.index.name, inplace=True)
loss_function_name = sgdclf.loss_function_.__class__.__name__.lower()
fname = "data/predictions/sgd_tuned_with_{0}.csv".format(loss_function_name)
df.to_csv(fname)
df.head()