# OBJECTIVE : Beat the baseline accuracy of ~78.57% (See A_*.ipynb)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
sns.set()

## Task 1 : Feature Engineering Train and Test Data

In [None]:
train = pd.read_csv('data/train_processed_1.csv', index_col='PassengerId')
train.head()

In [None]:
test = pd.read_csv('data/test_processed_1.csv', index_col='PassengerId')
test.head()

In [None]:
print('Train Shape :', train.shape)
print('Test Shape :', test.shape)

In [None]:
train = pd.get_dummies(data=train, drop_first=True)
print('Train Shape :', train.shape)
train.head()

In [None]:
test = pd.get_dummies(data=test, drop_first=True)
print('Test Shape :', test.shape)
test.head()

## Task 2 : Split Datasets as x and y

In [None]:
train_y = train.pop('Survived')
train_x = train
print('train_x shape :', train_x.shape)
print('train_y shape :', train_y.shape)

In [None]:
test_x = test # Test-set has no target columns
print('test_x shape :', test_x.shape)

## Task 3 : Data Modelling with Logistic Regression Classifier

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

lrclf = LogisticRegression(random_state=42)
lrclf.fit(train_x,train_y)

cv_scores = cross_val_score(lrclf, train_x, train_y, cv=3, scoring='accuracy')
print('CV Scores :', cv_scores) # [0.79124579 0.8047138  0.79124579]

results = lrclf.predict(test_x)

In [None]:
df = pd.DataFrame({
    test_x.index.name : test_x.index,
    'Survived' : results
    })
df.set_index(test_x.index.name, inplace=True)
df.head()

In [None]:
df.to_csv('data/predictions/logistic_regression.csv')

## Task 4 : Data Modelling with SGDClassifier

In [None]:
from sklearn.linear_model import SGDClassifier
sgdclf = SGDClassifier(random_state=42, max_iter=100)
sgdclf.fit(train_x,train_y)

# Measuring Accuracy with K-fold Cross-Validation
from sklearn.model_selection import cross_val_score
cv_scores = cross_val_score(sgdclf, train_x, train_y, cv=3, scoring='accuracy')
print('CV Scores :', cv_scores) # [0.76767677 0.71043771 0.79124579]

sgd_results = sgdclf.predict(test_x)

In [None]:
df = pd.DataFrame({
    test_x.index.name : test_x.index,
    'Survived' : sgd_results
    })
df.set_index(test_x.index.name, inplace=True)
df.to_csv('data/predictions/sgd.csv')
df.head()