# Objective : Get baseline Prediction Accuracy

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
sns.set()

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import SGDClassifier

In [None]:
sns.set_style('darkgrid')

In [None]:
x = pd.read_csv('data/train.csv', index_col='PassengerId')
x.head()

In [None]:
from IPython.display import display, HTML

y = x.pop('Survived')
y.name = 'Survived'

# display(HTML(x.head().to_html())) # Note that if you just print df1.to_html() you'll get the raw, unrendered HTML.
display(x.head()) # Easier alternative to print as HTML
display(y.to_frame().head())


In [None]:
x.Ticket.value_counts()

In [None]:
x.Cabin.value_counts()

In [None]:
x.Fare.value_counts()

In [None]:
# Removing assumed noise columns
x.drop(columns=['Name','Ticket','Cabin'], inplace=True)
x.head()

In [None]:
x = pd.get_dummies(x, drop_first=True)
x.head()

In [None]:
x.isnull().sum()

In [None]:
x.loc[x.Age.isnull()]

In [None]:
x.fillna(x.mean(), inplace=True)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.1, stratify=y, random_state=42)
print(x_train.shape,'\n',x_test.shape)

## Dummy Classification

In [None]:
clf = DummyClassifier()
clf.fit(x_train, y_train)

# Measuring Accuracy with K-fold Cross-Validation
cv_scores = cross_val_score(clf, x_train, y_train, cv=3, scoring='accuracy')
print('CV  Scores :',cv_scores)
# CV  Scores : [0.58955224 0.53183521 0.47744361]

y_pred = clf.predict(x_train)
print('Confusion Matrix:\n',confusion_matrix(y_train,y_pred))
print('Classification Report:\n',classification_report(y_train, y_pred))
print('Quick Accuracy :', max(cv_scores.round(2)))

## Quick Classification

In [None]:
# Modelling with SGDClassifier (Stochastic Gradient Descecnt Classifier)
clf = SGDClassifier(random_state=42, max_iter=100)
clf.fit(x_train,y_train)

# Measuring Accuracy with K-fold Cross-Validation
cv_scores = cross_val_score(clf, x_train, y_train, cv=3, scoring='accuracy')
print('CV  Scores :',cv_scores)
# cv_scores: [0.64552239, 0.41573034, 0.78571429]

y_pred = clf.predict(x_train)
print('Confusion Matrix:\n',confusion_matrix(y_train,y_pred))
print('Classification Report:\n',classification_report(y_train, y_pred))
print('Quick Accuracy :', max(cv_scores.round(2))) # 0.7857142857142857

## Next Mission
Can we get better than this Base-line Accuracy Score of 78.57%? Let's try!