<h1 style='text-align:center'>Titanic - Machine Learning from Disaster.</h1>
<br>

![](http://media.giphy.com/media/1Nk9bIidJVTy0/giphy.gif)

Objective: To predict whether a passenger survive or not, based on the other features

In [None]:
### Importing Packages

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
#import train and test data
train=pd.read_csv('data/train.csv')
train.head()

In [None]:
train.info()

In [None]:
#### Percentage of Missing Values
np.round(train.isnull().sum()/len(train) * 100,0)

## Quiz 1) - Mean/Median/Mode for Missing Value Imputation - Cabin???

In [None]:
# Remove the unused columns - We also remove cabin since it has more than 75% missing values
train.drop(['Ticket','Name', 'Embarked', 'SibSp', 'Parch', 'Cabin'],axis=1,inplace=True)
train.set_index('PassengerId', inplace=True)
train.head()

In [None]:
train.info()

In [None]:
### Survived Distribution
np.round(train.Survived.value_counts(normalize=True)*100,0)

In [None]:
train.describe()

## Quiz 2) - Mean or Median or Mode for Missing Value Imputation - Age ??

In [None]:
#### Store Age Median into a variable
age_median = train.Age.median()
age_median

In [None]:
### Missing Value Imputation
from sklearn.preprocessing import Imputer
impute=Imputer(missing_values='NaN',strategy='median',axis=1)
new=impute.fit_transform(train.Age.values.reshape(1,-1))
train['Age']=new.T

In [None]:
train.info()

In [None]:
train.Fare.describe()

## Quiz 3) - Does Fare has any missing Values

## Quiz 4)  Then How to Fill the missing data for Fare

In [None]:
median_fare = train.groupby('Pclass')['Fare'].median()
median_fare

In [None]:
train.Fare.replace(0, np.nan, inplace=True)
train.info()

In [None]:
train[train.Fare.isnull()][['Pclass',  'Fare']]

In [None]:
train['old_fare'] = train.Fare

In [None]:
train['Fare'] = train.groupby('Pclass')['Fare'].transform(lambda x : x.fillna(x.median()))

In [None]:
train[train.old_fare.isnull()][['Pclass', 'old_fare', 'Fare']]

In [None]:
pd.crosstab(train.Sex, train.Survived)

In [None]:
train['Sex'] = train.Sex.where(train.Age >= 18,  'Child')
train[['Age', 'Sex']]

In [None]:
pd.crosstab(train.Sex, train.Survived)

In [None]:
train.boxplot(column=['Fare'])
plt.show()

#### Detecting Outliers

In [None]:
q75, q25 = np.percentile(train.Fare, [75,25])
iqr = q75 - q25
upper_whisker = q75 + 1.5 * iqr
lower_whisker = q25 - 1.5 * iqr

In [None]:
def check(x, ul, ll):
    if ul>=x>=ll:
        return x

In [None]:
print("Percentage of Outliers in Fare:",len(train[train.Fare.apply(check, args = (upper_whisker, lower_whisker)).isnull()]['Fare'])/len(train) * 100)

# Quiz 5) How to Fill the outliers here??

In [None]:
train.Fare.describe()

In [None]:
train.groupby('Pclass')['Fare'].mean()

In [None]:
def transform_fare(x):
    if x > upper_whisker:
        return upper_whisker
    return x

train['Fare'] = train['Fare'].apply(lambda x: transform_fare(x))
train['Fare'].plot.hist(bins=100, title='Frequency distribution of mean transformed Trip duration');

In [None]:
train.Fare.describe()

In [None]:
train.drop(['old_fare'], axis=1, inplace=True)

#### Heat Map

In [None]:
plt.subplots(figsize = (15,8))
sns.heatmap(train.corr(), annot=True,cmap="PiYG")
plt.title("Correlations Among Features", fontsize = 20);

In [None]:
train = pd.get_dummies(train)

In [None]:
train.head()

In [None]:
X = train.iloc[:, 1:].values
y = train.iloc[:, 0].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 45)

In [None]:
print("Shape of X_train", X_train.shape)
print("Shape of X_test", X_test.shape)
print("Shape of y_train", y_train.shape)
print("Shape of y_test", y_test.shape)

In [None]:
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
y_pred = log_reg.predict(X_test)

In [None]:
print("Accuracy of the model:", accuracy_score(y_test, y_pred))

In [None]:
### Classification report 
print ("Classification Report : \n\n", classification_report(y_test, y_pred))

In [None]:
confusion_matrix(y_test, y_pred)

# Quiz 6) If you want to improve the recall score , what we have to do ??

In [None]:
y_prob = log_reg.predict_proba(X_test)

In [None]:
y_prob

In [None]:
y_prob = np.copy(y_prob[:,1])
y_prob

In [None]:
y_pred_new = np.where(y_prob>=0.8, 1, 0)

In [None]:
y_pred_new

In [None]:
y_pred.shape

In [None]:
np.alltrue(y_pred == y_pred_new)

In [None]:
print("Accuracy of the model:", accuracy_score(y_test, y_pred_new))

In [None]:
confusion_matrix(y_test, y_pred_new)

In [None]:
### Classification report 
print ("Classification Report : \n\n", classification_report(y_test, y_pred_new))

## Applying into the Real World Test Data 

In [None]:
real = pd.read_csv("data/test_real.csv")
real.set_index('PassengerId', inplace=True)
real.head()

In [None]:
real.info()

In [None]:
real.Survived.value_counts(normalize=True)

In [None]:
train.Survived.value_counts(normalize=True)

### Preparing the 'real data' as per our model

In [None]:
real.columns

In [None]:
real = real.drop(['Name', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked'], axis=1)
real

In [None]:
new_age = impute.transform(real.Age.values.reshape(1,-1))
real['Age'] = new_age.T

In [None]:
real.info()

In [None]:
real.Fare.max()

In [None]:
real['Fare'] = real['Fare'].apply(lambda x: transform_fare(x))

In [None]:
real.Fare.max()

In [None]:
real['Sex'] = real.Sex.where(real.Age >= 18,  'Child')
real.head()

In [None]:
real.Sex.value_counts()

In [None]:
real = pd.get_dummies(real)
real.head()

In [None]:
real.columns

In [None]:
real = real[['Survived', 'Pclass', 'Age', 'Fare', 'Sex_Child', 'Sex_female',
       'Sex_male']]

In [None]:
assert(np.alltrue(train.columns == real.columns)) , "Hooray!!! Big Issue, check the column names"

In [None]:
X_real = real.iloc[:, 1:].values
y_real = real.iloc[:, 0].values

## Quiz-7 How to build the model now???

In [None]:
y_hat = log_reg.predict(X_real)

In [None]:
accuracy_score(y_real, y_hat)

In [None]:
confusion_matrix(y_real, y_hat)

In [None]:
print(classification_report(y_real, y_hat))