# Predicting whether the passenger survived the titanic disaster or not

### Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

### Importing datasets

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

### Merging Train and test dataset

In [None]:
# shift the Survived column to the last
train1 = train.drop('Survived',1)
train1['Survived'] = train.Survived
train = train1
# Add a feature isTrainSet to differentiate between train data and test data
train['isTrainSet'] = True
test['Survived'] = np.nan
test['isTrainSet'] = False
# Merging the two datasets
df = pd.concat([train,test],ignore_index=True)

In [None]:
df.info()

## Dealing with Missing data

### Embarked Column

In [None]:
# Selecting the missing rows
df[df['Embarked'].isnull()]      # Both the rows traveled in 1st class and their fare was 80

In [None]:
# Checking median Fare for each embarkment point
df.groupby(['Pclass','Embarked']).Fare.median()   
# Since the Median Fare of people who traveled in 1st class and embarked from C is close to 80,
# Replacing the missing values by C

In [None]:
df['Embarked'].fillna('C',inplace=True)

In [None]:
df.info()  # All missing values of Embarked column are filled

### Fare Column

In [None]:
# Selecting the row with missing Fare value
df[df['Fare'].isnull()]

In [None]:
# Checking the median Fare of people who traveled from 3rd class and Embarked from S
df.loc[(df['Pclass'] == 3) & (df['Embarked'] == 'S')]['Fare'].median()

In [None]:
# Replacing the missing value with the median i.e 8.05
df['Fare'].fillna(8.05,inplace=True)

In [None]:
df.info()

## Age Column

In [None]:
df[df['Age'].isnull()]

In [None]:
# Checking the age distribution
df['Age'].hist()   # Age has some extreme values which may affect the mean

In [None]:
# Checking median age of different Gender
df.groupby(['Sex']).Age.median()    # Very Similar median

In [None]:
# Creating a title attribute and replacing the missing values by median age of each title
def getTitle(x):
    fname = x.split(',')[1]
    title = fname.split('.')[0]
    title = title.strip().lower()
    return title

In [None]:
df['Name'].map(lambda x: getTitle(x))

In [None]:
df['Name'].map(lambda x: getTitle(x)).unique()

In [None]:
def getTitle(x):
    title_group = {'mr':'Mr', 'mrs':'Mrs', 'miss':'Miss', 
                   'master':'Master', 'don':'Sir', 'rev':'Sir',
                   'dr':'Officer', 'mme':'Mrs','ms':'Mrs',
                   'major':'Officer', 'lady':'Lady', 'sir':'Sir', 'mlle':'Miss', 'col':'Officer',
                   'capt':'Officer', 'the countess':'Lady',
                   'jonkheer':'Sir', 'dona':'Lady'}
    fname = x.split(',')[1]
    title = fname.split('.')[0]
    title = title.strip().lower()
    return title_group[title]

In [None]:
df['Title'] = df['Name'].map(lambda x: getTitle(x))

In [None]:
df

In [None]:
df[df['Age'].notnull()].boxplot('Age','Title')

In [None]:
median_age = df.groupby('Title').Age.transform('median')
df['Age'].fillna(median_age,inplace=True)

In [None]:
df.info()

### Feature Engineering

In [None]:
df['Fare_bin'] = pd.qcut(df['Fare'],4,labels=['very_low','low','high','very_high'])

In [None]:
pd.crosstab(df[df['Survived'].notnull()]['Survived'],df[df['Survived'].notnull()]['Fare_bin'])
# People who paid more Survived

In [None]:
df['AgeState'] = np.where(df['Age'] >=18, 'Adult','Child')

In [None]:
pd.crosstab(df[df['Survived'].notnull()]['Survived'],df[df['Survived'].notnull()]['AgeState'])
# More Children Survived

In [None]:
df['Family_size'] = df['Parch'] + df['SibSp'] + 1

In [None]:
pd.crosstab(df[df['Survived'].notnull()]['Survived'],df[df['Survived'].notnull()]['Family_size'])
# Family with less number of people survived more

In [None]:
df['isMother'] = np.where(((df['Sex'] == 'female') & (df['Parch'] > 0) & (df['Age'] > 18) & 
                                    (df['Title'] != 'Miss')),1,0)

In [None]:
pd.crosstab(df[df['Survived'].notnull()]['Survived'],df[df['Survived'].notnull()]['isMother'])
# Women with children survived first

In [None]:
df.Cabin.unique()

In [None]:
df.loc[df['Cabin'] == 'T','Cabin'] = np.nan

In [None]:
def getDeck(x):
    return np.where(pd.notnull(x),str(x)[0].upper(),'Z')
df['Deck'] = df['Cabin'].map(lambda x: getDeck(x))

In [None]:
pd.crosstab(df[df['Survived'].notnull()]['Survived'],df[df['Survived'].notnull()]['Deck'])
# People in Deck/Cabin B,C,D,E,F

### Feature Encoding

In [None]:
df['isMale'] = np.where(df['Sex'] == 'male',1,0)

In [None]:
df['isAdult'] = np.where(df['AgeState'] == 'Adult',1,0)

In [None]:
df = pd.get_dummies(df,columns=['Deck','Pclass','Title','Fare_bin','Embarked'])

In [None]:
df.info()

## Dropping Categorical Data

In [None]:
df.drop(['Name','Cabin','Ticket','Sex','SibSp','Parch'],axis=1,inplace=True)

In [None]:
# Reordering the columns
columns = [i for i in df.columns if i != 'Survived']
columns = columns + ['Survived']
df = df[columns]

In [None]:
df.drop(['AgeState'],axis=1,inplace=True)
df.info()

## Saving the processed data into a csv file

In [None]:
# Get the training data
train = df.loc[df.isTrainSet == True]
# remove the isTrainSet Feature
train.drop('isTrainSet',axis=1,inplace=True)

In [None]:
# Save the processed train data to a csv file
train.to_csv('clean_train_data.csv',index=False)

In [None]:
# Get the test data
test = df.loc[df.isTrainSet == False]
# Remove the isTrainSet feature
test.drop(['Survived','isTrainSet'],axis=1,inplace=True)
# Save the processed test data to a csv file
test.to_csv('clean_test_data.csv',index=False)

## Machine Learning Models

### Importing the processed dataset

In [None]:
train = pd.read_csv('clean_train_data.csv')
test = pd.read_csv('clean_test_data.csv')

In [None]:
#train.drop('Unnamed: 0',axis=1,inplace=True)
#test.drop('Unnamed: 0',axis=1,inplace=True)

In [None]:
train.info()

In [None]:
test.info()

##### Creating Dependent and Independent Variables

In [None]:
X = train.loc[:,:'Embarked_S']

In [None]:
y = train.loc[:,'Survived':]

#### Train Test Split

In [None]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=0)

### Logistic Regresion

#### Training The Model

In [None]:
from sklearn.linear_model import LogisticRegression

LogisticRegressor = LogisticRegression(C=1.1,penalty='l2',random_state=1)

In [None]:
LogisticRegressor.fit(X_train,y_train)

In [None]:
y_pred = LogisticRegressor.predict(X_test)

#### Checking Accuracy of Logistic Regression model

In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

print('Confusion Matrix :-\n')
print(confusion_matrix(y_test,y_pred))
print('Classification Report :-\n')
print(classification_report(y_test,y_pred))
print('Accuracy Score: {}'.format(accuracy_score(y_test,y_pred)))

## Naive Bayes Model

#### Training the model

In [None]:
from sklearn.naive_bayes import BernoulliNB

NaiveBayesClassifier = BernoulliNB(alpha=1.0)

In [None]:
NaiveBayesClassifier.fit(X_train,y_train)

In [None]:
y_pred = NaiveBayesClassifier.predict(X_test)

#### Checking Accuracy of Naive Bayes model

In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

print('Confusion Matrix :-\n')
print(confusion_matrix(y_test,y_pred))
print('Classification Report :-\n')
print(classification_report(y_test,y_pred))
print('Accuracy Score: {}'.format(accuracy_score(y_test,y_pred)))

### Support Vector Model

In [None]:
from sklearn.svm import SVC

SupportVectorClassifier = SVC(kernel='linear',C=4.0,degree=3)

In [None]:
SupportVectorClassifier.fit(X_train,y_train)

In [None]:
y_pred = SupportVectorClassifier.predict(X_test)

#### Checking Accuracy of Support Vector Machine

In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

print('Confusion Matrix :-\n')
print(confusion_matrix(y_test,y_pred))
print('Classification Report :-\n')
print(classification_report(y_test,y_pred))
print('Accuracy Score: {}'.format(accuracy_score(y_test,y_pred)))

## Saving the Predictions in a CSV File

In [None]:
# Using the most accurate Model for prediction
final_pred = LogisticRegressor.predict(test)

In [None]:
test

In [None]:
submission_df = pd.DataFrame({'PassengerId':test['PassengerId'],'Survived':final_pred})
submission_df['Survived'] = submission_df.Survived.astype('int64')
submission_df.to_csv('Submission.csv',index=False)

In [None]:
df = pd.read_csv('Submission.csv')

In [None]:
submission_df