# Titanic - Machine Learning from Disaster 

In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt 

## Loading data 

In [2]:
raw_data = pd.read_csv("train.csv")
raw_data.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
data = raw_data.drop(['PassengerId','Name','Ticket'], axis = 1)


In [4]:
data.describe(include='all')

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
count,891.0,891.0,891,714.0,891.0,891.0,891.0,204,889
unique,,,2,,,,,147,3
top,,,male,,,,,G6,S
freq,,,577,,,,,4,644
mean,0.383838,2.308642,,29.699118,0.523008,0.381594,32.204208,,
std,0.486592,0.836071,,14.526497,1.102743,0.806057,49.693429,,
min,0.0,1.0,,0.42,0.0,0.0,0.0,,
25%,0.0,2.0,,20.125,0.0,0.0,7.9104,,
50%,0.0,3.0,,28.0,0.0,0.0,14.4542,,
75%,1.0,3.0,,38.0,1.0,0.0,31.0,,


In [5]:
# Survived is cateorical Varibale - We will go for losistic Regression

In [6]:
data['Pclass'].unique() # P-class is categorical

array([3, 1, 2], dtype=int64)

In [7]:
# Checking Null values 
data.isnull().sum()
# age, Cabin and embarked have null values

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Cabin       687
Embarked      2
dtype: int64

In [8]:
# Cabin have too much missing values - probably Cabin info is not captured for all of the customers
data = data.drop(['Cabin'], axis = 1)

In [9]:
# Check those null observation 
data[data['Age'].isnull()]

# Any idea why could possibly be age missing?


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
5,0,3,male,,0,0,8.4583,Q
17,1,2,male,,0,0,13.0000,S
19,1,3,female,,0,0,7.2250,C
26,0,3,male,,0,0,7.2250,C
28,1,3,female,,0,0,7.8792,Q
...,...,...,...,...,...,...,...,...
859,0,3,male,,0,0,7.2292,C
863,0,3,female,,8,2,69.5500,S
868,0,3,male,,0,0,9.5000,S
878,0,3,male,,0,0,7.8958,S


In [10]:
# only removing the records where age was null
data_wo_age_missing_values = data.dropna(subset=['Age'])
data_wo_age_missing_values.describe(include='all')

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
count,714.0,714.0,714,714.0,714.0,714.0,714.0,712
unique,,,2,,,,,3
top,,,male,,,,,S
freq,,,453,,,,,554
mean,0.406162,2.236695,,29.699118,0.512605,0.431373,34.694514,
std,0.49146,0.83825,,14.526497,0.929783,0.853289,52.91893,
min,0.0,1.0,,0.42,0.0,0.0,0.0,
25%,0.0,1.0,,20.125,0.0,0.0,8.05,
50%,0.0,2.0,,28.0,0.0,0.0,15.7417,
75%,1.0,3.0,,38.0,1.0,1.0,33.375,


In [11]:
# Again Checking Missing valuse 
data_wo_age_missing_values.isnull().sum()
#Embarked seems to have missing values 

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    2
dtype: int64

In [12]:
data_wo_age_missing_values[data_wo_age_missing_values['Embarked'].isnull()]

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
61,1,1,female,38.0,0,0,80.0,
829,1,1,female,62.0,0,0,80.0,


In [13]:
# lets drop them
data_wo_missing_values = data_wo_age_missing_values.dropna(subset=['Embarked'])
data_wo_missing_values.describe(include='all')

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
count,712.0,712.0,712,712.0,712.0,712.0,712.0,712
unique,,,2,,,,,3
top,,,male,,,,,S
freq,,,453,,,,,554
mean,0.404494,2.240169,,29.642093,0.514045,0.432584,34.567251,
std,0.491139,0.836854,,14.492933,0.930692,0.854181,52.938648,
min,0.0,1.0,,0.42,0.0,0.0,0.0,
25%,0.0,1.0,,20.0,0.0,0.0,8.05,
50%,0.0,2.0,,28.0,0.0,0.0,15.64585,
75%,1.0,3.0,,38.0,1.0,1.0,33.0,


In [14]:
inputs = data_wo_missing_values.drop(['Survived','Fare'], axis=1)
inputs['FamilyMembersCount'] = inputs['SibSp'] + inputs['Parch']
inputs = inputs.drop(['SibSp','Parch'], axis=1)
output = data_wo_missing_values['Survived']

In [15]:
import statsmodels.api as sm
inputs = sm.add_constant(inputs)
inputs.head(1)

  return ptp(axis=axis, out=out, **kwargs)


Unnamed: 0,const,Pclass,Sex,Age,Embarked,FamilyMembersCount
0,1.0,3,male,22.0,S,1


In [16]:
# Creating Dummy varibales
# for P class
inputs = pd.get_dummies(inputs,  columns=['Pclass','Embarked','Sex'])
#droping one dummies for each category to avoid multicollinearity
inputs = inputs.drop(['Pclass_1','Embarked_C','Sex_male'], axis =1)
inputs

Unnamed: 0,const,Age,FamilyMembersCount,Pclass_2,Pclass_3,Embarked_Q,Embarked_S,Sex_female
0,1.0,22.0,1,0,1,0,1,0
1,1.0,38.0,1,0,0,0,0,1
2,1.0,26.0,0,0,1,0,1,1
3,1.0,35.0,1,0,0,0,1,1
4,1.0,35.0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...
885,1.0,39.0,5,0,1,1,0,1
886,1.0,27.0,0,1,0,0,1,0
887,1.0,19.0,0,0,0,0,1,1
889,1.0,26.0,0,0,0,0,0,0


# Checking Assumptions

###  Checking Multi Colinearity

In [36]:
from statsmodels.stats.outliers_influence import variance_inflation_factor


In [61]:
vif = pd.DataFrame()
for i in range(inputs.shape[1]):
    vif["VIF"] = [variance_inflation_factor(inputs.values, i) for i in range(inputs.shape[1])]

vif['Features'] = inputs.columns
vif
# All on them are under 5. Good!

Unnamed: 0,VIF,Features
0,17.65906,const
1,1.302284,Age
2,1.151598,FamilyMembersCount
3,1.708314,Pclass_2
4,1.969321,Pclass_3
5,1.23979,Embarked_Q
6,1.333787,Embarked_S
7,1.101978,Sex_female


# Scaling data 

In [83]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaler.fit(inputs)

input_scaled  = pd.DataFrame(scaler.transform(inputs))
input_scaled.columns

RangeIndex(start=0, stop=8, step=1)

In [84]:
logit_reg = sm.Logit(output, inputs)
logit_reg

<statsmodels.discrete.discrete_model.Logit at 0x153a38a98c8>

In [90]:
reg_results = logit_reg.fit()

Optimization terminated successfully.
         Current function value: 0.445985
         Iterations 6


In [91]:
reg_results.summary()

0,1,2,3
Dep. Variable:,Survived,No. Observations:,712.0
Model:,Logit,Df Residuals:,704.0
Method:,MLE,Df Model:,7.0
Date:,"Sun, 29 Dec 2019",Pseudo R-squ.:,0.3391
Time:,23:17:13,Log-Likelihood:,-317.54
converged:,True,LL-Null:,-480.45
Covariance Type:,nonrobust,LLR p-value:,1.836e-66

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,1.8646,0.419,4.451,0.000,1.044,2.686
Age,-0.0421,0.008,-5.122,0.000,-0.058,-0.026
FamilyMembersCount,-0.1981,0.074,-2.662,0.008,-0.344,-0.052
Pclass_2,-1.2375,0.296,-4.183,0.000,-1.817,-0.658
Pclass_3,-2.4614,0.294,-8.383,0.000,-3.037,-1.886
Embarked_Q,-0.8691,0.596,-1.458,0.145,-2.037,0.299
Embarked_S,-0.4562,0.270,-1.693,0.090,-0.984,0.072
Sex_female,2.6610,0.221,12.032,0.000,2.228,3.094


### Checking Accuracy using confusion matrix

In [93]:
reg_results.pred_table()

array([[363.,  61.],
       [ 82., 206.]])

In [94]:
cm_df = pd.DataFrame(reg_results.pred_table())
cm_df.columns = ['Predicted 0','Predicted 1']
cm_df = cm_df.rename(index={0: 'Actual 0',1:'Actual 1'})
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,363.0,61.0
Actual 1,82.0,206.0


### Using Accuracy Formula

In [97]:
# Create an array (so it is easier to calculate the accuracy)
cm = np.array(cm_df)
# Calculate the accuracy of the model
accuracy_train = (cm[0,0]+cm[1,1])/cm.sum()
accuracy_train

0.7991573033707865

# Predicting Values

Look for the way to not repeat all the preocess as train data and automate the process for new data. Can create OOP design to call functions to apply on new data.