In [44]:
import pandas as pd
import matplotlib as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [45]:
#import data 
df_titanic = pd.read_csv("titanic.csv")
df_titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [46]:
df_titanic.shape

(891, 12)

### Handling Null values in the dataset

In [47]:
# check if there is any null data
df_titanic.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [48]:
# as we can see there are lots of empty values in cabin. This is because cabin number is only assigned for the
# passenger havin cabin lets conver it into numberical column containting the value 0 = no cabin, 1= have canbin 
df_titanic['Cabin'].isnull()

0       True
1      False
2       True
3      False
4       True
       ...  
886     True
887    False
888     True
889    False
890     True
Name: Cabin, Length: 891, dtype: bool

In [49]:
df_titanic['has_cabin'] = df_titanic['Cabin'].isnull().apply(lambda x: 1 if x == False else 0 )

In [50]:
df_titanic = df_titanic.drop(labels='Cabin', axis=1)
df_titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,has_cabin
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S,0


In [51]:
# fill null in embark with the most frequnt values
df_titanic['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [52]:
df_titanic['Embarked'].fillna('S', inplace=True)

In [53]:
# impute the age value with the mean values of age
df_titanic['Age'].fillna(df_titanic['Age'].mean(), inplace=True)

In [54]:
df_titanic.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
has_cabin      0
dtype: int64

### Converting categorical values to numeric values in the dataset

In [55]:
# lets drop the name column
df_titanic = df_titanic.drop(labels=['PassengerId', 'Name', 'Ticket' ], axis=1)

In [56]:
df_titanic['Sex'] = df_titanic['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
df_titanic.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,has_cabin
0,0,3,1,22.0,1,0,7.25,S,0
1,1,1,0,38.0,1,0,71.2833,C,1
2,1,3,0,26.0,0,0,7.925,S,0
3,1,1,0,35.0,1,0,53.1,S,1
4,0,3,1,35.0,0,0,8.05,S,0


In [57]:
le_embarked = LabelEncoder()
df_titanic['Embarked'] = le_embarked.fit_transform(df_titanic['Embarked'])
df_titanic.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,has_cabin
0,0,3,1,22.0,1,0,7.25,2,0
1,1,1,0,38.0,1,0,71.2833,0,1
2,1,3,0,26.0,0,0,7.925,2,0
3,1,1,0,35.0,1,0,53.1,2,1
4,0,3,1,35.0,0,0,8.05,2,0


In [58]:
X = df_titanic.drop(labels='Survived', axis=1)
y= df_titanic['Survived']

In [59]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

### Decision Tree classifier 

In [60]:
model_dt = DecisionTreeClassifier(random_state=0)
model_dt.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=0, splitter='best')

In [61]:
y_train_pred = model_dt.predict(X_train)
y_test_pred = model_dt.predict(X_test)

In [62]:
print('Training Accuracy score ', accuracy_score(y_train, y_train_pred))
print('Test Accuracy score ', accuracy_score(y_test, y_test_pred))

Training Accuracy score  0.985553772070626
Test Accuracy score  0.7761194029850746


In [63]:
confusion_matrix(y_test, y_test_pred)

array([[138,  30],
       [ 30,  70]], dtype=int64)

### Logistic Regression Classifier

In [64]:
from sklearn.linear_model import LogisticRegression 
from sklearn.preprocessing import StandardScaler

In [69]:
scaler  = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [68]:
lr_model = LogisticRegression()
lr_model.fit(X_train_scaled, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [70]:
y_train_pred = lr_model.predict(X_train_scaled)
y_test_pred = lr_model.predict(X_test_scaled)

In [71]:
print('Training Accuracy score ', accuracy_score(y_train, y_train_pred))
print('Test Accuracy score ', accuracy_score(y_test, y_test_pred))

Training Accuracy score  0.7961476725521669
Test Accuracy score  0.8059701492537313


In [72]:
confusion_matrix(y_test, y_test_pred)

array([[141,  27],
       [ 25,  75]], dtype=int64)