# Import Library & Dataset

In [1]:
import pandas as pd, numpy as np

In [2]:
data = pd.read_csv('https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv')
data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


# Data Preprocessing

In [3]:
# first - checking null data
data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [4]:
data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [5]:
# input data fpr null data in column 'Age'
data.loc[data['Age'].isnull(), 'Age'] = data['Age'].median()

In [6]:
# second - checking duplicate data
data.duplicated().sum()

0

In [7]:
# engineering column 'Sex'
data = pd.get_dummies(data, columns=['Sex'], drop_first=True)
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_male
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,,S,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,C,0
2,3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,,S,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,C123,S,0
4,5,0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,,S,1


In [8]:
# last - define X & y
X = data[['Pclass','Age','SibSp','Parch','Fare','Sex_male']]
y = data['Survived']

# Train-Test Split

In [9]:
from sklearn.model_selection import train_test_split

In [51]:
# split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# agar splitting tidak berubah, diberi random_state
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 45)

In [11]:
# scaling
from sklearn.preprocessing import StandardScaler

In [14]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


X_train_scaled = pd.DataFrame(X_train_scaled, columns=X.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X.columns)

In [52]:
X_train.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_male
30,1,40.0,0,0,27.7208,1
31,1,28.0,1,0,146.5208,0
243,3,22.0,0,0,7.125,1
576,2,34.0,0,0,13.0,0
844,3,17.0,0,0,8.6625,1


In [16]:
X_train_scaled.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_male
0,-1.566215,-0.341959,0.417782,-0.47041,1.257933,0.722058
1,-0.37099,-0.106926,-0.454537,-0.47041,-0.391244,0.722058
2,0.824235,-0.576992,-0.454537,-0.47041,-0.495797,0.722058
3,0.824235,-0.655336,1.290101,2.146247,0.060235,-1.384931
4,0.824235,-0.968714,-0.454537,-0.47041,-0.48286,0.722058


# Classification Model

## Library

In [17]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

In [18]:
# evaluation metrics
def classification_eval (aktual, prediksi, name):
    cm = confusion_matrix(aktual, prediksi)
    tp = cm[1][1]
    tn = cm[0][0]
    fp = cm[0][1]
    fn = cm[1][0]
    
    accuracy = round((tp+tn) / (tp+tn+fp+fn) * 100, 2)
    precision = round((tp) / (tp+fp) * 100, 2)
    recall = round((tp) / (tp+fn) * 100, 2)
    
    print('Evaluation Model:', name)
    print(cm)
    print('Accuracy   :', accuracy, '%')
    print('Precision  :', precision, '%')
    print('Recall     :', recall, '%')

## KNN

In [35]:
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train_scaled,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=7, p=2,
                     weights='uniform')

In [36]:
y_train_pred = knn.predict(X_train_scaled)
y_test_pred = knn.predict(X_test_scaled)

In [37]:
classification_eval(y_train, y_train_pred, 'KNN Training')

Evaluation Model: KNN Training
[[393  45]
 [ 65 209]]
Accuracy   : 84.55 %
Precision  : 82.28 %
Recall     : 76.28 %


In [38]:
classification_eval(y_test, y_test_pred, 'KNN Testing')

Evaluation Model: KNN Testing
[[98 13]
 [14 54]]
Accuracy   : 84.92 %
Precision  : 80.6 %
Recall     : 79.41 %


## Decision Tree

In [43]:
dectree = DecisionTreeClassifier(max_depth=7) # bisa cek pake gini atau pake entropy
dectree.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=7, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [44]:
y_train_pred = dectree.predict(X_train)
y_test_pred = dectree.predict(X_test)

In [59]:
pd.DataFrame(y_test_pred, columns=['Survived'])

Unnamed: 0,Survived
0,0
1,1
2,0
3,1
4,0
...,...
174,1
175,0
176,0
177,0


In [45]:
classification_eval(y_train, y_train_pred, 'Decision Tree Training')

Evaluation Model: Decision Tree Training
[[419  19]
 [ 66 208]]
Accuracy   : 88.06 %
Precision  : 91.63 %
Recall     : 75.91 %


In [46]:
classification_eval(y_test, y_test_pred, 'Decision Tree Testing')

Evaluation Model: Decision Tree Testing
[[101  10]
 [ 17  51]]
Accuracy   : 84.92 %
Precision  : 83.61 %
Recall     : 75.0 %


## Random Forest

In [53]:
ranfor = RandomForestClassifier(max_depth=7)
ranfor.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=7, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [54]:
y_train_pred = ranfor.predict(X_train)
y_test_pred = ranfor.predict(X_test)

In [55]:
classification_eval(y_train, y_train_pred, 'Random Forest Training')

Evaluation Model: Random Forest Training
[[416  15]
 [ 54 227]]
Accuracy   : 90.31 %
Precision  : 93.8 %
Recall     : 80.78 %


In [56]:
classification_eval(y_test, y_test_pred, 'Random Forest testing')

Evaluation Model: Random Forest testing
[[109   9]
 [ 17  44]]
Accuracy   : 85.47 %
Precision  : 83.02 %
Recall     : 72.13 %
