In [31]:
## Import Block
from os import path, getcwd
import numpy as np
import pandas as pd
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
import joblib
import json

In [32]:
## Path to dataset
dataset = path.join(getcwd(), "..", "data", "titanic.csv")

## Read data
df = pd.read_csv(dataset)
df = df.dropna()

df.head(7)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
10,11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16.7,G6,S
11,12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.55,C103,S
21,22,1,2,"Beesley, Mr. Lawrence",male,34.0,0,0,248698,13.0,D56,S
23,24,1,1,"Sloper, Mr. William Thompson",male,28.0,0,0,113788,35.5,A6,S


In [33]:
df = df.drop(columns=["PassengerId", "Name", "SibSp", "Ticket", "Embarked", "Cabin"], axis=1)
df.head(7)

Unnamed: 0,Survived,Pclass,Sex,Age,Parch,Fare
1,1,1,female,38.0,0,71.2833
3,1,1,female,35.0,0,53.1
6,0,1,male,54.0,0,51.8625
10,1,3,female,4.0,1,16.7
11,1,1,female,58.0,0,26.55
21,1,2,male,34.0,0,13.0
23,1,1,male,28.0,0,35.5


In [34]:
def resolve_gender(sex:str=None):
    female = 'female'
    male = 'male'
    if sex == female:
        return 0
    elif sex == male:
        return 1

def resolve_survival(survived=None):
    if survived == 1:
        return True
    elif survived == 0:
        return False

In [35]:
df["Sex"] = df["Sex"].apply(resolve_gender)
df["Survived"] = df["Survived"].apply(resolve_survival).astype(str)
df.head(7)

Unnamed: 0,Survived,Pclass,Sex,Age,Parch,Fare
1,True,1,0,38.0,0,71.2833
3,True,1,0,35.0,0,53.1
6,False,1,1,54.0,0,51.8625
10,True,3,0,4.0,1,16.7
11,True,1,0,58.0,0,26.55
21,True,2,1,34.0,0,13.0
23,True,1,1,28.0,0,35.5


In [36]:
x = df.drop(columns=["Survived"])
y = df["Survived"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [37]:
model = DecisionTreeClassifier(
    criterion="gini",
    splitter="best",
    max_depth=5,
    min_samples_leaf=5,
    min_samples_split=5,
    random_state=0
)
model.fit(x_train, y_train)

In [38]:
prediction = model.predict(x_test)
accuracy_score(y_test, prediction)

0.7837837837837838

In [39]:
joblib.dump(model, "titanic_survival_model.joblib")

['titanic_survival_model.joblib']

In [40]:
tree.export_graphviz(
    model,
    out_file="titanic_survival_tree.dot",
    feature_names=x.columns,
    class_names = sorted(y.unique()),
    label="all",
    rounded=True,
    filled=True
)