In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt # for plotting graphs.

from sklearn.model_selection import train_test_split # for splitting the data into training and testing data.
from sklearn.naive_bayes import GaussianNB # importing the Guassian Naive Bayes model.

np.set_printoptions(suppress=True, precision=6) # set the printing options

In [4]:
df = pd.read_csv("titanic-data.csv")
df.head()

Unnamed: 0,passenger_id,name,p_class,gender,age,sib_sp,parch,ticket,fare,cabin,embarked,survived
0,1,"Braund, Mr. Owen Harris",3,male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,"Heikkinen, Miss. Laina",3,female,26.0,0,0,STON/O2. 3101282,7.925,,S,1
3,4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,female,35.0,1,0,113803,53.1,C123,S,1
4,5,"Allen, Mr. William Henry",3,male,35.0,0,0,373450,8.05,,S,0


In [6]:
# dropping certain columns that don't have any impact on the survival rate.
df.drop(["passenger_id", "name", "sib_sp", "parch", "ticket", "cabin", "embarked"], axis=1, inplace=True)
df.head()

Unnamed: 0,p_class,gender,age,fare,survived
0,3,male,22.0,7.25,0
1,1,female,38.0,71.2833,1
2,3,female,26.0,7.925,1
3,1,female,35.0,53.1,1
4,3,male,35.0,8.05,0


In [8]:
target = df["survived"] # creating a series for the survived column.
inputs = df.drop("survived", axis=1) # creating a separate dataframe by removing the survived column.

display(target.head())
display(inputs.head())

0    0
1    1
2    1
3    1
4    0
Name: survived, dtype: int64

Unnamed: 0,p_class,gender,age,fare
0,3,male,22.0,7.25
1,1,female,38.0,71.2833
2,3,female,26.0,7.925
3,1,female,35.0,53.1
4,3,male,35.0,8.05


In [10]:
dummies = pd.get_dummies(inputs["gender"]) # converting the gender column into dummy variables.

display(dummies.head())
print(dummies.dtypes)

Unnamed: 0,female,male
0,False,True
1,True,False
2,True,False
3,True,False
4,False,True


female    bool
male      bool
dtype: object


In [12]:
# concatenating the inputs dataframe with the dummies dataframe.
inputs = pd.concat([inputs, dummies], axis=1)
inputs.head()

Unnamed: 0,p_class,gender,age,fare,female,male
0,3,male,22.0,7.25,False,True
1,1,female,38.0,71.2833,True,False
2,3,female,26.0,7.925,True,False
3,1,female,35.0,53.1,True,False
4,3,male,35.0,8.05,False,True


In [14]:
# dropping the gender column because we now have the female and male columns.
inputs.drop(["gender"], axis=1, inplace=True)
inputs.head()

Unnamed: 0,p_class,age,fare,female,male
0,3,22.0,7.25,False,True
1,1,38.0,71.2833,True,False
2,3,26.0,7.925,True,False
3,1,35.0,53.1,True,False
4,3,35.0,8.05,False,True


In [16]:
# it can be observed that some columns contains null values.
inputs.age[:10]

0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
5     NaN
6    54.0
7     2.0
8    27.0
9    14.0
Name: age, dtype: float64

In [18]:
# one way of handling these null values is to fill those values with the mean value of the whole column.
# we can also make these values an integer type values.
inputs["age"] = inputs["age"].fillna(inputs["age"].mean())

inputs.age[:10]
# as we can see the 5th row in the earlier cell was null and now it is replaced with the mean value.

0    22.000000
1    38.000000
2    26.000000
3    35.000000
4    35.000000
5    29.699118
6    54.000000
7     2.000000
8    27.000000
9    14.000000
Name: age, dtype: float64

In [20]:
X_train, X_test, y_train, y_test = train_test_split(inputs, target, test_size=0.2)
# 20% test size and 80% train size.
print(len(X_train), len(X_test), len(inputs))

# calculating training and testing data percentage.
print(len(X_train) / len(inputs)) # training data %
print(len(X_test) / len(inputs)) # testing data %

712 179 891
0.7991021324354658
0.20089786756453423


In [22]:
# we are using the Gaussian Naive Bayes model.
model = GaussianNB()
model.fit(X_train, y_train)

In [24]:
model.score(X_test, y_test)

0.7262569832402235

In [26]:
pred = np.array(model.predict(X_test))
pred_probability = np.array(model.predict_proba(X_test)) # calculating the probabilities

print(pred[:5])
for i in range(1, 6):
    print(pred_probability[i][0], end=", ")

[1 1 0 1 0]
0.05712905937258038, 0.9929221655414385, 0.021357888616471792, 0.9927510695220138, 0.9892898230898665, 