In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")

In [2]:
data = pd.read_csv("titanic.csv")
data.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [3]:
print(data['Survived'].unique())
data['Survived'].value_counts()

[0 1]


0    549
1    342
Name: Survived, dtype: int64

In [4]:
data.shape

(891, 12)

In [5]:
data.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [6]:
for val in data:
   print(data[val].value_counts())
   print()

1      1
599    1
588    1
589    1
590    1
      ..
301    1
302    1
303    1
304    1
891    1
Name: PassengerId, Length: 891, dtype: int64

0    549
1    342
Name: Survived, dtype: int64

3    491
1    216
2    184
Name: Pclass, dtype: int64

Braund, Mr. Owen Harris                     1
Boulos, Mr. Hanna                           1
Frolicher-Stehli, Mr. Maxmillian            1
Gilinski, Mr. Eliezer                       1
Murdlin, Mr. Joseph                         1
                                           ..
Kelly, Miss. Anna Katherine "Annie Kate"    1
McCoy, Mr. Bernard                          1
Johnson, Mr. William Cahoone Jr             1
Keane, Miss. Nora A                         1
Dooley, Mr. Patrick                         1
Name: Name, Length: 891, dtype: int64

male      577
female    314
Name: Sex, dtype: int64

24.00    30
22.00    27
18.00    26
19.00    25
28.00    25
         ..
36.50     1
55.50     1
0.92      1
23.50     1
74.00     1
Name: Age, Length: 88,

In [7]:
# Drop the columns
data = data.drop(['Cabin', 'Embarked', 'Pclass', 'PassengerId', 'Name', 'Ticket'], axis=1)

In [8]:
data.head()

Unnamed: 0,Survived,Sex,Age,SibSp,Parch,Fare
0,0,male,22.0,1,0,7.25
1,1,female,38.0,1,0,71.2833
2,1,female,26.0,0,0,7.925
3,1,female,35.0,1,0,53.1
4,0,male,35.0,0,0,8.05


In [9]:
data.shape

(891, 6)

In [10]:
data = data.dropna(subset =['Age'], axis=0)

In [11]:
data.dtypes

Survived      int64
Sex          object
Age         float64
SibSp         int64
Parch         int64
Fare        float64
dtype: object

In [12]:
print(data['Sex'].unique())
data['Sex'].value_counts()

['male' 'female']


male      453
female    261
Name: Sex, dtype: int64

In [13]:
from sklearn import preprocessing
  
# label_encoder object knows 
# how to understand word labels.
label_encoder = preprocessing.LabelEncoder()
  
# Encode labels in column
data['Sex']= label_encoder.fit_transform(data['Sex'])
  
data['Sex'].unique()

array([1, 0])

In [14]:
data.head()

Unnamed: 0,Survived,Sex,Age,SibSp,Parch,Fare
0,0,1,22.0,1,0,7.25
1,1,0,38.0,1,0,71.2833
2,1,0,26.0,0,0,7.925
3,1,0,35.0,1,0,53.1
4,0,1,35.0,0,0,8.05


In [15]:
def calculate_prior(df, Y):
    classes = sorted(list(df[Y].unique()))
    prior = []
    for i in classes:
        prior.append(len(df[df[Y]==i])/len(df))
    return prior

In [16]:
def calculate_likelihood_gaussian(df, feat_name, feat_val, Y, label):
    feat = list(df.columns)
    df = df[df[Y]==label]
    mean, std = df[feat_name].mean(), df[feat_name].std()
    p_x_given_y = (1 / (np.sqrt(2 * np.pi) * std)) *  np.exp(-((feat_val-mean)**2 / (2 * std**2 )))
    return p_x_given_y

In [17]:
def naive_bayes_gaussian(df, X, Y):
    # get feature names
    features = list(df.columns)[1:]

    # calculate prior
    prior = calculate_prior(df, Y)
    #print(prior)

    Y_pred = []
    # loop over every data sample
    for x in X:
        # calculate likelihood
        labels = sorted(list(df[Y].unique()))
        likelihood = [1]*len(labels)
        for j in range(len(labels)):
            for i in range(len(features)):
                likelihood[j] *= calculate_likelihood_gaussian(df, features[i], x[i], Y, labels[j])

        # calculate posterior probability (numerator only)
        post_prob = [1]*len(labels)
        for j in range(len(labels)):
            post_prob[j] = likelihood[j] * prior[j]

        Y_pred.append(np.argmax(post_prob))

    return np.array(Y_pred) 

In [18]:
data.head()

Unnamed: 0,Survived,Sex,Age,SibSp,Parch,Fare
0,0,1,22.0,1,0,7.25
1,1,0,38.0,1,0,71.2833
2,1,0,26.0,0,0,7.925
3,1,0,35.0,1,0,53.1
4,0,1,35.0,0,0,8.05


In [19]:
import numpy as np
max_vals = np.max(np.abs(data))
max_vals

Survived      1.0000
Sex           1.0000
Age          80.0000
SibSp         5.0000
Parch         6.0000
Fare        512.3292
dtype: float64

In [20]:
df_max_scaled = data.copy()
  
# apply normalization techniques on Column 1
column = 'Age'
df_max_scaled[column] = df_max_scaled[column] /df_max_scaled[column].abs().max()
column = 'Fare'
df_max_scaled[column] = df_max_scaled[column] /df_max_scaled[column].abs().max()
  
# view normalized data
display(df_max_scaled)
data = df_max_scaled
df_max_scaled.head()

Unnamed: 0,Survived,Sex,Age,SibSp,Parch,Fare
0,0,1,0.2750,1,0,0.014151
1,1,0,0.4750,1,0,0.139136
2,1,0,0.3250,0,0,0.015469
3,1,0,0.4375,1,0,0.103644
4,0,1,0.4375,0,0,0.015713
...,...,...,...,...,...,...
885,0,0,0.4875,0,5,0.056848
886,0,1,0.3375,0,0,0.025374
887,1,0,0.2375,0,0,0.058556
889,1,1,0.3250,0,0,0.058556


Unnamed: 0,Survived,Sex,Age,SibSp,Parch,Fare
0,0,1,0.275,1,0,0.014151
1,1,0,0.475,1,0,0.139136
2,1,0,0.325,0,0,0.015469
3,1,0,0.4375,1,0,0.103644
4,0,1,0.4375,0,0,0.015713


In [21]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=.20, random_state=50)

In [22]:
X_test = test.iloc[:,1:].values
Y_test = test.iloc[:,0].values
print(X_test)
print(Y_test)

[[1.         0.65       0.         0.         0.02635025]
 [0.         0.1625     0.         1.         0.03806147]
 [0.         0.4        0.         0.         0.14891148]
 [0.         0.3        0.         0.         0.13526459]
 [1.         0.2625     0.         0.         0.01646071]
 [1.         0.175      5.         2.         0.0915427 ]
 [1.         0.425      0.         0.         0.01267896]
 [0.         0.375      0.         0.         0.1111184 ]
 [0.         0.025      3.         2.         0.05445717]
 [1.         0.225      0.         0.         0.02244651]
 [0.         0.1875     0.         0.         0.01410226]
 [1.         0.425      0.         0.         0.01571255]
 [1.         0.625      2.         0.         0.26086743]
 [1.         0.075      0.         1.         0.02434958]
 [0.         0.3875     1.         0.         0.03513366]
 [0.         0.2625     1.         0.         0.01917712]
 [0.         0.1125     2.         2.         0.06709553]
 [1.         0

In [23]:
Y_pred = naive_bayes_gaussian(train, X=X_test, Y="Survived")

In [24]:
from sklearn.metrics import confusion_matrix, f1_score
print(confusion_matrix(Y_test, Y_pred))
from sklearn.metrics import classification_report
print(classification_report(Y_test,Y_pred))
print(f1_score(Y_test, Y_pred))

[[66 14]
 [17 46]]
              precision    recall  f1-score   support

           0       0.80      0.82      0.81        80
           1       0.77      0.73      0.75        63

    accuracy                           0.78       143
   macro avg       0.78      0.78      0.78       143
weighted avg       0.78      0.78      0.78       143

0.7479674796747968
