In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
le = LabelEncoder()

In [2]:
df = pd.read_csv("train.csv")
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
884,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
885,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
886,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,45.0,1,2,W./C. 6607,23.4500,,S
887,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [3]:
df.set_index("PassengerId", inplace = True)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  889 non-null    int64  
 1   Pclass    889 non-null    int64  
 2   Name      889 non-null    object 
 3   Sex       889 non-null    object 
 4   Age       889 non-null    float64
 5   SibSp     889 non-null    int64  
 6   Parch     889 non-null    int64  
 7   Ticket    889 non-null    object 
 8   Fare      889 non-null    float64
 9   Cabin     202 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.3+ KB


In [5]:
df.drop(columns = ["Name", "Ticket", "Cabin"], inplace=True)

In [6]:
df

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,3,male,22.0,1,0,7.2500,S
2,1,1,female,38.0,1,0,71.2833,C
3,1,3,female,26.0,0,0,7.9250,S
4,1,1,female,35.0,1,0,53.1000,S
5,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
887,0,2,male,27.0,0,0,13.0000,S
888,1,1,female,19.0,0,0,30.0000,S
889,0,3,female,45.0,1,2,23.4500,S
890,1,1,male,26.0,0,0,30.0000,C


In [7]:
df.Fare.describe()

count    889.000000
mean      32.096681
std       49.697504
min        0.000000
25%        7.895800
50%       14.454200
75%       31.000000
max      512.329200
Name: Fare, dtype: float64

# Making ranges for fare and age

<ul>
    <li>Below 1st queartile</li>
    <li>Between 1st and 2nd quartile</li>
    <li>Between 2nd and 3rd quartile</li>
    <li>Above 3rd quartile</li>
</ul>

In [8]:
encoded = [0,7.895800,14.454200,31, 513]
i = 0
for minlim, maxlim in zip(encoded[:-1], encoded[1:]):
    filt = (df.Fare >= minlim) & (df.Fare < maxlim)
    df.loc[filt, "Fare"] = i
    i+=1

In [9]:
df.Age.describe()

count    889.000000
mean      32.699854
std       14.346821
min        0.420000
25%       22.000000
50%       32.000000
75%       45.000000
max       80.000000
Name: Age, dtype: float64

In [10]:
encoded = [0.42,22,32,45, 81]
i = 0
for minlim, maxlim in zip(encoded[:-1], encoded[1:]):
    filt = (df.Age >= minlim) & (df.Age < maxlim)
    df.loc[filt, "Age"] = i
    i+=1

In [11]:
for column in ["Sex", "Embarked"]:
    df[column] = le.fit_transform(df[column])
    print(df[column].unique())

[1 0]
[2 0 1]


In [12]:
df

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,3,1,1.0,1,0,0.0,2
2,1,1,0,2.0,1,0,3.0,0
3,1,3,0,1.0,0,0,1.0,2
4,1,1,0,2.0,1,0,3.0,2
5,0,3,1,2.0,0,0,1.0,2
...,...,...,...,...,...,...,...,...
887,0,2,1,1.0,0,0,1.0,2
888,1,1,0,0.0,0,0,2.0,2
889,0,3,0,3.0,1,2,2.0,2
890,1,1,1,1.0,0,0,2.0,0


# Splitting dataset into features and target

In [13]:
y = df.iloc[:, 0]
X = df.iloc[:, 1:]

# Splitting dataset into train and test

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

# Model development

In [15]:
model = BernoulliNB()
model.fit(X_train, y_train)
print("Target: Survived, Accuracy:", round(model.score(X_test, y_test)*100, 3))
print(f"Confusion Matrix:\n{confusion_matrix(y_test, model.predict(X_test))}")

Target: Survived, Accuracy: 79.775
Confusion Matrix:
[[90 14]
 [22 52]]


# We perform the last three cells for other columns as the target

In [16]:
for target in df.columns[1:]:
    y = df[[target]]
    X = df[list(set(df.columns) - set(y.columns))]
    y = df[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
    model.fit(X_train, y_train)
    
    print("Target: "+target+",  Accuracy:", round(model.score(X_test, y_test)*100, 3))
    print(f"Confusion Matrix:\n{confusion_matrix(y_test, model.predict(X_test))}\n\n")

Target: Pclass,  Accuracy: 59.551
Confusion Matrix:
[[23  2 21]
 [18  0 19]
 [11  1 83]]


Target: Sex,  Accuracy: 75.281
Confusion Matrix:
[[37 19]
 [25 97]]


Target: Age,  Accuracy: 30.337
Confusion Matrix:
[[17  9  1 17]
 [ 4  3  4 35]
 [10  3  0 24]
 [ 9  6  2 34]]


Target: SibSp,  Accuracy: 67.978
Confusion Matrix:
[[103  16   0   0   0   0]
 [ 26  17   0   0   1   0]
 [  4   1   0   0   0   0]
 [  0   3   0   0   2   0]
 [  0   2   0   0   1   0]
 [  1   1   0   0   0   0]]


Target: Parch,  Accuracy: 75.281
Confusion Matrix:
[[120  19   0   0]
 [  6  12   0   0]
 [ 13   5   2   0]
 [  1   0   0   0]]


Target: Fare,  Accuracy: 44.382
Confusion Matrix:
[[ 5 30  4  3]
 [ 4 37  5  2]
 [ 1 10 19 10]
 [ 1  8 21 18]]


Target: Embarked,  Accuracy: 73.596
Confusion Matrix:
[[  0   0  35]
 [  0   4  11]
 [  0   1 127]]


