## Author: Abel Fereja
## Date: 14-Nov-2022
## Dataset and code source: 
https://github.com/codebasics/py/blob/master/ML/14_naive_bayes/14_naive_bayes_1_titanic_survival_prediction.ipynbipynb
## Tutorial video:
https://www.youtube.com/watch?v=PPeaRc-r1OI
## Dataset description: 
This dataset contains details about passengers from the titanic disaster. The untimate objective of the project is to predict if passenger has survived or not from the accident using Naive bayes prediction model. Important step data preprocessing phases are also covered before making the prediction.
## Credit goes to Dhaval Patel
I created this repository by following his tutorial video on Youtube at codebasics and the codes from the above github repository.


In [94]:
import pandas as pd

In [95]:
df = pd.read_csv("Titanic.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [96]:
# drop some of the features which we will not use in the prediction
df.drop(['PassengerId','Name','SibSp','Parch','Ticket','Cabin','Embarked'],axis='columns',inplace=True)
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
0,0,3,male,22.0,7.25
1,1,1,female,38.0,71.2833
2,1,3,female,26.0,7.925
3,1,1,female,35.0,53.1
4,0,3,male,35.0,8.05


In [97]:
inputs = df.drop('Survived',axis='columns')
target = df.Survived

In [98]:
inputs

Unnamed: 0,Pclass,Sex,Age,Fare
0,3,male,22.0,7.2500
1,1,female,38.0,71.2833
2,3,female,26.0,7.9250
3,1,female,35.0,53.1000
4,3,male,35.0,8.0500
...,...,...,...,...
886,2,male,27.0,13.0000
887,1,female,19.0,30.0000
888,3,female,,23.4500
889,1,male,26.0,30.0000


In [99]:
#inputs.Sex = inputs.Sex.map({'male': 1, 'female': 2})

In [100]:
inputs

Unnamed: 0,Pclass,Sex,Age,Fare
0,3,male,22.0,7.2500
1,1,female,38.0,71.2833
2,3,female,26.0,7.9250
3,1,female,35.0,53.1000
4,3,male,35.0,8.0500
...,...,...,...,...
886,2,male,27.0,13.0000
887,1,female,19.0,30.0000
888,3,female,,23.4500
889,1,male,26.0,30.0000


In [101]:
dummies = pd.get_dummies(inputs.Sex)
dummies.head(3)

Unnamed: 0,female,male
0,0,1
1,1,0
2,1,0


In [102]:
print(dummies)

     female  male
0         0     1
1         1     0
2         1     0
3         1     0
4         0     1
..      ...   ...
886       0     1
887       1     0
888       1     0
889       0     1
890       0     1

[891 rows x 2 columns]


In [103]:
# adding the new female and male columns with integer values in teh table
inputs = pd.concat([inputs,dummies],axis='columns')
inputs.head(3)

Unnamed: 0,Pclass,Sex,Age,Fare,female,male
0,3,male,22.0,7.25,0,1
1,1,female,38.0,71.2833,1,0
2,3,female,26.0,7.925,1,0


In [104]:
inputs

Unnamed: 0,Pclass,Sex,Age,Fare,female,male
0,3,male,22.0,7.2500,0,1
1,1,female,38.0,71.2833,1,0
2,3,female,26.0,7.9250,1,0
3,1,female,35.0,53.1000,1,0
4,3,male,35.0,8.0500,0,1
...,...,...,...,...,...,...
886,2,male,27.0,13.0000,0,1
887,1,female,19.0,30.0000,1,0
888,3,female,,23.4500,1,0
889,1,male,26.0,30.0000,0,1


In [105]:
# droping the sex and male columns as one column is enough to represent sex
inputs.drop(['Sex','male'],axis='columns',inplace=True)
inputs.head(8)

Unnamed: 0,Pclass,Age,Fare,female
0,3,22.0,7.25,0
1,1,38.0,71.2833,1
2,3,26.0,7.925,1
3,1,35.0,53.1,1
4,3,35.0,8.05,0
5,3,,8.4583,0
6,1,54.0,51.8625,0
7,3,2.0,21.075,0


In [106]:
#. to check if we have any NaN value 
inputs.columns[inputs.isna().any()]

Index(['Age'], dtype='object')

In [107]:
inputs.Age[:10]   # check the first ten rows of the age column

0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
5     NaN
6    54.0
7     2.0
8    27.0
9    14.0
Name: Age, dtype: float64

In [108]:
inputs.Age = inputs.Age.fillna(inputs.Age.mean()) # to fill the NaN values by the mean of teh ages
inputs.head(10)

Unnamed: 0,Pclass,Age,Fare,female
0,3,22.0,7.25,0
1,1,38.0,71.2833,1
2,3,26.0,7.925,1
3,1,35.0,53.1,1
4,3,35.0,8.05,0
5,3,29.699118,8.4583,0
6,1,54.0,51.8625,0
7,3,2.0,21.075,0
8,3,27.0,11.1333,1
9,2,14.0,30.0708,1


In [109]:
# split our dataset into train and test and our test size is set 30%
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(inputs,target,test_size=0.3)

In [110]:
len(X_train)

623

In [111]:
len(X_test)

268

In [112]:
len(inputs)

891

In [113]:
# importing the Gaussian Naive Bayes formula

from sklearn.naive_bayes import GaussianNB
model = GaussianNB()

In [114]:
model.fit(X_train,y_train)

GaussianNB()

In [115]:
# GaussianNB(priors=None, var_smoothing=1e-09)

In [116]:
model.score(X_test,y_test)

0.7910447761194029

In [117]:
X_test[0:10]

Unnamed: 0,Pclass,Age,Fare,female
76,3,29.699118,7.8958,0
674,2,29.699118,0.0,0
579,3,32.0,7.925,0
2,3,26.0,7.925,1
745,1,70.0,71.0,0
264,3,29.699118,7.75,1
612,3,29.699118,15.5,1
555,1,62.0,26.55,0
68,3,17.0,7.925,1
212,3,22.0,7.25,0


In [118]:
y_test[0:10]

76     0
674    0
579    1
2      1
745    0
264    0
612    1
555    0
68     1
212    0
Name: Survived, dtype: int64

In [119]:
model.predict(X_test[0:10])

array([0, 0, 0, 1, 1, 1, 1, 0, 1, 0])

In [120]:
model.predict_proba(X_test[:10])

array([[0.96201983, 0.03798017],
       [0.91678944, 0.08321056],
       [0.96272791, 0.03727209],
       [0.4473058 , 0.5526942 ],
       [0.2669877 , 0.7330123 ],
       [0.45904748, 0.54095252],
       [0.45927097, 0.54072903],
       [0.63340548, 0.36659452],
       [0.39807695, 0.60192305],
       [0.95719249, 0.04280751]])

In [121]:
# calculationg the score using cross validation
from sklearn.model_selection import cross_val_score
result = cross_val_score(GaussianNB(),X_train, y_train, cv=5)
#result.mean()

In [122]:
type(result)

numpy.ndarray

In [123]:
result

array([0.784     , 0.728     , 0.792     , 0.79032258, 0.71774194])

In [125]:
result.mean()

0.7624129032258065