In [69]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB # importing the Guassian Naive Bayes model.

np.set_printoptions(suppress=True, precision=6)

In [70]:
df = pd.read_csv("/content/Titanic-Dataset.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [71]:
print(df.columns)
df.drop(["Passenger_Id", "Name", "Sib_sp", "Parch", "Ticket", "Cabin", "Embarked"], axis=1, inplace=True, errors='ignore')
print(df.head())

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
   PassengerId  Survived  Pclass     Sex   Age  SibSp     Fare
0            1         0       3    male  22.0      1   7.2500
1            2         1       1  female  38.0      1  71.2833
2            3         1       3  female  26.0      0   7.9250
3            4         1       1  female  35.0      1  53.1000
4            5         0       3    male  35.0      0   8.0500


In [72]:
print("Columns in the DataFrame:")
print(df.columns)

if "Survived" in df.columns:

    target = df["Survived"]

    inputs = df.drop("Survived", axis=1)


    display(target.head())
    display(inputs.head())
else:
    print("The 'Survived' column is not found in the DataFrame.")

Columns in the DataFrame:
Index(['PassengerId', 'Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Fare'], dtype='object')


0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Fare
0,1,3,male,22.0,1,7.25
1,2,1,female,38.0,1,71.2833
2,3,3,female,26.0,0,7.925
3,4,1,female,35.0,1,53.1
4,5,3,male,35.0,0,8.05


In [73]:
print("Columns in the inputs DataFrame:")
print(inputs.columns)

if "Sex" in inputs.columns:

    dummies = pd.get_dummies(inputs["Sex"])


    display(dummies.head())

    print(dummies.dtypes)
else:
    print("The 'Sex' column is not found in the inputs DataFrame.")

Columns in the inputs DataFrame:
Index(['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'Fare'], dtype='object')


Unnamed: 0,female,male
0,False,True
1,True,False
2,True,False
3,True,False
4,False,True


female    bool
male      bool
dtype: object


In [74]:
inputs = pd.concat([inputs, dummies], axis=1)
inputs.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Fare,female,male
0,1,3,male,22.0,1,7.25,False,True
1,2,1,female,38.0,1,71.2833,True,False
2,3,3,female,26.0,0,7.925,True,False
3,4,1,female,35.0,1,53.1,True,False
4,5,3,male,35.0,0,8.05,False,True


In [75]:
inputs.drop(["Sex"], axis=1, inplace=True)
inputs.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Fare,female,male
0,1,3,22.0,1,7.25,False,True
1,2,1,38.0,1,71.2833,True,False
2,3,3,26.0,0,7.925,True,False
3,4,1,35.0,1,53.1,True,False
4,5,3,35.0,0,8.05,False,True


In [76]:
inputs.Age[:10]

0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
5     NaN
6    54.0
7     2.0
8    27.0
9    14.0
Name: Age, dtype: float64

In [77]:
inputs["Age"] = inputs["Age"].fillna(inputs["Age"].mean())
inputs.Age[:10]

0    22.000000
1    38.000000
2    26.000000
3    35.000000
4    35.000000
5    29.699118
6    54.000000
7     2.000000
8    27.000000
9    14.000000
Name: Age, dtype: float64

In [78]:
X_train, X_test, y_train, y_test = train_test_split(inputs, target, test_size=0.2)

print(len(X_train), len(X_test), len(inputs))

# calculating training and testing data percentage.

print(len(X_train) / len(inputs)) # training data %
print(len(X_test) / len(inputs)) # testing data %

712 179 891
0.7991021324354658
0.20089786756453423


In [79]:
model = GaussianNB()
model.fit(X_train, y_train)

In [80]:
model.score(X_test, y_test)

0.8212290502793296

In [81]:
pred = np.array(model.predict(X_test))
pred_probability = np.array(model.predict_proba(X_test))

print(pred[:5])
for i in range(1, 6):
    print(pred_probability[i][0], end=", ")

[1 1 1 0 1]
0.024406613167262108, 0.039307675397449095, 0.9821901530125247, 0.0030035089689066934, 0.5860979827575742, 

In [82]:
pred = np.array(model.predict(X_test))
pred_probability = np.array(model.predict_proba(X_test))

print("First 5 predictions:", pred[:5])

print("Prediction probabilities for the first 5 samples:")
for i in range(5):
    print(f"Sample {i+1}: Probability of not surviving: {pred_probability[i][0]:.4f}, Probability of surviving: {pred_probability[i][1]:.4f}")

survival_predictions = ["Survived" if p == 1 else "Did not survive" for p in pred]

print("\nSurvival Predictions:")
for i in range(5):
    print(f"Sample {i+1}: {survival_predictions[i]}, Probability: {pred_probability[i][1]:.4f}")

First 5 predictions: [1 1 1 0 1]
Prediction probabilities for the first 5 samples:
Sample 1: Probability of not surviving: 0.0002, Probability of surviving: 0.9998
Sample 2: Probability of not surviving: 0.0244, Probability of surviving: 0.9756
Sample 3: Probability of not surviving: 0.0393, Probability of surviving: 0.9607
Sample 4: Probability of not surviving: 0.9822, Probability of surviving: 0.0178
Sample 5: Probability of not surviving: 0.0030, Probability of surviving: 0.9970

Survival Predictions:
Sample 1: Survived, Probability: 0.9998
Sample 2: Survived, Probability: 0.9756
Sample 3: Survived, Probability: 0.9607
Sample 4: Did not survive, Probability: 0.0178
Sample 5: Survived, Probability: 0.9970
