In [3]:
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

# Load the dataset
df = pd.read_csv("Titanic-Dataset.csv")
df = df[['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]

# Handle missing values
imputer = SimpleImputer(strategy='median') #handle missing values
df[['Age', 'Fare']] = imputer.fit_transform(df[['Age', 'Fare']]) #apply statistics 

df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True) # replace missing values with mode
df['Embarked'] = LabelEncoder().fit_transform(df['Embarked']) #  convert categorical values into numerical values

# Split the data into train and test sets
X = df.drop('Survived', axis=1) #remove survived column
y = df['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) #20% of data for test

# Initialize and fit the Gaussian Naive Bayes classifier
classifier = GaussianNB()
classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = classifier.predict(X_test)

# Evaluate the model
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Confusion Matrix:
 [[86 19]
 [37 37]]
Accuracy: 0.6871508379888268


In [5]:
# Confusion Matrix
# A confusion matrix is a table used to evaluate the performance of a classification algorithm. It provides a summary of the classification results by showing the counts of true positives, false positives, true negatives, and false negatives.

# Here’s how the matrix is structured for a binary classification problem:

# True Positive (TP): The number of instances where the model correctly predicted the positive class.
# False Positive (FP): The number of instances where the model incorrectly predicted the positive class (i.e., predicted positive when it was actually negative).
# True Negative (TN): The number of instances where the model correctly predicted the negative class.
# False Negative (FN): The number of instances where the model incorrectly predicted the negative class (i.e., predicted negative when it was actually positive).

# For a binary classification, the confusion matrix is usually structured as follows:

# Predicted Positive	Predicted Negative
# Actual Positive	TP	FN
# Actual Negative	FP	TN
# Example:
# If you have a confusion matrix:
# [[50, 10],
#  [ 5, 35]]

# This means:

# 50 True Positives
# 10 False Positives
# 5 False Negatives
# 35 True Negatives



# Accuracy Score
# Accuracy is a metric that measures the proportion of correctly classified instances out of the total instances. It’s calculated using the following formula:

# Accuracy
# =
# Number of Correct Predictions
# Total Number of Predictions
# Accuracy= 
# Total Number of Predictions
# Number of Correct Predictions
# ​
 

# In terms of the confusion matrix:

# Accuracy=TP+TN/TP+TN+FP+FN


# [[50, 10],
#  [ 5, 35]]

# Then:

# TP = 50
# TN = 35
# FP = 10
# FN = 5
# The accuracy would be:

# Accuracy=50+35/50+10+5+35

#  =0.85 or 85%
 


