# Session 11-12 Naive Bayes

# Exercise: Titanic survival prediction

You are a data scientist tasked with analyzing the Titanic dataset.

Your goal is to predict passenger survival using a Naïve Bayes classifier.

Dataset link: https://www.kaggle.com/competitions/titanic

# Step 1. Data Preparation

## Goal: Load the data, handle missing values, and select features.

In [None]:
# Step 1: Data preparation

import pandas as pd
import numpy as np

In [None]:
# Load the Titanic dataset
data = pd.read_csv("train.csv")

# |Display basic information
print(data.head())
print(data.info())

In [None]:
# Target variable: survival (0 = No, 1 = Yes)
y = data["Survived"]

# Select relevant features
features = ["Pclass", "Sex", "Age", "Fare"]
X = data[features].copy()  # .copy() avoids SettingWithCopyWarning

# Check missing values
print(X.isnull().sum())

In [None]:
# Handle missing Age values using median
X["Age"] = X["Age"].fillna(X["Age"].median())

# Convert categorical variable Sex to numerical
# male -> 0, female -> 1
X["Sex"] = X["Sex"].map({"male": 0, "female": 1})

# Step 2. Train–Test Split

## Goal: Split the dataset into 70% training and 30% testing.

In [None]:
# Step 2: Train-test split

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.30,
    random_state=42,
    stratify=y  # preserves class distribution
)

print("Training set size:", X_train.shape)
print("Testing set size:", X_test.shape)

# Step 3. Model Development (Naïve Bayes)

## Goal: Build and train a Gaussian Naïve Bayes classifier.

We use Gaussian Naïve Bayes because features like Age and Fare are continuous.

In [None]:
# Step 3: Model development

from sklearn.naive_bayes import GaussianNB

# Initialize Gaussian Naïve Bayes classifier
model = GaussianNB()

# Train the model
model.fit(X_train, y_train)

# Step 4. Model Evaluation

## Goal: Evaluate using Accuracy, Precision, Recall, ROC-AUC.

In [None]:
# Step 4: Model evaluation

from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

# Predict class labels
y_pred = model.predict(X_test)

# Predict probabilities (needed for ROC-AUC)
y_pred_proba = model.predict_proba(X_test)[:, 1]

# Compute evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

print("Model performance:")
print("Accuracy :", accuracy)
print("Precision:", precision)
print("Recall   :", recall)
print("ROC-AUC  :", roc_auc)

## Drawing ROC curve

In [None]:
# (1): Import required functions
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

# (2): Compute False Positive Rate (FPR) and True Positive Rate (TPR)
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)

# (3): Compute Area Under the Curve (AUC)
roc_auc = auc(fpr, tpr)

In [None]:
# (4): Plot ROC curve

plt.figure()
plt.plot(fpr, tpr, label=f"ROC curve (AUC = {roc_auc:.3f})")

# Diagonal line = random classifier
plt.plot([0, 1], [0, 1], linestyle="--", label="Random classifier")

# Labels and title
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - SVM Titanic Survival Prediction")
plt.legend()
plt.show()

# Step 5. Confusion Matrix Analysis

## Goal: Compute and interpret TP, FP, TN, FN.

In [None]:
# Step 5: Confusion matrix analysis

from sklearn.metrics import confusion_matrix

# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Unpack confusion matrix values
TN, FP, FN, TP = cm.ravel()

print("Confusion Matrix:")
print(cm)

In [None]:
print("\nInterpretation:")
print("True Positives (TP):", TP, "- Correctly predicted survivors")
print("False Positives (FP):", FP, "- Predicted survived but did not")
print("True Negatives (TN):", TN, "- Correctly predicted non-survivors")
print("False Negatives (FN):", FN, "- Predicted non-survivor but survived")