# Titanic Survival Prediction: KNN and Naive Bayes from Scratch

This notebook implements the **K-Nearest Neighbors (KNN)** and **Naive Bayes** algorithms from scratch using only NumPy and Pandas. The models are evaluated on the Titanic dataset.

## 1. Data Preprocessing

Preprocessing is crucial for machine learning. In this section, we:
- Handle missing values: Missing 'Age' and 'Fare' values are filled with the median. 'Embarked' is filled with the most common value (mode).
- Feature Selection: Removed 'PassengerId', 'Name', 'Ticket', and 'Cabin' as they contain high-cardinality or irrelevant information.
- Feature Encoding: Converted 'Sex' and 'Embarked' into numerical values.
- Feature Scaling: For KNN, features are normalized (mean=0, std=1) because distance calculations are sensitive to the scale of variables.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

def preprocess_data(df):
    # Drop irrelevant columns
    df = df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
    # Fill missing values
    df['Age'] = df['Age'].fillna(df['Age'].median())
    df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
    df['Fare'] = df['Fare'].fillna(df['Fare'].median())
    # Encode categorical variables
    df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
    df['Embarked'] = df['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})
    return df

def normalize(X):
    mean = np.mean(X, axis=0)
    std = np.std(X, axis=0)
    return (X - mean) / std

train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
y_test_labels = pd.read_csv('gender_submission.csv')

train_processed = preprocess_data(train_df)
test_processed = preprocess_data(test_df)

x_train = train_processed.drop('Survived', axis=1).values
y_train = train_processed['Survived'].values
x_test = test_processed.values
y_test = y_test_labels['Survived'].values

x_train_norm = normalize(x_train)
x_test_norm = normalize(x_test)

## 2. K-Nearest Neighbors (KNN)

KNN is a non-parametric, lazy learning algorithm. It classifies a point based on the labels of its $k$ closest neighbors.

In [None]:
class KNN:
    def __init__(self, k=5):
        self.k = k

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        predictions = [self._predict(x) for x in X]
        return np.array(predictions)

    def _predict(self, x):
        # Compute Euclidean distances
        distances = [np.sqrt(np.sum((x - x_tr)**2)) for x_tr in self.X_train]
        # Get k nearest samples' labels
        k_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = [self.y_train[i] for i in k_indices]
        # Return the majority vote
        return np.bincount(k_nearest_labels).argmax()

## 3. Naive Bayes

Gaussian Naive Bayes assumes features follow a normal distribution. It uses Bayes' Theorem to calculate the posterior probability for each class.

In [None]:
class NaiveBayes:
    def fit(self, X, y):
        n_samples, n_features = X.shape
        self._classes = np.unique(y)
        n_classes = len(self._classes)

        self._mean = np.zeros((n_classes, n_features), dtype=np.float64)
        self._var = np.zeros((n_classes, n_features), dtype=np.float64)
        self._priors = np.zeros(n_classes, dtype=np.float64)

        for idx, c in enumerate(self._classes):
            X_c = X[y == c]
            self._mean[idx, :] = X_c.mean(axis=0)
            self._var[idx, :] = X_c.var(axis=0) + 1e-9 # Add small value to avoid division by zero
            self._priors[idx] = X_c.shape[0] / float(n_samples)

    def predict(self, X):
        return np.array([self._get_prediction(x) for x in X])

    def _get_prediction(self, x):
        posteriors = []
        for idx, c in enumerate(self._classes):
            prior = np.log(self._priors[idx])
            conditional = np.sum(np.log(self._pdf(idx, x)))
            posteriors.append(prior + conditional)
        return self._classes[np.argmax(posteriors)]

    def _pdf(self, class_idx, x):
        mean = self._mean[class_idx]
        var = self._var[class_idx]
        numerator = np.exp(- (x - mean)**2 / (2 * var))
        denominator = np.sqrt(2 * np.pi * var)
        return numerator / denominator

## 4. Evaluation and Visualization

In [None]:
# Run KNN
knn = KNN(k=5)
knn.fit(x_train_norm, y_train)
knn_preds = knn.predict(x_test_norm)
knn_acc = np.mean(knn_preds == y_test)

# Run Naive Bayes
nb = NaiveBayes()
nb.fit(x_train, y_train)
nb_preds = nb.predict(x_test)
nb_acc = np.mean(nb_preds == y_test)

print(f"KNN Accuracy: {knn_acc:.4f}")
print(f"Naive Bayes Accuracy: {nb_acc:.4f}")

# Visualization
models = ['KNN', 'Naive Bayes']
accuracies = [knn_acc, nb_acc]

plt.figure(figsize=(8, 5))
plt.bar(models, accuracies, color=['blue', 'green'])
plt.ylabel('Accuracy')
plt.title('Model Accuracy Comparison')
plt.ylim(0, 1.0)
for i, v in enumerate(accuracies):
    plt.text(i, v + 0.02, f"{v:.4f}", ha='center')
plt.show()