In [2]:
import numpy as np
import pandas as pd

In [3]:
def load_data():
    train = pd.read_csv('titanic/train.csv')
    test = pd.read_csv('titanic/test.csv')
    submission = pd.read_csv('titanic/gender_submission.csv')

    test = test.merge(submission, on='PassengerId')

    def preprocess(df):
        df = df.copy()
        df['Age'] = df['Age'].fillna(df['Age'].median())
        df['Fare'] = df['Fare'].fillna(df['Fare'].median())
        df['Embarked'] = df['Embarked'].fillna('S')

        df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
        df['Embarked'] = df['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})
        df['FamilySize'] = df['SibSp'] + df['Parch']
        return df

    train = preprocess(train)
    test = preprocess(test)

    features = ['Pclass', 'Sex', 'Age', 'Fare', 'FamilySize']
    X_train = train[features].values
    y_train = train['Survived'].values
    X_test = test[features].values
    y_test = test['Survived'].values

    return X_train, y_train, X_test, y_test

In [4]:
def naive_bayes(X_train, y_train):
    classes = np.unique(y_train)
    params = {}
    for c in classes:
        X_c = X_train[y_train == c]
        means = np.mean(X_c, axis=0)
        stds = np.std(X_c, axis=0)
        stds = np.where(stds == 0, 1e-9, stds)
        params[c] = (means, stds)
    return params

def predict_naive_bayes(model, X):
    predictions = []
    for x in X:
        class_probs = []
        for c, (means, stds) in model.items():
            log_prob = -0.5 * np.sum(np.log(2 * np.pi * stds**2 + 1e-9))
            log_prob -= np.sum(((x - means)**2) / (2 * stds**2 + 1e-9))
            class_probs.append(log_prob)
        predictions.append(list(model.keys())[np.argmax(class_probs)])
    return np.array(predictions)