In [2]:
import pandas as pd
import numpy as np

# --------------------------------------------------
# 1. Daten laden
# --------------------------------------------------
path = "../Data/"
fname = "titanic.csv"
fpath = path + fname
df = pd.read_csv(fpath,delimiter=',')

# --------------------------------------------------
# 2. Daten sichten & auswählen
# --------------------------------------------------
features = ["Pclass", "Sex", "Age", "Parch"]
target = "Survived"

df = df[features + [target]]
print(df)
# --------------------------------------------------
# 3. Vorverarbeitung
# --------------------------------------------------

# Sex kodieren: male=0, female=1
df["Sex"] = df["Sex"].map({"male": 0, "female": 1})

# Fehlende Alterswerte mit Mittelwert ersetzen
df["Age"] = df["Age"].fillna(df["Age"].mean().round())

# --------------------------------------------------
# 4. Train/Test Split (80/20)
# --------------------------------------------------
np.random.seed(42)
indices = np.random.permutation(len(df))
train_size = int(0.8 * len(df))

train_idx = indices[:train_size]
test_idx = indices[train_size:]

train_df = df.iloc[train_idx]
test_df = df.iloc[test_idx]

X_train = train_df[features].values
y_train = train_df[target].values

X_test = test_df[features].values
y_test = test_df[target].values

# --------------------------------------------------
# 5. Naive Bayes (Gaussian) – Training
# --------------------------------------------------
classes = np.unique(y_train)

means = {}
variances = {}
priors = {}

for c in classes:
    X_c = X_train[y_train == c]
    means[c] = X_c.mean(axis=0)
    variances[c] = X_c.var(axis=0) + 1e-6  # Stabilität
    priors[c] = X_c.shape[0] / X_train.shape[0]

# --------------------------------------------------
# 6. Klassifikation Testdaten
# --------------------------------------------------
def gaussian_log_likelihood(x, mean, var):
    return -0.5 * np.sum(np.log(2 * np.pi * var) + ((x - mean) ** 2) / var)

y_pred = []

for x in X_test:
    posteriors = []
    for c in classes:
        log_prior = np.log(priors[c])
        log_likelihood = gaussian_log_likelihood(x, means[c], variances[c])
        posteriors.append(log_prior + log_likelihood)
    y_pred.append(classes[np.argmax(posteriors)])

y_pred = np.array(y_pred)

# --------------------------------------------------
# 7. Ergebnis auswerten
# --------------------------------------------------
accuracy = np.mean(y_pred == y_test)
print("Accuracy:", accuracy)

     Pclass     Sex   Age  Parch  Survived
0         3    male  22.0      0         0
1         1  female  38.0      0         1
2         3  female  26.0      0         1
3         1  female  35.0      0         1
4         3    male  35.0      0         0
..      ...     ...   ...    ...       ...
886       2    male  27.0      0         0
887       1  female  19.0      0         1
888       3  female   NaN      2         0
889       1    male  26.0      0         1
890       3    male  32.0      0         0

[891 rows x 5 columns]
Accuracy: 0.7988826815642458
