In [1]:
# Gruppe 4: Survived ausgleichen
import pandas as pd
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE  # SMOTE importieren

path = "../data/"
file_path = "../data/titanic.csv"
target = "Survived"

df = pd.read_csv(file_path)

# --- VORBEREITUNG FÜR SMOTE (Zwingend notwendig!) ---
# SMOTE benötigt saubere, numerische Daten ohne Lücken.
# 1. Spalten entfernen, die reiner Text sind (Namen, Ticketnummern, Kabine)
df = df.drop(columns=['Name', 'Ticket', 'Cabin', 'PassengerId'], errors='ignore')

# 2. Text-Kategorien (Geschlecht, Hafen) in Zahlen umwandeln (One-Hot-Encoding)
# Aus 'Sex' wird 'Sex_male' (0 oder 1)
df = pd.get_dummies(df, columns=['Sex', 'Embarked'], drop_first=True)

# 3. Leere Werte (NaN) auffüllen (z.B. beim Alter), sonst meckert SMOTE
df['Age'] = df['Age'].fillna(df['Age'].mean())
# ----------------------------------------------------

X, y = df.drop(columns=[target]), df[target]

# 1. Undersample (Mehrheit reduzieren)
X_u, y_u = RandomUnderSampler(random_state=42).fit_resample(X, y)
pd.concat([X_u, y_u], axis=1).to_csv("../data/titanic_undersampled.csv", index=False)

# 2. Oversample (Minderheit kopieren)
X_o, y_o = RandomOverSampler(random_state=42).fit_resample(X, y)
pd.concat([X_o, y_o], axis=1).to_csv("../data/titanic_oversampled.csv", index=False)

# 3. SMOTE (Neue Daten generieren)
# Erzeugt künstliche Passagiere basierend auf den Nachbarn
smote = SMOTE(random_state=42)
X_s, y_s = smote.fit_resample(X, y)
pd.concat([X_s, y_s], axis=1).to_csv("../data/titanic_smote.csv", index=False)

print("Fertig: titanic_undersampled.csv, titanic_oversampled.csv & titanic_smote.csv")

Fertig: titanic_undersampled.csv, titanic_oversampled.csv & titanic_smote.csv


In [2]:
import pandas as pd
import numpy as np

path = "../data/"
files = ["titanic.csv"] #, "titanic_oversampled.csv", "titanic_undersampled.csv"

for file_name in files:

    # --------------------------------------------------
    # 1. Daten laden
    # --------------------------------------------------
    df = pd.read_csv(path + file_name, delimiter=',')

    # --------------------------------------------------
    # 2. Daten sichten & auswählen
    # --------------------------------------------------
    features = ["Pclass", "Sex", "Age", "Parch"]
    target = "Survived"

    df = df[features + [target]]
    print(df)
    # --------------------------------------------------
    # 3. Vorverarbeitung
    # --------------------------------------------------

    # Sex kodieren: male=0, female=1
    df["Sex"] = df["Sex"].map({"male": 0, "female": 1})

    # Fehlende Alterswerte mit Mittelwert ersetzen
    df["Age"] = df["Age"].fillna(df["Age"].mean().round())

    # --------------------------------------------------
    # 4. Train/Test Split (80/20)
    # --------------------------------------------------
    np.random.seed(42)
    indices = np.random.permutation(len(df))
    train_size = int(0.8 * len(df))

    train_idx = indices[:train_size]
    test_idx = indices[train_size:]

    train_df = df.iloc[train_idx]
    test_df = df.iloc[test_idx]

    X_train = train_df[features].values
    y_train = train_df[target].values

    X_test = test_df[features].values
    y_test = test_df[target].values

    # --------------------------------------------------
    # 5. Naive Bayes (Gaussian) – Training
    # --------------------------------------------------
    classes = np.unique(y_train)

    means = {}
    variances = {}
    priors = {}

    for c in classes:
        X_c = X_train[y_train == c]
        means[c] = X_c.mean(axis=0)
        variances[c] = X_c.var(axis=0) + 1e-6  # Stabilität
        priors[c] = X_c.shape[0] / X_train.shape[0]

    # --------------------------------------------------
    # 6. Klassifikation Testdaten
    # --------------------------------------------------
    def gaussian_log_likelihood(x, mean, var):
        return -0.5 * np.sum(np.log(2 * np.pi * var) + ((x - mean) ** 2) / var)

    y_pred = []

    for x in X_test:
        posteriors = []
        for c in classes:
            log_prior = np.log(priors[c])
            log_likelihood = gaussian_log_likelihood(x, means[c], variances[c])
            posteriors.append(log_prior + log_likelihood)
        y_pred.append(classes[np.argmax(posteriors)])

    y_pred = np.array(y_pred)

    # --------------------------------------------------
    # 7. Ergebnis auswerten
    # --------------------------------------------------
    accuracy = np.mean(y_pred == y_test)
    print("Accuracy:", accuracy)


     Pclass     Sex   Age  Parch  Survived
0         3    male  22.0      0         0
1         1  female  38.0      0         1
2         3  female  26.0      0         1
3         1  female  35.0      0         1
4         3    male  35.0      0         0
..      ...     ...   ...    ...       ...
886       2    male  27.0      0         0
887       1  female  19.0      0         1
888       3  female   NaN      2         0
889       1    male  26.0      0         1
890       3    male  32.0      0         0

[891 rows x 5 columns]
Accuracy: 0.7988826815642458
