In [3]:
import pandas as pd

CSV_PATH = "../data/titanic.csv"

df = pd.read_csv(CSV_PATH)
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [7]:
import pandas as pd

df = pd.read_csv('../Data/titanic.csv')
col = 'Embarked'

anzahl = df[col].value_counts(dropna=False)
prozent = df[col].value_counts(normalize=True, dropna=False) * 100
uebersicht = pd.DataFrame({'Anzahl': anzahl, 'Prozent': prozent})
uebersicht['Prozent'] = uebersicht['Prozent'].round(2).astype(str) + '%'

print(uebersicht)

          Anzahl Prozent
Embarked                
S            644  72.28%
C            168  18.86%
Q             77   8.64%
NaN            2   0.22%


In [14]:
import pandas as pd
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE

# 1. Daten laden und vorbereiten
df = pd.read_csv('../data/titanic.csv')

# WICHTIG: SMOTE und Modelle brauchen Zahlen. Wir müssen Text umwandeln.
# Wir nehmen hier nur Zahlenspalten + Geschlecht als Beispiel
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
df = df[['Survived', 'Pclass', 'Sex', 'Age', 'Fare']].dropna()

# Aufteilen in Features (X) und Zielvariable (y)
X = df.drop('Survived', axis=1)
y = df['Survived']

print(f"Original Verteilung: {y.value_counts().to_dict()}")

# --- 1. UNDERSAMPLING (RandomUnderSampler) ---
# Entfernt zufällig Daten aus der Mehrheitsklasse
rus = RandomUnderSampler(random_state=42)
X_under, y_under = rus.fit_resample(X, y)

print(f"Nach Undersampling: {y_under.value_counts().to_dict()}")


# --- 2. OVERSAMPLING (RandomOverSampler) ---
# Kopiert zufällig Daten der Minderheitsklasse
ros = RandomOverSampler(random_state=42)
X_over, y_over = ros.fit_resample(X, y)

print(f"Nach Oversampling:  {y_over.value_counts().to_dict()}")


# --- 3. SMOTE (Synthetic Minority Over-sampling Technique) ---
# Generiert neue, künstliche Datenpunkte für die Minderheit
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X, y)

print(f"Nach SMOTE:         {y_smote.value_counts().to_dict()}")

Original Verteilung: {0: 424, 1: 290}
Nach Undersampling: {0: 290, 1: 290}
Nach Oversampling:  {0: 424, 1: 424}
Nach SMOTE:         {0: 424, 1: 424}


In [23]:
import pandas as pd
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

path = "../data/titanic.csv"
target = "Survived"

df = pd.read_csv(path)
X, y = df.drop(columns=[target]), df[target]

# Undersample
X_u, y_u = RandomUnderSampler(random_state=42).fit_resample(X, y)
pd.concat([X_u, y_u], axis=1).to_csv("../data/titanic_undersampled.csv", index=False)

# Oversample
X_o, y_o = RandomOverSampler(random_state=42).fit_resample(X, y)
pd.concat([X_o, y_o], axis=1).to_csv("../data/titanic_oversampled.csv", index=False)

print("Fertig: titanic_undersampled.csv & titanic_oversampled.csv")

Fertig: titanic_undersampled.csv & titanic_oversampled.csv
