In [1]:

import pandas as pd
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.metrics import fbeta_score
from sklearn.model_selection import StratifiedKFold, train_test_split

# Lade den Trainings- und Validierungsdatensatz
train_val_data = pd.read_csv('../data/train_data_2024-08-01.csv')
X_train_val = train_val_data.drop(columns=['UKATEGORIE'])
y_train_val = train_val_data['UKATEGORIE']

# Stratified K-Fold Cross Validation Setup
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

train_val_fbeta_scores = []  # Liste, um die F-beta Scores der einzelnen Folds zu speichern

for train_index, val_index in skf.split(X_train_val, y_train_val):
    X_train, X_val = X_train_val.iloc[train_index], X_train_val.iloc[val_index]
    y_train, y_val = y_train_val.iloc[train_index], y_train_val.iloc[val_index]

    # Trainiere das Balanced Random Forest Modell mit replacement=True
    model = BalancedRandomForestClassifier(n_estimators=2060, max_depth=16, max_features='sqrt', min_samples_leaf=2, min_samples_split=10, random_state=42, replacement=True, bootstrap=False,  class_weight={0:1, 1: 2}, sampling_strategy='all')
    model.fit(X_train, y_train)

    # Vorhersagen auf Validierungsdaten
    y_val_pred = model.predict(X_val)
    
    # Berechne den F-beta Score für die Validierungsdaten
    val_fbeta = fbeta_score(y_val, y_val_pred, beta=2)
    train_val_fbeta_scores.append(val_fbeta)

# Durchschnittlicher F-beta Score aus der Stratified K-Fold Cross Validation
average_train_val_fbeta = sum(train_val_fbeta_scores) / len(train_val_fbeta_scores)

# Lade den separaten Testdatensatz
test_data = pd.read_csv('../data/test_data_nichtAnfassen.csv')
X_test = test_data.drop(columns=['UKATEGORIE'])
y_test = test_data['UKATEGORIE']

# Trainiere das Modell auf dem gesamten Trainings- und Validierungsdatensatz
model.fit(X_train_val, y_train_val)

# Vorhersagen auf Testdaten
y_test_pred = model.predict(X_test)

# Berechne den F-beta Score für die Testdaten
test_fbeta = fbeta_score(y_test, y_test_pred, beta=2)

# Drucke die F-beta Scores
print(f'Durchschnittlicher Train-Validation F-beta Score: {average_train_val_fbeta}')
print(f'Test F-beta Score: {test_fbeta}')

Durchschnittlicher Train-Validation F-beta Score: 0.49176958782013636
Test F-beta Score: 0.49596309111880044
