In [1]:
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import make_scorer, fbeta_score
import pandas as pd

train_data_loaded = pd.read_csv('../data/train_data_2024-08-01.csv')

X = train_data_loaded.drop(columns=['UKATEGORIE'])
y = train_data_loaded['UKATEGORIE']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# KFold-Konfiguration
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Definieren des F-beta-Scores mit beta = 2
beta = 2
fbeta_scorer = make_scorer(fbeta_score, beta=beta)

# Initialisierung der Modelle
knn = KNeighborsClassifier(n_neighbors=6, leaf_size=41, weights='distance', p=1)
bal_rf = BalancedRandomForestClassifier(n_estimators=2060, max_depth=16, max_features='sqrt',
                                        min_samples_leaf=2, min_samples_split=10, random_state=42, replacement=True, bootstrap=False, class_weight={0:1, 1:2})
log_reg = LogisticRegression(C=1, max_iter=1700, penalty='l2', solver='liblinear',
                             tol=0.0001, random_state=42, class_weight={0:1, 1:9})
rf = RandomForestClassifier(n_estimators=344, min_samples_split=400, min_samples_leaf=8,
                            max_features='log2', max_depth=15, random_state=42, class_weight={0:1, 1:9})

# SMOTE
sm = SMOTE(random_state=42)

# Score Arrays für die Modelle
scores = {
    'KNN': [],
    'BalancedRandomForest': [],
    'LogisticRegression': [],
    'RandomForest': []
}

# Fit und Evaluation für jedes Modell mit Cross-Validation
for train_index, test_index in kf.split(X_train, y_train):
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[test_index]

    # Anwendung von SMOTE auf das Trainingsset
    X_res_fold, y_res_fold = sm.fit_resample(X_train_fold, y_train_fold)

    # KNN
    knn.fit(X_res_fold, y_res_fold)
    y_pred_knn = knn.predict(X_val_fold)
    score_knn = fbeta_score(y_val_fold, y_pred_knn, beta=beta, average='weighted')
    scores['KNN'].append(score_knn)

    # Balanced RandomForestClassifier
    bal_rf.fit(X_res_fold, y_res_fold)
    y_pred_bal_rf = bal_rf.predict(X_val_fold)
    score_bal_rf = fbeta_score(y_val_fold, y_pred_bal_rf, beta=beta, average='weighted')
    scores['BalancedRandomForest'].append(score_bal_rf)

    # Logistic Regression
    log_reg.fit(X_res_fold, y_res_fold)
    y_pred_log_reg = log_reg.predict(X_val_fold)
    score_log_reg = fbeta_score(y_val_fold, y_pred_log_reg, beta=beta)
    scores['LogisticRegression'].append(score_log_reg)

    # Random Forest
    rf.fit(X_res_fold, y_res_fold)
    y_pred_rf = rf.predict(X_val_fold)
    score_rf = fbeta_score(y_val_fold, y_pred_rf, beta=beta)
    scores['RandomForest'].append(score_rf)

# Durchschnittliche F-beta-Scores berechnen
averages = {model: np.mean(score_list) for model, score_list in scores.items()}

# Ergebnisse ausgeben
print("Durchschnittliche F-beta-Scores:")
for model, avg_score in averages.items():
    print(f"{model}: {avg_score:.4f}")

# Ergebnisse auf dem Test-Set berechnen
X_res_test, y_res_test = sm.fit_resample(X_train, y_train)

# KNN
knn.fit(X_res_test, y_res_test)
y_pred_test_knn = knn.predict(X_test)
score_test_knn = fbeta_score(y_test, y_pred_test_knn, beta=beta, average='weighted')

# Balanced RandomForestClassifier
bal_rf.fit(X_res_test, y_res_test)
y_pred_test_bal_rf = bal_rf.predict(X_test)
score_test_bal_rf = fbeta_score(y_test, y_pred_test_bal_rf, beta=beta, average='weighted')

# Logistic Regression
log_reg.fit(X_res_test, y_res_test)
y_pred_test_log_reg = log_reg.predict(X_test)
score_test_log_reg = fbeta_score(y_test, y_pred_test_log_reg, beta=beta)

# Random Forest
rf.fit(X_res_test, y_res_test)
y_pred_test_rf = rf.predict(X_test)
score_test_rf = fbeta_score(y_test, y_pred_test_rf, beta=beta)

print("\nF-beta-Scores auf dem Test-Set:")
print(f"KNN: {score_test_knn:.4f}")
print(f"BalancedRandomForest: {score_test_bal_rf:.4f}")
print(f"LogisticRegression: {score_test_log_reg:.4f}")
print(f"RandomForest: {score_test_rf:.4f}")

  warn(
  warn(
  warn(
  warn(
  warn(


Durchschnittliche F-beta-Scores:
KNN: 0.6750
BalancedRandomForest: 0.6845
LogisticRegression: 0.4747
RandomForest: 0.4746


  warn(



F-beta-Scores auf dem Test-Set:
KNN: 0.6791
BalancedRandomForest: 0.6824
LogisticRegression: 0.4817
RandomForest: 0.4799
