In [2]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import fbeta_score, make_scorer
df = pd.read_csv('../data/GeneralDatensatz18-21ohneGeo-mitLockdown_mitCorona.csv', sep=';')

X=df[['UMONAT','USTUNDE','UWOCHENTAG','UART','USTRZUSTAND','BEZ','UTYP1','ULICHTVERH','IstRad','IstPKW','IstFuss','IstKrad','IstGkfz','IstSonstige', 'LOCKDOWN', 'COVID']]

#für tödliche und schwere vs. leichte Unfälle
y = df['UKATEGORIE'].isin([1, 2]).astype(int)
# -> 1 ist schwer oder tödlich, 0 ist leicht

# KFold-Konfiguration
kf = KFold(n_splits=5, shuffle=True, random_state=42)

import matplotlib.pyplot as plt

# Leere Listen für F1-Scores
f1_scores_log_reg = []
f1_scores_rf = []
roc_scores_log_reg = []
roc_scores_rf = []

weights = range(0, 25) # eigentlich würden noch etwas weniger reichen, es ändert sich nichts mehr
beta = 2
fbeta_scorer= make_scorer(fbeta_score, beta=beta)

for i in weights:
    # Logistische Regression
    log_reg = LogisticRegression(max_iter=1000, class_weight={0:1, 1: i})

    # F1-Score berechnen und speichern
    f1_reg = cross_val_score(log_reg, X, y, cv=kf, scoring=fbeta_scorer)
    roc_auc_reg = cross_val_score(log_reg, X, y, cv=kf, scoring='roc_auc')
    #print(f"Weight: {i} F1 Score Logistische Regression (k-fold): { f1_reg.mean()}")
    
    f1_scores_log_reg.append(f1_reg.mean())
    roc_scores_log_reg.append(roc_auc_reg.mean())
    
   
    

    # Random Forest
    rf_clas = RandomForestClassifier(n_estimators=100, max_depth=5,random_state=42, class_weight={0:1, 1: i})

    # F1-Score berechnen und speichern
    f1_rf = cross_val_score(rf_clas, X, y, cv=kf, scoring=fbeta_scorer)
    roc_auc_rf = cross_val_score(log_reg, X, y, cv=kf, scoring='roc_auc')
    print(f"Weight: {i} F1 Score Random Forest (k-fold): { f1_rf.mean()}")

    f1_scores_rf.append(f1_rf.mean())
    roc_scores_rf.append(roc_auc_rf.mean())
    
# F1-Scores plotten
plt.figure(figsize=(10, 6))
plt.plot(weights, f1_scores_log_reg, marker='o', label='Logistic Regression Fbeta')
plt.plot(weights, f1_scores_rf, marker='o', label='Random Forest Fbeta')
plt.plot(weights, roc_scores_log_reg, marker='o', label='Logistic Regression AUC')
plt.plot(weights, roc_scores_rf, marker='o', label='Random Forest AUC')
plt.title('F1-Scores für verschiedene Gewichte')
plt.xlabel('Gewicht')
plt.ylabel('F1-Score')
plt.legend()
plt.grid(True)
plt.show()

Weight: 0 F1 Score Random Forest (k-fold): 0.0
Weight: 1 F1 Score Random Forest (k-fold): 0.0
Weight: 2 F1 Score Random Forest (k-fold): 0.027125280723094492
Weight: 3 F1 Score Random Forest (k-fold): 0.20903599567062905
Weight: 4 F1 Score Random Forest (k-fold): 0.3470483431192305
Weight: 5 F1 Score Random Forest (k-fold): 0.40508855576285896
Weight: 6 F1 Score Random Forest (k-fold): 0.44846325707848367
Weight: 7 F1 Score Random Forest (k-fold): 0.49339236975929107
Weight: 8 F1 Score Random Forest (k-fold): 0.49941402834242554
Weight: 9 F1 Score Random Forest (k-fold): 0.49711183207185095
Weight: 10 F1 Score Random Forest (k-fold): 0.4904704749559997
Weight: 11 F1 Score Random Forest (k-fold): 0.4824361694421631
Weight: 12 F1 Score Random Forest (k-fold): 0.47812259599672113
Weight: 13 F1 Score Random Forest (k-fold): 0.4761801025577081
Weight: 14 F1 Score Random Forest (k-fold): 0.47597380162804387


KeyboardInterrupt: 