In [1]:
import numpy as np
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from collections import Counter
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer, fbeta_score

In [2]:
# daten laden / teilen
df = pd.read_csv('../data/GeneralDatensatz18-21ohneGeo-mitLockdown_mitCorona_mitF.csv', sep=';')

X=df[['UMONAT','USTUNDE','UWOCHENTAG','UART','USTRZUSTAND','BEZ','UTYP1','ULICHTVERH','IstRad','IstPKW','IstFuss','IstKrad','IstGkfz','IstSonstige', 'LOCKDOWN', 'COVID']]
#für tödliche Unfälle

#für tödliche und schwere vs. leichte Unfälle  -> 1 ist schwer oder tödlich, 0 ist leicht
y = df['UKATEGORIE'].isin([1, 2]).astype(int)


In [3]:
# StratifiedKFold initialisieren
skf = StratifiedKFold(n_splits=5)

# Variablen für Ergebnisse
fbetas_rf = []
fbetas_random = []
fbetas_mehrheit = []

In [4]:
beta = 2
fbeta_scorer= make_scorer(fbeta_score, beta=beta)

In [5]:
# Stratified K-Fold Cross-Validation
counter = 0
for train_index, test_index in skf.split(df.iloc[:, :-1], df['UKATEGORIE'].isin([1, 2]).astype(int)):
    X_train, X_test = df.iloc[train_index, :-1], df.iloc[test_index, :-1]
    y_train, y_test = df.iloc[train_index, -1], df.iloc[test_index, -1]
    
    
    # Random Forest Modell trainieren
    model = LogisticRegression(max_iter=150, C=0.069,  solver='lbfgs', penalty='l2', tol=0.001, class_weight= {0: 1, 1: 9})
    
    model.fit(X_train, y_train)
    
    # Vorhersagen mit dem Random Forest Modell
    y_pred_rf = model.predict(X_test)
    
    # Leistung des Random Forest Modells bewerten
    fbeta_rf = fbeta_score(y_test, y_pred_rf, beta=beta, average='weighted')
    print(f"Fold nummer {counter}: score ist {fbeta_rf} ")
    fbetas_rf.append(fbeta_rf)
    
     # jetzt mit random:

    # Klassenverteilung im Testdatensatz ermitteln
    class_counts = Counter(y_test)
    total_samples = len(y_test)
    class_probabilities = {cls: count / total_samples for cls, count in class_counts.items()}
    
    # Zufällige Vorhersagen basierend auf den Klassenwahrscheinlichkeiten erstellen
    np.random.seed(42)
    y_pred_random_weighted = np.random.choice(
        list(class_probabilities.keys()),
        size=y_test.shape,
        p=list(class_probabilities.values())
    )
    
    
    # Leistung der gewichteten zufälligen Vorhersagen bewerten
    fbeta_random_weighted = fbeta_score(y_test, y_pred_random_weighted, beta=beta, average='weighted')
   
    fbetas_random.append(fbeta_random_weighted)
    
    # immer mehrheitsklasse vorhersagen
    y_pred_zeros =np.zeros(y_test.shape)
   
    # Leistung der Vorhersagen der mehrheitsklasse bewerten
    fbeta_mehrheit = fbeta_score(y_test, y_pred_zeros, beta=beta, average='weighted')
    fbetas_mehrheit.append(fbeta_mehrheit)
    counter = counter+1
    
    

ValueError: Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
# Durchschnittliche Genauigkeiten berechnen
mean_scores_rf = np.mean(fbetas_rf)
mean_scores_random = np.mean(fbetas_random)
mean_scores_mehrheit = np.mean(fbetas_mehrheit)

In [None]:
print(f"Random Forest Mean score: {mean_scores_rf}")
print(f"Weighted Random Prediction Mean score: {mean_scores_random}")
print(f"Weighted Mehrheit Prediction Mean score: {mean_scores_mehrheit}")


In [None]:

# Klassifikationsberichte für den letzten Fold ausgeben
print("Random Forest Classification Report (Last Fold):")
print(classification_report(y_test, y_pred_rf))

print("Weighted Random Prediction Classification Report (Last Fold):")
print(classification_report(y_test, y_pred_random_weighted))