In [94]:
import numpy as np
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import classification_report
from collections import Counter
import pandas as pd
from sklearn.model_selection import StratifiedKFold,  cross_val_score
from sklearn.metrics import make_scorer, fbeta_score

In [95]:
# Lade Datensatz
train_data_loaded = pd.read_csv('../data/train_data_2024-08-01.csv')
X = train_data_loaded.drop(columns=['UKATEGORIE'])
y = train_data_loaded['UKATEGORIE']


In [96]:
# StratifiedKFold initialisieren
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Variablen für Ergebnisse
fbetas_rf = []
fbetas_random = []
fbetas_mehrheit = []

In [97]:
# Liste von beta-Werten
beta_values = [0.7, 0.9, 1, 2, 3, 5, 10, 50, 10000]

# Variablen für Ergebnisse
results = {}
results_rf = {}
results_random = {}
results_mehrheit = {}


In [98]:
for beta in beta_values:
    fbeta_scorer = make_scorer(fbeta_score, beta=beta)
    model_logreg = LogisticRegression(C=1, max_iter=1700, penalty='l2', solver='liblinear', tol=0.0001, random_state=42, class_weight={0: 1, 1: 9})
    logreg_scores = cross_val_score(model_logreg, X, y, cv=skf, scoring=fbeta_scorer)
    results[beta] = logreg_scores.mean()

    print(f'LogReg F betas für beta = {beta}: {logreg_scores}')
    print(f'LogReg F-beta Score (mean) für beta = {beta}: {logreg_scores.mean()}')

    # Variablen für Ergebnisse
    fbetas_rf = []
    fbetas_random = []
    fbetas_mehrheit = []
    
    counter = 1
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Drucke die ersten 10 values von y_train
        print(f"Die ersten 20 Werte von y_train für beta = {beta}:               ", y_train.values[:20])
        # Drucke die ersten 10 values von y_test
        print(f"Die ersten 20 Werte von y_test für beta = {beta}:                ", y_test.values[:20])
        
        # Random Forest Modell trainieren
        model_logreg.fit(X_train, y_train)
        y_pred_rf = model_logreg.predict(X_test)
        print(f"Ersten 20 Vorhersagen mit dem Modell für beta = {beta}:          ", y_pred_rf[:20])
        
        # Zufällige Vorhersagen basierend auf den Klassenwahrscheinlichkeiten erstellen
        class_counts = Counter(y_test)
        total_samples = len(y_test)
        class_probabilities = {cls: count / total_samples for cls, count in class_counts.items()}
        
        np.random.seed(42)
        y_pred_random_weighted = np.random.choice(
            list(class_probabilities.keys()),
            size=y_test.shape,
            p=list(class_probabilities.values())
        )
        
        # Drucke die ersten 10 Werte
        print(f"Ersten 20 Vorhersagen mit dem _random_weighted für beta = {beta}:", y_pred_random_weighted[:20])
        
        # Leistung der gewichteten zufälligen Vorhersagen bewerten
        fbeta_random_weighted = fbeta_score(y_test, y_pred_random_weighted, beta=beta, average='weighted')
      
        fbetas_random.append(fbeta_random_weighted)
        
        # Immer Mehrheitsklasse vorhersagen
        y_pred_majority = np.full(y_test.shape, y_train.mode().iloc[0])
        
        # Drucke die ersten 10 Werte
        print(f"Ersten 20 Vorhersagen mit dem y_pred_majority für beta = {beta}: ", y_pred_majority[:20])
        
        # Leistung der Vorhersagen der Mehrheitsklasse bewerten
        fbeta_mehrheit = fbeta_score(y_test, y_pred_majority, beta=beta, average='weighted')
        
       # print(f"Fold Nummer {counter}: fbeta_random_weighted - Score ist {fbeta_random_weighted}")
       # print(f"Fold Nummer {counter}: fbeta_mehrheit - Score ist {fbeta_mehrheit}")
        fbetas_mehrheit.append(fbeta_mehrheit)
        
        counter += 1
        
    results_rf[beta] = np.mean(logreg_scores)
    results_random[beta] = np.mean(fbetas_random)
    results_mehrheit[beta] =  np.mean(fbetas_mehrheit)
    
       

LogReg F betas für beta = 0.7: [0.23675471 0.23367634 0.23619902 0.23103375 0.23201623]
LogReg F-beta Score (mean) für beta = 0.7: 0.2339360085670165
Die ersten 20 Werte von y_train für beta = 0.7:                [0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1]
Die ersten 20 Werte von y_test für beta = 0.7:                 [0 0 0 0 0 1 0 0 0 0 0 1 1 1 0 0 0 0 0 0]
Ersten 20 Vorhersagen mit dem Modell für beta = 0.7:           [1 1 1 0 1 1 1 1 1 0 0 1 1 1 1 1 1 0 1 0]
Ersten 20 Vorhersagen mit dem _random_weighted für beta = 0.7: [0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0]
Ersten 20 Vorhersagen mit dem y_pred_majority für beta = 0.7:  [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
Die ersten 20 Werte von y_train für beta = 0.7:                [0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0]
Die ersten 20 Werte von y_test für beta = 0.7:                 [0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0]
Ersten 20 Vorhersagen mit dem Modell für beta = 0.7:           [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
Ersten 20 

In [99]:

# Ergebnisse ausgeben
for beta in results.keys():
    print(f'Beta: {beta}')
    print(f'F-beta Score (mean) logReg: {results[beta]}')
 #   print(f'F-beta Score (mean) logReg händischer StratifiedFold: {results_rf[beta]}')
    print(f'F-beta Score (mean) Weighted Random Prediction: {results_random[beta]}')
    print(f'F-beta Score (mean) Mehrheit Prediction: {results_mehrheit[beta]}')
    print()  # Add an empty line for better readability

Beta: 0.7
F-beta Score (mean) logReg: 0.2339360085670165
F-beta Score (mean) Weighted Random Prediction: 0.7413436411632752
F-beta Score (mean) Mehrheit Prediction: 0.7545596779138349

Beta: 0.9
F-beta Score (mean) logReg: 0.26880414850039197
F-beta Score (mean) Weighted Random Prediction: 0.7416703503314792
F-beta Score (mean) Mehrheit Prediction: 0.7693231723706073

Beta: 1
F-beta Score (mean) logReg: 0.28777772921160116
F-beta Score (mean) Weighted Random Prediction: 0.7418151577302602
F-beta Score (mean) Mehrheit Prediction: 0.7760396157975746

Beta: 2
F-beta Score (mean) logReg: 0.48240489744404214
F-beta Score (mean) Weighted Random Prediction: 0.7426462385123551
F-beta Score (mean) Mehrheit Prediction: 0.8167984428179098

Beta: 3
F-beta Score (mean) logReg: 0.6228114595968568
F-beta Score (mean) Weighted Random Prediction: 0.7429245423154188
F-beta Score (mean) Mehrheit Prediction: 0.8313531025898367

Beta: 5
F-beta Score (mean) logReg: 0.7587059999909253
F-beta Score (mean) Wei

In [100]:

# Klassifikationsberichte für den letzten Fold ausgeben
print("Random Forest Classification Report (Last Fold):")
print(classification_report(y_test, y_pred_rf))

print("Weighted Random Prediction Classification Report (Last Fold):")
print(classification_report(y_test, y_pred_random_weighted))

Random Forest Classification Report (Last Fold):
              precision    recall  f1-score   support

           0       0.91      0.23      0.37      6787
           1       0.17      0.87      0.29      1231

    accuracy                           0.33      8018
   macro avg       0.54      0.55      0.33      8018
weighted avg       0.80      0.33      0.36      8018

Weighted Random Prediction Classification Report (Last Fold):
              precision    recall  f1-score   support

           0       0.85      0.85      0.85      6787
           1       0.16      0.15      0.15      1231

    accuracy                           0.74      8018
   macro avg       0.50      0.50      0.50      8018
weighted avg       0.74      0.74      0.74      8018

