In [1]:
from sklearn.metrics import classification_report
import pandas as pd
from sklearn.metrics import make_scorer, fbeta_score
from sklearn.model_selection import cross_val_score, StratifiedKFold
from imblearn.over_sampling import SMOTE
from sklearn.neighbors import KNeighborsClassifier
from collections import Counter
import numpy as np

In [2]:
# Lade Datensatz
train_data_loaded = pd.read_csv('../../data/train_data_2024-08-01.csv')
X = train_data_loaded.drop(columns=['UKATEGORIE'])
y = train_data_loaded['UKATEGORIE']


In [3]:
# StratifiedKFold initialisieren
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)



In [4]:
beta = 2
fbeta_scorer= make_scorer(fbeta_score, beta=beta)

In [5]:
# SMOTE
sm = SMOTE(random_state=42)

# KNN Initialisierung
knn = KNeighborsClassifier(n_neighbors=6, leaf_size=41, weights='distance', p=1)

# Listen für die Ergebnisse
fbetas_knn = []
fbetas_random = []
fbetas_mehrheit = []

counter = 1
for train_index, test_index in skf.split(X, y):
    X_train_fold, X_test_fold = X.iloc[train_index], X.iloc[test_index]
    y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]
    
    # Anwendung von SMOTE auf Trainingsset
    X_res_fold, y_res_fold = sm.fit_resample(X_train_fold, y_train_fold)

    # Trainiere KNN
    knn.fit(X_res_fold, y_res_fold)
    y_pred_knn = knn.predict(X_test_fold)
    fbeta_knn = fbeta_score(y_test_fold, y_pred_knn, beta=beta)
    fbetas_knn.append(fbeta_knn)

    print(f"Fold Nummer {counter} - KNN F-beta Score: {fbeta_knn}")
    
    # Zufällige Vorhersagen entsprechend der Klassenverteilung
    class_counts = Counter(y_test_fold)
    total_samples = len(y_test_fold)
    class_probabilities = {cls: count / total_samples for cls, count in class_counts.items()}
    
    np.random.seed(42)
    y_pred_random_weighted = np.random.choice(
        list(class_probabilities.keys()),
        size=y_test_fold.shape,
        p=list(class_probabilities.values())
    )
    
    fbeta_random_weighted = fbeta_score(y_test_fold, y_pred_random_weighted, beta=beta)
    fbetas_random.append(fbeta_random_weighted)

    print(f"Fold Nummer {counter} - Zufällige Vorhersagen F-beta Score: {fbeta_random_weighted}")
    
    # Immer Mehrheitsklasse vorhersagen
    majority_class = y_train_fold.mode().iloc[0]
    y_pred_majority = np.full(y_test_fold.shape, majority_class)
    
    fbeta_mehrheit = fbeta_score(y_test_fold, y_pred_majority, beta=beta)
    fbetas_mehrheit.append(fbeta_mehrheit)

    print(f"Fold Nummer {counter} - Mehrheitsklasse F-beta Score: {fbeta_mehrheit}")
    
    counter += 1

# Durchschnittliche F-beta-Scores berechnen und ausgeben
print("\nDurchschnittliche F-beta-Scores:")
print(f"KNN: {np.mean(fbetas_knn):.4f} (std: {np.std(fbetas_knn):.4f})")
print(f"Zufällige Vorhersagen: {np.mean(fbetas_random):.4f} (std: {np.std(fbetas_random):.4f})")
print(f"Mehrheitsklasse: {np.mean(fbetas_mehrheit):.4f} (std: {np.std(fbetas_mehrheit):.4f})")

Fold Nummer 1 - KNN F-beta Score: 0.3209690665206679
Fold Nummer 1 - Zufällige Vorhersagen F-beta Score: 0.16413522782949536
Fold Nummer 1 - Mehrheitsklasse F-beta Score: 0.0
Fold Nummer 2 - KNN F-beta Score: 0.3046448087431694
Fold Nummer 2 - Zufällige Vorhersagen F-beta Score: 0.1347378735913768
Fold Nummer 2 - Mehrheitsklasse F-beta Score: 0.0
Fold Nummer 3 - KNN F-beta Score: 0.3204254158712844
Fold Nummer 3 - Zufällige Vorhersagen F-beta Score: 0.15910574412532638
Fold Nummer 3 - Mehrheitsklasse F-beta Score: 0.0
Fold Nummer 4 - KNN F-beta Score: 0.2933425797503467
Fold Nummer 4 - Zufällige Vorhersagen F-beta Score: 0.14768276762402088
Fold Nummer 4 - Mehrheitsklasse F-beta Score: 0.0
Fold Nummer 5 - KNN F-beta Score: 0.3079138664106433
Fold Nummer 5 - Zufällige Vorhersagen F-beta Score: 0.15270292340356034
Fold Nummer 5 - Mehrheitsklasse F-beta Score: 0.0

Durchschnittliche F-beta-Scores:
KNN: 0.3095 (std: 0.0104)
Zufällige Vorhersagen: 0.1517 (std: 0.0101)
Mehrheitsklasse: 0.000

In [6]:
# Durchschnittliche Genauigkeiten berechnen
mean_scores = np.mean(fbetas_knn)
mean_scores_random = np.mean(fbetas_random)
mean_scores_mehrheit = np.mean(fbetas_mehrheit)

In [7]:
print(f"Modell Mean score: {mean_scores}")
print(f"Weighted Random Prediction Mean score: {mean_scores_random}")
print(f"Weighted Mehrheit Prediction Mean score: {mean_scores_mehrheit}")


Modell Mean score: 0.3094591474592224
Weighted Random Prediction Mean score: 0.15167290731475594
Weighted Mehrheit Prediction Mean score: 0.0
