In [1]:
from imblearn.over_sampling import SMOTE
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
import pandas as pd
from sklearn.metrics import make_scorer, fbeta_score
from sklearn.model_selection import StratifiedKFold

In [2]:
train_data_loaded = pd.read_csv('../data/train_data_2024-08-01.csv')

X = train_data_loaded.drop(columns=['UKATEGORIE'])
y = train_data_loaded['UKATEGORIE']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# KFold-Konfiguration
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Definieren des F-beta-Scores mit beta = 2
beta = 2
fbeta_scorer = make_scorer(fbeta_score, beta=beta)

In [3]:
# Parameter-Suchräume definieren
#class_weights = ['balanced', {0: 1, 1: 2, 2: 1}, {0: 1, 1: 1, 2: 2}, None]
#class_weights = ['balanced', {0:1, 1: 1}, {0:1, 1: 3}, {0:1, 1: 6}, {0:1, 1: 7}, {0:1, 1: 8}, {0:1, 1: 9}, {0:1, 1: 10}, None]

knn_param_space = {
    'n_neighbors': (1200, 1700, 20),
    'leaf_size': (10, 50),
    'p': (1, 2),
    'weights': Categorical(['uniform', 'distance'])
}




In [4]:
# Anwendung von SMOTE
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_train, y_train)

In [5]:
# K-Nearest Neighbors
knn_opt = BayesSearchCV(estimator=KNeighborsClassifier(), search_spaces=knn_param_space, n_iter=32, cv=kf, n_jobs=-1, scoring=fbeta_scorer, random_state=42)
knn_opt.fit(X_res, y_res)
print("Best parameters for KNN: ", knn_opt.best_params_)

Best parameters for KNN:  OrderedDict({'leaf_size': 43, 'n_neighbors': 20, 'p': 1, 'weights': 'distance'})


In [6]:
# Modell evaluieren

# Best parameters from BayesSearchCV
best_knn_params = knn_opt.best_params_

# Erstelle das KNN Modell mit den besten Parametern
knn_best = KNeighborsClassifier(**best_knn_params)

# Führe die Kreuzvalidierung durch
scores = cross_val_score(knn_best, X_res, y_res, cv=kf, scoring=fbeta_scorer, n_jobs=-1)

# Ausgabe der Ergebnisse
print(f"Kreuzvalidierungsergebnisse: {scores}")
print(f"Durchschnittlicher Wert: {scores.mean()} ± {scores.std()}")

Kreuzvalidierungsergebnisse: [0.8893356  0.88770944 0.88630104 0.89089332 0.88754421]
Durchschnittlicher Wert: 0.8883567213095602 ± 0.0015935827402731794


In [7]:
print("Best fbeta_score for KNN: ", knn_opt.best_score_)


Best fbeta_score for KNN:  0.8883567213095602


In [8]:
# Perform Cross-Validation auf Trainingsdaten
train_scores = cross_val_score(knn_best, X_train, y_train, cv=kf, scoring=fbeta_scorer)

# Perform Cross-Validation auf Testdaten
test_scores = cross_val_score(knn_best, X_test, y_test, cv=kf, scoring=fbeta_scorer)

print("Cross-Validation Scores für Trainingsdaten: ", train_scores)
print("Mean Cross-Validation Score für Trainingsdaten: ", train_scores.mean())
print("Standard Deviation Cross-Validation Score für Trainingsdaten: ", train_scores.std())

print("\nCross-Validation Scores für Testdaten: ", test_scores)

Cross-Validation Scores für Trainingsdaten:  [0.01888218 0.02386335 0.01512478 0.02011061 0.01513241]
Mean Cross-Validation Score für Trainingsdaten:  0.018622664498935026
Standard Deviation Cross-Validation Score für Trainingsdaten:  0.003291297713917394

Cross-Validation Scores für Testdaten:  [0.01485149 0.00499002 0.01986097 0.00498504 0.004995  ]
