In [4]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split, cross_val_score
from imblearn.over_sampling import SMOTE
from sklearn.metrics import fbeta_score, make_scorer

train_data_loaded = pd.read_csv('../data/train_data_2024-08-01.csv')
X = train_data_loaded.drop(columns=['UKATEGORIE'])
y = train_data_loaded['UKATEGORIE']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

sf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

beta = 2
fbeta_scorer = make_scorer(fbeta_score, beta=beta)


for i in range(1, 20):
    fbeta_scores_SMOTE = []
    # Loop über jeden Split
    for train_index, test_index in sf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
        # Anwendung von SMOTE
        sm = SMOTE(random_state=42)
        X_res, y_res = sm.fit_resample(X_train, y_train)
    
        # Trainieren Sie den Klassifikator
        knn = KNeighborsClassifier(n_neighbors=i, weights='distance', p=1)
        knn.fit(X_res, y_res)
    
        # Vorhersagen auf den Testdaten machen
        y_pred = knn.predict(X_test)
        y_pred_train = knn.predict(X_train)
    
        # F-beta-Score berechnen und zur Liste hinzufügen
        fbeta = cross_val_score(knn, X, y, scoring=fbeta_scorer)
        fbeta_scores_SMOTE.append(fbeta)
    
        
    # print(f"SMOTE fbeta scores für {i} neighbors (Train und Test): ", fbeta_train, fbeta_test)

# Durchschnittlichen F-beta-Score über alle Folds ausgeben
    print(f"Average F-beta score for {i} neighbors:" , pd.Series(fbeta_scores_SMOTE).mean())





Average F-beta score for 1 neighbors: [0.19562716 0.18277622 0.18039539 0.20551297 0.19331586]
Average F-beta score for 2 neighbors: [0.13420449 0.13015761 0.12890694 0.14700735 0.14263074]
Average F-beta score for 3 neighbors: [0.13243832 0.11764706 0.12068966 0.1345737  0.11733673]
Average F-beta score for 4 neighbors: [0.10752688 0.10843373 0.09535271 0.12181617 0.11464497]
Average F-beta score for 5 neighbors: [0.08915023 0.08679245 0.08970727 0.09900057 0.08773585]
Average F-beta score for 6 neighbors: [0.08498854 0.08392142 0.08285714 0.08955793 0.08195159]
Average F-beta score for 7 neighbors: [0.07361488 0.07239382 0.07795958 0.07807981 0.06759367]
Average F-beta score for 8 neighbors: [0.06418985 0.0669383  0.06214799 0.07469926 0.05932698]
Average F-beta score for 9 neighbors: [0.05883507 0.05780913 0.05481597 0.05768479 0.05753852]
Average F-beta score for 10 neighbors: [0.04919323 0.05775255 0.05011792 0.05494505 0.05300353]
Average F-beta score for 11 neighbors: [0.0386215

In [6]:
# für leaf_size

for i in range(1, 200, 20):
    fbeta_scores_SMOTE_leaf = []
    # Loop über jeden Split
    for train_index, test_index in sf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
        # Anwendung von SMOTE
        sm = SMOTE(random_state=42)
        X_res, y_res = sm.fit_resample(X_train, y_train)
    
        # Trainieren Sie den Klassifikator
        knn = KNeighborsClassifier(leaf_size=i, weights='distance', p=1)
        knn.fit(X_res, y_res)
    
        # Vorhersagen auf den Testdaten machen
        y_pred = knn.predict(X_test)
        y_pred_train = knn.predict(X_train)
    
        # F-beta-Score berechnen und zur Liste hinzufügen
        fbeta = fbeta_score(y_test, y_pred, beta=2)
        fbeta_scores_SMOTE_leaf.append(fbeta)
    
        
    # print(f"SMOTE fbeta scores für {i} neighbors (Train und Test): ", fbeta_train, fbeta_test)

# Durchschnittlichen F-beta-Score über alle Folds ausgeben
    print(f"Average F-beta score for leaf_size = {i}: " , pd.Series(fbeta_scores_SMOTE_leaf).mean())



Average F-beta score for leaf_size = 1:  0.3203308512923251
Average F-beta score for leaf_size = 21:  0.3203308512923251
Average F-beta score for leaf_size = 41:  0.3203308512923251
Average F-beta score for leaf_size = 61:  0.3203308512923251
Average F-beta score for leaf_size = 81:  0.3203308512923251
Average F-beta score for leaf_size = 101:  0.3203308512923251
Average F-beta score for leaf_size = 121:  0.3203308512923251
Average F-beta score for leaf_size = 141:  0.3203308512923251
Average F-beta score for leaf_size = 161:  0.3203308512923251
Average F-beta score for leaf_size = 181:  0.3203308512923251
