In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

df = pd.read_csv('../data/weblogs.csv')
df.head()

df = df.drop(['NIGHT', 'ID'], axis=1)
df = df.drop('OTHER_METHOD', axis=1)

X = df.loc[:, df.columns != 'ROBOT']
y = df['ROBOT']

numeric_cols = X.select_dtypes(['int64', 'float64']).columns
scaler = StandardScaler()
X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train = X_train.fillna(0)
X_test = X_test.fillna(0)

In [6]:
knn = KNeighborsClassifier()

### Eerste model trainen

In [2]:
param_grid = {
    'n_neighbors': [10, 20, 30, 40, 50], 
    'weights': ['uniform', 'distance'], 
    'metric': ['euclidean', 'manhattan'], 
    'algorithm': ['auto', 'ball_tree', 'kd_tree']
}
knn_gscv = GridSearchCV(knn, param_grid, cv=5)
knn_gscv.fit(X_train, y_train)

print(f"Best params: {knn_gscv.best_params_}\nBest score: {knn_gscv.best_score_}")

Best params: {'algorithm': 'auto', 'metric': 'manhattan', 'n_neighbors': 20, 'weights': 'distance'}
Best score: 0.9766522028993115


In [3]:
final_knn = knn_gscv.best_estimator_
final_knn.fit(X_train, y_train)
final_knn.score(X_test, y_test)

from sklearn.metrics import classification_report, confusion_matrix

y_pred = final_knn.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99     10785
           1       0.94      0.94      0.94      2686

    accuracy                           0.98     13471
   macro avg       0.97      0.96      0.96     13471
weighted avg       0.98      0.98      0.98     13471

[[10638   147]
 [  161  2525]]


In [5]:
import pickle
filename = '../modellen/knn_classifier_final_model_auto_manhattan_n-20_distance.sav'
pickle.dump(final_knn, open(filename, 'wb'))

### Tweede model trainen

In [6]:
param_grid2 = {
    'n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9, 10],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan'],
    'algorithm': ['auto']
}

test = GridSearchCV(knn, param_grid2, cv=5)
test.fit(X_train, y_train)

print(f"Best params: {test.best_params_}\nBest score: {test.best_score_}")

Best params: {'algorithm': 'auto', 'metric': 'manhattan', 'n_neighbors': 10, 'weights': 'distance'}
Best score: 0.9765037334106517


In [7]:
final_knn2 = test.best_estimator_
final_knn2.fit(X_train, y_train)
final_knn2.score(X_test, y_test)

from sklearn.metrics import classification_report, confusion_matrix

y_pred = final_knn2.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.99      0.99     10785
           1       0.95      0.94      0.94      2686

    accuracy                           0.98     13471
   macro avg       0.97      0.96      0.96     13471
weighted avg       0.98      0.98      0.98     13471

[[10642   143]
 [  171  2515]]


In [8]:
import pickle
filename = '../modellen/knn_classifier_final_model_auto_manhattan_n-10_distance.sav'
pickle.dump(final_knn2, open(filename, 'wb'))


### Derde model trainen

In [7]:
param_grid3 = {
    'n_neighbors': [5, 10, 15, 20],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan'],
    'algorithm': ['ball_tree', 'kd_tree']
}

grid_search_3 = GridSearchCV(knn, param_grid3, cv=5)
grid_search_3.fit(X_train, y_train)

print(f"Best params: {grid_search_3.best_params_}\nBest score: {grid_search_3.best_score_}")

Best params: {'algorithm': 'kd_tree', 'metric': 'manhattan', 'n_neighbors': 20, 'weights': 'distance'}
Best score: 0.9766336431368765


In [8]:
final_knn3 = grid_search_3.best_estimator_
final_knn3.fit(X_train, y_train)
final_knn3.score(X_test, y_test)

from sklearn.metrics import classification_report, confusion_matrix

y_pred = final_knn3.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99     10785
           1       0.94      0.94      0.94      2686

    accuracy                           0.98     13471
   macro avg       0.97      0.96      0.96     13471
weighted avg       0.98      0.98      0.98     13471

[[10638   147]
 [  161  2525]]
