In [27]:
# data manipulation
import pandas as pd
import numpy as np

# model
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier

# metrics
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score, recall_score, precision_score

# saving
import joblib

In [28]:
X_train = joblib.load('data/X_train_preprocessed.pkl')
X_test = joblib.load('data/X_test_preprocessed.pkl')
y_train = joblib.load('data/y_train_preprocessed.pkl')
y_test = joblib.load('data/y_test_preprocessed.pkl')

In [29]:
knn = KNeighborsClassifier()
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

In [30]:
param_grid = [
    {
        'metric': ['minkowski'],
        'p': [1, 2],  # hyperparametro usado apenas com a metrica minkowski
        'n_neighbors': [1, 3, 5, 7, 9, 11, 15, 20],
        'weights': ['uniform', 'distance']
    },
    {
        'metric': ['euclidean', 'manhattan'],
        'n_neighbors': [1, 3, 5, 7, 9, 11, 15, 20],
        'weights': ['uniform', 'distance']
    }
]

In [46]:
grid_search = GridSearchCV(
    estimator=knn,
    param_grid=param_grid,
    cv=skf,
    scoring='f1',
    n_jobs=-1
)

In [47]:
grid_search.fit(X_train, y_train)

In [48]:
results_df = pd.DataFrame(grid_search.cv_results_)
results_df = results_df.sort_values(by='mean_test_score', ascending=False)
pd.set_option('display.max_columns', None)
results_df.head(5)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_metric,param_n_neighbors,param_p,param_weights,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
32,0.01015,0.001921,0.096353,0.010993,euclidean,1,,uniform,"{'metric': 'euclidean', 'n_neighbors': 1, 'wei...",0.581633,0.542714,0.545455,0.487047,0.531915,0.467662,0.572864,0.540541,0.528497,0.603015,0.540134,0.038802,1
2,0.008054,0.001837,0.054172,0.003919,minkowski,1,2.0,uniform,"{'metric': 'minkowski', 'n_neighbors': 1, 'p':...",0.581633,0.542714,0.545455,0.487047,0.531915,0.467662,0.572864,0.540541,0.528497,0.603015,0.540134,0.038802,1
3,0.008707,0.001556,0.054052,0.006192,minkowski,1,2.0,distance,"{'metric': 'minkowski', 'n_neighbors': 1, 'p':...",0.581633,0.542714,0.545455,0.487047,0.531915,0.467662,0.572864,0.540541,0.528497,0.603015,0.540134,0.038802,1
33,0.010014,0.001442,0.053645,0.008005,euclidean,1,,distance,"{'metric': 'euclidean', 'n_neighbors': 1, 'wei...",0.581633,0.542714,0.545455,0.487047,0.531915,0.467662,0.572864,0.540541,0.528497,0.603015,0.540134,0.038802,1
1,0.009026,0.002262,0.408406,0.046432,minkowski,1,1.0,distance,"{'metric': 'minkowski', 'n_neighbors': 1, 'p':...",0.53,0.550265,0.505263,0.523077,0.443182,0.459184,0.535354,0.47191,0.476684,0.55102,0.504594,0.037223,5


In [50]:
preds = grid_search.best_estimator_.predict(X_train)
print(classification_report(y_train, preds))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4898
           1       1.00      1.00      1.00       966

    accuracy                           1.00      5864
   macro avg       1.00      1.00      1.00      5864
weighted avg       1.00      1.00      1.00      5864



In [51]:
joblib.dump(grid_search.best_estimator_, 'models/knn.pkl')

['models/knn.pkl']