In [1]:
!git clone https://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@github.com/DadeOrsu/dm_project24_group_6.git

Cloning into 'dm_project24_group_6'...
remote: Enumerating objects: 1280, done.[K
remote: Counting objects: 100% (309/309), done.[K
remote: Compressing objects: 100% (200/200), done.[K
remote: Total 1280 (delta 201), reused 185 (delta 96), pack-reused 971 (from 1)[K
Receiving objects: 100% (1280/1280), 51.44 MiB | 7.28 MiB/s, done.
Resolving deltas: 100% (850/850), done.
Updating files: 100% (39/39), done.


In [2]:
cd dm_project24_group_6/src/task4_machine_learning/

/content/dm_project24_group_6/src/task4_machine_learning


# KNN classification

This notebook aims to solve task 4 of classfication by using KNN algorithm as showed during the course.

In [3]:
import pandas as pd
from os import path
import numpy as np
from preprocessing import get_train_test_data

X_train, y_train, X_test, y_test, columns_to_keep = get_train_test_data()

In [4]:
X_train = X_train.dropna()
X_test = X_test.dropna()
y_train = y_train[X_train.index]
y_test = y_test[X_test.index]

In [5]:
X_train.shape

(554459, 10)

In [6]:
from sklearn.metrics import classification_report
def report_scores(test_label, test_pred):
    print(classification_report(test_label,
                            test_pred,
                            target_names=['0', '1']))

# Hyperparameter search

Since the dataset is highly imbalanced, we try to overcome this by balancing the two classes.

In [8]:
from sklearn.utils import resample
import pandas as pd

# Combine X_train and y_train in a unique dataset
train_set = pd.concat([X_train, y_train], axis=1)
train_set.columns = list(X_train.columns) + ['label']  # Dai un nome alla colonna 'label'

# Divide between majority and minority classes
majority_class = train_set[train_set['label'] == 0]
minority_class = train_set[train_set['label'] == 1]

# Number of sampling to mantain
samples_per_class = len(majority_class)

# oversampling of the minority class
minority_upsampled = resample(
    minority_class,
    replace=True,                # Sampling with replacement
    n_samples=samples_per_class, # Number of sampling equals to majority class
    random_state=42
)

# combine the two classes in a balanced dataset
balanced_train_set = pd.concat([majority_class, minority_upsampled])

# divide between features and label
X_train = balanced_train_set.drop(columns=['label'])
y_train = balanced_train_set['label']

print("Distribution of the balaced classses:")
print(y_train.value_counts())


Distribution of the balaced classses:
label
0    460644
1    460644
Name: count, dtype: int64


We procede in the following steps:
1. We define the hyperparameters of the model so that we can tune them later by using a grid search.
2. We split the training data into a training and a validation set. The data is divided into 80% training and 20% validation.
3. The code iterates through a Parameter Grid to find the best hyperparameters for the model. The result of each combination of parameters is stored inside the `params_tested` list, so that they can be analyzed later.

In [10]:
from sklearn.metrics import f1_score
from sklearn.model_selection import ParameterGrid, train_test_split
from sklearn.neighbors import KNeighborsClassifier

NUM_FOLDS = 5
RANDOM_SEED = 42

# Grid of hyperparameters
hyper_params = {
    'n_neighbors': [3, 5, 7],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'kd_tree'],
    'metric': ['euclidean', 'manhattan'],
    'leaf_size': [20, 30],
}


grid_params = ParameterGrid(hyper_params)

X_train_set, X_val_set, Y_train_set, Y_val_set = train_test_split(
    X_train,y_train,
    test_size=0.2,
    stratify=y_train,
    random_state=RANDOM_SEED,
    shuffle=True
)

params_tested = list()

for comb in grid_params:
    knn = KNeighborsClassifier(**comb, n_jobs=-1)
    knn.fit(X_train_set, Y_train_set)
    Y_pred_train_set = knn.predict(X_train_set)
    Y_pred_val_set = knn.predict(X_val_set)
    train_f_score = f1_score(Y_train_set, Y_pred_train_set, average='macro')
    val_f_score = f1_score(Y_val_set, Y_pred_val_set, average='macro')
    new_comb = comb
    new_comb|={
        'train_f_score': train_f_score,
        'val_f_score': val_f_score
    }
    print(comb)
    report_scores(Y_val_set, Y_pred_val_set)
    params_tested.append(new_comb)




{'algorithm': 'auto', 'leaf_size': 20, 'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform', 'train_f_score': 0.9151837333853025, 'val_f_score': 0.8314009806451783}
              precision    recall  f1-score   support

           0       0.93      0.72      0.81     92129
           1       0.77      0.95      0.85     92129

    accuracy                           0.83    184258
   macro avg       0.85      0.83      0.83    184258
weighted avg       0.85      0.83      0.83    184258

{'algorithm': 'auto', 'leaf_size': 20, 'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'distance', 'train_f_score': 1.0, 'val_f_score': 0.8550826225257774}
              precision    recall  f1-score   support

           0       0.99      0.73      0.84     92129
           1       0.78      0.99      0.87     92129

    accuracy                           0.86    184258
   macro avg       0.88      0.86      0.86    184258
weighted avg       0.88      0.86      0.86    184258

{'algorithm'

Since the research of the best hyperparameters is computationally expensive, we store the results contained in the params_tested list in a CSV file. This way, we can analyze the results later without having to re-run the code.

In [11]:
import json

params_df=pd.DataFrame(params_tested)

params_df.sort_values(by='val_f_score',ascending=False)

params_df.to_csv('params_knn/test_f1_averaged.csv')

In [12]:
pd.read_csv('params_knn/test_f1_averaged.csv').sort_values(by='val_f_score',ascending=False).head(10)

Unnamed: 0.1,Unnamed: 0,algorithm,leaf_size,metric,n_neighbors,weights,train_f_score,val_f_score
43,43,kd_tree,30,manhattan,3,distance,1.0,0.855431
7,7,auto,20,manhattan,3,distance,1.0,0.855431
31,31,kd_tree,20,manhattan,3,distance,1.0,0.855431
19,19,auto,30,manhattan,3,distance,1.0,0.855431
37,37,kd_tree,30,euclidean,3,distance,1.0,0.855083
25,25,kd_tree,20,euclidean,3,distance,1.0,0.855083
1,1,auto,20,euclidean,3,distance,1.0,0.855083
13,13,auto,30,euclidean,3,distance,1.0,0.855083
18,18,auto,30,manhattan,3,uniform,0.916102,0.8317
42,42,kd_tree,30,manhattan,3,uniform,0.916102,0.8317


After analysing the results, we can choose the best hyperparameters and train the model with the entire training data. Finally, we can use the test data to evaluate the model.

In [13]:
from sklearn.neighbors import KNeighborsClassifier
best_model = KNeighborsClassifier(
    algorithm='kd_tree',
    leaf_size=30,
    weights='distance',
    metric='manhattan',
    n_neighbors=3,
)
best_model.fit(X_train, y_train)

In [14]:
test_pred_knn = best_model.predict(X_test)

In [15]:
report_scores(y_test, test_pred_knn)

              precision    recall  f1-score   support

           0       0.87      0.75      0.80     30219
           1       0.19      0.35      0.25      5187

    accuracy                           0.69     35406
   macro avg       0.53      0.55      0.53     35406
weighted avg       0.77      0.69      0.72     35406

