In [35]:
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
import joblib
import matplotlib.pyplot as plt

## Load Features


In [36]:
X_train = np.load("../features/X_train_scaled.npy")
X_val = np.load("../features/X_val_scaled.npy")
X_test = np.load("../features/X_test_scaled.npy")
y_train = np.load("../features/y_train.npy")
y_val = np.load("../features/y_val.npy")
y_test = np.load("../features/y_test.npy")

## Global Parameters



In [37]:
param_grid = {
    'n_neighbors': [3, 5, 7, 9 , 11],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

In [38]:
CLASS_MAPPING = {
    "cardboard": 0,
    "glass": 1,
    "metal": 2,
    "paper": 3,
    "plastic": 4,
    "trash": 5,
    "unknown": 6
}

## Model Training


In [39]:
grid_search = GridSearchCV(
    KNeighborsClassifier(),
    param_grid,
    cv=3,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2
)
grid_search.fit(X_train, y_train)

print(f"\n Best parameters: {grid_search.best_params_}")
print(f" Best CV score: {grid_search.best_score_:.4f}")

Fitting 3 folds for each of 20 candidates, totalling 60 fits

 Best parameters: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'distance'}
 Best CV score: 0.8685


 0.86513345 0.86712095 0.86172629 0.86342987        nan 0.86030664
        nan 0.86172629        nan 0.86541738        nan 0.86484952
        nan 0.86002271]


## Model Evaluation

In [40]:
val_score = grid_search.score(X_val, y_val)
print(f"Validation accuracy: {val_score:.4f}")

Validation accuracy: 1.0000


## Save Model


In [41]:
# Train final model on train+val if validation score is good
if val_score >= 0.80:
    print("\nTraining final model on train+val...")
    X_combined = np.vstack([X_train, X_val])
    y_combined = np.concatenate([y_train, y_val])

    knn = KNeighborsClassifier(**grid_search.best_params_)
    knn.fit(X_combined, y_combined)
else:
    print("\nUsing train-only model (validation score too low)")
    knn = grid_search.best_estimator_


Training final model on train+val...


In [42]:
# Model Evaluation on test set
y_pred = knn.predict(X_test)
print(f"\nTest Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"\nClassification Report:\n{classification_report(y_test, y_pred)}")


Test Accuracy: 1.0000

Classification Report:
              precision    recall  f1-score   support

   cardboard       1.00      1.00      1.00        73
       glass       1.00      1.00      1.00       118
       metal       1.00      1.00      1.00        95
       paper       1.00      1.00      1.00       135
     plastic       1.00      1.00      1.00       110
       trash       1.00      1.00      1.00        33

    accuracy                           1.00       564
   macro avg       1.00      1.00      1.00       564
weighted avg       1.00      1.00      1.00       564



In [43]:
joblib.dump(knn, "../models/knn_model.pkl")
print( "Model Saved Successfully")

Model Saved Successfully


## Model Prediction with Rejection


In [58]:
import numpy as np
from collections import Counter

def knn_predict_with_rejection(model, X, threshold=0.6):
    distances, indices = model.kneighbors(X)
    neighbor_labels = model._y[indices]  # shape: (n_samples, k)

    final_preds = []

    for i in range(len(X)):
        label_counts = Counter(neighbor_labels[i])
        pred_label, votes = label_counts.most_common(1)[0]

        confidence = votes / model.n_neighbors

        if confidence < threshold:
            final_preds.append(CLASS_MAPPING["unknown"])
        else:
            final_preds.append(CLASS_MAPPING[pred_label])

    return np.array(final_preds)

# UNKNOWN_LABEL = 6
#
# def knn_predict_with_rejection(model, X, threshold=0.6):
#     distances, indices = model.kneighbors(X)
#     neighbor_labels = model._y[indices]
#     preds = model.predict(X)
#
#     final_preds = []
#     for i, pred in enumerate(preds):
#         votes = np.sum(neighbor_labels[i] == pred)
#         confidence = votes / model.n_neighbors
#
#         if confidence < threshold:
#             final_preds.append(UNKNOWN_LABEL)
#         else:
#             final_preds.append(pred)
#
#     return np.array(final_preds)


In [54]:
# import numpy as np
#
# def knn_confidence(model, X):
#     # 1. Get the indices of the neighbors
#     distances, indices = model.kneighbors(X)
#
#     # 2. Get the actual labels of those neighbors
#     # We use .ravel() if your target y was a column vector
#     neighbor_labels = model._y[indices]
#
#     # 3. Get the model's final predictions
#     preds = model.predict(X)
#
#     confs = []
#     for i, pred in enumerate(preds):
#         # Ensure we are comparing the row of neighbors to the single prediction
#         # We use np.equal to be safe, then sum the Trues
#         votes = np.sum(neighbor_labels[i] == pred)
#
#         # Calculate proportion
#         confs.append(votes / model.n_neighbors)
#
#     return np.array(confs)


In [57]:
# knn_conf = knn_confidence(knn, X_val)
#
# plt.figure(figsize=(7, 5))
# plt.hist(knn_conf, bins=20)
# plt.axvline(0.6, linestyle='--')
# plt.xlabel("KNN Vote Confidence")
# plt.ylabel("Number of Samples")
# plt.title("KNN Confidence Distribution (Validation Set)")
# plt.show()
