In [47]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import joblib
import os

In [48]:
# load features...

X_features = np.load("../features/X_features.npy")
y_labels = np.load("../features/y_labels.npy")

print("Features loaded successfully")
print("X_features shape:", X_features.shape)
print("y_labels shape:", y_labels.shape)


Features loaded successfully
X_features shape: (2849, 8196)
y_labels shape: (2849,)


In [49]:
# Split to train and testing:

# step 1: split data (train and test):
X_train, X_test, y_train, y_test = train_test_split(
    X_features,
    y_labels,
    test_size=0.2,
    random_state=42, # I need the shuffle to be fixed not changeble
    stratify=y_labels # split the data while keeping the same class proportions
)

print("Train size: " ,X_train.shape)
print ("Test Validation size: ", X_test.shape)

# step 2: Feature Scaling - fit for training , transform for testing

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print("Feature scaling done")

Train size:  (2279, 8196)
Test Validation size:  (570, 8196)
Feature scaling done


In [52]:
# Train KNN model:

# Try different K values (odd) 1-3-5-7-9...
k_values = [1,3,5,7,9,11]
# try different weighting schemes (uniform, distance)
weighting_schemes = ["uniform", "distance"]
# choose the best model and save it
best_accuracy = 0
best_k = None
best_weight = None
best_model = None

for k in k_values:
    for weight in weighting_schemes:
        print(f"Training k-NN with k={k}, weights={weight}")
        knn = KNeighborsClassifier(n_neighbors=k,weights=weight,metric="cosine")

        knn.fit(X_train,y_train)

        y_val = knn.predict(X_test)
        acc = accuracy_score(y_test, y_val)
        print(f"Validation Accuracy: {acc:.4f}")

        if acc >best_accuracy:
            best_accuracy = acc
            best_k = k
            best_weight = weight
            best_model = knn
print("\nBest k-NN Model:")
print("Best k:", best_k)
print("Best weighting:", best_weight)
print("Best validation accuracy:", best_accuracy)

# save best model
joblib.dump(best_model,"D:\Documents\GitHub\Material-Stream-Identification-System\models\knn_best.pkl")

print("Best knn model saved")


  joblib.dump(best_model,"D:\Documents\GitHub\Material-Stream-Identification-System\models\knn_best.pkl")


Training k-NN with k=1, weights=uniform
Validation Accuracy: 0.6684
Training k-NN with k=1, weights=distance
Validation Accuracy: 0.6684
Training k-NN with k=3, weights=uniform
Validation Accuracy: 0.5702
Training k-NN with k=3, weights=distance
Validation Accuracy: 0.6491
Training k-NN with k=5, weights=uniform
Validation Accuracy: 0.5632
Training k-NN with k=5, weights=distance
Validation Accuracy: 0.6491
Training k-NN with k=7, weights=uniform
Validation Accuracy: 0.5842
Training k-NN with k=7, weights=distance
Validation Accuracy: 0.6526
Training k-NN with k=9, weights=uniform
Validation Accuracy: 0.5561
Training k-NN with k=9, weights=distance
Validation Accuracy: 0.6351
Training k-NN with k=11, weights=uniform
Validation Accuracy: 0.5702
Training k-NN with k=11, weights=distance
Validation Accuracy: 0.6368

Best k-NN Model:
Best k: 1
Best weighting: uniform
Best validation accuracy: 0.6684210526315789
Best knn model saved
