In [2]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score

# --- Step 1: Load Your PROCESSED Data ---
try:
    train_df = pd.read_csv('./Data/salary.train.processed.csv', index_col='id')
    test_df = pd.read_csv('./Data/salary.test.processed.csv', index_col='id')
except FileNotFoundError:
    print("Error: Could not find the processed CSV files.")
    print("Please make sure 'salary.train.processed.csv' and 'salary.test.processed.csv' are in the './Data/' folder.")
    raise

# --- Step 2: Separate Features (X) and Target (y) ---
X_train = train_df.drop('label', axis=1)
y_train = train_df['label']
X_test = test_df.drop('label', axis=1)
y_test = test_df['label']

print(f"Training features shape: {X_train.shape}")
print(f"Testing features shape: {X_test.shape}")

# --- Step 3: Initialize and Train the KNN Model ---
# We'll start with k=5 neighbors (n_neighbors=5)
# 'n_jobs=-1' uses all your CPU cores to speed up finding neighbors
knn = KNeighborsClassifier(n_neighbors=5, n_jobs=-1)

print("\nTraining the KNN model...")
knn.fit(X_train, y_train)
print("Model training complete!")

# --- Step 4: Evaluate the Model ---
print("Evaluating the model on the test set...")
y_pred_knn = knn.predict(X_test)

# Check accuracy
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print(f"\nKNN Model Accuracy on Test Data (k=5): {accuracy_knn * 100:.2f}%")

# Get a detailed report
print("\nKNN Classification Report (k=5):")
print(classification_report(y_test, y_pred_knn,digits=4))

Training features shape: (16720, 56)
Testing features shape: (4180, 56)

Training the KNN model...
Model training complete!
Evaluating the model on the test set...

KNN Model Accuracy on Test Data (k=5): 80.24%

KNN Classification Report (k=5):
              precision    recall  f1-score   support

         0.0     0.8406    0.8121    0.8261      2416
         1.0     0.7541    0.7891    0.7712      1764

    accuracy                         0.8024      4180
   macro avg     0.7973    0.8006    0.7986      4180
weighted avg     0.8041    0.8024    0.8029      4180

