## Import Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_val_score,
    KFold
)

## Read Dataset

In [2]:
df = pd.read_csv("Transformed_Data.csv")
print(df.shape)
df.head()

(18900, 20)


Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area3,Wilderness_Area4,Soil_Type3,Soil_Type4,Soil_Type10,Soil_Type23,Soil_Type29,Soil_Type30,Soil_Type32,Cover_Type
0,0.48875,0.242953,1.579103,-0.24244,-0.103541,0.840184,0.856814,-2.074582,-2.035292,0.828308,1,0,0,0,0,0,0,0,0,1
1,0.923949,0.924811,-1.243057,-0.080129,-0.914803,-0.563925,-0.384762,0.664501,0.749468,-0.02912,0,0,0,0,0,1,0,0,0,1
2,1.30176,0.543035,0.285613,0.287345,-0.751288,1.38065,0.547494,-1.044093,-0.969251,1.453142,1,0,0,0,0,1,0,0,0,1
3,0.503097,0.76748,-1.125467,-0.680049,-0.820478,1.037096,0.285514,0.232543,0.075034,-0.038221,1,0,0,0,0,0,0,0,0,1
4,1.220459,-1.652851,-1.007877,0.835735,0.226095,0.818457,0.21215,1.266796,0.531908,0.212173,1,0,0,0,0,0,0,0,0,1


## Separate response and predictor variables

In [3]:
features = df.drop(columns=["Cover_Type"])
cover_type = df["Cover_Type"]

In [4]:
## Subset of features were selected based on Random Forest model as it is the best model out of all
subset = [
    "Elevation",
    "Horizontal_Distance_To_Roadways",
    "Horizontal_Distance_To_Fire_Points",
    "Horizontal_Distance_To_Hydrology",
    "Vertical_Distance_To_Hydrology",
    "Hillshade_9am",
    "Aspect",
    "Hillshade_Noon",
    "Hillshade_3pm",
    "Wilderness_Area4"
]

features_10 = features[subset]

## Hyperparameter optimization for KNN Classifier

In [5]:
## Find the optimal n_neighbors using RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV
knn = KNeighborsClassifier()
param_grid_knn = {
    "n_neighbors": np.arange(1, 50, 2)
}
random_search_knn = RandomizedSearchCV(
    knn, param_distributions=param_grid_knn, n_jobs=-1, n_iter=20, cv=5, random_state=447, scoring="accuracy"
)
random_search_knn.fit(features, cover_type)
print("Best K: ", random_search_knn.best_params_)

Best K:  {'n_neighbors': 1}


In [6]:
## Find the optimal n_neighbors using GridSearchCV
from sklearn.model_selection import GridSearchCV
param_grid = {'n_neighbors': np.arange(1, 50, 2)}
knn = KNeighborsClassifier()
grid_search_knn = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy')
grid_search_knn.fit(features, cover_type)
print("Best K: ", grid_search_knn.best_params_)

Best K:  {'n_neighbors': 1}


Both RandomizedSearchCV and GridSearchCV sugget that the best n_neighbors is 1

## KNN Classifier - with all features

In [7]:
## Set up the KNN Classifier.
knn = KNeighborsClassifier(n_neighbors = 1)

# Set up the k-fold cross-validator
kfold = 5
kf = KFold(n_splits=kfold, shuffle=True, random_state=447)

# Calculate out-of-sample accuracy
scores = cross_val_score(knn, features, cover_type , cv=kf, scoring='accuracy')
# Print individual fold accuracies and average accuracy
results = pd.DataFrame({'Fold': np.arange(1, kfold + 1), 'Accuracy': scores})
print(results)
print(f'Average accuracy: {np.mean(scores)}')

   Fold  Accuracy
0     1  0.792328
1     2  0.795238
2     3  0.793122
3     4  0.796296
4     5  0.790741
Average accuracy: 0.7935449735449736


## KNN Classifier - with 10 features

In [8]:
# Calculate out-of-sample accuracy
scores = cross_val_score(knn, features_10, cover_type , cv=kf, scoring='accuracy')

# Print individual fold accuracies and average accuracy
results = pd.DataFrame({'Fold': np.arange(1, kfold + 1), 'Accuracy': scores})
print(results)
print(f'Average accuracy: {np.mean(scores)}')

   Fold  Accuracy
0     1  0.783598
1     2  0.775926
2     3  0.779630
3     4  0.793651
4     5  0.775926
Average accuracy: 0.7817460317460319


As seen, reducing the number of features did not improve the accuracy.