## Import Libraries

In [18]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

## Read Dataset

In [19]:
df = pd.read_csv("Transformed_Data.csv")
print(df.shape)
df.head()

(18900, 20)


Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area3,Wilderness_Area4,Soil_Type3,Soil_Type4,Soil_Type10,Soil_Type23,Soil_Type29,Soil_Type30,Soil_Type32,Cover_Type
0,0.48875,0.242953,1.579103,-0.24244,-0.103541,0.840184,0.856814,-2.074582,-2.035292,0.828308,1,0,0,0,0,0,0,0,0,1
1,0.923949,0.924811,-1.243057,-0.080129,-0.914803,-0.563925,-0.384762,0.664501,0.749468,-0.02912,0,0,0,0,0,1,0,0,0,1
2,1.30176,0.543035,0.285613,0.287345,-0.751288,1.38065,0.547494,-1.044093,-0.969251,1.453142,1,0,0,0,0,1,0,0,0,1
3,0.503097,0.76748,-1.125467,-0.680049,-0.820478,1.037096,0.285514,0.232543,0.075034,-0.038221,1,0,0,0,0,0,0,0,0,1
4,1.220459,-1.652851,-1.007877,0.835735,0.226095,0.818457,0.21215,1.266796,0.531908,0.212173,1,0,0,0,0,0,0,0,0,1


## Holdout set prediction intervals

In [20]:
## Prediction intervals for a categorical response
## prob_matrix: a probability matrix of dimension nxJ, J = # categories, each row is a probability mass function
## labels: a vector of length J, with short names for categories
## This code is a modified version from lecture notes stat447-classification-predintervals.pdf in Python.
def category_pred_interval(prob_matrix, labels):
    ncases = prob_matrix.shape[0] 
    pred50 = [None] * ncases
    pred80 = [None] * ncases
    
    for i in range(ncases):
        p = prob_matrix[i, :]
        ip = np.argsort(p)[::-1]  
        p_ordered = p[ip] # decreasing order
        labels_ordered = np.array(labels)[ip] # decreasing order
        G = np.cumsum(p_ordered)   # cumulative sum from largest
        k1 = np.min(np.where(G >= 0.5))  # level1 = 0.5
        k2 = np.min(np.where(G >= 0.8))  # level2 = 0.8
        pred1 = labels_ordered[:k1 + 1]
        pred2 = labels_ordered[:k2 + 1]
        pred50[i] = "".join(map(str, pred1))
        pred80[i] = "".join(map(str, pred2))
    return {"pred50": pred50, "pred80": pred80}

In [21]:
## Split data into train and test.
train_df, test_df = train_test_split(df, test_size=0.3, random_state=123) ## 70% train data and 30% test data
X_train = train_df.drop(columns=["Cover_Type"])
X_test = test_df.drop(columns=["Cover_Type"])
y_train = train_df["Cover_Type"]
y_test = test_df["Cover_Type"]

In [22]:
## KNN Classifier - with all features
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train)

In [23]:
## KNN Classifier holdout set prediction intervals
prob_matrix = knn.predict_proba(X_test)
labels = knn.classes_
intervals = category_pred_interval(prob_matrix, labels)
pred50 = intervals["pred50"]
pred80 = intervals["pred80"]
# Create a DataFrame with the true labels and the predicted 50% interval
data50 = {'True_Labels': y_test, 'Pred50': pred50}
data80 = {'True_Labels': y_test, 'Pred80': pred80}
df50 = pd.DataFrame(data50)
df80 = pd.DataFrame(data80)
cross_tab50 = pd.crosstab(df50['True_Labels'], df50['Pred50'])
print("50% Prediction Interval:")
print(cross_tab50)

# Calculate cross-tabulation for 80% interval
cross_tab80 = pd.crosstab(df80['True_Labels'], df80['Pred80'])
print("80% Prediction Interval:")
print(cross_tab80)

50% Prediction Interval:
Pred50         1    2    3    4    5    6    7
True_Labels                                   
1            507  161    0    0   41    6  116
2            174  442   16    3  112   30   24
3              1   17  613   44   21  124    0
4              0    0   25  774    0   18    0
5             16   35   13    0  731    9    0
6              3   18   80   27   17  644    0
7             49    8    0    0    2    0  749
80% Prediction Interval:
Pred80         1    2    3    4    5    6    7
True_Labels                                   
1            507  161    0    0   41    6  116
2            174  442   16    3  112   30   24
3              1   17  613   44   21  124    0
4              0    0   25  774    0   18    0
5             16   35   13    0  731    9    0
6              3   18   80   27   17  644    0
7             49    8    0    0    2    0  749
