# K-Nearest Neighbors Classification

This is a user-friendly Jupyter Notebook version of KNN classifier. It allows users to:

- Upload a dataset
- Choose parameters for K-Nearest Neighbors (KNN)
- Select cross validation methods
- View classification results (accuracy, sensitivity, specificity)
- Visualize the confusion matrix
- Visualizw score report for each sample


## Import Libraries

In [6]:
# Import required libraries
import numpy as np # Numerical operations
import pandas as pd  # For data manipulation and analysis
from sklearn.model_selection import LeaveOneOut, StratifiedKFold  # Cross-validation strategies
from sklearn.neighbors import KNeighborsClassifier  # KNN model
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score  # Evaluation metrics
import ipywidgets as widgets
from ipywidgets import IntSlider, Dropdown, Button, HBox, VBox, Label, Layout   # For UI Control
from IPython.display import display, clear_output, HTML
import io  # Provides tools for handling input/output
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
from collections import defaultdict

# Clear previous output
output = widgets.Output()
output.clear_output()

## Interactive UI
* Click the "Upload File" button and choose the input .csv file.
* Then choose parameters for KNN and cross-validation methods
  1. Pick a value for K
  2. Select a distance metric method
  3. Choose a weight scheme on how the items in nearest neighbors are weighted
  4. Choose a cross validation method (leave-one-out or K-fold). If K-fold is chosen, select the number of folds.
* Click the green button to run KNN and check the results.

In [7]:
# Input file widget
uploader = widgets.FileUpload(
    accept='.csv',
    multiple=False,
    description="Upload File"
)

display(HTML("<b>Import CSV File</b>"))
display(uploader)

# K (number of neighbors)
k_widget = widgets.BoundedIntText(
    value=5,                  # Default value
    min=1, max=20, step=1,    # Allowed range and increment
    description='Number of nearest neighbors (K):',  # Label shown next to the input
    style={'description_width': 'initial'}  # Ensures the full description is visible (doesn't get truncated)
)

# Distance metric selection (Dropdown)
metric_widget = widgets.Dropdown(
    options=['euclidean', 'manhattan', 'chebyshev'],  # Available distance metrics
    value='euclidean',              
    description='Distance Metric:', 
    style={'description_width': 'initial'}  
)

# Weight function selection (Dropdown)
weights_widget = widgets.Dropdown(
    options=['one', 'inverse', 'inversesquare'],  # Custom weight options
    value='one',                  
    description='Weight Function:',  
    style={'description_width': 'initial'}  
)

# Dropdown to select cross-validation method
cv_widget = widgets.Dropdown(
    options=['Leave-One-Out', 'K-Fold'],      # Available CV methods
    value='K-Fold',                           
    description='Cross Validation Method:',   
    style={'description_width': 'initial'}    
)

# Numeric input for number of folds (only relevant for K-Fold)
fold_widget = widgets.BoundedIntText(
    value=5,           # Default value for folds
    min=2,             # Minimum allowed number of folds
    max=20,            # Maximum allowed number of folds
    step=1,            # Increment step
    description='Number of folds:',  # Label shown next to the input
    style={'description_width': 'initial'}  # Prevent label truncation
)

# Output area for dynamically showing/hiding fold input
cv_output = widgets.Output()

# Function to dynamically show fold input only if 'K-Fold' is selected
def update_cv_ui(change):
    with cv_output:
        cv_output.clear_output()  # Clear previous output
        if cv_widget.value == 'K-Fold':
            display(fold_widget)  # Show fold input only if 'K-Fold' is selected
cv_widget.observe(update_cv_ui, names='value') # update_cv_ui() is run when cv_widget value changes

display(k_widget,
        metric_widget,
        weights_widget
)
# Display the CV dropdown menu
display(cv_widget)
display(cv_output)
# Manually call update function to set the initial state (since default is 'K-Fold')
update_cv_ui(None)


def run_knn(b):
    with output:  # All printed output will appear inside the 'output' widget
        output.clear_output()  # Clear previous output
       
        if uploader.value:
            
            uploaded_file = uploader.value[0]           # Get the first and only uploaded file
            content = uploaded_file['content']          # Binary content of the file
            file_name = uploaded_file['name']
            
            try:
                data = pd.read_csv(io.BytesIO(content))     # Read into a DataFrame from memory buffer
                X = data.drop(columns=['NAME', 'CLASS'])    # Use all columns except 'NAME' and 'CLASS' as features
                y = data['CLASS']                           # Use 'CLASS' as the target label
            except Exception as e:
                # Catch and display any error that occurs during file read or processing
                print(f"❌ Error: {e}")
       
            # Read hyperparameters and CV settings from widgets
            k = k_widget.value                          # Number of neighbors
            metric = metric_widget.value                # Distance metric
            weights = weights_widget.value              # Weighting method
            cv_method = cv_widget.value                 # Cross-validation method
            n = 1 if cv_method == 'Leave-One-Out' else fold_widget.value  # LOOCV if 1 split, K-fold if otherwise

            # Call the function with all collected inputs to run KNN model and evaluate
            print(f"\n\nRunning KNN with k={k}")
            print(f"Cross-Validation was performed using {cv_method}")
            print(f"Input file name: {file_name}\n")

            evaluate_knn(X, y, k, metric, weights, n)
        else:
            print("⚠️ No file uploaded.")

# Run button
run_button = widgets.Button(description='Run KNN', button_style='success')
run_button.on_click(run_knn)

# === Final Output Display ===
display(
    run_button,
    output
)

FileUpload(value=(), accept='.csv', description='Upload File')

BoundedIntText(value=5, description='Number of nearest neighbors (K):', max=20, min=1, style=DescriptionStyle(…

Dropdown(description='Distance Metric:', options=('euclidean', 'manhattan', 'chebyshev'), style=DescriptionSty…

Dropdown(description='Weight Function:', options=('one', 'inverse', 'inversesquare'), style=DescriptionStyle(d…

Dropdown(description='Cross Validation Method:', index=1, options=('Leave-One-Out', 'K-Fold'), style=Descripti…

Output()

Button(button_style='success', description='Run KNN', style=ButtonStyle())

Output()

## Define functions
Below are helper functions for the notebook. If you have run all cells, you can skip them.

In [8]:
# Customized function for calculating weights
def map_weights(weight):
    if weight == 'one':
        return 'uniform'
    elif weight == 'inverse':
        return 'distance'
    else:
        return lambda d: 1 / (d**2 + 1e-5)

def get_weight_array(dists, k, weights):
    if weights == "one":
        return np.ones(k)
    elif weights == "inverse":
        return 1 / (dists + 1e-5)
    else:
        return 1 / (dists**2 + 1e-5)
       

In [9]:
# Calculate overall accuracy, sensitivity and selectivity for each class
def evaluate_knn(X, y, k, metric, weights, n):
    class_names = list(dict.fromkeys(y))       # This preserves the order of class names in the dataset
    label_to_index = {label: idx for idx, label in enumerate(class_names)}
    y_encoded = np.array([label_to_index[label] for label in y])            # Encode class labels as indices
    class_indices = list(range(len(class_names)))        # A range of indices for class names (eg. 0,1,2)

    # Get scores and predicted class labels for training and testing samples via cross validation
    val_true, val_pred, val_scores, train_score_sums, train_score_counts, train_labels = run_cross_validation(
        X, y_encoded, k, metric, weights, class_indices, n
    )
    train_score_avgs = train_score_sums / train_score_counts[:, np.newaxis]
    
    train_pred, train_true = [], []
    train_pred = np.argmax(train_score_avgs, axis = 1)
    train_true = [lbl for lbl in train_labels]           
        
    train_cm = confusion_matrix(train_true, train_pred, labels=class_indices)
    val_cm = confusion_matrix(val_true, val_pred, labels=class_indices)      
   
    train_acc = accuracy_score(train_true, train_pred)
    val_acc = accuracy_score(val_true, val_pred)
    
    display(HTML('<div style="text-align:left;"><strong>Summary Report</strong></div>'))
    print(f"Training\t{train_acc * 100:.2f}% ({int(train_acc * len(train_true))}/{len(train_true)})")
    print(f"Validation\t{val_acc * 100:.2f}% ({int(val_acc * len(val_true))}/{len(val_true)})")

    train_recall = recall_score(train_true, train_pred, labels=class_indices, average=None, zero_division=0)
    train_specificity = compute_specificity(train_cm)

    val_recall = recall_score(val_true, val_pred, labels=class_indices, average=None, zero_division=0)
    val_specificity = compute_specificity(val_cm)

    for i, cls in enumerate(class_names):  # Loop through each class
        train_sens = train_recall[i] * 100  # Sensitivity = Recall
        val_sens = val_recall[i] * 100
        train_spec = train_specificity[i] * 100
        val_spec = val_specificity[i] * 100
        train_ner = (train_recall[i] + train_specificity[i]) / 2 * 100  # Non-error rate
        val_ner = (val_recall[i] + val_specificity[i]) / 2 * 100

        # Extract confusion matrix values
        tp_train = train_cm[i][i]
        fn_train = train_cm[i, :].sum() - tp_train
        fp_train = train_cm[:, i].sum() - tp_train
        tn_train = train_cm.sum() - (tp_train + fn_train + fp_train)

        tp_val = val_cm[i][i]
        fn_val = val_cm[i, :].sum() - tp_val
        fp_val = val_cm[:, i].sum() - tp_val
        tn_val = val_cm.sum() - (tp_val + fn_val + fp_val)

        # Print table with metrics
        print("")
        print(f"{cls}")
        print(f"{'':<10}{'Sensitivity':>20}{'Selectivity':>25}{'Non-Error Rate':>25}")
        print(f"{'Training':<10}{train_sens:>14.2f}% ({tp_train}/{tp_train+fn_train}){train_spec:>17.2f}% ({tn_train}/{tn_train+fp_train}){train_ner:>21.2f}%")
        print(f"{'Validation':<10}{val_sens:>14.2f}% ({tp_val}/{tp_val+fn_val}){val_spec:>17.2f}% ({tn_val}/{tn_val+fp_val}){val_ner:>21.2f}%")

    plot_confusion_matrix_heatmap_with_metrics(train_cm, class_names, train_true, train_pred, "\nTraining Confusion Matrix\n")  # Training confusion matrix
    plot_confusion_matrix_heatmap_with_metrics(val_cm, class_names, val_true, val_pred, "\nValidation Confusion Matrix\n")  # Validation confusion matrix

    display(HTML('<div style="text-align:center;"><strong>Score Report</strong></div>'))
    
    plot_scores(train_score_avgs, class_names, y_encoded, "Training")
    plot_scores(val_scores, class_names, y_encoded, "Validation")    


def accumulate_scores(score_matrix, idx, neighbor_labels, weights_arr, class_indices):
    class_weight_sum = defaultdict(float)     
    # For each test sample, get the weighted score for each class
    for lbl, w in zip(neighbor_labels, weights_arr):
        class_weight_sum[lbl] += w
    for cls_idx in class_indices:
        score_matrix[idx, cls_idx] += class_weight_sum.get(cls_idx, 0.0)

# Obtain classification results
def run_cross_validation(X, y_encoded, k, metric, weights, class_indices, n):
    splitter = LeaveOneOut() if n == 1 else StratifiedKFold(n_splits=n, shuffle=True, random_state=42)
    model = KNeighborsClassifier(n_neighbors=k, metric=metric, weights=map_weights(weights))
    n_classes = len(class_indices)

    val_true, val_pred = [], []
    sample_scores = np.zeros((len(X), n_classes))
    train_score_sums = np.zeros((len(X), n_classes))
    train_score_counts = np.zeros(len(X))
    train_labels = [None] * len(X)

    for train_idx, test_idx in splitter.split(X, y_encoded):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y_encoded[train_idx], y_encoded[test_idx]

        model.fit(X_train, y_train)
        val_preds = model.predict(X_test)
        val_pred.extend(val_preds)
        val_true.extend(y_test)

        neigh_dists_test, neigh_indices_test = model.kneighbors(X_test, n_neighbors=k)
        for i, (test_i, neighbors, dists) in enumerate(zip(test_idx, neigh_indices_test, neigh_dists_test)):
            weights_arr = get_weight_array(dists, k, weights)
            accumulate_scores(sample_scores, test_i, y_train[neighbors], weights_arr, class_indices)

        train_preds = model.predict(X_train)
        neigh_dists_train, neigh_indices_train = model.kneighbors(X_train, n_neighbors=k)
        for i, (train_i, neighbors, dists) in enumerate(zip(train_idx, neigh_indices_train, neigh_dists_train)):
            weights_arr = get_weight_array(dists, k, weights)
            accumulate_scores(train_score_sums, train_i, y_train[neighbors], weights_arr, class_indices)
            train_score_counts[train_i] += 1
            train_labels[train_i] = y_encoded[train_i]
    return val_true, val_pred, sample_scores, train_score_sums, train_score_counts, train_labels

def compute_specificity(cm):
    specificity = []
    for i in range(len(cm)):  # For each class
        TP = cm[i][i]  # True positives for class i
        FP = sum(cm[:, i]) - TP  # False positives for class i
        FN = sum(cm[i, :]) - TP  # False negatives for class i
        TN = cm.sum() - (TP + FP + FN)  # True negatives for class i
        denom = TN + FP
        specificity.append(TN / denom if denom else 0.0)  # Avoid division by zero
    return np.array(specificity)

In [10]:
# Plot confusion matrix heatmaps for training and validation datasets
def plot_confusion_matrix_heatmap_with_metrics(cm, labels, true_labels, pred_labels, title):
    total = cm.sum()
    n = len(labels)

    # Create annotated matrix (n+1)x(n+1)
    annotated_cm = np.empty((n + 1, n + 1), dtype=object)

    # Fill confusion matrix cells
    for i in range(n):
        for j in range(n):
            count = cm[i, j]
            percent = round(count / total * 100)
            annotated_cm[i, j] = f"{count}/{total}\n({percent}%)"

    # Add recall (rightmost column)
    recalls = recall_score(true_labels, pred_labels, labels=range(n), average=None, zero_division=0)
    for i in range(n):
        correct = cm[i, i]
        total_true = cm[i].sum()
        recall_pct = round(recalls[i] * 100)
        annotated_cm[i, -1] = f"{correct}/{total_true if total_true else 1}\n({recall_pct}%)"

    # Add precision (bottom row)
    precisions = precision_score(true_labels, pred_labels, labels=range(n), average=None, zero_division=0)
    for j in range(n):
        correct = cm[j, j]
        total_pred = cm[:, j].sum()
        precision_pct = round(precisions[j] * 100)
        annotated_cm[-1, j] = f"{correct}/{total_pred if total_pred else 1}\n({precision_pct}%)"

    annotated_cm[-1, -1] = ""  # bottom-right corner

    # Extend the original confusion matrix with zeros to match shape
    extended_cm = np.zeros((n + 1, n + 1))
    extended_cm[:n, :n] = cm

    # Create extended label set
    xticklabels = list(labels) + ["Recall"]
    yticklabels = list(labels) + ["Precision"]

    # Plot
    plt.figure(figsize=(6, 6))
    ax = sns.heatmap(extended_cm, annot=annotated_cm, fmt="", cmap="Greens",
                xticklabels=xticklabels, yticklabels=yticklabels, cbar=False,
                linewidths=0, linecolor='gray')
    ax.xaxis.set_ticks_position('top')       # Move ticks to top

    # Determine number of rows and columns
    nrows, ncols = extended_cm.shape
    
    # Highlight last row and last column
    for i in range(nrows):
        for j in range(ncols):
            if i == nrows - 1 or j == ncols - 1:
                # Calculate patch position: seaborn heatmap uses (col, row) as (x, y)
                rect = plt.Rectangle(
                    (j, i), 1, 1,
                    fill=True,
                    facecolor='white',   # Light yellow
                    edgecolor='black',
                    linewidth=0
                )
                ax.add_patch(rect)
    
    # Redraw annotations on top
    for t in ax.texts:
        t.set_zorder(10)
    
    plt.title(title)
    plt.xlabel("Predicted Labels")
    plt.ylabel("True Labels")
    plt.tight_layout()
    plt.show()

In [11]:
# Plot score plots for training and testing samples
def plot_scores (scores, class_names, true_labels,score_set):
    n_items = scores.shape[0]
    n_classes = len(class_names)  # Can be 2, 3, or more

    # Generate distinct colors using a colormap
    cmap = plt.colormaps['tab10']  # Or try 'tab20', 'Set3', etc.
    class_colors = {i: cmap(i % cmap.N) for i in range(n_classes)}
    
    # Initialize bottoms for stacking
    pos_bottoms = np.zeros(n_items)
    neg_bottoms = np.zeros(n_items)

    plt.figure(figsize=(14, 6))

    for cls in range(n_classes):
        values = []
        bottoms = []
        colors = []

        for i in range(n_items):
            score = scores[i, cls]
            is_true_class = (cls == true_labels[i])

            if is_true_class:
                values.append(score)
                bottoms.append(pos_bottoms[i])
                pos_bottoms[i] += score  # Accumulate for next class
            else:
                values.append(-score)
                bottoms.append(neg_bottoms[i])
                neg_bottoms[i] -= score  # Accumulate downward
            colors.append(class_colors[cls])

        # Draw bars for this class across all items
        plt.bar(range(n_items), values, bottom=bottoms, color=colors, width=0.6)

    #print(scores)
    plt.xlabel('Item ID')
    plt.ylabel('Score')
    plt.title(f'Scores per Item ({score_set})')
    
    # Custom legend
    legend_elements = [Patch(facecolor=class_colors[i], label=f'{class_names[i]}(Class {i})') for i in range(n_classes)]
    plt.legend(handles=legend_elements, loc='lower center', bbox_to_anchor=(0.5, -0.25), ncol=n_classes)
    plt.grid(True, linestyle=':', linewidth=0.5)
    plt.tight_layout()
    plt.show()
    