# Assignment 1
K-Nearest Neighbors (KNN): Classifier & Regressor on Iris Dataset

In [1]:
! pip install --upgrade pip
! pip install numpy scipy matplotlib scikit-learn pandas pytest

Collecting pip
  Using cached pip-25.3-py3-none-any.whl.metadata (4.7 kB)
Using cached pip-25.3-py3-none-any.whl (1.8 MB)


ERROR: To modify pip, please run the following command:
C:\Users\Black Parrot\miniconda3\python.exe -m pip install --upgrade pip


Collecting pytest
  Downloading pytest-9.0.2-py3-none-any.whl.metadata (7.6 kB)
Collecting iniconfig>=1.0.1 (from pytest)
  Downloading iniconfig-2.3.0-py3-none-any.whl.metadata (2.5 kB)
Downloading pytest-9.0.2-py3-none-any.whl (374 kB)
Downloading iniconfig-2.3.0-py3-none-any.whl (7.5 kB)
Installing collected packages: iniconfig, pytest
Successfully installed iniconfig-2.3.0 pytest-9.0.2


In [5]:
# Iris_KNN_solution.py
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsClassifier as SKKNNClassifier, KNeighborsRegressor as SKKNNRegressor
from knn_from_scratch import KNNClassifierScratch, KNNRegressorScratch

## Objectives

Understand KNN algorithm principles for Classification & Regression

Implement from scratch + compare with scikit-learn

Analyze impact of distance metrics, feature scaling & hyperparameters

Perform visualization, evaluation & reporting

## Dataset

Iris dataset from sklearn.datasets 

150 samples → 4 numeric features → 3 classes

In [8]:
# 1) Load dataset & EDA
data = load_iris()
X = data['data']
y = data['target']
feature_names = data['feature_names']

df = pd.DataFrame(X, columns=feature_names)
df['target'] = y
print(df.describe())
print(df['target'].value_counts())

       sepal length (cm)  sepal width (cm)  petal length (cm)  \
count         150.000000        150.000000         150.000000   
mean            5.843333          3.057333           3.758000   
std             0.828066          0.435866           1.765298   
min             4.300000          2.000000           1.000000   
25%             5.100000          2.800000           1.600000   
50%             5.800000          3.000000           4.350000   
75%             6.400000          3.300000           5.100000   
max             7.900000          4.400000           6.900000   

       petal width (cm)      target  
count        150.000000  150.000000  
mean           1.199333    1.000000  
std            0.762238    0.819232  
min            0.100000    0.000000  
25%            0.300000    0.000000  
50%            1.300000    1.000000  
75%            1.800000    2.000000  
max            2.500000    2.000000  
target
0    50
1    50
2    50
Name: count, dtype: int64


## Tasks

### Part A: Data Understanding & Preprocessing

Load the dataset & convert into a DataFrame

Summary statistics + class distribution

Visualizations:

Pairplot with class coloring

Correlation heatmap

Standardization (train-test split applied before scaling)

In [9]:
# Quick pairplot-like scatter matrix (matplotlib)
pd.plotting.scatter_matrix(df.iloc[:, :4], figsize=(10,10), diagonal='hist')
plt.suptitle('Iris pairwise plots')
plt.tight_layout()
plt.savefig('iris_pairwise.png')
plt.close()

In [10]:
# 2) Train/test split and standardize (scaler fitted on train only)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
scaler = StandardScaler().fit(X_train)
X_train_s = scaler.transform(X_train)
X_test_s = scaler.transform(X_test)


### Part B: From-Scratch KNN

You must implement:

KNN Classifier

KNN Regressor

With the following features:

Euclidean & Manhattan distance

Uniform & Distance-based weighting

Tie-breaking rules

Methods to include:

fit(X_train, y_train)
predict(X_test, k=3, metric='euclidean', weights='uniform')

In [11]:
# 3) From-scratch KNN classifier
knn_scratch = KNNClassifierScratch()
knn_scratch.fit(X_train_s, y_train)
y_pred_scratch = knn_scratch.predict(X_test_s, k=5, metric='euclidean', weights='uniform')
print("Scratch KNN accuracy (k=5):", accuracy_score(y_test, y_pred_scratch))

Scratch KNN accuracy (k=5): 0.9333333333333333


In [12]:
# 4) scikit-learn KNN classifier
clf = SKKNNClassifier(n_neighbors=5, weights='uniform', p=2)
clf.fit(X_train_s, y_train)
y_pred_sk = clf.predict(X_test_s)
print("sklearn KNN accuracy (k=5):", accuracy_score(y_test, y_pred_sk))
print("Confusion matrix (sklearn):\\n", confusion_matrix(y_test, y_pred_sk))


sklearn KNN accuracy (k=5): 0.9333333333333333
Confusion matrix (sklearn):\n [[10  0  0]
 [ 0 10  0]
 [ 0  2  8]]


### Part C: Classification Experiments

Use:

k ∈ {1,3,5,7,9,11}

Metrics: Manhattan (p=1), Euclidean (p=2)

Weights: uniform / distance

Evaluate using:
✓ Accuracy
✓ Confusion matrix
✓ Cross-validation (5-Fold)

Plot Accuracy vs k (for each metric & weight type)

In [13]:
# 5) CV over k and weights
param_grid = {'n_neighbors': [1,3,5,7,9,11], 'weights': ['uniform', 'distance'], 'p':[1,2]}
grid = GridSearchCV(SKKNNClassifier(), param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid.fit(scaler.transform(X), y)
print("GridSearch best:", grid.best_params_, "best_score:", grid.best_score_)

GridSearch best: {'n_neighbors': 7, 'p': 2, 'weights': 'distance'} best_score: 0.9666666666666666


### Part D: Regression Experiments

Regression target options (choose one):

1️⃣ Predict Petal Length from other features

2️⃣ Create synthetic target:

y=0.5⋅sepal_length+noise

Evaluate:

- RMSE

- R² score

- Plot RMSE vs k

In [16]:
# 6) Regression: predict petal length (index 2) from others
X_reg = X[:, [0,1,3]]
y_reg = X[:, 2]
Xr_train, Xr_test, yr_train, yr_test = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)
scaler_r = StandardScaler().fit(Xr_train)
Xr_train_s = scaler_r.transform(Xr_train); Xr_test_s = scaler_r.transform(Xr_test)

reg_scratch = KNNRegressorScratch()
reg_scratch.fit(Xr_train_s, yr_train)
yreg_pred_scratch = reg_scratch.predict(Xr_test_s, k=5, metric='euclidean', weights='uniform')
print("Regression RMSE (scratch, k=5):", mean_squared_error(yr_test, yreg_pred_scratch))
print("Regression R2 (scratch, k=5):", r2_score(yr_test, yreg_pred_scratch))

reg_sk = SKKNNRegressor(n_neighbors=5, weights='uniform', p=2); reg_sk.fit(Xr_train_s, yr_train)
yreg_pred_sk = reg_sk.predict(Xr_test_s)
print("Regression RMSE (sklearn, k=5):", mean_squared_error(yr_test, yreg_pred_sk))
print("Regression R2 (sklearn, k=5):", r2_score(yr_test, yreg_pred_sk))

Regression RMSE (scratch, k=5): 0.1024933333333333
Regression R2 (scratch, k=5): 0.9687271371179631
Regression RMSE (sklearn, k=5): 0.10249333333333328
Regression R2 (sklearn, k=5): 0.9687271371179632


### Part E: Visualization

2D decision boundaries for at least two feature pairs

Scatter of Actual vs Predicted (regression)

In [None]:
# Visualize decision boundary for two feature pairs (2D)
# We'll plot on a grid for pair (sepal length, petal length) and (petal width, petal length)
import numpy as np
def plot_decision_boundary(X_train2, y_train, model_predict_fn, filename, title):
    # X_train2: (n,2)
    margin = 0.5
    x_min, x_max = X_train2[:,0].min()-margin, X_train2[:,0].max()+margin
    y_min, y_max = X_train2[:,1].min()-margin, X_train2[:,1].max()+margin
    xx, yy = np.meshgrid(np.linspace(x_min, x_max, 200), np.linspace(y_min, y_max, 200))
    grid = np.c_[xx.ravel(), yy.ravel()]
    pred = model_predict_fn(grid)
    zz = pred.reshape(xx.shape)
    plt.figure(figsize=(6,5))
    plt.contourf(xx, yy, zz, alpha=0.3)
    plt.scatter(X_train2[:,0], X_train2[:,1], c=y_train, edgecolor='k', s=40)
    plt.title(title)
    plt.savefig(filename); plt.close()

In [19]:
# Example: use sklearn KNN trained on two features (after scaling those features)
# Pair 1: sepal length (0) and petal length (2)
pair_idxs = (0,2)
X_pair = X[:, list(pair_idxs)]
Xtr_p, Xte_p, ytr_p, yte_p = train_test_split(X_pair, y, test_size=0.2, random_state=1, stratify=y)
scaler_pair = StandardScaler().fit(Xtr_p)
Xtr_ps = scaler_pair.transform(Xtr_p)
clf2 = SKKNNClassifier(n_neighbors=5)
clf2.fit(Xtr_ps, ytr_p)
def predict_fn(grid):
    return clf2.predict(scaler_pair.transform(grid))
plot_decision_boundary(Xtr_p, ytr_p, predict_fn, 'iris_decision_pair1.png', 'Decision boundary (sepal_len vs petal_len)')



In [18]:
# Pair 2: petal width (3) vs petal length (2)
pair_idxs = (3,2)
X_pair = X[:, list(pair_idxs)]
Xtr_p, Xte_p, ytr_p, yte_p = train_test_split(X_pair, y, test_size=0.2, random_state=1, stratify=y)
scaler_pair = StandardScaler().fit(Xtr_p)
Xtr_ps = scaler_pair.transform(Xtr_p)
clf2 = SKKNNClassifier(n_neighbors=5)
clf2.fit(Xtr_ps, ytr_p)
def predict_fn2(grid):
    return clf2.predict(scaler_pair.transform(grid))
plot_decision_boundary(Xtr_p, ytr_p, predict_fn2, 'iris_decision_pair2.png', 'Decision boundary (petal_width vs petal_len)')

print("Plots saved: iris_pairwise.png, iris_decision_pair1.png, iris_decision_pair2.png")


Plots saved: iris_pairwise.png, iris_decision_pair1.png, iris_decision_pair2.png
