### KNN Regression From Scratch – Diamond Price Prediction

In [69]:
#  Importing required libraries for data loading and manipulation
import pandas as pd
import numpy as np

##### Load the Diamonds dataset

In [70]:
# Loading the diamonds.csv dataset into pandas DataFrame to check the data

df=pd.read_csv(fr"diamonds.csv")
df

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56
53938,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74


In [71]:
df.shape

(53940, 10)

In [72]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    53940 non-null  float64
 1   cut      53940 non-null  object 
 2   color    53940 non-null  object 
 3   clarity  53940 non-null  object 
 4   depth    53940 non-null  float64
 5   table    53940 non-null  float64
 6   price    53940 non-null  int64  
 7   x        53940 non-null  float64
 8   y        53940 non-null  float64
 9   z        53940 non-null  float64
dtypes: float64(6), int64(1), object(3)
memory usage: 4.1+ MB


In [73]:
df.isnull().sum()

carat      0
cut        0
color      0
clarity    0
depth      0
table      0
price      0
x          0
y          0
z          0
dtype: int64

##### Separate input features (X) and target (price)

In [74]:
# Dropping price column from features and storing it separately as target variable(y)

X=df.drop('price',axis=1) 

y=df['price'] # target variable

In [75]:
from sklearn.model_selection import train_test_split

##### Split data into Train and Test (75:25)

In [76]:
# Splitting data to avoid overfitting and evaluate on unseen data

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25, random_state=42 )

In [77]:
print('Shape of Training Feature DataSet =',X_train.shape)
print('Shape of Training Target DataSet =',y_train.shape)

print('Shape of testing Feature DataSet =',X_test.shape)
print('Shape of Testing Target DataSet =',y_test.shape)

Shape of Training Feature DataSet = (40455, 9)
Shape of Training Target DataSet = (40455,)
Shape of testing Feature DataSet = (13485, 9)
Shape of Testing Target DataSet = (13485,)


In [78]:
# Taking 2000 random samples from train data manually to make the scratch KNN run faster

X_train = X_train.sample(5000, random_state=42)
y_train = y_train.loc[X_train.index]

##### Identify categorical and numerical columns

In [46]:
# Listing categorical columns for encoding and numeric columns for scaling

categorical_columns=["cut","color","clarity"]

numerical_columns=X.columns.difference(categorical_columns)

print("Categorical Columns:",categorical_columns)
print("numeric columns:",numerical_columns)

Categorical Columns: ['cut', 'color', 'clarity']
numeric columns: Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object')


In [47]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

##### Build preprocessing pipeline (scale numbers + encode categories)

In [None]:
# Fitting preprocessing pipeline on training data (scaling numeric + encoding categorical) using ColumnTransformer
# so all features become numeric and ready for KNN distance calculations.

preprocessing=ColumnTransformer(transformers=[('nums',StandardScaler(),numerical_columns),('catg',OneHotEncoder(),categorical_columns)])

##### Fit on train and transform both train and test

In [49]:
# Fitting preprocessing only on train data to avoid data leakage, then applying same rules to test data

X_train_processed=preprocessing.fit_transform(X_train)

X_test_processed=preprocessing.transform(X_test)

In [None]:
# Also tested scaling and One-Hot Encoding manually on selected columns to understand how preprocessing works internally.
# This was done step-by-step before combining the final feature matrix.

scale = StandardScaler()
X_train_numeric=scale.fit_transform(X_train[numerical_columns])
X_train_numeric=pd.DataFrame(X_train_numeric)  
X_train_numeric

Unnamed: 0,0,1,2,3,4,5
0,1.520198,-0.857361,0.747757,1.516961,1.493731,1.353417
1,-0.604589,0.064380,-1.121402,-0.490394,-0.533053,-0.489823
2,0.544531,0.135283,0.280467,0.726735,0.653802,0.700301
3,0.436123,-0.715555,0.280467,0.681320,0.653802,0.569677
4,-0.864767,0.489799,-0.654112,-0.926381,-0.916499,-0.852665
...,...,...,...,...,...,...
995,-0.214322,-0.573748,-0.186822,-0.008992,0.005596,-0.068926
996,-0.886448,0.915218,-0.186822,-1.035378,-0.980406,-0.896206
997,0.197627,0.277090,-0.654112,0.436078,0.379912,0.439054
998,-1.059900,-0.006523,0.747757,-1.235205,-1.281685,-1.230022


In [51]:
X_test_numeric=scale.transform(X_test[numerical_columns])
X_test_numeric=pd.DataFrame(X_test_numeric)  
X_test_numeric

Unnamed: 0,0,1,2,3,4,5
0,-1.233352,0.277090,-0.654112,-1.634860,-1.619482,-1.578350
1,-0.496181,-1.211877,-0.186822,-0.299650,-0.323071,-0.431768
2,-0.886448,0.277090,-1.121402,-0.917298,-0.943888,-0.881693
3,-0.821404,-0.644652,-0.186822,-0.771969,-0.806943,-0.838152
4,1.606924,0.418896,-1.121402,1.516961,1.457212,1.527581
...,...,...,...,...,...,...
13480,-0.539544,-0.999167,0.747757,-0.426813,-0.396108,-0.504337
13481,1.780376,-1.353683,-0.654112,1.635041,1.703713,1.440499
13482,0.869753,0.489799,-1.588692,0.990144,0.936821,1.019602
13483,0.436123,0.347993,0.747757,0.581407,0.617283,0.642246


In [52]:
encoder=OneHotEncoder(sparse_output=False)
X_train_categoric=encoder.fit_transform(X_train[categorical_columns])
X_train_categoric=pd.DataFrame(X_train_categoric)  
X_train_categoric

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
996,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
997,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
998,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [53]:
X_test_categoric=encoder.transform(X_test[categorical_columns])
X_test_categoric=pd.DataFrame(X_test_categoric)  
X_test_categoric

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13480,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
13481,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
13482,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
13483,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [None]:
# Combining scaled numeric features and encoded categorical features into final matrices
# which are used as input for scratch KNN regression model.

X_train_after_pre=np.hstack([X_train_numeric,X_train_categoric])
X_train_after_pre

array([[ 1.52019838, -0.85736112,  0.74775701, ...,  1.        ,
         0.        ,  0.        ],
       [-0.60458863,  0.06438008, -1.12140187, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.54453088,  0.13528325,  0.28046729, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.19762687,  0.27708959, -0.65411215, ...,  0.        ,
         0.        ,  0.        ],
       [-1.05990013, -0.00652309,  0.74775701, ...,  0.        ,
         0.        ,  1.        ],
       [-1.05990013,  0.7025086 , -1.5886916 , ...,  0.        ,
         0.        ,  0.        ]])

In [55]:
X_test_after_pre=np.hstack([X_test_numeric,X_test_categoric])
X_test_after_pre

array([[-1.23335214,  0.27708959, -0.65411215, ...,  0.        ,
         1.        ,  0.        ],
       [-0.49618113, -1.21187697, -0.18682243, ...,  0.        ,
         0.        ,  1.        ],
       [-0.88644813,  0.27708959, -1.12140187, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 0.86975338,  0.48979909, -1.5886916 , ...,  1.        ,
         0.        ,  0.        ],
       [ 0.43612338,  0.34799275,  0.74775701, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.50116788,  0.77341177,  0.74775701, ...,  0.        ,
         0.        ,  0.        ]])

### Preprocessing Done in Two Ways
- Built a pipeline using **ColumnTransformer** to scale numeric columns and one-hot encode categorical columns for model training.
- Also performed **manual scaling and encoding** on the same columns separately to understand how mean/std and category vectors are generated.
- Finally, both processed outputs were combined into numeric matrices and passed to the scratch KNN regression model.


##### Define Euclidean distance function

In [56]:
# Creating manual function to calculate straight-line distance between two diamonds

def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1-x2)**2))

##### Build KNN regression logic from scratch

In [57]:
# Looping test data and finding k nearest train prices, then averaging them to predict price

def knn_prediction(X_train, y_train, X_test,k):
    predicted_prices=[]
    for i in X_test:
        distance=[]
        for j in range(len(X_train)):
            dist=euclidean_distance(i, X_train[j])
            distance.append((dist,y_train.iloc[j]))
        distance.sort(key=lambda x: x[0])
        nearest_neighbors = distance[:k]
        average_price = np.mean([price for dstnc, price in nearest_neighbors])
        predicted_prices.append(average_price)  
    return np.array(predicted_prices)


##### Run scratch KNN and generate predictions

In [66]:
# Calling the manual KNN function with k=5 to predict test data prices

k = 5
manual_KNN_prediction = knn_prediction(X_train_after_pre, y_train,  X_test_after_pre,  k)

In [67]:
print(pd.DataFrame(manual_KNN_prediction))

             0
0        872.8
1       2038.6
2       1743.6
3       1124.8
4      10239.6
...        ...
13480   1900.6
13481   9247.2
13482   6902.6
13483   4051.8
13484   3936.6

[13485 rows x 1 columns]


In [63]:
len(y_pred_scratch)

13485

##### Evaluate scratch model performance

In [64]:
# Checking error using MAE and RMSE to know how close predictions are to real values
from sklearn.metrics import mean_absolute_error, mean_squared_error

mae=mean_squared_error(y_test,y_pred_scratch)
rmse = np.sqrt(mean_squared_error(y_test,y_pred_scratch))

In [65]:
print("manual KNN MAE:", mae)
print("Manual KNN RMSE:", rmse)

manual KNN MAE: 1553033.8438650353
Manual KNN RMSE: 1246.2077851887443


##### Train KNN using sklearn in-built algorithm

In [24]:
# Training sklearn KNN regressor with same k value for fair comparison

from sklearn.neighbors import KNeighborsRegressor

knn_sklearn = KNeighborsRegressor(n_neighbors=5)
knn_sklearn.fit(X_train_processed, y_train)

y_pred_sklearn = knn_sklearn.predict(X_test_processed)


In [25]:
y_pred_sklearn

array([ 759. , 1930.8, 1770.2, ..., 6361.8, 4157.2, 3347.2])

##### Evaluate sklearn model performance

In [26]:
# Evaluating sklearn predictions using MAE and RMSE metrics

mae_sk = mean_absolute_error(y_test, y_pred_sklearn)
rmse_sk = np.sqrt(mean_squared_error(y_test, y_pred_sklearn))

print("Sklearn KNN MAE:", mae_sk)
print("Sklearn KNN RMSE:", rmse_sk)


Sklearn KNN MAE: 683.4076232851316
Sklearn KNN RMSE: 1197.2283727307984


##### Compare scratch vs in-built model results

In [27]:
# Creating comparison table to show difference between manual and sklearn KNN performance

comparison = pd.DataFrame({
    "Model": ["Scratch KNN", "Sklearn KNN"],
    "MAE": [mae, mae_sk],
    "RMSE": [rmse, rmse_sk]})
comparison

Unnamed: 0,Model,MAE,RMSE
0,Scratch KNN,1433356.0,1197.228373
1,Sklearn KNN,683.4076,1197.228373


### Conclusion
- Built KNN regression from scratch and tested it on Diamonds dataset.
- Manual model MAE and sklearn model MAE are different but close, which is normal.
- Scratch KNN helped me understand distance-based prediction clearly.
- Both models generated 13,485 predictions, so data preprocessing was correct.