In [9]:
import pandas as pd
import numpy as np

data = pd.read_csv("groceries.csv")
print(data.head())
print(data.info())

   Region  Fresh  Milk  Grocery  Frozen  Detergents_Paper  Delicassen  class
0       3  12669  9656     7561     214              2674        1338      2
1       3   7057  9810     9568    1762              3293        1776      2
2       3   6353  8808     7684    2405              3516        7844      2
3       3  13265  1196     4221    6404               507        1788      1
4       3  22615  5410     7198    3915              1777        5185      1
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 440 entries, 0 to 439
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   Region            440 non-null    int64
 1   Fresh             440 non-null    int64
 2   Milk              440 non-null    int64
 3   Grocery           440 non-null    int64
 4   Frozen            440 non-null    int64
 5   Detergents_Paper  440 non-null    int64
 6   Delicassen        440 non-null    int64
 7   class             440 no

In [10]:
print(data["class"].value_counts())

class
2    180
3    173
1     87
Name: count, dtype: int64


setp 1: pre processing

In [11]:
# checking if the dtaset has any missing values
print(data.isnull().sum())


Region              0
Fresh               0
Milk                0
Grocery             0
Frozen              0
Detergents_Paper    0
Delicassen          0
class               0
dtype: int64


Since the data is already in numeric format no need to perform encoding

In [12]:
from sklearn.preprocessing import StandardScaler

X = data.drop("class", axis=1).values
y = data["class"].values

scaler = StandardScaler()
X = scaler.fit_transform(X)

In [22]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (352, 7)
X_test shape: (88, 7)
y_train shape: (352,)
y_test shape: (88,)


step 2: scratch implementation of KNN

In [23]:
def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2) ** 2))

In [24]:
from collections import Counter

class KNN:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        return np.array([self._predict(x) for x in X])

    def _predict(self, x):
        distances = [euclidean_distance(x, x_train) for x_train in self.X_train]
        k_indices = np.argsort(distances)[:self.k]
        k_neighbor_labels = [self.y_train[i] for i in k_indices]
        most_common = Counter(k_neighbor_labels).most_common(1)
        return most_common[0][0]

In [25]:
knn_scratch = KNN(k=5)
knn_scratch.fit(X_train, y_train)
y_pred_scratch = knn_scratch.predict(X_test)

accuracy_scratch = np.mean(y_pred_scratch == y_test)
print("Scratch KNN Accuracy:", accuracy_scratch)

Scratch KNN Accuracy: 0.8636363636363636


step 3: KNN sklearn

In [26]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

print("Sklearn KNN Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Sklearn KNN Accuracy: 0.8636363636363636
Confusion Matrix:
 [[12  2  2]
 [ 4 32  3]
 [ 0  1 32]]
Classification Report:
               precision    recall  f1-score   support

           1       0.75      0.75      0.75        16
           2       0.91      0.82      0.86        39
           3       0.86      0.97      0.91        33

    accuracy                           0.86        88
   macro avg       0.84      0.85      0.84        88
weighted avg       0.87      0.86      0.86        88

