<a href="https://colab.research.google.com/github/AfiyaHumaira/KNN-from-Scratch/blob/main/KNN_from_Scratch_Iris_and_News_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Load required libraries
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

In [2]:
# Euclidean distance calculation
def euclidean_distance(a, b):
    distance = 0
    for i in range(len(a)):
        distance += (a[i] - b[i]) ** 2
    return distance ** 0.5

In [3]:
# Custom KNN prediction
def knn_predict(x_test, X_train, y_train, k):
    distances = []
    for i in range(len(X_train)):
        dist = euclidean_distance(x_test, X_train[i])
        distances.append((dist, y_train[i]))
    distances.sort()
    top_k = [label for dist, label in distances[:k]]
    return max(set(top_k), key=top_k.count)


In [4]:
# Accuracy
def accuracy(y_true, y_pred):
    correct = 0
    for i in range(len(y_true)):
        if y_true[i] == y_pred[i]:
            correct += 1
    return correct / len(y_true)


In [5]:
# Confusion Matrix
def confusion_matrix(y_true, y_pred, num_classes):
    matrix = [[0 for _ in range(num_classes)] for _ in range(num_classes)]
    for i in range(len(y_true)):
        matrix[y_true[i]][y_pred[i]] += 1
    return matrix

In [6]:
# Precision, Recall, F1
def metrics(cm):
    precision = []
    recall = []
    f1 = []
    for i in range(len(cm)):
        tp = cm[i][i]
        fp = sum(cm[j][i] for j in range(len(cm)) if j != i)
        fn = sum(cm[i][j] for j in range(len(cm)) if j != i)
        pre = tp / (tp + fp + 1e-9)
        rec = tp / (tp + fn + 1e-9)
        f1_score = 2 * pre * rec / (pre + rec + 1e-9)
        precision.append(pre)
        recall.append(rec)
        f1.append(f1_score)
    return precision, recall, f1

In [9]:
# Run KNN from scratch
def run_custom_knn(X, y, dataset_name, class_count):
    best_k = 1
    best_split = 0.3
    best_accuracy = 0

    print(f"\n--- {dataset_name} - Custom KNN ---")
    for split in [0.2, 0.3, 0.4, 0.5]:
        for k in range(1, 11):
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split, random_state=1)
            y_pred = []
            for x in X_test:
                pred = knn_predict(x, X_train, y_train, k)
                y_pred.append(pred)
            acc = accuracy(y_test, y_pred)
            if acc > best_accuracy:
                best_accuracy = acc
                best_k = k
                best_split = split

    print(f"Best K = {best_k}, Best Split = {best_split}, Accuracy = {round(best_accuracy, 4)}")

    # Final evaluation
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=best_split, random_state=1)
    final_preds = []
    for x in X_test:
        final_preds.append(knn_predict(x, X_train, y_train, best_k))

    acc = accuracy(y_test, final_preds)
    cm = confusion_matrix(y_test, final_preds, class_count)
    prec, rec, f1 = metrics(cm)

    print("Confusion Matrix:")
    for row in cm:
        print(row)
    for i in range(class_count):
        print(f"Class {i}: Precision={round(prec[i],4)}, Recall={round(rec[i],4)}, F1={round(f1[i],4)}")

    return best_k, best_split

In [10]:
# Run Sklearn KNN
def run_sklearn_knn(X, y, dataset_name, k, split):
    print(f"\n--- {dataset_name} - Sklearn KNN ---")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split, random_state=1)
    model = KNeighborsClassifier(n_neighbors=k)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))


In [11]:
# Dataset 1: Iris
# ---------------------------
iris = load_iris()
iris_X = iris.data.tolist()
iris_y = iris.target.tolist()

k_iris, split_iris = run_custom_knn(iris_X, iris_y, "Iris Dataset", 3)
run_sklearn_knn(iris_X, iris_y, "Iris Dataset", k_iris, split_iris)


--- Iris Dataset - Custom KNN ---
Best K = 1, Best Split = 0.2, Accuracy = 1.0
Confusion Matrix:
[11, 0, 0]
[0, 13, 0]
[0, 0, 6]
Class 0: Precision=1.0, Recall=1.0, F1=1.0
Class 1: Precision=1.0, Recall=1.0, F1=1.0
Class 2: Precision=1.0, Recall=1.0, F1=1.0

--- Iris Dataset - Sklearn KNN ---
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        11
           1       1.00      1.00      1.00        13
           2       1.00      1.00      1.00         6

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



In [13]:
# Dataset 2: News
# ---------------------------
news_df = pd.read_csv("/content/news_dataset_200.csv")  # Make sure this CSV is in your working directory
news_df["label"] = news_df["label"].map({"sports": 0, "politics": 1})

vectorizer = CountVectorizer()
news_X = vectorizer.fit_transform(news_df["message"]).toarray()
news_y = news_df["label"].tolist()

k_news, split_news = run_custom_knn(news_X, news_y, "News Dataset", 2)
run_sklearn_knn(news_X, news_y, "News Dataset", k_news, split_news)


--- News Dataset - Custom KNN ---
Best K = 1, Best Split = 0.2, Accuracy = 1.0
Confusion Matrix:
[23, 0]
[0, 17]
Class 0: Precision=1.0, Recall=1.0, F1=1.0
Class 1: Precision=1.0, Recall=1.0, F1=1.0

--- News Dataset - Sklearn KNN ---
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        23
           1       1.00      1.00      1.00        17

    accuracy                           1.00        40
   macro avg       1.00      1.00      1.00        40
weighted avg       1.00      1.00      1.00        40

