<a href="https://colab.research.google.com/github/Aruuu62/Flower-and-News-Classification-with-Custom-Evaluation-Metrics-knn/blob/main/KNN_From_Scratch_LabReport02.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# KNN from Scratch: Iris and News Classification
Lab Report 02 for CSE312


In [1]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier as SklearnKNN


In [2]:
class KNNClassifier:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X_train, y_train):
        self.X_train = np.array(X_train)
        self.y_train = np.array(y_train)

    def predict(self, X_test):
        predictions = []
        for x in X_test:
            distances = np.linalg.norm(self.X_train - x, axis=1)
            k_indices = distances.argsort()[:self.k]
            k_nearest_labels = self.y_train[k_indices]
            most_common = Counter(k_nearest_labels).most_common(1)
            predictions.append(most_common[0][0])
        return np.array(predictions)


In [3]:
def accuracy_score(y_true, y_pred):
    return np.mean(y_true == y_pred)

def confusion_matrix(y_true, y_pred, labels):
    matrix = np.zeros((len(labels), len(labels)), dtype=int)
    label_index = {label: idx for idx, label in enumerate(labels)}
    for t, p in zip(y_true, y_pred):
        matrix[label_index[t]][label_index[p]] += 1
    return matrix

def precision_recall_f1(y_true, y_pred, labels):
    cm = confusion_matrix(y_true, y_pred, labels)
    precision, recall, f1 = [], [], []
    for i in range(len(labels)):
        TP = cm[i][i]
        FP = sum(cm[j][i] for j in range(len(labels)) if j != i)
        FN = sum(cm[i][j] for j in range(len(labels)) if j != i)
        p = TP / (TP + FP) if (TP + FP) else 0
        r = TP / (TP + FN) if (TP + FN) else 0
        f = 2 * p * r / (p + r) if (p + r) else 0
        precision.append(p)
        recall.append(r)
        f1.append(f)
    return precision, recall, f1


In [4]:
iris = load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

knn = KNNClassifier(k=3)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred, list(set(y))))
p, r, f = precision_recall_f1(y_test, y_pred, list(set(y)))
print("Precision:", p)
print("Recall:", r)
print("F1 Score:", f)


Accuracy: 1.0
Confusion Matrix:
 [[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]
Precision: [np.float64(1.0), np.float64(1.0), np.float64(1.0)]
Recall: [np.float64(1.0), np.float64(1.0), np.float64(1.0)]
F1 Score: [np.float64(1.0), np.float64(1.0), np.float64(1.0)]


In [5]:
# Simulated small dataset
news_data = pd.DataFrame({
    'text': [
        "government election policy parliament",
        "team won the final match",
        "new rules announced by the government",
        "sports championship live score",
        "minister held a press conference",
        "player breaks world record in sprint"
    ],
    'label': ['politics', 'sports', 'politics', 'sports', 'politics', 'sports']
})

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(news_data['text']).toarray()
le = LabelEncoder()
y = le.fit_transform(news_data['label'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

knn_news = KNNClassifier(k=3)
knn_news.fit(X_train, y_train)
y_pred_news = knn_news.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred_news))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_news, list(set(y))))
p, r, f = precision_recall_f1(y_test, y_pred_news, list(set(y)))
print("Precision:", p)
print("Recall:", r)
print("F1 Score:", f)


Accuracy: 0.5
Confusion Matrix:
 [[1 0]
 [1 0]]
Precision: [np.float64(0.5), 0]
Recall: [np.float64(1.0), np.float64(0.0)]
F1 Score: [np.float64(0.6666666666666666), 0]


In [6]:
# Comparison with sklearn
sk_knn = SklearnKNN(n_neighbors=3)
sk_knn.fit(X_train, y_train)
y_pred_sklearn = sk_knn.predict(X_test)

print("Scikit-learn Accuracy:", accuracy_score(y_test, y_pred_sklearn))
print("Scikit-learn Confusion Matrix:\n", confusion_matrix(y_test, y_pred_sklearn, list(set(y))))
p, r, f = precision_recall_f1(y_test, y_pred_sklearn, list(set(y)))
print("Scikit-learn Precision:", p)
print("Scikit-learn Recall:", r)
print("Scikit-learn F1 Score:", f)


Scikit-learn Accuracy: 0.5
Scikit-learn Confusion Matrix:
 [[1 0]
 [1 0]]
Scikit-learn Precision: [np.float64(0.5), 0]
Scikit-learn Recall: [np.float64(1.0), np.float64(0.0)]
Scikit-learn F1 Score: [np.float64(0.6666666666666666), 0]
