<a href="https://colab.research.google.com/github/Dipu1222/KNN-from-Scratch-Flower-and-News-Classification-with-Custom-Evaluation-Metrics/blob/main/222002120_CSE412_222D3_LabReport02_knnFromScratch_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier as SklearnKNN
from sklearn.metrics import classification_report


In [18]:
# News datasets
sports_articles = [
    "The football team celebrated their historic league win last night.",
    "The basketball player made a game-winning three-point shot.",
    "The coach emphasized teamwork after the team’s victory.",
    "Rain delayed the second innings of the cricket match.",
    "The new striker scored in his debut match.",
    "The tennis star advanced to the finals after a tough match.",
    "The swimming champion broke another world record.",
    "Fans flooded the stadium to cheer for their team.",
    "The referee’s controversial decision sparked debate.",
    "The club announced the signing of a famous midfielder.",
    "The athlete underwent surgery after a knee injury.",
    "The Olympics opening ceremony dazzled viewers worldwide.",
    "The marathon attracted runners from over 50 countries.",
    "The sprinter qualified for the finals with a personal best.",
    "The team unveiled their new kit for the season.",
    "The manager said the goal is to reach the top four.",
    "The goalkeeper saved two crucial penalties.",
    "The final score left fans stunned in disbelief.",
    "The championship ended with a thrilling penalty shootout.",
    "The young gymnast impressed judges with her routine.",
    "The cycling tour was one of the most competitive ever.",
    "The boxing match ended in a surprise knockout.",
    "The player was awarded the golden boot.",
    "The training camp focused on fitness and tactics.",
    "The hockey team made a strong comeback in the second half.",
    "The athlete thanked his coach after winning gold.",
    "The baseball team extended their winning streak.",
    "The fencing finals showcased incredible talent.",
    "The relay team set a national record.",
    "The wrestler dominated his opponent in the final round.",
    "The ice skater performed a flawless routine.",
    "The match ended in a dramatic tie.",
    "The coach made five substitutions before halftime.",
    "The tennis match lasted over five hours.",
    "The ski resort hosted the world championship.",
    "The fans held a banner for their favorite player.",
    "The referee handed out three red cards.",
    "The club reached the finals for the first time in history.",
    "The volleyball team won all sets in the tournament.",
    "The rugby match drew a record-breaking crowd.",
    "The team held a press conference after the loss.",
    "The cricket captain scored a double century.",
    "The table tennis player is ranked number one.",
    "The racing driver secured pole position.",
    "The stadium will host the next World Cup match.",
    "The team’s strategy was praised by analysts.",
    "The forward was named Player of the Match.",
    "The match will be replayed due to weather issues.",
    "The league is set to introduce VAR next season.",
    "The sportsperson announced retirement after 20 years.",
]

politics_articles = [
    "The president addressed the nation on economic reforms.",
    "Parliament approved the controversial new tax bill.",
    "The prime minister met with world leaders at the summit.",
    "The election commission released the final results.",
    "The new budget focuses on healthcare and education.",
    "The opposition party staged a protest in the capital.",
    "The government launched a campaign against corruption.",
    "Lawmakers debated the immigration policy in Parliament.",
    "The senator proposed amendments to the labor law.",
    "The president signed an executive order on climate policy.",
    "The ruling party gained majority in the national election.",
    "The foreign minister held talks with international diplomats.",
    "The candidate launched her campaign in three major cities.",
    "The government faces backlash over fuel price hikes.",
    "The cabinet reshuffle was announced last week.",
    "The mayor promised to improve infrastructure in rural areas.",
    "The bill was blocked in the upper house of Parliament.",
    "Political analysts predict a tight race in the upcoming polls.",
    "The party leader resigned amid allegations of misconduct.",
    "The president’s approval rating dropped significantly.",
    "Voters expressed concerns over rising inflation.",
    "The constitution was amended to extend presidential terms.",
    "The campaign focused on youth and employment issues.",
    "The government imposed a nationwide lockdown.",
    "The minister of health presented the COVID-19 response plan.",
    "The opposition accused the administration of censorship.",
    "The finance minister defended the new tax reforms.",
    "Political rallies were suspended due to security threats.",
    "A new political alliance was formed to challenge the incumbents.",
    "The prime minister will visit the flood-affected regions.",
    "Civil rights groups criticized the proposed surveillance law.",
    "The election was declared free and fair by observers.",
    "The party's manifesto promises affordable housing.",
    "The government launched a digital ID program.",
    "A diplomatic dispute arose between neighboring countries.",
    "The president met with tech leaders to discuss regulations.",
    "The policy aims to reduce the wealth gap.",
    "The legislative assembly will convene next Monday.",
    "A whistleblower exposed misuse of public funds.",
    "The speaker of the house called for a special session.",
    "International sanctions were lifted after peace talks.",
    "The new law mandates transparency in public spending.",
    "The opposition walked out during the budget session.",
    "The government emphasized gender equality in new policies.",
    "Political unrest continues in the capital city.",
    "The prime minister launched a rural electrification program.",
    "The president vetoed the proposed defense bill.",
    "The education reform act passed with a narrow margin.",
    "The ruling coalition lost support in recent polls.",
    "Public opinion is divided over the new privacy regulations.",
]

news_data = sports_articles + politics_articles
news_labels = [0]*len(sports_articles) + [1]*len(politics_articles)


In [19]:
# 2. KNN from Scratch
def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2)**2))

class CustomKNN:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        return [self._predict(x) for x in X]

    def _predict(self, x):
        distances = [euclidean_distance(x, x_train) for x_train in self.X_train]
        k_indices = np.argsort(distances)[:self.k]
        k_labels = [self.y_train[i] for i in k_indices]
        return Counter(k_labels).most_common(1)[0][0]

In [29]:
# 3. Evaluation Metrics from Scratch
def accuracy(y_true, y_pred):
    return np.mean(np.array(y_true) == np.array(y_pred))

def confusion_matrix(y_true, y_pred):
    tp = sum((yt == 1 and yp == 1) for yt, yp in zip(y_true, y_pred))
    tn = sum((yt == 0 and yp == 0) for yt, yp in zip(y_true, y_pred))
    fp = sum((yt == 0 and yp == 1) for yt, yp in zip(y_true, y_pred))
    fn = sum((yt == 1 and yp == 0) for yt, yp in zip(y_true, y_pred))
    return np.array([[tn, fp],[fn, tp]])


def precision(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    return cm[1][1] / (cm[1][1] + cm[0][1])

def recall(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    return cm[1][1] / (cm[1][1] + cm[1][0])

def f1_score(y_true, y_pred):
    p = precision(y_true, y_pred)
    r = recall(y_true, y_pred)
    return 2 * p * r / (p + r)


In [30]:
# 4. Run on News Dataset
def run_news_classification(test_size=0.2, k=3):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(news_data).toarray()
    y = np.array(news_labels)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

    model = CustomKNN(k=k)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print(f"Results for News Dataset (k={k}, test_size={test_size}):")
    print("Accuracy:", accuracy(y_test, y_pred))
    print("Confusion Matrix:", confusion_matrix(y_test, y_pred))
    print("Precision:", precision(y_test, y_pred))
    print("Recall:", recall(y_test, y_pred))
    print("F1 Score:", f1_score(y_test, y_pred))

    # Comparison with sklearn
    clf = SklearnKNN(n_neighbors=k)
    clf.fit(X_train, y_train)
    y_pred_sklearn = clf.predict(X_test)

    print("\n[Scikit-learn Results]")
    print(classification_report(y_test, y_pred_sklearn))

run_news_classification(k=5, test_size=0.2)

Results for News Dataset (k=5, test_size=0.2):
Accuracy: 0.8
Confusion Matrix: [[11  1]
 [ 3  5]]
Precision: 0.8333333333333334
Recall: 0.625
F1 Score: 0.7142857142857143

[Scikit-learn Results]
              precision    recall  f1-score   support

           0       0.85      0.92      0.88        12
           1       0.86      0.75      0.80         8

    accuracy                           0.85        20
   macro avg       0.85      0.83      0.84        20
weighted avg       0.85      0.85      0.85        20



In [32]:
# 5. Run on Iris Dataset
def run_iris_classification(test_size=0.2, k=3):
    iris = load_iris()
    X = iris.data
    y = iris.target

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

    model = CustomKNN(k=k)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print(f"\nResults for Iris Dataset (k={k}, test_size={test_size}):")
    print("Accuracy:", accuracy(y_test, y_pred))
    print("Confusion Matrix:", confusion_matrix(y_test, y_pred))
    print("Precision:", precision(y_test, y_pred))
    print("Recall:", recall(y_test, y_pred))
    print("F1 Score:", f1_score(y_test, y_pred))
    # Sklearn KNN
    clf = SklearnKNN(n_neighbors=k)
    clf.fit(X_train, y_train)
    y_pred_sklearn = clf.predict(X_test)

    print("\n[Scikit-learn Results]")
    print(classification_report(y_test, y_pred_sklearn))

run_iris_classification(k=5, test_size=0.2)



Results for Iris Dataset (k=5, test_size=0.2):
Accuracy: 1.0
Confusion Matrix: [[10  0]
 [ 0  9]]
Precision: 1.0
Recall: 1.0
F1 Score: 1.0

[Scikit-learn Results]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30

