<a href="https://colab.research.google.com/github/BBC-TECH/Assignment-4/blob/main/Experiment4ipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

 Experiment 4: Naive Bayes (from scratch and sklearn comparison)

In [6]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

In [7]:

df = pd.read_csv("/content/sample_data/spam.csv", encoding="latin-1")

# Drop extra unnamed/empty columns
df = df.dropna(axis=1, how="all")


df = df.rename(columns={df.columns[0]: "Label", df.columns[1]: "Message"})

# Encode labels: ham -> 0, spam -> 1
df['Label'] = df['Label'].map({'ham': 0, 'spam': 1})

X = df['Message'].values
y = df['Label'].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("✅ Dataset Loaded Successfully")
print(df.head())


✅ Dataset Loaded Successfully
   Label                                            Message
0      0  Go until jurong point, crazy.. Available only ...
1      0                      Ok lar... Joking wif u oni...
2      1  Free entry in 2 a wkly comp to win FA Cup fina...
3      0  U dun say so early hor... U c already then say...
4      0  Nah I don't think he goes to usf, he lives aro...


In [8]:
# 2. Naive Bayes From Scratch

class NaiveBayesScratch:
    def __init__(self, alpha=1.0):  # Laplace smoothing
        self.alpha = alpha

    def fit(self, X, y):
        self.classes = np.unique(y)
        self.class_priors = {}
        self.word_counts = {}
        self.class_word_totals = {}

        for c in self.classes:
            X_c = X[y==c]
            all_words = []
            for doc in X_c:
                all_words.extend(doc)
            self.word_counts[c] = Counter(all_words)
            self.class_word_totals[c] = sum(self.word_counts[c].values())
            self.class_priors[c] = len(X_c) / len(X)

        self.vocab = set()
        for c in self.classes:
            self.vocab.update(self.word_counts[c].keys())
        self.vocab = list(self.vocab)
        self.V = len(self.vocab)

    def predict(self, X):
        preds = []
        for doc in X:
            class_scores = {}
            for c in self.classes:
                log_prob = np.log(self.class_priors[c])
                for word in doc:
                    word_count = self.word_counts[c].get(word, 0)
                    log_prob += np.log((word_count + self.alpha) /
                                       (self.class_word_totals[c] + self.alpha * self.V))
                class_scores[c] = log_prob
            preds.append(max(class_scores, key=class_scores.get))
        return np.array(preds)

In [9]:
# 3. Evaluation Function
def evaluate_model(vectorizer, method="Scratch NB"):
    vec = vectorizer()
    X_train_vec = vec.fit_transform(X_train)
    X_test_vec = vec.transform(X_test)

    if method == "Scratch NB":
        # Convert to tokens for scratch NB
        X_train_tokens = [[vec.get_feature_names_out()[i] for i in X_train_vec[row].indices]
                          for row in range(X_train_vec.shape[0])]
        X_test_tokens = [[vec.get_feature_names_out()[i] for i in X_test_vec[row].indices]
                         for row in range(X_test_vec.shape[0])]

        nb = NaiveBayesScratch(alpha=1.0)
        nb.fit(np.array(X_train_tokens, dtype=object), y_train)
        y_pred = nb.predict(np.array(X_test_tokens, dtype=object))

    elif method == "Sklearn NB":
        nb = MultinomialNB(alpha=1.0)
        nb.fit(X_train_vec, y_train)
        y_pred = nb.predict(X_test_vec)

    else:  # Logistic Regression
        clf = LogisticRegression(max_iter=1000)
        clf.fit(X_train_vec, y_train)
        y_pred = clf.predict(X_test_vec)

    # Metrics
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)

    print(f"\n📌 {method} with {vectorizer.__name__}")
    print("Accuracy:", acc)
    print("Precision:", prec)
    print("Recall:", rec)
    print("F1:", f1)
    print("Confusion Matrix:\n", cm)

    return [method, vectorizer.__name__, acc, prec, rec, f1]


In [11]:
# 4. Run All Models

results = []
# Naive Bayes Scratch
results.append(evaluate_model(CountVectorizer, method="Scratch NB"))
results.append(evaluate_model(TfidfVectorizer, method="Scratch NB"))
# Sklearn Naive Bayes
results.append(evaluate_model(CountVectorizer, method="Sklearn NB"))
results.append(evaluate_model(TfidfVectorizer, method="Sklearn NB"))
# Logistic Regression
results.append(evaluate_model(CountVectorizer, method="Logistic Regression"))
results.append(evaluate_model(TfidfVectorizer, method="Logistic Regression"))



📌 Scratch NB with CountVectorizer
Accuracy: 0.9847533632286996
Precision: 0.9925373134328358
Recall: 0.8926174496644296
F1: 0.9399293286219081
Confusion Matrix:
 [[965   1]
 [ 16 133]]

📌 Scratch NB with TfidfVectorizer
Accuracy: 0.9847533632286996
Precision: 0.9925373134328358
Recall: 0.8926174496644296
F1: 0.9399293286219081
Confusion Matrix:
 [[965   1]
 [ 16 133]]

📌 Sklearn NB with CountVectorizer
Accuracy: 0.9865470852017937
Precision: 0.9855072463768116
Recall: 0.912751677852349
F1: 0.9477351916376306
Confusion Matrix:
 [[964   2]
 [ 13 136]]

📌 Sklearn NB with TfidfVectorizer
Accuracy: 0.9596412556053812
Precision: 1.0
Recall: 0.697986577181208
F1: 0.8221343873517787
Confusion Matrix:
 [[966   0]
 [ 45 104]]

📌 Logistic Regression with CountVectorizer
Accuracy: 0.9802690582959641
Precision: 1.0
Recall: 0.8523489932885906
F1: 0.9202898550724637
Confusion Matrix:
 [[966   0]
 [ 22 127]]

📌 Logistic Regression with TfidfVectorizer
Accuracy: 0.9721973094170404
Precision: 1.0
Recal

In [12]:
# 5. Results Table

results_df = pd.DataFrame(results,
    columns=["Model_Type", "Vectorizer_Type", "Accuracy_Score", "Precision_Score", "Recall_Score", "F1_Score"]
)

print("\n📊 Final Comparative Results:\n")
print(results_df)


📊 Final Comparative Results:

            Model_Type  Vectorizer_Type  Accuracy_Score  Precision_Score  \
0           Scratch NB  CountVectorizer        0.984753         0.992537   
1           Scratch NB  TfidfVectorizer        0.984753         0.992537   
2           Sklearn NB  CountVectorizer        0.986547         0.985507   
3           Sklearn NB  TfidfVectorizer        0.959641         1.000000   
4  Logistic Regression  CountVectorizer        0.980269         1.000000   
5  Logistic Regression  TfidfVectorizer        0.972197         1.000000   

   Recall_Score  F1_Score  
0      0.892617  0.939929  
1      0.892617  0.939929  
2      0.912752  0.947735  
3      0.697987  0.822134  
4      0.852349  0.920290  
5      0.791946  0.883895  
