<a href="https://colab.research.google.com/github/Apoorvmittal11/23-CS-072-ML-LAB-EXPERIMENT/blob/main/23-CS-072%20EXPERIMENT4/23_CS_072_Experiment_4(Naive_Bayes).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Load & Preprocess the Dataset

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load dataset
df = pd.read_csv('spam.csv', encoding='latin-1')

# Keep only relevant columns
df = df[['Category', 'Message']]
df.columns = ['label', 'message']

# Encode labels (ham = 0, spam = 1)
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    df['message'], df['label'], test_size=0.2, random_state=42
)

print("Dataset size:", df.shape)
print("Train size:", X_train.shape, "Test size:", X_test.shape)


Dataset size: (5572, 2)
Train size: (4457,) Test size: (1115,)


Implement the multinomial Naive Bayes algorithm with Laplace smoothing:

P(spam | d) ∝ P(spam) Y
w∈d
P(w | spam)

In [6]:
import numpy as np
from collections import defaultdict, Counter

class NaiveBayesScratch:
    def __init__(self, alpha=1.0):
        self.alpha = alpha
        self.class_priors = {}
        self.word_counts = {}
        self.class_totals = {}
        self.vocab = set()

    def fit(self, X, y):
        docs_by_class = defaultdict(list)
        for text, label in zip(X, y):
            docs_by_class[label].append(text)

        total_docs = len(X)
        self.class_priors = {
            c: len(docs) / total_docs for c, docs in docs_by_class.items()
        }

        self.word_counts = {c: Counter() for c in docs_by_class}
        self.class_totals = {}

        for c, docs in docs_by_class.items():
            for text in docs:
                for word in text.split():
                    self.word_counts[c][word] += 1
                    self.vocab.add(word)
            self.class_totals[c] = sum(self.word_counts[c].values())

    def predict(self, X):
        preds = []
        for text in X:
            scores = {}
            for c in self.class_priors:
                score = np.log(self.class_priors[c])
                for word in text.split():
                    word_count = self.word_counts[c][word] + self.alpha
                    total = self.class_totals[c] + self.alpha * len(self.vocab)
                    score += np.log(word_count / total)
                scores[c] = score
            preds.append(max(scores, key=scores.get))
        return preds


Train and evaluate Naive Bayes on the same dataset using both CountVectorizer
and TfidfVectorizer.

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Convert text → Bag of Words
vectorizer = CountVectorizer()
X_train_cv = vectorizer.fit_transform(X_train)
X_test_cv = vectorizer.transform(X_test)

# Convert sparse matrix to "word word word..." format for scratch model
def transform_for_scratch(X, vocab):
    inv_vocab = {i: w for w, i in vocab.items()}
    docs = []
    for row in X:
        words = []
        for idx, count in zip(row.indices, row.data):
            words.extend([inv_vocab[idx]] * count)  # extend list instead of nesting
        docs.append(" ".join(words))
    return docs

X_train_scratch = transform_for_scratch(X_train_cv, vectorizer.vocabulary_)
X_test_scratch = transform_for_scratch(X_test_cv, vectorizer.vocabulary_)

# Train model
nb = NaiveBayesScratch(alpha=1.0)
nb.fit(X_train_scratch, y_train)

# Predictions
y_pred = nb.predict(X_test_scratch)




In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert text → TF-IDF
vectorizer_tfidf = TfidfVectorizer()
X_train_tf = vectorizer_tfidf.fit_transform(X_train)
X_test_tf = vectorizer_tfidf.transform(X_test)

# Convert TF-IDF to pseudo-counts for scratch model
def tfidf_to_counts(X, vocab, scale=100):
    inv_vocab = {i: w for w, i in vocab.items()}
    docs = []
    for row in X:
        words = []
        for idx, value in zip(row.indices, row.data):
            repeat = int(value * scale)  # scale TF-IDF into counts
            if repeat > 0:
                words.extend([inv_vocab[idx]] * repeat)
        docs.append(" ".join(words))
    return docs

# TfidfVectorizer conversion
X_train_scratch_tf = tfidf_to_counts(X_train_tf, vectorizer_tfidf.vocabulary_)
X_test_scratch_tf = tfidf_to_counts(X_test_tf, vectorizer_tfidf.vocabulary_)


# Train model
nb_tfidf = NaiveBayesScratch(alpha=1.0)
nb_tfidf.fit(X_train_scratch_tf, y_train)

# Predictions
y_pred_tf = nb_tfidf.predict(X_test_scratch_tf)

Report accuracy, precision, recall, F1-score, and confusion matrix.

In [13]:
# Metrics
print("CountVectorizer Results")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

CountVectorizer Results
Accuracy: 0.9928251121076234
Precision: 1.0
Recall: 0.9463087248322147
F1: 0.9724137931034482
Confusion Matrix:
 [[966   0]
 [  8 141]]


In [14]:
# Metrics
print("\nTfidfVectorizer Results")
print("Accuracy:", accuracy_score(y_test, y_pred_tf))
print("Precision:", precision_score(y_test, y_pred_tf))
print("Recall:", recall_score(y_test, y_pred_tf))
print("F1:", f1_score(y_test, y_pred_tf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_tf))



TfidfVectorizer Results
Accuracy: 0.9856502242152466
Precision: 0.934640522875817
Recall: 0.959731543624161
F1: 0.9470198675496688
Confusion Matrix:
 [[956  10]
 [  6 143]]


Create a results table to summarize your findings. Example format:

In [12]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# --- Collect metrics from Task 3 and Task 4 ---
# CountVectorizer
acc_count = accuracy_score(y_test, y_pred)
prec_count = precision_score(y_test, y_pred)
rec_count = recall_score(y_test, y_pred)
f1_count = f1_score(y_test, y_pred)

# TF-IDF
acc_tfidf = accuracy_score(y_test, y_pred_tf)
prec_tfidf = precision_score(y_test, y_pred_tf)
rec_tfidf = recall_score(y_test, y_pred_tf)
f1_tfidf = f1_score(y_test, y_pred_tf)

# --- Build results table ---
results = [
    ["Naive Bayes", "Count", 1.0, acc_count, prec_count, rec_count, f1_count],
    ["Naive Bayes", "TF-IDF", 1.0, acc_tfidf, prec_tfidf, rec_tfidf, f1_tfidf]
]

columns = ["Model", "Vectorizer", "Reg. λ", "Accuracy", "Precision", "Recall", "F1"]
results_df = pd.DataFrame(results, columns=columns)

# Display nicely
print(results_df)


         Model Vectorizer  Reg. λ  Accuracy  Precision    Recall        F1
0  Naive Bayes      Count     1.0  0.992825   1.000000  0.946309  0.972414
1  Naive Bayes     TF-IDF     1.0  0.985650   0.934641  0.959732  0.947020
