##**Part** III: Comparative Analysis

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [3]:
#load data
df = pd.read_csv("/content/spam.csv", encoding="latin-1")[['v1', 'v2']]
df.columns = ['label', 'text']
y = df['label'].map({'ham': 0, 'spam': 1}).values
X_raw = df['text'].values

In [4]:
#Naive Bayes Implementation
class NaiveBayes:
    def __init__(self, alpha=1.0):
        self.alpha = alpha
        self.class_priors = {}
        self.word_likelihoods = {}
        self.vocab_size = 0
        self.classes = []

    def fit(self, X, y):
        self.classes = np.unique(y)
        n_docs = len(y)
        self.class_priors = {c: np.sum(y == c) / n_docs for c in self.classes}
        word_counts = {c: np.zeros(X.shape[1]) for c in self.classes}
        class_word_totals = {c: 0 for c in self.classes}
        for c in self.classes:
            X_c = X[y == c]
            word_counts[c] = np.sum(X_c, axis=0)
            class_word_totals[c] = np.sum(word_counts[c])
        self.vocab_size = X.shape[1]
        self.word_likelihoods = {
            c: (word_counts[c] + self.alpha) /
               (class_word_totals[c] + self.alpha * self.vocab_size)
            for c in self.classes
        }

    def predict(self, X):
        predictions = []
        for x in X:
            class_scores = {}
            for c in self.classes:
                score = np.log(self.class_priors[c])
                score += np.sum(x * np.log(self.word_likelihoods[c]))
                class_scores[c] = score
            predictions.append(max(class_scores, key=class_scores.get))
        return np.array(predictions)

In [5]:
#Logistic Regression
class LogisticRegressionScratch:
    def __init__(self, learning_rate=0.01, epochs=500, lambda_=0.0):
        self.lr = learning_rate
        self.epochs = epochs
        self.lambda_ = lambda_
        self.weights = None
        self.bias = 0

    def _sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def fit(self, X, y):
        m, n = X.shape
        self.weights = np.zeros(n)
        self.bias = 0
        for _ in range(self.epochs):
            z = np.dot(X, self.weights) + self.bias
            h = self._sigmoid(z)
            dw = (1/m) * np.dot(X.T, (h - y)) + (self.lambda_ / m) * self.weights
            db = (1/m) * np.sum(h - y)
            self.weights -= self.lr * dw
            self.bias -= self.lr * db

    def predict(self, X):
        z = np.dot(X, self.weights) + self.bias
        return (self._sigmoid(z) >= 0.5).astype(int)

In [6]:
# to train & evaluate
def evaluate_model(model, X_train, X_test, y_train, y_test, name):
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    return {
        "Model": name,
        "Accuracy": accuracy_score(y_test, preds),
        "Precision": precision_score(y_test, preds),
        "Recall": recall_score(y_test, preds),
        "F1": f1_score(y_test, preds)
    }

In [7]:
results = []
#native bayes using both
for vec_name, vectorizer in [("Count", CountVectorizer()), ("TF-IDF", TfidfVectorizer())]:
    X_vec = vectorizer.fit_transform(X_raw).toarray()
    X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42, stratify=y)
    nb = NaiveBayes(alpha=1.0)
    res = evaluate_model(nb, X_train, X_test, y_train, y_test, f"Naive Bayes ({vec_name})")
    results.append(res)
#logistic using both
scaler = StandardScaler()
for vec_name, vectorizer in [("Count", CountVectorizer()), ("TF-IDF", TfidfVectorizer())]:
    X_vec = vectorizer.fit_transform(X_raw).toarray()
    X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42, stratify=y)
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    log_reg = LogisticRegressionScratch(learning_rate=0.01, epochs=1000, lambda_=0.1)
    res = evaluate_model(log_reg, X_train, X_test, y_train, y_test, f"Logistic Regression ({vec_name})")
    results.append(res)

In [8]:
df_results = pd.DataFrame(results)
print(df_results.to_string(index=False))

                       Model  Accuracy  Precision   Recall       F1
         Naive Bayes (Count)  0.981166   0.915584 0.946309 0.930693
        Naive Bayes (TF-IDF)  0.961435   1.000000 0.711409 0.831373
 Logistic Regression (Count)  0.990135   0.985915 0.939597 0.962199
Logistic Regression (TF-IDF)  0.983857   0.951724 0.926174 0.938776
