In [14]:
import pandas as pd
import numpy as np
import math
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

In [15]:
data = pd.read_csv("/content/spam.csv",)
data.columns = ['Category', 'Message']
data['Category'] = data['Category'].map({'ham': 0, 'spam': 1})

In [16]:
X_train, X_test, y_train, y_test = train_test_split(data['Message'], data['Category'], test_size=0.2, random_state=42)

class CountVectorizerCustom:

  def fit(self, documents):
    vocab = set()
    for doc in documents:
      for word in doc.lower().split():
        vocab.add(word)
    self.vocab = {word: idx for idx, word in enumerate(sorted(vocab))}
    return self


  def transform(self, documents):
    rows = []
    for doc in documents:
      vec = [0] * len(self.vocab)
      for word in doc.lower().split():
        if word in self.vocab:
          vec[self.vocab[word]] += 1
      rows.append(vec)
    return np.array(rows)

class TfidfVectorizerCustom:
  def fit(self, documents):
    self.docs = [doc.lower().split() for doc in documents]
    self.vocab = {word: idx for idx, word in enumerate(sorted(set(word for doc in self.docs for word in doc)))}
    self.idf = {}
    N = len(self.docs)
    for word in self.vocab:
      df = sum(1 for doc in self.docs if word in doc)
      self.idf[word] = math.log((N + 1) / (df + 1)) + 1
    return self


  def transform(self, documents):
    rows = []
    for doc in documents:
      tf = Counter(doc.lower().split())
      vec = [0] * len(self.vocab)
      for word, count in tf.items():
        if word in self.vocab:
          vec[self.vocab[word]] = (count / len(doc)) * self.idf[word]
      rows.append(vec)
    return np.array(rows)

class MultinomialNBCustom:
  def __init__(self, alpha=1.0):
    self.alpha = alpha


  def fit(self, X, y):
    self.classes = np.unique(y)
    n_features = X.shape[1]
    self.class_priors = {}
    self.likelihoods = {}


    for c in self.classes:
      X_c = X[y == c]
      self.class_priors[c] = X_c.shape[0] / X.shape[0]
      word_counts = np.sum(X_c, axis=0) + self.alpha
      self.likelihoods[c] = word_counts / (np.sum(word_counts))
    return self


  def predict(self, X):
    preds = []
    for x in X:
      class_scores = {}
      for c in self.classes:
        log_prob = np.log(self.class_priors[c])
        log_prob += np.sum(x * np.log(self.likelihoods[c]))
        class_scores[c] = log_prob
      preds.append(max(class_scores, key=class_scores.get))
    return np.array(preds)


In [17]:
def train_and_evaluate(vectorizer, X_train, X_test, y_train, y_test):
    vectorizer.fit(X_train)
    X_train_vec = vectorizer.transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    model = MultinomialNBCustom(alpha=1.0)
    model.fit(X_train_vec, y_train.values)
    y_pred = model.predict(X_test_vec)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)

    print(f"\nUsing {vectorizer.__class__.__name__}:")
    print(f"Accuracy: {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall: {rec:.4f}")
    print(f"F1-score: {f1:.4f}")
    print("Confusion Matrix:")
    print(cm)
    print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=["Ham", "Spam"]))

In [None]:
count_vec = CountVectorizerCustom()
train_and_evaluate(count_vec, X_train, X_test, y_train, y_test)


tfidf_vec = TfidfVectorizerCustom()
train_and_evaluate(tfidf_vec, X_train, X_test, y_train, y_test)


Using CountVectorizerCustom:
Accuracy: 0.9865
Precision: 1.0000
Recall: 0.8993
F1-score: 0.9470
Confusion Matrix:
[[966   0]
 [ 15 134]]

Classification Report:
               precision    recall  f1-score   support

         Ham       0.98      1.00      0.99       966
        Spam       1.00      0.90      0.95       149

    accuracy                           0.99      1115
   macro avg       0.99      0.95      0.97      1115
weighted avg       0.99      0.99      0.99      1115

