In [None]:
import numpy as np
import re
from collections import defaultdict
from sklearn.model_selection import train_test_split


data = [
    ("buy cheap meds now", "spam"),
    ("limited offer, click now", "spam"),
    ("win money instantly", "spam"),
    ("meeting at noon", "ham"),
    ("schedule the call", "ham"),
    ("let's have lunch tomorrow", "ham"),
]


In [None]:
texts, labels = zip(*data)
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.33, random_state=42)


def tokenize(text):
    text = text.lower()
    words = re.findall(r'\b\w+\b', text)
    return words

In [None]:
class NaiveBayesClassifier:
    def __init__(self):
        self.classes = None
        self.vocab = set()
        self.word_freq = {}
        self.class_counts = {}
        self.total_words = {}
    
  

In [None]:
  def fit(self, X, y):
        self.classes = set(y)
        self.class_counts = defaultdict(int)
        self.word_freq = {c: defaultdict(int) for c in self.classes}
        self.total_words = defaultdict(int)
        
        for text, label in zip(X, y):
            self.class_counts[label] += 1
            words = tokenize(text)
            for word in words:
                self.vocab.add(word)
                self.word_freq[label][word] += 1
                self.total_words[label] += 1

In [None]:
def predict(self, X):
        predictions = []
        for text in X:
            words = tokenize(text)
            scores = {}
            for c in self.classes:
                log_prob = np.log(self.class_counts[c] / sum(self.class_counts.values()))
                for word in words:
                    word_freq = self.word_freq[c][word] + 1  # Laplace smoothing
                    word_prob = word_freq / (self.total_words[c] + len(self.vocab))
                    log_prob += np.log(word_prob)
                scores[c] = log_prob
            predictions.append(max(scores, key=scores.get))
        return predictions

In [None]:
nb = NaiveBayesClassifier()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)