In [1]:
import numpy as np
from collections import defaultdict

class NaiveBayesSpamFilter:
    def __init__(self):
        self.vocab = set()
        self.word_counts_spam = defaultdict(int)
        self.word_counts_ham = defaultdict(int)
        self.total_spam_words = 0
        self.total_ham_words = 0
        self.num_spam = 0
        self.num_ham = 0

    def preprocess(self, email):
        return email.lower().split()

    def train(self, emails, labels):
        for email, label in zip(emails, labels):
            words = self.preprocess(email)
            self.vocab.update(words)
            if label == 'spam':
                self.num_spam += 1
                for word in words:
                    self.word_counts_spam[word] += 1
                    self.total_spam_words += 1
            else:
                self.num_ham += 1
                for word in words:
                    self.word_counts_ham[word] += 1
                    self.total_ham_words += 1

    def calculate_word_prob(self, word, label):
        if label == 'spam':
            return (self.word_counts_spam[word] + 1) / (self.total_spam_words + len(self.vocab))
        else:
            return (self.word_counts_ham[word] + 1) / (self.total_ham_words + len(self.vocab))

    def calculate_prior(self, label):
        total_emails = self.num_spam + self.num_ham
        if label == 'spam':
            return self.num_spam / total_emails
        else:
            return self.num_ham / total_emails

    def predict(self, email):
        words = self.preprocess(email)

        spam_log_prob = np.log(self.calculate_prior('spam'))
        ham_log_prob = np.log(self.calculate_prior('ham'))

        for word in words:
            spam_log_prob += np.log(self.calculate_word_prob(word, 'spam'))
            ham_log_prob += np.log(self.calculate_word_prob(word, 'ham'))

        return 'spam' if spam_log_prob > ham_log_prob else 'ham'

    def evaluate(self, emails, labels):
        correct = 0
        for email, label in zip(emails, labels):
            prediction = self.predict(email)
            if prediction == label:
                correct += 1
        return correct / len(labels)

# Example usage
if __name__ == "__main__":
    # Sample dataset
    emails = [
        "Win money now",
        "Hi, how are you",
        "Congratulations, you won",
        "Let’s meet for lunch",
        "Claim your free prize now"
    ]
    labels = ['spam', 'ham', 'spam', 'ham', 'spam']

    # Train-test split
    train_emails = emails[:4]
    train_labels = labels[:4]
    test_emails = emails[4:]
    test_labels = labels[4:]

    # Initialize and train
    classifier = NaiveBayesSpamFilter()
    classifier.train(train_emails, train_labels)

    # Evaluate
    accuracy = classifier.evaluate(test_emails, test_labels)
    print(f"Accuracy: {accuracy * 100:.2f}%")


Accuracy: 100.00%
