In [1]:
import numpy as np
from collections import defaultdict

# Tokenization function: Split text into words
def tokenize(text):
    # Basic tokenization: split by whitespace and lowercase
    return text.lower().split()

# Example usage of tokenization
email = "Win a prize now"
tokens = tokenize(email)
print("Tokens:", tokens)


Tokens: ['win', 'a', 'prize', 'now']


In [2]:
# Example training data: List of tuples (email, label)
training_data = [
    ("Win a prize now", "Spam"),
    ("Meeting tomorrow", "Not Spam"),
    ("Win money", "Spam"),
    ("Update meeting", "Not Spam"),
    ("Prize guaranteed", "Spam"),
    ("Tomorrow's update", "Not Spam")
]

# Display training data
print("Training Data:", training_data)


Training Data: [('Win a prize now', 'Spam'), ('Meeting tomorrow', 'Not Spam'), ('Win money', 'Spam'), ('Update meeting', 'Not Spam'), ('Prize guaranteed', 'Spam'), ("Tomorrow's update", 'Not Spam')]


In [3]:
# Step 1: Calculate word counts for each label
def calculate_word_counts(training_data):
    word_counts = {
        "Spam": defaultdict(int),
        "Not Spam": defaultdict(int)
    }
    email_counts = {
        "Spam": 0,
        "Not Spam": 0
    }

    for email, label in training_data:
        email_counts[label] += 1
        for word in tokenize(email):
            word_counts[label][word] += 1

    return word_counts, email_counts

# Calculate word counts and email counts from training data
word_counts, email_counts = calculate_word_counts(training_data)
print("Word Counts:", word_counts)
print("Email Counts:", email_counts)


Word Counts: {'Spam': defaultdict(<class 'int'>, {'win': 2, 'a': 1, 'prize': 2, 'now': 1, 'money': 1, 'guaranteed': 1}), 'Not Spam': defaultdict(<class 'int'>, {'meeting': 2, 'tomorrow': 1, 'update': 2, "tomorrow's": 1})}
Email Counts: {'Spam': 3, 'Not Spam': 3}


In [4]:
# Step 2: Calculate prior probabilities
def calculate_prior_probabilities(email_counts):
    total_emails = sum(email_counts.values())
    return {label: count / total_emails for label, count in email_counts.items()}

# Calculate priors
priors = calculate_prior_probabilities(email_counts)
print("Prior Probabilities:", priors)


Prior Probabilities: {'Spam': 0.5, 'Not Spam': 0.5}


In [5]:
# Create a vocabulary of all unique words
vocab = set(word for email, _ in training_data for word in tokenize(email))
print("Vocabulary:", vocab)


Vocabulary: {'tomorrow', 'meeting', "tomorrow's", 'a', 'prize', 'now', 'guaranteed', 'win', 'update', 'money'}


In [6]:
# Step 3: Calculate likelihoods with Laplace smoothing
def calculate_likelihoods(word_counts, vocab):
    likelihoods = {label: defaultdict(float) for label in word_counts}
    vocab_size = len(vocab)

    for label, counts in word_counts.items():
        total_words = sum(counts.values())
        for word in vocab:
            # Laplace smoothing: (count + 1) / (total_words + vocab_size)
            likelihoods[label][word] = (counts[word] + 1) / (total_words + vocab_size)

    return likelihoods

# Calculate likelihoods
likelihoods = calculate_likelihoods(word_counts, vocab)
print("Likelihoods:", likelihoods)


Likelihoods: {'Spam': defaultdict(<class 'float'>, {'tomorrow': 0.05555555555555555, 'meeting': 0.05555555555555555, "tomorrow's": 0.05555555555555555, 'a': 0.1111111111111111, 'prize': 0.16666666666666666, 'now': 0.1111111111111111, 'guaranteed': 0.1111111111111111, 'win': 0.16666666666666666, 'update': 0.05555555555555555, 'money': 0.1111111111111111}), 'Not Spam': defaultdict(<class 'float'>, {'tomorrow': 0.125, 'meeting': 0.1875, "tomorrow's": 0.125, 'a': 0.0625, 'prize': 0.0625, 'now': 0.0625, 'guaranteed': 0.0625, 'win': 0.0625, 'update': 0.1875, 'money': 0.0625})}


In [7]:
# Step 4: Predict function
def predict(email, priors, likelihoods):
    # Tokenize the email
    tokens = tokenize(email)

    # Initialize posterior probabilities with the log of prior probabilities
    posteriors = {label: np.log(priors[label]) for label in priors}

    # Update posterior probabilities with the log of likelihoods
    for label in priors:
        for token in tokens:
            if token in likelihoods[label]:  # Only consider words in the vocabulary
                posteriors[label] += np.log(likelihoods[label][token])

    # Return the class with the highest posterior probability
    return max(posteriors, key=posteriors.get)

# Example usage
email = "win a prize now"
predicted_class = predict(email, priors, likelihoods)
print("Predicted class:", predicted_class)


Predicted class: Spam
