In [1]:
from collections import defaultdict
import math

In [2]:
def train_naive_bayes(data):
    class_counts = defaultdict(int)
    word_counts = defaultdict(lambda: defaultdict(int))
    vocab = set()
    
    # Count occurrences
    for words, label in data:
        class_counts[label] += 1
        for word in words:
            word_counts[label][word] += 1
            vocab.add(word)
    
    return class_counts, word_counts, vocab

In [3]:
def calculate_probabilities(class_counts, word_counts, vocab, text, alpha=1):
    total_reviews = sum(class_counts.values())
    probabilities = {}
    
    for label in class_counts:
        # Prior probability: P(Class)
        prob = math.log(class_counts[label] / total_reviews)
        total_words = sum(word_counts[label].values())
        vocab_size = len(vocab)
        
        # Compute likelihood with add-1 smoothing: P(w|Class)
        for word in text:
            word_freq = word_counts[label][word] + alpha
            prob += math.log(word_freq / (total_words + vocab_size * alpha))
        
        probabilities[label] = prob
    
    return probabilities

In [4]:
def classify(class_counts, word_counts, vocab, text):
    probabilities = calculate_probabilities(class_counts, word_counts, vocab, text)
    return max(probabilities, key=probabilities.get)

In [5]:
# Training Data
reviews = [
    (['fun', 'couple', 'love', 'love'], 'Comedy'),
    (['fast', 'furious', 'shoot'], 'Action'),
    (['couple', 'fly', 'fast', 'fun', 'fun'], 'Comedy'),
    (['furious', 'shoot', 'shoot', 'fun'], 'Action'),
    (['fly', 'fast', 'shoot', 'love'], 'Action')
]

# Train Naive Bayes Classifier
class_counts, word_counts, vocab = train_naive_bayes(reviews)

# New document
D = ['fast', 'couple', 'shoot', 'fly']

# Classify new document
predicted_class = classify(class_counts, word_counts, vocab, D)
print(predicted_class)

Action
