# Bag-of-words Naive Bayes classifier

In [2]:

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

# Toy dataset: text and labels
docs = [
    "I love this movie",
    "This film is great",
    "Amazing performance and beautiful cinematography",
    "I hate this movie",
    "This film is awful",
    "Terrible acting and boring plot"
]
labels = ["positive", "positive", "positive", "negative", "negative", "negative"]

# Bag-of-Words representation
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(docs)

# Naive Bayes classifier with Laplace smoothing (alpha=1)
clf = MultinomialNB(alpha=1.0)
clf.fit(X, labels)

# Show learned priors and likelihoods (in log-space)
print("Class log priors:", clf.class_log_prior_)
print("Feature log probs (per class):\n", clf.feature_log_prob_)

# Test examples
test_docs = [
    "What a fantastic movie",
    "I did not like the film"
]
X_test = vectorizer.transform(test_docs)
predictions = clf.predict(X_test)
log_probs = clf.predict_log_proba(X_test)

print("\nVocabulary:", vectorizer.vocabulary_)
for doc, pred, logp in zip(test_docs, predictions, log_probs):
    print(f"\nDocument: {doc}")
    print(f"Prediction: {pred}")
    print(f"Log probabilities: {logp}")



Class log priors: [-0.69314718 -0.69314718]
Feature log probs (per class):
 [[-2.67414865 -3.36729583 -2.67414865 -2.67414865 -3.36729583 -2.67414865
  -3.36729583 -2.67414865 -3.36729583 -2.67414865 -2.67414865 -3.36729583
  -2.67414865 -3.36729583 -2.67414865 -2.67414865 -2.26868354]
 [-3.36729583 -2.67414865 -2.67414865 -3.36729583 -2.67414865 -3.36729583
  -2.67414865 -2.67414865 -2.67414865 -3.36729583 -2.67414865 -2.67414865
  -2.67414865 -2.67414865 -3.36729583 -3.36729583 -2.26868354]]

Vocabulary: {'love': 11, 'this': 16, 'movie': 12, 'film': 7, 'is': 10, 'great': 8, 'amazing': 1, 'performance': 13, 'and': 2, 'beautiful': 4, 'cinematography': 6, 'hate': 9, 'awful': 3, 'terrible': 15, 'acting': 0, 'boring': 5, 'plot': 14}

Document: What a fantastic movie
Prediction: negative
Log probabilities: [-0.69314718 -0.69314718]

Document: I did not like the film
Prediction: negative
Log probabilities: [-0.69314718 -0.69314718]
