In [2]:
import re
from collections import defaultdict, Counter
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


In [3]:
data = [
    # Positive examples
    ("I love this product, it's fantastic!", "positive"),
    ("Absolutely wonderful, would buy again!", "positive"),
    ("This item exceeded my expectations, highly recommend!", "positive"),
    ("Great quality, fast shipping, and excellent customer service!", "positive"),
    ("Very pleased with this purchase, works perfectly!", "positive"),
    ("Amazing experience, I'll definitely be back for more!", "positive"),
    ("This is my favorite product, couldn't be happier!", "positive"),
    ("The service was outstanding, I am extremely satisfied!", "positive"),
    ("Superb performance and reliable, worth every penny!", "positive"),
    ("Top-notch quality and affordable, highly recommended!", "positive"),
    
    # Negative examples
    ("This is terrible, I hate it", "negative"),
    ("Worst experience, totally disappointed", "negative"),
    ("The product broke after one use, very poor quality!", "negative"),
    ("Extremely dissatisfied, not worth the money", "negative"),
    ("I regret buying this, it was a waste of money", "negative"),
    ("Poor customer service and bad quality", "negative"),
    ("This item did not work as expected, I'm very disappointed", "negative"),
    ("The quality is horrible, don't buy this!", "negative"),
    ("Not what I was expecting, very low quality", "negative"),
    ("This was a mistake, I would not recommend it to anyone", "negative"),
    
    # Neutral examples
    ("It's okay, not the best but decent", "neutral"),
    ("This product is average, does the job", "neutral"),
    ("Nothing special, but it works as expected", "neutral"),
    ("The quality is fine, nothing extraordinary", "neutral"),
    ("It's a functional product, meets basic requirements", "neutral"),
    ("This is a standard item, performs as described", "neutral"),
    ("Average experience, could be better", "neutral"),
    ("The product is fair for the price", "neutral"),
    ("Not too bad, but not impressive either", "neutral"),
    ("It's neither great nor terrible, just okay", "neutral"),
]


In [4]:
def tokenize(text):
    return re.findall(r'\b\w+\b', text.lower())


# Build vocabulary and word counts for each class
def build_word_counts(X_train, y_train):
    class_word_counts = defaultdict(Counter)
    class_doc_counts = Counter(y_train)
    vocabulary = set()

    for text, label in zip(X_train, y_train):
        words = tokenize(text)
        vocabulary.update(words)
        class_word_counts[label].update(words)
    
    return class_word_counts, class_doc_counts, vocabulary


def calculate_class_probabilities(class_word_counts, class_doc_counts, vocabulary, k_smoothing=1):
    class_probabilities = {}
    word_probabilities = {label: {} for label in class_doc_counts}

    total_docs = sum(class_doc_counts.values())
    for label in class_doc_counts:
        class_probabilities[label] = np.log(class_doc_counts[label] / total_docs)  # log(P(label))
        total_words_in_class = sum(class_word_counts[label].values())
        
        for word in vocabulary:
            word_count = class_word_counts[label][word]
            # Add-k smoothing: P(word|label) = (count + k_smoothing) / (total words + k_smoothing * vocab_size)
            word_probabilities[label][word] = np.log((word_count + k_smoothing) / (total_words_in_class + k_smoothing * len(vocabulary)))
    
    return class_probabilities, word_probabilities

def predict(text, class_probabilities, word_probabilities, vocabulary):
    words = tokenize(text)
    scores = {}
    
    for label in class_probabilities:
        # Start with log(P(label))
        score = class_probabilities[label]
        
        # Add log(P(word|label)) for each word in the input text
        for word in words:
            if word in vocabulary:
                score += word_probabilities[label].get(word, np.log(1 / (len(vocabulary) + 1)))
        
        scores[label] = score

    return max(scores, key=scores.get)



In [5]:
texts, labels = zip(*data)
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

class_word_counts, class_doc_counts, vocabulary = build_word_counts(X_train, y_train)
vocab_size = len(vocabulary)

for k in [0.25, 0.75, 1]:
    class_probabilities, word_probabilities = calculate_class_probabilities(class_word_counts, class_doc_counts, vocabulary, k_smoothing=k)
    y_pred = [predict(text, class_probabilities, word_probabilities, vocabulary) for text in X_test]
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy for k={k}: {accuracy}")

Accuracy for k=0.25: 0.6666666666666666
Accuracy for k=0.75: 0.6666666666666666
Accuracy for k=1: 0.6666666666666666


In [14]:
predict("Okay, wonderful", class_probabilities, word_probabilities, vocabulary)

'neutral'