In [4]:
import numpy as np

class MultinomialNaiveBayes:
    def __init__(self, alpha=1.0):
        self.alpha = alpha  # Laplace smoothing parameter
        self.classes = None
        self.class_priors = None
        self.word_probs = None

    def fit(self, X, y):
        # Calculate class priors and vocabulary size
        self.classes, class_counts = np.unique(y, return_counts=True)
        num_classes = len(self.classes)
        total_samples = len(y)
        
        self.class_priors = class_counts / total_samples

        # Create a vocabulary from the training data
        self.vocabulary = set()
        for doc in X:
            self.vocabulary.update(doc)
        self.vocabulary = list(self.vocabulary)
        self.vocabulary_size = len(self.vocabulary)
        
        # Initialize count matrices for word occurrences
        word_counts_per_class = np.zeros((num_classes, self.vocabulary_size))

        # Count word occurrences for each class
        for i, cls in enumerate(self.classes):
            class_docs = X[y == cls]
            for doc in class_docs:
                for word in doc:
                    word_idx = self.vocabulary.index(word)
                    word_counts_per_class[i][word_idx] += 1

        # Apply Laplace smoothing to word counts
        smoothed_counts = word_counts_per_class + self.alpha
        class_word_counts = np.sum(smoothed_counts, axis=1, keepdims=True)

        # Calculate word probabilities for each class
        word_probs = smoothed_counts / class_word_counts

        self.word_probs = word_probs

    def predict(self, X):
        predictions = []
        for doc in X:
            class_scores = np.log(self.class_priors)  # Initialize with class priors
            for word in doc:
                if word in self.vocabulary:
                    word_idx = self.vocabulary.index(word)
                    class_scores += np.log(self.word_probs[:, word_idx])
            if not np.isnan(class_scores).all():
                predicted_class = self.classes[np.argmax(class_scores)]
            else:
                # Handle the case when all class_scores are NaN
                # You can choose to assign a default class or handle it differently
                predicted_class = "unknown"
            predictions.append(predicted_class)
        return predictions

# Example usage:
X_train = [["good", "product", "great"],
           ["poor", "service"],
           ["awesome", "experience", "good"],
           ["poor", "product"]]
y_train = ["positive", "negative", "positive", "negative"]

X_test = [["good", "service"],
          ["poor", "product", "bad"]]

clf = MultinomialNaiveBayes(alpha=1.0)
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)

for i, prediction in enumerate(predictions):
    print(f"Example {i+1}: {prediction}")

ValueError: 'g' is not in list