In [1]:
class CustimNaiveBayes:
    def __init__(self, alpha=1.0):
        self.alpha = alpha
        self.class_priors = {}
        self.word_likelihoods = {}
        self.vocab_size = 0

    def fit(self, X_vec, y_labels, feature_names):
        self.vocab_size = X_vec.shape[1]
        self.feature_names = feature_names
        total_samples = len(y_labels)

        unique_classes = np.unique(y_labels)

        self.vocab_size = X_vec.shape[1]
        for c in unique_classes:
            X_c = X_vec[y_labels == c]
            total_tokens_in_c = np.sum(X_c)

            denominator = total_tokens_in_c + self.alpha * self.vocab_size
            self.word_likelihoods[c] = {}

            word_counts = np.sum(X_c, axis=0)
            for i, word in enumerate(self.feature_names):
                numerator = word_counts[i] + self.alpha
                self.word_likelihoods[c][word] = numerator / denominator

    def predict_one(self, sample_vector):
        best_class = None
        max_log_prob = -np.inf

        for c, prior in self.class_priors.items():
            current_log_prob = np.log(prior)
            for i, count in enumerate(sample_vector):
                # We only need to consider words present in the sample (count > 0)
                if count > 0:
                    word = self.feature_names[i]
                    
                    # Get the likelihood P(W|C) for this word and class
                    likelihood = self.word_likelihoods[c].get(word, 0)
                    
                    # Add the log likelihood for each occurrence of the word
                    # We multiply the log(P(W|C)) by the count of the word, 
                    # based on the Multinomial Naive Bayes assumption.
                    if likelihood > 0:
                        current_log_prob += count * math.log(likelihood)
            
            # Check if this class is the best one so far
            if current_log_prob > max_log_prob:
                max_log_prob = current_log_prob
                best_class = c
                
        return best_class

    def predict(self, X_vec):
        """Predicts classes for an entire dataset."""
        predictions = [self.predict_one(sample) for sample in X_vec]
        return np.array(predictions)


In [None]:
import numpy as np
import re
import math
from collections import defaultdict, Counter

# --- Helper Functions and Classes ---

def clean_text(text):
    """Cleans raw text."""
    text = text.lower()
    text = re.sub(r'<br\s*/>', ' ', text)
    text = re.sub(r'[^a-z\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

STOP_WORDS = set([
    'the', 'and', 'a', 'of', 'to', 'is', 'in', 'it', 'this', 'that', 
    'was', 'as', 'for', 'with', 'movie', 'film', 'but', 'on', 'are', 
    'not', 'have', 'be', 'one', 'all', 'at', 'by', 'an', 'who', 'so', 
    'from', 'like', 'there', 'or', 'just', 'about', 'out', 'if', 'has',
    'what', 'some', 'good', 'can', 'more', 'when', 'very', 'up', 'no', 
    'time', 'my', 'even', 'would', 'she', 'which', 'only', 'really', 
    'see', 'story', 'their', 'had'
])

def tokenize_with_stopwords(text):
    tokens = clean_text(text).split(" ")
    return [token for token in tokens if token and token not in STOP_WORDS]

def tokenize_without_stopwords(text):
    cleaned_tokens = clean_text(text).split(" ")
    return [token for token in cleaned_tokens if token not in STOP_WORDS and len(token) > 1]

    