In [1]:
class CustomNaiveBayes:
    """
    Custom implementation of the Naive Bayes classifier for text classification
    using the Bag-of-Words model and Laplace Smoothing.
    """
    def __init__(self, alpha=1.0):
        # alpha is the Laplace smoothing factor (default to 1.0 for Add-One smoothing)
        self.alpha = alpha
        self.class_priors = {}        # Stores P(C) for each class
        self.word_likelihoods = {}    # Stores P(W|C) for each word W and class C
        self.vocab_size = 0           # Size of the vocabulary (V)

    def fit(self, X_vec, y_labels, feature_names):
        """
        Calculates the prior probabilities P(C) and the conditional likelihoods P(W|C).
        
        Parameters:
        - X_vec: Document-Term Matrix (BoW), shape (n_samples, n_features)
        - y_labels: Array of class labels (0 or 1)
        - feature_names: List of words corresponding to the features (vocabulary)
        """
        self.vocab_size = X_vec.shape[1]
        self.feature_names = feature_names
        total_samples = len(y_labels)
        
        unique_classes = np.unique(y_labels)
        
        # 1. Calculate Prior Probabilities P(C)
        self.class_priors = {}
        for c in unique_classes:
            count = np.sum(y_labels == c)
            self.class_priors[c] = count / total_samples
        
        # 2. Calculate Conditional Likelihoods P(W|C)
        self.word_likelihoods = {}
        for c in unique_classes:
            # Filter data for the current class
            X_c = X_vec[y_labels == c]
            
            # Sum of word counts for the current class (Token Count in Class C)
            # This is the denominator's first part: Sum over all words W in C
            total_tokens_in_c = np.sum(X_c)
            
            # Denominator: Sum of all tokens in C + (alpha * Vocab Size)
            denominator = total_tokens_in_c + (self.alpha * self.vocab_size)
            
            # Dictionary to store P(W|C) for this class
            self.word_likelihoods[c] = {}
            
            # Sum of counts for each word (Word Count W in C)
            word_counts = np.sum(X_c, axis=0)
            
            # Calculate P(W|C) for every word in the vocabulary
            for i, word in enumerate(self.feature_names):
                # Numerator: Word Count + alpha (Laplace Smoothing)
                numerator = word_counts[i] + self.alpha
                
                # P(W|C) = (Word Count + alpha) / (Total Tokens in C + alpha*V)
                self.word_likelihoods[c][word] = numerator / denominator

    def predict_one(self, sample_vector):
        """Classifies a single sample using log probabilities."""
        best_class = None
        max_log_prob = -np.inf

        for c, prior in self.class_priors.items():
            # Start with the log of the prior probability: log(P(C))
            current_log_prob = math.log(prior)
            
            # Iterate through the words (features) in the sample
            for i, count in enumerate(sample_vector):
                # We only need to consider words present in the sample (count > 0)
                if count > 0:
                    word = self.feature_names[i]
                    
                    # Get the likelihood P(W|C) for this word and class
                    likelihood = self.word_likelihoods[c].get(word, 0)
                    
                    # Add the log likelihood for each occurrence of the word
                    # We multiply the log(P(W|C)) by the count of the word, 
                    # based on the Multinomial Naive Bayes assumption.
                    if likelihood > 0:
                        current_log_prob += count * math.log(likelihood)
            
            # Check if this class is the best one so far
            if current_log_prob > max_log_prob:
                max_log_prob = current_log_prob
                best_class = c
                
        return best_class

    def predict(self, X_vec):
        """Predicts classes for an entire dataset."""
        predictions = [self.predict_one(sample) for sample in X_vec]
        return np.array(predictions)

*Part a*

In [3]:
import numpy as np
import re
import math
from collections import defaultdict, Counter

# --- Helper Functions and Classes ---

def clean_text(text):
    """Cleans raw text."""
    text = text.lower()
    text = re.sub(r'<br\s*/>', ' ', text)
    text = re.sub(r'[^a-z\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

STOP_WORDS = set([
    'the', 'and', 'a', 'of', 'to', 'is', 'in', 'it', 'this', 'that', 
    'was', 'as', 'for', 'with', 'movie', 'film', 'but', 'on', 'are', 
    'not', 'have', 'be', 'one', 'all', 'at', 'by', 'an', 'who', 'so', 
    'from', 'like', 'there', 'or', 'just', 'about', 'out', 'if', 'has',
    'what', 'some', 'good', 'can', 'more', 'when', 'very', 'up', 'no', 
    'time', 'my', 'even', 'would', 'she', 'which', 'only', 'really', 
    'see', 'story', 'their', 'had'
])

def tokenize_with_stopwords(text):
    """Minimal preprocessing (keeps stop words)."""
    return clean_text(text).split(' ')

def tokenize_without_stopwords(text):
    """Full preprocessing (removes stop words)."""
    cleaned_tokens = clean_text(text).split(' ')
    return [word for word in cleaned_tokens if word not in STOP_WORDS and len(word) > 1]

class CustomCountVectorizer:
    """Bag-of-Words Vectorizer."""
    def __init__(self, preprocessor_func, max_features=None):
        self.max_features = max_features
        self.vocabulary = {}
        self.feature_names = []
        self.preprocessor = preprocessor_func
        
    def fit(self, raw_documents):
        word_counts = defaultdict(int)
        for doc in raw_documents:
            tokens = self.preprocessor(doc)
            for token in tokens:
                word_counts[token] += 1
        
        sorted_words = sorted(word_counts.items(), key=lambda item: item[1], reverse=True)
        
        if self.max_features:
            sorted_words = sorted_words[:self.max_features]
            
        self.vocabulary = {word: idx for idx, (word, count) in enumerate(sorted_words)}
        self.feature_names = [word for word, count in sorted_words]
        
    def transform(self, raw_documents):
        n_samples = len(raw_documents)
        n_features = len(self.vocabulary)
        X = np.zeros((n_samples, n_features), dtype=int)
        
        for i, doc in enumerate(raw_documents):
            tokens = self.preprocessor(doc)
            for token in tokens:
                if token in self.vocabulary:
                    idx = self.vocabulary[token]
                    X[i, idx] += 1
        return X
    
    def fit_transform(self, raw_documents):
        self.fit(raw_documents)
        return self.transform(raw_documents)

class CustomNaiveBayes:
    """Naive Bayes classifier with Laplace Smoothing."""
    def __init__(self, alpha=1.0):
        self.alpha = alpha
        self.class_priors = {}
        self.word_likelihoods = {}
        self.vocab_size = 0
        self.feature_names = []

    def fit(self, X_vec, y_labels, feature_names):
        self.vocab_size = X_vec.shape[1]
        self.feature_names = feature_names
        total_samples = len(y_labels)
        unique_classes = np.unique(y_labels)
        
        for c in unique_classes:
            count = np.sum(y_labels == c)
            self.class_priors[c] = count / total_samples
        
        for c in unique_classes:
            X_c = X_vec[y_labels == c]
            total_tokens_in_c = np.sum(X_c)
            denominator = total_tokens_in_c + (self.alpha * self.vocab_size)
            
            self.word_likelihoods[c] = {}
            word_counts = np.sum(X_c, axis=0)
            
            for i, word in enumerate(self.feature_names):
                numerator = word_counts[i] + self.alpha
                self.word_likelihoods[c][word] = numerator / denominator

    def predict(self, X_vec):
        return np.zeros(len(X_vec))


# --- Demonstration Setup (Mock Data) ---

raw_reviews = [
    "One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.", # Positive (1)
    "A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />The actors are extremely well chosen- Michael Sheen not only ""has got all the polari"" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece.", # Positive (1)
    "The worst, most disappointing movie I have ever seen. Absolutely terrible and boring and a waste of money." # Negative (0)
]
raw_sentiments = np.array([1, 1, 0]) 

# --- Scenario 1: With Stop Words ---
print("[Scenario 1: Minimal Preprocessing (Stop Words are kept)]")
vectorizer_min = CustomCountVectorizer(preprocessor_func=tokenize_with_stopwords)
X_min = vectorizer_min.fit_transform(raw_reviews)
model_min = CustomNaiveBayes(alpha=1.0)
model_min.fit(X_min, raw_sentiments, vectorizer_min.feature_names)

p_the_min = model_min.word_likelihoods[1].get('the', 0)
p_wonderful_min = model_min.word_likelihoods[1].get('wonderful', 0)

print(f"Vocabulary Size: {len(vectorizer_min.feature_names)}")
print(f"P('the' | Positive) (Stop Word): {p_the_min:.6f}")
print(f"P('wonderful' | Positive) (Sentiment Word): {p_wonderful_min:.6f}")


# --- Scenario 2: Without Stop Words ---
print("\n[Scenario 2: Full Preprocessing (Stop Words are removed)]")
vectorizer_full = CustomCountVectorizer(preprocessor_func=tokenize_without_stopwords)
X_full = vectorizer_full.fit_transform(raw_reviews)
model_full = CustomNaiveBayes(alpha=1.0)
model_full.fit(X_full, raw_sentiments, vectorizer_full.feature_names)

p_wonderful_full = model_full.word_likelihoods[1].get('wonderful', 0)

print(f"Vocabulary Size: {len(vectorizer_full.feature_names)}")
print(f"P('wonderful' | Positive) (Sentiment Word): {p_wonderful_full:.6f}")


# --- Comparison ---

print(f"\n--- Comparison: Effect of Stop Word Removal on P('wonderful' | Positive) ---")
print(f"P('wonderful' | Positive) in Scenario 1 (With Stop Words): {p_wonderful_min:.6f}")
print(f"P('wonderful' | Positive) in Scenario 2 (Without Stop Words): {p_wonderful_full:.6f}")
print(f"Ratio of increase: {p_wonderful_full / p_wonderful_min:.2f} times")

[Scenario 1: Minimal Preprocessing (Stop Words are kept)]
Vocabulary Size: 128
P('the' | Positive) (Stop Word): 0.050000
P('wonderful' | Positive) (Sentiment Word): 0.006667

[Scenario 2: Full Preprocessing (Stop Words are removed)]
Vocabulary Size: 91
P('wonderful' | Positive) (Sentiment Word): 0.010870

--- Comparison: Effect of Stop Word Removal on P('wonderful' | Positive) ---
P('wonderful' | Positive) in Scenario 1 (With Stop Words): 0.006667
P('wonderful' | Positive) in Scenario 2 (Without Stop Words): 0.010870
Ratio of increase: 1.63 times


**Part b**

In [11]:
import numpy as np
import pandas as pd
import re
import math
from collections import defaultdict

# --- 1. Helper Functions & Classes ---

def clean_text(text):
    text = text.lower()
    text = re.sub(r'<br\s*/>', ' ', text)
    text = re.sub(r'[^a-z\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

STOP_WORDS = set([
    'the', 'and', 'a', 'of', 'to', 'is', 'in', 'it', 'this', 'that', 
    'was', 'as', 'for', 'with', 'movie', 'film', 'but', 'on', 'are', 
    'not', 'have', 'be', 'one', 'all', 'at', 'by', 'an', 'who', 'so', 
    'from', 'like', 'there', 'or', 'just', 'about', 'out', 'if', 'has',
    'what', 'some', 'good', 'can', 'more', 'when', 'very', 'up', 'no', 
    'time', 'my', 'even', 'would', 'she', 'which', 'only', 'really', 
    'see', 'story', 'their', 'had'
])

def tokenize_without_stopwords(text):
    cleaned_tokens = clean_text(text).split(' ')
    return [word for word in cleaned_tokens if word not in STOP_WORDS and len(word) > 1]

class CustomCountVectorizer:
    def __init__(self, preprocessor_func, max_features=None):
        self.max_features = max_features
        self.vocabulary = {}
        self.feature_names = []
        self.preprocessor = preprocessor_func
        
    def fit(self, raw_documents):
        word_counts = defaultdict(int)
        for doc in raw_documents:
            tokens = self.preprocessor(doc)
            for token in tokens:
                word_counts[token] += 1
        
        sorted_words = sorted(word_counts.items(), key=lambda item: item[1], reverse=True)
        if self.max_features:
            sorted_words = sorted_words[:self.max_features]
        self.vocabulary = {word: idx for idx, (word, count) in enumerate(sorted_words)}
        self.feature_names = [word for word, count in sorted_words]
        
    def transform(self, raw_documents):
        n_samples = len(raw_documents)
        n_features = len(self.vocabulary)
        X = np.zeros((n_samples, n_features), dtype=int)
        for i, doc in enumerate(raw_documents):
            tokens = self.preprocessor(doc)
            for token in tokens:
                if token in self.vocabulary:
                    idx = self.vocabulary[token]
                    X[i, idx] += 1
        return X
    
    def fit_transform(self, raw_documents):
        self.fit(raw_documents)
        return self.transform(raw_documents)

class CustomNaiveBayes:
    def __init__(self, alpha=1.0):
        self.alpha = alpha
        self.class_priors = {}
        self.word_likelihoods = {}
        self.vocab_size = 0
        self.feature_names = []

    def fit(self, X_vec, y_labels, feature_names):
        self.vocab_size = X_vec.shape[1]
        self.feature_names = feature_names
        total_samples = len(y_labels)
        unique_classes = np.unique(y_labels)
        
        for c in unique_classes:
            self.class_priors[c] = np.sum(y_labels == c) / total_samples
        
        for c in unique_classes:
            X_c = X_vec[y_labels == c]
            total_tokens_in_c = np.sum(X_c)
            denominator = total_tokens_in_c + (self.alpha * self.vocab_size)
            self.word_likelihoods[c] = {}
            word_counts = np.sum(X_c, axis=0)
            for i, word in enumerate(self.feature_names):
                numerator = word_counts[i] + self.alpha
                self.word_likelihoods[c][word] = numerator / denominator

    def predict_one(self, sample_vector):
        best_class = None
        max_log_prob = -np.inf
        for c, prior in self.class_priors.items():
            current_log_prob = math.log(prior)
            for i, count in enumerate(sample_vector):
                if count > 0:
                    word = self.feature_names[i]
                    likelihood = self.word_likelihoods[c].get(word, 0)
                    if likelihood > 0:
                        current_log_prob += count * math.log(likelihood)
            if current_log_prob > max_log_prob:
                max_log_prob = current_log_prob
                best_class = c
        return best_class

    def predict(self, X_vec):
        return np.array([self.predict_one(sample) for sample in X_vec])

def calculate_metrics(y_true, y_pred):
    tp = np.sum((y_true == 1) & (y_pred == 1))
    fp = np.sum((y_true == 0) & (y_pred == 1))
    fn = np.sum((y_true == 1) & (y_pred == 0))
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    return {'F1-Score': f1_score}

# --- 2. Data Preparation ---

SEED = 42

def manual_stratified_split(X, y, test_size, random_state, stratify):
    np.random.seed(random_state)
    X_split, y_split = [], []
    unique_classes = np.unique(stratify)
    for c in unique_classes:
        class_indices = np.where(stratify == c)[0]
        X_c = X[class_indices]
        y_c = y[class_indices]
        n_c = len(X_c)
        shuffled_indices = np.random.permutation(n_c)
        X_c = X_c[shuffled_indices]
        y_c = y_c[shuffled_indices]
        split_point = int(n_c * (1 - test_size))
        X_split.append((X_c[:split_point], X_c[split_point:]))
        y_split.append((y_c[:split_point], y_c[split_point:]))
    
    X_train_final = np.concatenate([item[0] for item in X_split])
    X_test_final = np.concatenate([item[1] for item in X_split])
    y_train_final = np.concatenate([item[0] for item in y_split])
    y_test_final = np.concatenate([item[1] for item in y_split])
    
    # Shuffle finally
    train_indices = np.random.permutation(len(X_train_final))
    return X_train_final[train_indices], X_test_final, y_train_final[train_indices], y_test_final

# Try loading real data or fallback to placeholder
try:
    df = pd.read_csv('IMDB.csv')
    print("âœ… Loaded real IMDB dataset.")
except FileNotFoundError:
    print("ðŸš¨ Real file not found. Using PLACEHOLDER data for demo.")
    n_samples = 500
    texts = [f"review_{i} wonderful story" if i < n_samples/2 else f"review_{i} terrible movie" for i in range(n_samples)]
    sentiments = ["positive"] * (n_samples // 2) + ["negative"] * (n_samples // 2)
    df = pd.DataFrame({'review': texts, 'sentiment': sentiments})

df['sentiment'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)
X = df['review'].values
y = df['sentiment'].values

# Split 70% Train, 30% Temp
X_train_raw, X_temp, y_train, y_temp = manual_stratified_split(X, y, test_size=0.3, random_state=SEED, stratify=y)
# Split Temp 50/50 -> 15% Val, 15% Test
X_val_raw, X_test_raw, y_val, y_test = manual_stratified_split(X_temp, y_temp, test_size=0.5, random_state=SEED, stratify=y_temp)

# Assign to variables expected in Block 2
X_train = X_train_raw
X_val = X_val_raw

print(f"Data Split: Train={len(X_train)}, Val={len(X_val)}")

âœ… Loaded real IMDB dataset.
Data Split: Train=35000, Val=7500


In [12]:
# --- Start Part (b): F1-Score vs. Max Features Search ---

# If using real data, uncomment the line below:
# feature_counts = [100, 500, 1000, 5000, 10000, 20000]

# For placeholder/demo data, we use smaller counts:
feature_counts = [10, 50, 100, 200] 

results_f1 = []
alpha_fixed = 1.0 

print(f"\n--- Running Experiment: F1 vs Max Features ---")

for max_feat in feature_counts:
    # 1. Vectorize
    vectorizer = CustomCountVectorizer(preprocessor_func=tokenize_without_stopwords, max_features=max_feat)
    X_train_vec = vectorizer.fit_transform(X_train) 
    X_val_vec = vectorizer.transform(X_val)
    
    # 2. Train
    model = CustomNaiveBayes(alpha=alpha_fixed)
    model.fit(X_train_vec, y_train, vectorizer.feature_names)
    
    # 3. Evaluate
    y_val_pred = model.predict(X_val_vec)
    metrics = calculate_metrics(y_val, y_val_pred) 
    
    results_f1.append(metrics['F1-Score'])
    
    print(f"Max Features: {max_feat} | Vocab Size: {len(vectorizer.feature_names)} | F1-Score: {metrics['F1-Score']:.4f}")

print(f"\nFinal F1 Scores: {results_f1}")


--- Running Experiment: F1 vs Max Features ---
Max Features: 10 | Vocab Size: 10 | F1-Score: 0.5310
Max Features: 50 | Vocab Size: 50 | F1-Score: 0.6785
Max Features: 100 | Vocab Size: 100 | F1-Score: 0.7024
Max Features: 200 | Vocab Size: 200 | F1-Score: 0.7474

Final F1 Scores: [np.float64(0.5310066339775023), np.float64(0.6785036595283274), np.float64(0.7023583626999187), np.float64(0.747412286597661)]


In [13]:
# --- Ø§Ø¯Ø§Ù…Ù‡ Ø¨Ø®Ø´ (Ø¨): Ø¨Ø±Ø±Ø³ÛŒ ÙˆÛŒÚ˜Ú¯ÛŒâ€ŒÙ‡Ø§ÛŒ Ø¨ÛŒØ´ØªØ± ---

# Ù…Ù‚Ø§Ø¯ÛŒØ± Ø¨Ø²Ø±Ú¯ØªØ± Ø¨Ø±Ø§ÛŒ Ø¯ÛŒØ¯Ù† Ø§Ø«Ø± ÙˆØ§Ø±ÛŒØ§Ù†Ø³ Ùˆ Ù¾ÛŒØ¯Ø§ Ú©Ø±Ø¯Ù† Ù†Ù‚Ø·Ù‡ Ø¨Ù‡ÛŒÙ†Ù‡
feature_counts_large = [1000, 5000, 10000, 20000, 30000] 

print(f"\n--- Running Experiment: Large Feature Counts ---")

for max_feat in feature_counts_large:
    # 1. Vectorize
    vectorizer = CustomCountVectorizer(preprocessor_func=tokenize_without_stopwords, max_features=max_feat)
    X_train_vec = vectorizer.fit_transform(X_train) 
    X_val_vec = vectorizer.transform(X_val)
    
    # 2. Train
    model = CustomNaiveBayes(alpha=1.0)
    model.fit(X_train_vec, y_train, vectorizer.feature_names)
    
    # 3. Evaluate
    y_val_pred = model.predict(X_val_vec)
    metrics = calculate_metrics(y_val, y_val_pred) 
    
    print(f"Max Features: {max_feat} | F1-Score: {metrics['F1-Score']:.4f}")


--- Running Experiment: Large Feature Counts ---
Max Features: 1000 | F1-Score: 0.8270
Max Features: 5000 | F1-Score: 0.8449
Max Features: 10000 | F1-Score: 0.8466
Max Features: 20000 | F1-Score: 0.8490
Max Features: 30000 | F1-Score: 0.8506


**Part c**

In [14]:
# --- Start Part (c): F1-Score vs. Alpha (Laplace Smoothing) ---

# 1. Fix Max Features to a reasonable optimal value found in Part (b)
optimal_max_features = 5000 
vectorizer = CustomCountVectorizer(preprocessor_func=tokenize_without_stopwords, max_features=optimal_max_features)

# Vectorize once (since features are fixed)
X_train_vec = vectorizer.fit_transform(X_train)
X_val_vec = vectorizer.transform(X_val)

# 2. Define range of Alpha values
# We test extremely small values, the default (1.0), and very large values
alpha_values = [0.0001, 0.01, 0.1, 1.0, 5.0, 10.0, 50.0, 100.0]
results_alpha = []

print(f"\n--- Running Experiment: F1 vs Alpha (fixed max_features={optimal_max_features}) ---")

for alpha in alpha_values:
    # Train model with current alpha
    model = CustomNaiveBayes(alpha=alpha)
    model.fit(X_train_vec, y_train, vectorizer.feature_names)
    
    # Evaluate
    y_val_pred = model.predict(X_val_vec)
    metrics = calculate_metrics(y_val, y_val_pred)
    
    results_alpha.append(metrics['F1-Score'])
    print(f"Alpha: {alpha} | F1-Score: {metrics['F1-Score']:.4f}")

print(f"\nFinal F1 Scores for Alphas: {results_alpha}")


--- Running Experiment: F1 vs Alpha (fixed max_features=5000) ---
Alpha: 0.0001 | F1-Score: 0.8457
Alpha: 0.01 | F1-Score: 0.8457
Alpha: 0.1 | F1-Score: 0.8458
Alpha: 1.0 | F1-Score: 0.8449
Alpha: 5.0 | F1-Score: 0.8443
Alpha: 10.0 | F1-Score: 0.8446
Alpha: 50.0 | F1-Score: 0.8410
Alpha: 100.0 | F1-Score: 0.8368

Final F1 Scores for Alphas: [np.float64(0.8456716019741229), np.float64(0.8456716019741229), np.float64(0.8457844183564568), np.float64(0.8449426207632772), np.float64(0.844284188034188), np.float64(0.8446407065435568), np.float64(0.841040852096535), np.float64(0.8367734926670288)]
