In [1]:
import pandas as pd
import re
import random
from collections import defaultdict
import math
import pickle

In [2]:
# STEP 1: Load Data
df = pd.read_csv('Restaurant_Reviews.tsv', sep='\t')

In [3]:
# STEP 2: Clean the text
def clean_text(text):
    text = re.sub(r'[^a-zA-Z]', ' ', text).lower().split()
    return text

df['Cleaned'] = df['Review'].apply(clean_text)

In [4]:
# STEP 3: Split data into train and test (80/20)
data = list(zip(df['Cleaned'], df['Liked']))
random.seed(42)
random.shuffle(data)

split_idx = int(0.8 * len(data))
train_data = data[:split_idx]
test_data = data[split_idx:]

In [5]:
# STEP 4: Training - Count words and classes
word_counts = {
    0: defaultdict(int),  # Negative
    1: defaultdict(int),  # Positive
}
class_counts = {0: 0, 1: 0}
total_words = {0: 0, 1: 0}

for words, label in train_data:
    class_counts[label] += 1
    for word in words:
        word_counts[label][word] += 1
        total_words[label] += 1

In [6]:
# STEP 5: Vocabulary
vocab = set(word for label in word_counts for word in word_counts[label])
vocab_size = len(vocab)

In [7]:
# Improved Naive Bayes prediction: returns label and probability
def predict_naive_bayes_granular(words):
    scores = {}
    total_docs = sum(class_counts.values())
    for label in [0, 1]:
        log_prob = math.log(class_counts[label] / total_docs)
        for word in words:
            word_freq = word_counts[label][word] + 1  # Laplace smoothing
            word_prob = word_freq / (total_words[label] + vocab_size)
            log_prob += math.log(word_prob)
        scores[label] = log_prob
    # Softmax normalization for probability
    max_log = max(scores.values())
    exp0 = math.exp(scores[0] - max_log)
    exp1 = math.exp(scores[1] - max_log)
    prob_pos = exp1 / (exp0 + exp1)
    label = 1 if scores[1] > scores[0] else 0
    return label, prob_pos

In [8]:
# STEP 6: Prediction function using Naive Bayes
def predict_naive_bayes(words):
    scores = {}
    total_docs = sum(class_counts.values())
    for label in [0, 1]:
        log_prob = math.log(class_counts[label] / total_docs)
        for word in words:
            word_freq = word_counts[label][word] + 1  # Laplace smoothing
            word_prob = word_freq / (total_words[label] + vocab_size)
            log_prob += math.log(word_prob)
        scores[label] = log_prob
    return 1 if scores[1] > scores[0] else 0

In [9]:
# Model evaluation with granular sentiment, confusion matrix, and classification report
from sklearn.metrics import confusion_matrix, classification_report
import numpy as np

true_labels = []
pred_labels = []
probs = []

for words, true_label in test_data:
    pred, prob_pos = predict_naive_bayes_granular(words)
    true_labels.append(true_label)
    pred_labels.append(pred)
    probs.append(prob_pos)

accuracy = np.mean(np.array(pred_labels) == np.array(true_labels))
print(f"Accuracy: {accuracy:.2f}")

# Confusion matrix
cm = confusion_matrix(true_labels, pred_labels)
print("Confusion Matrix:\n", cm)

# Classification report
print(classification_report(true_labels, pred_labels, target_names=['Negative', 'Positive']))

Accuracy: 0.77
Confusion Matrix:
 [[78 22]
 [25 75]]
              precision    recall  f1-score   support

    Negative       0.76      0.78      0.77       100
    Positive       0.77      0.75      0.76       100

    accuracy                           0.77       200
   macro avg       0.77      0.77      0.76       200
weighted avg       0.77      0.77      0.76       200



In [10]:
# STEP 7: Evaluate on test data
correct = 0
for words, true_label in test_data:
    pred = predict_naive_bayes(words)
    if pred == true_label:
        correct += 1

accuracy = correct / len(test_data)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.77


In [11]:
# Save model parameters for reuse
import pickle

with open('model_params.pkl', 'wb') as f:
    pickle.dump((word_counts, class_counts, total_words, vocab_size), f)
print("Model parameters saved to model_params.pkl")

Model parameters saved to model_params.pkl


In [12]:
import pickle

# bundle them into a tuple
model_params = (word_counts, class_counts, total_words, vocab_size)

# save to a file
with open('model_params.pkl', 'wb') as f:
    pickle.dump(model_params, f)

print("✅ Model parameters saved to model_params.pkl")


✅ Model parameters saved to model_params.pkl


In [13]:
# Load model parameters from pickle
import pickle

with open('model_params.pkl', 'rb') as f:
    word_counts, class_counts, total_words, vocab_size = pickle.load(f)
print("Model parameters loaded from model_params.pkl")

Model parameters loaded from model_params.pkl
