In [1]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import pandas as pd
import numpy as np


reviews = pd.read_csv('/content/drive/MyDrive/rt_reviews.csv',encoding='latin-1')

reviews = reviews.sample(frac=1, random_state=42).reset_index(drop=True)

# Dividing the dataset
train_size = int(0.5 * len(reviews))
dev_size = int(0.25 * len(reviews))
train_data = reviews[:train_size]
dev_data = reviews[train_size:train_size+dev_size]
test_data = reviews[train_size+dev_size:]

vocab = {}
for review in train_data['Review']:
    words = review.lower().split()
    for word in words:
        if word in vocab:
            vocab[word] += 1
        else:
            vocab[word] = 1

# Remove words
vocab = {k:v for k,v in vocab.items() if v >= 5}

vocab_list = sorted(list(vocab.keys()))
vocab_size = len(vocab_list)
reverse_vocab = {word:i for i, word in enumerate(vocab_list)}


occurrence_prob = {}
for word, count in vocab.items():
    occurrence_prob[word] = count / len(train_data)

positive_data = train_data[train_data['Freshness'] == 'fresh']
negative_data = train_data[train_data['Freshness'] == 'rotten']
positive_reviews = ' '.join(positive_data['Review']).lower()
negative_reviews = ' '.join(negative_data['Review']).lower()

cond_prob_positive = {}
cond_prob_negative = {}
for word in vocab_list:
    count_positive = positive_reviews.count(word)
    count_negative = negative_reviews.count(word)
    cond_prob_positive[word] = (count_positive + 1) / (len(positive_data) + vocab_size)
    cond_prob_negative[word] = (count_negative + 1) / (len(negative_data) + vocab_size)

# accuracy
def predict(review):
    words = review.lower().split()
    positive_score = 0
    negative_score = 0
    for word in words:
        if word in vocab:
            positive_score += np.log(cond_prob_positive[word])
            negative_score += np.log(cond_prob_negative[word])
    positive_score += np.log(len(positive_data) / len(train_data))
    negative_score += np.log(len(negative_data) / len(train_data))
    if positive_score >= negative_score:
        return 'fresh'
    else:
        return 'rotten'

correct = 0
total = len(dev_data)
for _, row in dev_data.iterrows():
    predicted = predict(row['Review'])
    if predicted == row['Freshness']:
        correct += 1

accuracy = correct / total
print(f"Accuracy on dev dataset: {accuracy:.4f}")


Accuracy on dev dataset: 0.7884
