In [77]:


import numpy as np 
import pandas as pd 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/rotten-tomatoes-reviews/rt_reviews.csv


In [78]:
#Load the dataset from kaggle
#default character encoding of utf-8 failed to decode the file, so used encoding='ISO-8859-1'
df = pd.read_csv("/kaggle/input/rotten-tomatoes-reviews/rt_reviews.csv", encoding='ISO-8859-1')

# Splitting the dataset into train,development,and test
train, dev, test = np.split(df.sample(frac=1, random_state=42), [int(0.6*len(df)), int(0.8*len(df))])


In [79]:
df.head(10)

Unnamed: 0,Freshness,Review
0,fresh,"Manakamana doesn't answer any questions, yet ..."
1,fresh,Wilfully offensive and powered by a chest-thu...
2,rotten,It would be difficult to imagine material mor...
3,rotten,Despite the gusto its star brings to the role...
4,rotten,If there was a good idea at the core of this ...
5,rotten,"Gleeson goes the Hallmark Channel route, dama..."
6,fresh,It was the height of satire in 1976: dark as ...
7,rotten,"Everyone in ""The Comedian"" deserves a better ..."
8,rotten,Actor encourages grumpy Christians to embrace...
9,fresh,"Slight, contained, but ineffably soulful."


In [80]:
# Build a vocabulary as list
vocab = {}
for reviews in train['Review']:
    words = reviews.lower().split()
    for word in words:
        if word in vocab:
            vocab[word] += 1
        else:
            vocab[word] = 1

In [81]:
# Remove rare words from vocab if occurring is less than 5 times
vocab_filter = {k:v for k,v in vocab.items() if v>=5}

In [82]:
# Generate reverse index where indices are the integers from 0 to the number of words in the vocabulary minus 1
reverse_index = {word: i for i, word in enumerate(vocab_filter)}


In [83]:
# Calculates the number of documents in the training set
num_of_documents = len(train)
# Calculate the number of documents labeled as "fresh", and the number labeled as "rotten"
num_of_fresh = len(train[train['Freshness']=='fresh'])
num_of_rotten = len(train[train['Freshness']=='rotten'])
#Calculate the prior probabilities of a document fresh / rotten
pb_fresh = num_of_fresh / num_of_documents
pb_rotten = num_of_rotten / num_of_documents
#Count how many times each word appears in the "fresh" and "rotten" documents.
word_cnt_fresh = np.zeros(len(vocab_filter))
word_cnt_rotten = np.zeros(len(vocab_filter))
#Count the number of times word "fresh" repeats in document in train data
for reviews in train[train['Freshness']=='fresh']['Review']:
    words = reviews.lower().split()
    for word in words:
        if word in vocab_filter:
            word_cnt_fresh[reverse_index[word]] += 1
#Count the number of times word "rotten" repeats in document in train data            
for reviews in train[train['Freshness']=='rotten']['Review']:
    words = reviews.lower().split()
    for word in words:
        if word in vocab_filter:
            word_cnt_rotten[reverse_index[word]] += 1
#Perform laplace smoothing
pb_of_word_fresh = (word_cnt_fresh + 1) / (np.sum(word_cnt_fresh) + len(vocab_filter))
pb_of_word_rotten = (word_cnt_rotten + 1) / (np.sum(word_cnt_rotten) + len(vocab_filter))

In [84]:
#calculates the accuracy on the development dataset (dev) using the probability estimates computed earlier
import math
correct = 0
total = len(dev)

#calculates the probability that a review will be labeled as "fresh" or "rotten" by  looping over each row in the development dataset

for i, row in dev.iterrows():
    fresh_prob = math.log(pb_fresh)
    rotten_prob = math.log(pb_rotten)
#determine which class has the larger probability by comparing these probabilities

    words = row['Review'].lower().split()
    for word in words:
        if word in vocab_filter:
            fresh_prob += math.log(pb_of_word_fresh[reverse_index[word]])
            rotten_prob += math.log(pb_of_word_rotten[reverse_index[word]])

    if fresh_prob > rotten_prob:
        prediction = 'fresh'
    else:
        prediction = 'rotten'

    if prediction == row['Freshness']:
        correct += 1

accuracy = correct / total
print("Accuracy:" +str(accuracy))


Accuracy:0.7966666666666666


In [85]:
# Comparing different smoothing factors on development (dev) dataset
smoothing_factors = [0, 0.5, 1, 5]

for sf in smoothing_factors:
    word_cnt_fresh = np.zeros(len(vocab_filter))
    word_cnt_rotten = np.zeros(len(vocab_filter))

    for reviews in train[train['Freshness']=='fresh']['Review']:
        words = reviews.lower().split()
        for word in words:
            if word in vocab_filter:
                word_cnt_fresh[reverse_index[word]] += 1

    for reviews in train[train['Freshness']=='rotten']['Review']:
        words = reviews.lower().split()
        for word in words:
            if word in vocab_filter:
                word_cnt_rotten[reverse_index[word]] += 1

    pb_of_word_fresh = (word_cnt_fresh + sf) / (np.sum(word_cnt_fresh) + sf*len(vocab_filter))
    pb_of_word_rotten = (word_cnt_rotten + sf) / (np.sum(word_cnt_rotten) + sf*len(vocab_filter))

    correct = 0
    for i in range(len(dev)):
        fresh_prob = np.log(pb_fresh)
        rotten_prob = np.log(pb_rotten)
        review_words = dev.iloc[i]['Review'].lower().split()
        for word in review_words:
            if word in vocab_filter:
                fresh_prob += np.log(pb_of_word_fresh[reverse_index[word]])
                rotten_prob += np.log(pb_of_word_rotten[reverse_index[word]])
        predicted_freshness = 'fresh' if fresh_prob > rotten_prob else 'rotten'
        if predicted_freshness == dev.iloc[i]['Freshness']:
            correct += 1
    accuracy = correct / len(dev)
    print("Smoothing factor:", sf)
    print("Accuracy:", accuracy)



Smoothing factor: 0
Accuracy: 0.79315625
Smoothing factor: 0.5
Accuracy: 0.7961458333333333
Smoothing factor: 1
Accuracy: 0.7966666666666666
Smoothing factor: 5
Accuracy: 0.79471875


In [86]:
# Derive Top 10 words that predict each class fresh and rotten
#Only powerful words with a length of at least 10 are taken into consideration because the document contains a lot of very short, repetitive words with less significance.
top_words_fresh = sorted([(word, pb_of_word_fresh[reverse_index[word]]) for word in vocab_filter.keys() if len(word) >= 10], key=lambda x: x[1], reverse=True)[:10]
top_words_rotten = sorted([(word, pb_of_word_rotten[reverse_index[word]]) for word in vocab_filter.keys() if len(word) >= 10], key=lambda x: x[1], reverse=True)[:10]

print("Top 10 words that predict fresh reviews are:")
for word, p in top_words_fresh:
    print(f"Word: {word}, P(fresh|{word}): {p:.8f}")

print("Top 10 words that predict rotten reviews are:")
for word, p in top_words_rotten:
    print(f"Word: {word}, P(rotten|{word}): {p:.8f}")

Top 10 words that predict fresh reviews are:
Word: performance, P(fresh|performance): 0.00082815
Word: characters, P(fresh|characters): 0.00074779
Word: performances, P(fresh|performances): 0.00074567
Word: entertaining, P(fresh|entertaining): 0.00069854
Word: documentary, P(fresh|documentary): 0.00061726
Word: compelling, P(fresh|compelling): 0.00034655
Word: fascinating, P(fresh|fascinating): 0.00033356
Word: everything, P(fresh|everything): 0.00031301
Word: beautifully, P(fresh|beautifully): 0.00030818
Word: experience, P(fresh|experience): 0.00028431
Top 10 words that predict rotten reviews are:
Word: characters, P(rotten|characters): 0.00101604
Word: ultimately, P(rotten|ultimately): 0.00054700
Word: interesting, P(rotten|interesting): 0.00045887
Word: everything, P(rotten|everything): 0.00040863
Word: performance, P(rotten|performance): 0.00036518
Word: performances, P(rotten|performances): 0.00031279
Word: unfortunately,, P(rotten|unfortunately,): 0.00031279
Word: predictable, P

In [87]:
#From hyperparameter tuning in development (dev) dataset, smoothing_factor is considered optimal hyperparameter and optimal value for it is found to be 1
smoothing_factor = 1
correct = 0
total = len(test)

for i, row in test.iterrows():
    fresh_prob = math.log(pb_fresh)
    rotten_prob = math.log(pb_rotten)

    words = row['Review'].lower().split()
    
    #index = reverse_index.get(word, -1) is  used to find the word's index in the arrays. The word is skipped if it is not in reverse_index and index is set to -1.
    for word in words:
        if word in vocab_filter:
            index = reverse_index.get(word, -1)
            if index != -1:
                fresh_prob += math.log((word_cnt_fresh[index] + smoothing_factor) / (np.sum(word_cnt_fresh) + smoothing_factor * len(vocab_filter)))
                rotten_prob += math.log((word_cnt_rotten[index] + smoothing_factor) / (np.sum(word_cnt_rotten) + smoothing_factor * len(vocab_filter)))

    if fresh_prob > rotten_prob:
        prediction = 'fresh'
    else:
        prediction = 'rotten'

    if prediction == row['Freshness']:
        correct += 1

accuracy = correct / total
print("Final accuracy on test dataset:", accuracy)






Final accuracy on test dataset: 0.7992604166666667
