In [20]:
#imports
from csv import reader
from random import seed
from random import randrange
from math import sqrt
from math import exp
from math import pi
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
from fractions import Fraction
from io import StringIO



In [21]:
movieReviews = pd.read_csv('cleaned_reviews.csv', sep = ',', header = None, names = ['review_score', 'review_content'])
movieReviews = movieReviews[~movieReviews['review_score'].str.contains('\.')]
print(movieReviews['review_content'].isnull().sum())
movieReviews = movieReviews.drop(0)
print (movieReviews.shape)
movieReviews.head()



0
(109542, 2)


Unnamed: 0,review_score,review_content
2,3/5,"Crammed with dragons, set-destroying fights an..."
3,2/4,"For what it is and for whom it is intended, it..."
4,2/5,Chris Columbus returns to his comfort zone for...
5,2/5,Although the standard allegorical bases for my...
6,3/5,You don't even have to be familiar with the fi...


In [22]:
# turn fractions into percentages

# Convert decimal strings to fractions
movieReviews['review_score'] = movieReviews['review_score'].apply(lambda x: Fraction(x))

# Convert fractions to percentages
movieReviews['review_score_percentage'] = movieReviews['review_score'] * 100

movieReviews.head()

Unnamed: 0,review_score,review_content,review_score_percentage
2,3/5,"Crammed with dragons, set-destroying fights an...",60
3,1/2,"For what it is and for whom it is intended, it...",50
4,2/5,Chris Columbus returns to his comfort zone for...,40
5,2/5,Although the standard allegorical bases for my...,40
6,3/5,You don't even have to be familiar with the fi...,60


In [23]:
# Define bins and labels
bins = [0, 19, 39, 59, 79, 100]
labels = [1, 2, 3, 4, 5]

# Create a new column 'review_label' based on the specified ranges
movieReviews['review_label'] = pd.cut(movieReviews['review_score_percentage'], bins=bins, labels=labels, include_lowest=True)


movieReviews.head()

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/sydneychapman/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [24]:
#finding the top 50 words for each rating

# Function to preprocess text (tokenization, lowercase, and removing stopwords)
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    words = [word.lower() for word in words if word.isalpha() and word.lower() not in stop_words]
    return words

train_data, test_data = train_test_split(movieReviews, test_size=0.1, random_state=42)

# Display label distribution in training and test sets
print("Training Data Label Distribution:")
print(train_data['review_label'].value_counts(normalize=True))

print("\nTest Data Label Distribution:")
print(test_data['review_label'].value_counts(normalize=True))

# Get rid of punctuation and convert text to lowercase
def preprocess_text(text):
    text = text.str.replace('\W', ' ')  # Removes punctuation
    text = text.str.lower()  # Converts text to lowercase
    return text

train_data['review_content'] = preprocess_text(train_data['review_content'])
test_data['review_content'] = preprocess_text(test_data['review_content'])

# Tokenize and build vocabulary
all_words = ' '.join(train_data['review_content'])
all_words = word_tokenize(all_words)
vocabulary = set(all_words)


Training Data Label Distribution:
review_label
4    0.394028
3    0.289705
5    0.207651
2    0.098588
1    0.010028
Name: proportion, dtype: float64

Test Data Label Distribution:
review_label
4    0.388732
3    0.292119
5    0.210300
2    0.099626
1    0.009223
Name: proportion, dtype: float64


In [25]:
from collections import Counter
#Naive Bayes implementation

# Naive Bayes parameters
alpha = .06

# Initiate parameters
parameters_per_rating = {rating: Counter() for rating in range(1, 6)}

# Calculate parameters
for rating in range(1, 6):
    rating_data = train_data[train_data['review_label'] == rating]['review_content']
    n_rating = len(rating_data)
    
    # Count occurrences of each word in the entire column
    word_counts = Counter(' '.join(rating_data).split())
    
    for word in vocabulary:
        n_word_given_rating = word_counts[word]
        p_word_given_rating = (n_word_given_rating + alpha) / (n_rating + alpha * len(vocabulary))
        parameters_per_rating[rating][word] = p_word_given_rating

In [26]:
#classifier 
# Naive Bayes classifier
def predict_rating(review, parameters, n_rating):
    words = word_tokenize(review)
    
    # Initialize probabilities with prior probabilities
    probabilities = {rating: 1 for rating in parameters.keys()}
    
    for word in words:
        for rating, word_params in parameters.items():
            p_word_given_rating = word_params.get(word, 1 / (n_rating + alpha * len(vocabulary)))
            probabilities[rating] *= p_word_given_rating
    
    # Choose the rating with the highest probability
    predicted_rating = max(probabilities, key=probabilities.get)
    return predicted_rating

# Predict ratings on the test set
test_data['predicted_rating'] = test_data['review_content'].apply(lambda x: predict_rating(x, parameters_per_rating, len(train_data)))

# Remove rows with NaN values in 'review_label'
test_data = test_data.dropna(subset=['review_label', 'predicted_rating'])

# Evaluate the accuracy
accuracy = accuracy_score(test_data['review_label'], test_data['predicted_rating'])
print(f"\nAccuracy on the test set: {accuracy:.2%}")

# Display the first few rows of the test set with predictions
print("\nTest Data with Predicted Ratings:")
print(test_data[['review_label', 'predicted_rating', 'review_content']].head())



Accuracy on the test set: 52.55%

Test Data with Predicted Ratings:
       review_label  predicted_rating  \
95678             1                 3   
12419             4                 3   
111736            4                 4   
144623            4                 3   
105747            4                 3   

                                           review_content  
95678   this spaceship misfire manages to be overblown...  
12419   call it quirky or observational or shaggy, but...  
111736  there will never be another marilyn monroe or ...  
144623  is it a pointed cultural take or just a gleefu...  
105747  the twist itself particularly original. the wa...  


In [27]:
#testing a classifier
import re

def classify_movie_review_rating(review):
    '''
    review: a string
    '''
    review = re.sub('\W', ' ', review)
    review = review.lower().split()

    probabilities = {rating: 1 for rating in range(1, 6)}  # Assuming ratings are 1 to 5

    for word in review:
        for rating, word_params in parameters_per_rating.items():
            p_word_given_rating = word_params.get(word, alpha / (n_rating + alpha * len(vocabulary)))
            probabilities[rating] *= p_word_given_rating

    # Choose the rating with the highest probability
    predicted_rating = max(probabilities, key=probabilities.get)
    
    return predicted_rating

# Example usage with a movie review
example_review = "This movie was amazing! I loved it."
predicted_rating = classify_movie_review_rating(example_review)
print('Predicted Rating:', predicted_rating)

Predicted Rating: 4
