In [19]:
import pandas as pd
from collections import Counter
import nltk
nltk.download('punkt')  # Download NLTK resources
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from fractions import Fraction
import re
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')



[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ayushkhanal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ayushkhanal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ayushkhanal/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [20]:
# Read CSV file
movieReviews = pd.read_csv('cleaned_reviews.csv', sep=',', header=None, names=['review_score', 'review_content'])
movieReviews = movieReviews[~movieReviews['review_score'].str.contains('/')]
movieReviews = movieReviews.drop(0)


# Convert fractions to percentages
movieReviews['review_score'] = movieReviews['review_score'].apply(lambda x: Fraction(x))
movieReviews['review_score_percentage'] = movieReviews['review_score'] * 100

# Define bins and labels
bins = [0, 19, 39, 59, 79, 100]
labels = [1, 2, 3, 4, 5]
movieReviews['review_label'] = pd.cut(movieReviews['review_score_percentage'], bins=bins, labels=labels, include_lowest=True)

def preprocess_text_and_tokenize(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    
    # Remove punctuation and convert text to lowercase
    text = text.apply(lambda sentence: re.sub('\W', ' ', sentence).lower())
    
    # Tokenize and lemmatize using apply on each element
    words = text.apply(lambda sentence: [lemmatizer.lemmatize(word.lower()) for word in word_tokenize(sentence)])
    
    # Flatten the list of lists
    lemmatized_words = [word for sentence_words in words for word in sentence_words if word.isalpha() and word not in stop_words and word != '']
    
    return ' '.join(lemmatized_words)  # Join the words into a space-separated string




# Train-test split
train_data, test_data = train_test_split(movieReviews, test_size=0.1, random_state=42)

# Display label distribution in training and test sets
print("Training Data Label Distribution:")
print(train_data['review_label'].value_counts(normalize=True))

print("\nTest Data Label Distribution:")
print(test_data['review_label'].value_counts(normalize=True))

# Apply preprocessing to the review content
train_data['review_content'] = preprocess_text_and_tokenize(train_data['review_content'])
test_data['review_content'] = preprocess_text_and_tokenize(test_data['review_content'])

# Tokenize and build vocabulary using lemmatization
all_words = ' '.join(train_data['review_content'])
vocabulary = set(word_tokenize(all_words))


Training Data Label Distribution:
review_label
5    0.647059
1    0.352941
2    0.000000
3    0.000000
4    0.000000
Name: proportion, dtype: float64

Test Data Label Distribution:
review_label
5    1.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: proportion, dtype: float64


In [21]:

# Logistic Regression parameters
alpha = 0.06

# Initiate parameters
lr_parameters_per_rating = {rating: Counter() for rating in range(1, 6)}

# Calculate Logistic Regression parameters
for rating in range(1, 6):
    rating_data = train_data[train_data['review_label'] == rating]['review_content']
    n_rating = len(rating_data)
    
    # Count occurrences of each word in the entire column
    word_counts = Counter(' '.join(rating_data).split())
    
    for word in vocabulary:
        n_word_given_rating = word_counts[word]
        p_word_given_rating = (n_word_given_rating + alpha) / (n_rating + alpha * len(vocabulary))
        lr_parameters_per_rating[rating][word] = p_word_given_rating


# Logistic Regression classifier
import numpy as np

# Logistic Regression classifier
def lr_predict_rating(review, parameters, n_rating, alpha, vocabulary, lemmatizer, stop_words):
    words = word_tokenize(review)
    lemmatized_words = [lemmatizer.lemmatize(word.lower()) for word in words if word.isalpha() and word.lower() not in stop_words]

    # Initialize log probabilities with prior log probabilities
    log_probabilities = {rating: np.log(1) for rating in parameters.keys()}
    
    for word in lemmatized_words:
        for rating, word_params in parameters.items():
            p_word_given_rating = word_params.get(word, alpha / (n_rating + alpha * len(vocabulary)))
            log_probabilities[rating] += np.log(p_word_given_rating)

    # Choose the rating with the highest log probability
    predicted_rating = max(log_probabilities, key=log_probabilities.get)
    return predicted_rating





lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
test_data['lr_predicted_rating'] = test_data['review_content'].apply(lambda x: lr_predict_rating(x, lr_parameters_per_rating, len(train_data), alpha, vocabulary, lemmatizer, stop_words))
# Remove rows with NaN values in 'review_label'
test_data = test_data.dropna(subset=['review_label', 'lr_predicted_rating'])

# Evaluate the Logistic Regression accuracy
lr_accuracy = accuracy_score(test_data['review_label'], test_data['lr_predicted_rating'])
print(f"\nLogistic Regression Accuracy on the test set: {lr_accuracy:.2%}")

# Display the first few rows of the test set with predictions
print("\nTest Data with Predicted Ratings:")
print(test_data[['review_label', 'lr_predicted_rating', 'review_content']].head())





Logistic Regression Accuracy on the test set: 100.00%

Test Data with Predicted Ratings:
      review_label  lr_predicted_rating  \
57225            5                    5   

                                          review_content  
57225  thriller sobering enough graphic portrayal for...  


In [22]:

def classify_movie_review_rating(review, n_rating, alpha, vocabulary, lr_parameters_per_rating):
    review = re.sub('\W', ' ', review)
    review = review.lower().split()

    probabilities = {rating: 1 for rating in range(1, 6)}  # Assuming ratings are 1 to 5

    for word in review:
        for rating, word_params in lr_parameters_per_rating.items():
            p_word_given_rating = word_params.get(word, alpha / (n_rating + alpha * len(vocabulary)))
            probabilities[rating] *= p_word_given_rating

    # Choose the rating with the highest probability
    predicted_rating = max(probabilities, key=probabilities.get)
    
    return predicted_rating

# Example usage with a movie review
example_review = "This movie was amazing! I loved it."
lr_predicted_rating = classify_movie_review_rating(example_review, len(train_data), alpha, vocabulary, lr_parameters_per_rating)
print('Logistic Regression Predicted Rating:', lr_predicted_rating)

example_review = "This movie sucks"
lr_predicted_rating = classify_movie_review_rating(example_review, len(train_data), alpha, vocabulary, lr_parameters_per_rating)
print('Logistic Regression Predicted Rating:', lr_predicted_rating)

example_review = "This movie was ok"
lr_predicted_rating = classify_movie_review_rating(example_review, len(train_data), alpha, vocabulary, lr_parameters_per_rating)
print('Logistic Regression Predicted Rating:', lr_predicted_rating)


Logistic Regression Predicted Rating: 5
Logistic Regression Predicted Rating: 5
Logistic Regression Predicted Rating: 5
