In [1]:
# Imports
import pandas as pd
from fractions import Fraction
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from collections import Counter

# Function to preprocess text
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    
    # Remove punctuation and lowercase all words
    words = [word.lower() for word in words if word.isalpha()]
    
    # Remove stop words
    words = [word for word in words if word not in stop_words]
    
    # Remove words with capital letters that are not the first word
    words = [word for i, word in enumerate(words) if i == 0 or word.islower()]
    
    return ' '.join(words)

# Function to get sentiment scores
def get_sentiment_scores(text):
    sid = SentimentIntensityAnalyzer()
    sentiment_scores = sid.polarity_scores(text)
    return sentiment_scores

# Read data
movieReviews = pd.read_csv('cleaned_reviews.csv', sep=',', header=None, names=['review_score', 'review_content'])
movieReviews = movieReviews[~movieReviews['review_score'].str.contains('\.')]
movieReviews = movieReviews.drop(0)

# Convert decimal strings to fractions
movieReviews['review_score'] = movieReviews['review_score'].apply(lambda x: Fraction(x))

# Convert fractions to percentages
movieReviews['review_score_percentage'] = movieReviews['review_score'] * 100

# Define bins and labels
bins = [0, 20, 40, 60, 80, 100]
labels = [1, 2, 3, 4, 5]

# Create a new column 'review_label'
movieReviews['review_label'] = pd.cut(movieReviews['review_score_percentage'], bins=bins, labels=labels, include_lowest=True)

# Split data
train_data, test_data = train_test_split(movieReviews, test_size=0.1, random_state=42)

# Drop NaN values in the 'review_label' column in both training and test data
train_data = train_data.dropna(subset=['review_label'])
test_data = test_data.dropna(subset=['review_label'])

# Apply the preprocessing function to the review content
train_data['review_content'] = train_data['review_content'].apply(preprocess_text)
test_data['review_content'] = test_data['review_content'].apply(preprocess_text)

# Apply sentiment analysis to the review content
train_data['sentiment_scores'] = train_data['review_content'].apply(get_sentiment_scores)
test_data['sentiment_scores'] = test_data['review_content'].apply(get_sentiment_scores)

# Extract sentiment features
train_data['compound'] = train_data['sentiment_scores'].apply(lambda x: x['compound'])
test_data['compound'] = test_data['sentiment_scores'].apply(lambda x: x['compound'])

# TF-IDF Vectorization with parameter tuning
vectorizer = TfidfVectorizer(stop_words='english', tokenizer=word_tokenize, max_features=500, ngram_range=(1, 2))
X_train = vectorizer.fit_transform(train_data['review_content'])
X_test = vectorizer.transform(test_data['review_content'])

# Combine sentiment features with TF-IDF features
X_train_sentiment = pd.concat([pd.DataFrame(X_train.toarray()), train_data['compound']], axis=1)
X_test_sentiment = pd.concat([pd.DataFrame(X_test.toarray()), test_data['compound']], axis=1)

# Naive Bayes Classifier using the most common 40 words and sentiment features
def predict_rating_top_words_sentiment(features, top_words, parameters, n_rating):
    # Extracting only the TF-IDF features corresponding to the top_words
    features_top_words = features[top_words]
    
    # Initialize probabilities with prior probabilities
    probabilities = {rating: 1 for rating in parameters.keys()}
    
    for word in features_top_words.columns:
        for rating, word_params in parameters.items():
            p_word_given_rating = word_params.get(word, 1 / (n_rating + alpha * len(top_words)))
            probabilities[rating] *= p_word_given_rating
    
    # Include sentiment feature (compound score)
    probabilities['sentiment'] = features['compound']
    
    # Choose the rating with the highest probability
    predicted_rating = max(probabilities, key=probabilities.get)
    return predicted_rating

# Calculate the most common 40 words for each rating
top_words_per_rating = {}
for rating in range(1, 6):
    rating_data = train_data[train_data['review_label'] == rating]['review_content']
    
    # Count occurrences of each word in the entire column
    word_counts = Counter(' '.join(rating_data).split())
    
    # Get the most common 40 words
    top_words = [word for word, _ in word_counts.most_common(40)]
    top_words_per_rating[rating] = top_words

# Naive Bayes parameters
alpha = .06

# Initiate parameters
parameters_per_rating = {rating: Counter() for rating in range(1, 6)}

# Calculate parameters using the most common 40 words
for rating in range(1, 6):
    rating_data = train_data[train_data['review_label'] == rating]['review_content']
    n_rating = len(rating_data)
    
    # Count occurrences of each word in the entire column
    word_counts = Counter(' '.join(rating_data).split())
    
    for word in top_words_per_rating[rating]:
        n_word_given_rating = word_counts[word]
        p_word_given_rating = (n_word_given_rating + alpha) / (n_rating + alpha * len(top_words_per_rating[rating]))
        parameters_per_rating[rating][word] = p_word_given_rating

# Predict ratings on the test set using the most common 40 words and sentiment features
test_data['predicted_rating_top_words_sentiment'] = test_data.apply(lambda x: predict_rating_top_words_sentiment(x[X_test_sentiment.columns], top_words_per_rating[x['review_label']], parameters_per_rating, len(train_data)), axis=1)

# Evaluate the accuracy
accuracy_top_words_sentiment = accuracy_score(test_data['review_label'], test_data['predicted_rating_top_words_sentiment'])
print(f"\nAccuracy on the test set using top 40 words and sentiment: {accuracy_top_words_sentiment:.2%}")

# Display the first few rows of the test set with predictions using top words and sentiment
print("\nTest Data with Predicted Ratings using Top Words and Sentiment:")
print(test_data[['review_label', 'predicted_rating_top_words_sentiment', 'review_content', 'compound']].head())







KeyboardInterrupt: 