In [1]:
from fractions import Fraction
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Read the CSV file
movieReviews = pd.read_csv('cleaned_reviews.csv', sep=',', header=None, names=['review_score', 'review_content'])

# Preprocess the data
try:
    # Convert decimal strings to percentages
    movieReviews['review_score'] = movieReviews['review_score'].apply(lambda x: Fraction(x) * 100)
except ValueError as e:
    print(f"Error converting 'review_score' to percentages: {e}")
    print("Rows with invalid 'review_score':")
    print(movieReviews[pd.to_numeric(movieReviews['review_score'], errors='coerce').isnull()])

# Tokenize and remove stopwords
stopwords_list = set(stopwords.words('english'))
movieReviews['review_content'] = movieReviews['review_content'].apply(
    lambda x: ' '.join([word.lower() for word in word_tokenize(x) if word.isalpha() and word.lower() not in stopwords_list])
)

# Drop rows with NaN values in 'review_score'
movieReviews = movieReviews.dropna(subset=['review_score'])

# Train-test split
train_data, test_data = train_test_split(movieReviews, test_size=0.1, random_state=42)

# Use TF-IDF vectorizer
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_data['review_content'])
X_test = vectorizer.transform(test_data['review_content'])

# Train a Multinomial Naive Bayes classifier
clf = MultinomialNB(alpha=0.06)
clf.fit(X_train, train_data['review_score'])

# Make predictions on the test set
test_data['predicted_rating'] = clf.predict(X_test)

# Convert 'predicted_rating' to percentages
test_data['predicted_rating'] = test_data['predicted_rating'] * 100

# Evaluate the accuracy
accuracy = accuracy_score(test_data['review_score'], test_data['predicted_rating'])
print(f"\nAccuracy on the test set: {accuracy:.2%}")

# Display the first few rows of the test set with predictions
print("\nTest Data with Predicted Ratings:")
print(test_data[['review_score', 'predicted_rating', 'review_content']].head())




Error converting 'review_score' to percentages: Invalid literal for Fraction: 'review_score'
Rows with invalid 'review_score':
        review_score                                     review_content
0       review_score                                     review_content
1              3.5/5  Percy Jackson isn't a great movie, but it's a ...
2                3/5  Crammed with dragons, set-destroying fights an...
3                2/4  For what it is and for whom it is intended, it...
4                2/5  Chris Columbus returns to his comfort zone for...
...              ...                                                ...
152300           4/5  There's a magical moment in every movie of qua...
152301           4/5  One of the film's great strengths is the trust...
152302           4/5  The film feels like a riposte to Trump-ism. It...
152303           4/5  The genius of the film lies in its combination...
152304         2.5/4  Students of such things in movies should appre...

[152218 