In [None]:
from csv import reader
from random import seed
from random import randrange
from math import sqrt
from math import exp
from math import pi
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter

import pandas as pd

movieReviews = pd.read_csv('', sep = '\t', header = None, names = ['review_score', 'review_content'])
print (movieReviews.shape)
movieReviews.head()

# turn fractions into percentages

movieReviews['review_score'] = movieReviews['review_score'].apply(lambda x: eval(x))
movieReviews['review_score_percentage'] = movieReviews['review_score'] * 100

bins = [0, 20, 40, 60, 80, 100]
labels = [0, 1, 2, 3, 4, 5]

# Create a new column 'review_label' based on the specified ranges
movieReviews['review_label'] = pd.cut(movieReviews['review_score_percentage'], bins=bins, labels=labels, include_lowest=True)



In [None]:
#finding the top 50 words for each rating

# Function to preprocess text (tokenization, lowercase, and removing stopwords)
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    words = [word.lower() for word in words if word.isalpha() and word.lower() not in stop_words]
    return words

# Create a new column with preprocessed text
movieReviews['processed_text'] = movieReviews['review_content'].apply(preprocess_text)

# Create a Counter for each score
score_counters = {}

# Iterate over unique scores
for score in range(6):
    # Concatenate all processed_text lists for the current score
    texts = movieReviews[movieReviews['review_label'] == score]['processed_text'].sum()
    
    # Create a Counter for the words in the concatenated texts
    counter = Counter(texts)
    
    # Store the Counter in the dictionary
    score_counters[score] = counter

# Get the top 50 words for each score
top_50_words_per_score = {score: counter.most_common(50) for score, counter in score_counters.items()}

# Display the top 50 words for each score
for score, top_words in top_50_words_per_score.items():
    print(f'Top 50 words for score {score}:')
    print(top_words)
    print('\n')

In [None]:
#split the data into 80/10/10

firstSplit = round(len(movieReviews) * 0.8)
secondSplit = round(len(movieReviews) * 0.10) + firstSplit

trainingData = movieReviews[:firstSplit].reset_index(drop=True)
validationData = movieReviews[firstSplit:secondSplit].reset_index(drop=True)
testData = movieReviews[secondSplit:].reset_index(drop=True)

print(trainingData.shape)
print(validationData.shape)
print(testData.shape)

# Display label distribution in training and validation sets
print("Training Data Label Distribution:")
print(trainingData['review_label'].value_counts(normalize=True))

print("\nValidation Data Label Distribution:")
print(validationData['review_label'].value_counts(normalize=True))

# Get rid of punctuation and convert text to lowercase
def preprocess_text(text):
    text = text.str.replace('\W', ' ')  # Removes punctuation
    text = text.str.lower()  # Converts text to lowercase
    return text

trainingData['review_content'] = preprocess_text(trainingData['review_content'])
validationData['review_content'] = preprocess_text(validationData['review_content'])
testData['review_content'] = preprocess_text(testData['review_content'])

# Display the first few rows of the preprocessed training data
trainingData.head(3)