This project prompts the user to give their thoughts on a movie they have seen. Then, it will provide an estimation of what that person would rate the movie out of 5, with 5 being the best and 1 being the worst.

In [2]:
# Imports necessary packages and load the dataframe
# CSV from https://www.kaggle.com/datasets/stefanoleone992/rotten-tomatoes-movies-and-critic-reviews-dataset/
import pandas as pd
import math
import numpy as np
df = pd.read_csv('rotten_tomatoes_critic_reviews.csv')


In [3]:
#Separating necessary data from this project
#Converting letter grades to numerical grades

reviews = []
ratings = []
for review in df.itertuples(index=False):
    if (type(review[5]) == str and type(review[7]) == str):
        reviews.append(review[7])
        if review[5] == 'A+':
            ratings.append('5/5')
        elif review[5] == 'A':
            ratings.append('4.5/5')
        elif review[5] == 'A-':
            ratings.append('4.75/5')
        elif review[5] == 'A  -':
            ratings.append('4.75/5')
        elif review[5] == 'B+':
            ratings.append('4/5')
        elif review[5] == 'B':
            ratings.append('3.5/5')
        elif review[5] == 'B-':
            ratings.append('3.25/5')
        elif review[5] == 'C+':
            ratings.append('3/5')
        elif review[5] == 'C':
            ratings.append('2.5/5')
        elif review[5] == 'C-':
            ratings.append('2.25/5')
        elif review[5] == 'C  -':
            ratings.append('2.25/5')
        elif review[5] == 'D+':
            ratings.append('2/5')
        elif review[5] == 'D':
            ratings.append('1.5/5')
        elif review[5] == 'D-':
            ratings.append('1.25/5')
        elif review[5] == 'F+':
            ratings.append('1/5')
        elif review[5] == 'F':
            ratings.append('0.5/5')
        elif review[5] == 'F-':
            ratings.append('0.25/5')
        else:
            ratings.append(review[5])

In [4]:
#Converting strings and fractions to floats for the LSTM
#Taking in to account errors and some typos that were in the dataset

ratings_as_float= []
for i in ratings:
    if '/' in i:
        a , b = i.split("/")
        if float(b) == 0:
            b = 100
        to_float = float(a) / float(b)
        ratings_as_float.append(to_float)
    else:
        if float(i) < 10:
            ratings_as_float.append(float(i)/10)
        elif float(i) <= 100:
            ratings_as_float.append(float(i)/100)
        else:
            ratings_as_float.append(float(i)/1000)

In [5]:
#Load needed data into a dataframe
Data = pd.DataFrame({'Review':reviews,'Rating':ratings_as_float})


In [6]:
#Replace non-letter characters with space

import re

def preprocess(text):
    text = re.sub(r'[^\w\s\']', ' ', text)
    text = re.sub(r' + ', ' ',text)
    return text.strip().lower()

In [7]:
#Preprocess
Data['Review'] = Data['Review'].map(preprocess)

Fasttext Method

Here, FastText is a scalable solution from Meta that, as far as I know, is the simplest way to do text classification.
That being said, I want this project to give a percentage not classify categories. I am leaving this in case another user wants to look at it further

In [7]:
#Preprocess specifically for fasttext
Data['Rating'] = "__label__" + (Data['Rating'].astype(str))
Data["labeled_review"] = Data['Rating'] + " " + Data['Review']

In [29]:
#Prepare test and training
from sklearn.model_selection import train_test_split

train, test = train_test_split(Data, test_size=0.30)

In [30]:
#Export to csv
train.to_csv("review.train", columns=['labeled_review'], index=False, header=False)
test.to_csv("review.test", columns=['labeled_review'], index=False, header=False)

In [31]:
#Run fasttext model
import fasttext

model = fasttext.train_supervised("review.train")
model.test("review.test")

(37924, 0.25018457968568714, 0.25018457968568714)

In [40]:
#Predict
model.predict('The action scenes were good and the story and characters were amazing')

(('__label__0.8',), array([0.18067279]))

LSTM Method

As this is an NLP task, I chose to use an LSTM. Even though it is continuous and LSTMs generally work best with discrete classification, since my dataset is bounded between 0 and 1, combining an LSTM with dense layers with sigmoid activation functions works.

In [13]:
# Import necessary packages

import tensorflow as tf
import tensorflow.keras as keras
from keras.models import Model, load_model
from keras.layers import Dense, Input, Dropout, LSTM, Activation, Bidirectional, Flatten
from keras.layers import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.regularizers import l1, l2
from keras.optimizers import Adam

In [14]:
#Create training and testing datasets
from sklearn.model_selection import train_test_split

train, test = train_test_split(Data, test_size=0.30)

In [15]:
#Get sentences and prepare to tokenize data
all_sentences = pd.concat([train, test], axis=0)
tokenizer = Tokenizer(split=' ')

In [16]:
#Tokenize all data
tokenizer.fit_on_texts(all_sentences['Review'])
sequences = tokenizer.texts_to_sequences(all_sentences['Review'])
word_index = tokenizer.word_index
data = sequence.pad_sequences(sequences, padding = 'post')

In [18]:
#Prepare to vectorize data
from gensim.models import FastText

In [19]:
#Building vocabulary to vectorize data
fasttext_model=FastText(alpha=0.025,window=5,min_count=1,workers=4)
fasttext_model.build_vocab(sequences)

In [20]:
#Vectorizing data and saving model
fasttext_model.train(sequences, total_examples=len(sequences), epochs=3)
fasttext_model.save("review_model.bin")

In [45]:
#Create embedding matrix based on vectorization
vocab_size = len(word_index) + 1 
print(vocab_size)
embedding_matrix = np.random.random((vocab_size, 100))
for word, i in word_index.items():
    try:
        embedding_vector = fasttext_model.wv[word]
    except:
        print(word, 'not found')
    if embedding_vector is not None:
        embedding_matrix[i, :] = embedding_vector

131662


In [119]:
#Prepare model
#Here I need an embedding layer since this is NLP and I have an embedding matrix for the vectorization
#Masking layer since my dataset is masked
#Bi-directional LSTM to learn context forwards and backwards
#Flatten so that having a masking layer remains compatible with a dense layer
model = keras.Sequential()
model.add(Input(shape=(len(data[0]),), dtype='int32'))
model.add(Embedding(input_dim=vocab_size, output_dim=100, weights=[embedding_matrix], mask_zero=True))
model.add(Bidirectional(LSTM(units = 128, kernel_regularizer=l1(0.000001), return_sequences = True)))
model.add(Dropout(rate = 0.1))
model.add(Flatten())
model.add(keras.layers.Masking(mask_value=0.))
model.add(Dense(128, activation='sigmoid'))
model.add(Dense(1, activation='sigmoid'))



In [121]:
#View model
model.summary()

In [123]:
#Prepare training and testing data

train_data = data[:train.shape[0]]
test_data = data[train.shape[0]:]
print(train_data.shape, test_data.shape)

(531096, 60) (227613, 60)


In [179]:
#Further data preparation

X_train_val, X_test, Y_train_val, Y_test = train_test_split(train_data, train["Rating"].values, test_size=0.3)
X_train, X_val, Y_train, Y_val = train_test_split(X_train_val, Y_train_val, test_size=0.3)

In [125]:
#Shape consistency
Y_train = np.asarray(Y_train).astype('float32').reshape((-1,1))

In [126]:
#train model - don't want to use accuracy since this is not a classification problem
#I would run for more epochs but am limited in computing capabilities
adam = Adam(learning_rate=0.001)
model.compile(loss='mean_absolute_error',optimizer=adam,metrics=['mean_absolute_error'])
model.fit(X_train, Y_train, batch_size=128, epochs=1, validation_data = (X_val, Y_val))



[1m2034/2034[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m493s[0m 240ms/step - loss: 0.1447 - mean_absolute_error: 0.1413 - val_loss: 0.1295 - val_mean_absolute_error: 0.1276


<keras.src.callbacks.history.History at 0x27970373590>

In [181]:
#checking predictions on test dataset
model.evaluate(X_test,Y_test,batch_size=128)

[1m1245/1245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 38ms/step - loss: 0.1237 - mean_absolute_error: 0.1218


[0.12406165152788162, 0.12215328216552734]

In [244]:
#Checking predictions on user specified review
a = np.array(tokenizer.texts_to_sequences(['I thought the movie was pretty bad, but I have definitely seen worse']))
b = np.pad(a[0], (0,60-len(a[0])))
model.predict(np.array([b]))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step


array([[0.55941737]], dtype=float32)

In [141]:
test['Rating_Pred'] = model.predict(test_data)

[1m   1/7113[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m49:18[0m 416ms/step



[1m7113/7113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m102s[0m 14ms/step


In [183]:
import sklearn
sklearn.metrics.r2_score(test['Rating'].values, test['Rating_Pred'].values)

0.35461692111482224