This project prompts the user to give their thoughts on a movie they have seen. Then, it will provide an estimation of what that person would rate the movie out of 5, with 5 being the best and 1 being the worst.

In [None]:
# Imports necessary packages and load the dataframe
# CSV from https://www.kaggle.com/datasets/stefanoleone992/rotten-tomatoes-movies-and-critic-reviews-dataset/
import pandas as pd
import math
import numpy as np
df = pd.read_csv('rotten_tomatoes_critic_reviews.csv')

In [None]:
#Separating necessary data from this project
#Converting letter grades to numerical grades

reviews = []
ratings = []
for review in df.itertuples(index=False):
    if (type(review[5]) == str and type(review[7]) == str):
        reviews.append(review[7])
        if review[5] == 'A+':
            ratings.append('5/5')
        elif review[5] == 'A':
            ratings.append('4.5/5')
        elif review[5] == 'A-':
            ratings.append('4.75/5')
        elif review[5] == 'A  -':
            ratings.append('4.75/5')
        elif review[5] == 'B+':
            ratings.append('4/5')
        elif review[5] == 'B':
            ratings.append('3.5/5')
        elif review[5] == 'B-':
            ratings.append('3.25/5')
        elif review[5] == 'C+':
            ratings.append('3/5')
        elif review[5] == 'C':
            ratings.append('2.5/5')
        elif review[5] == 'C-':
            ratings.append('2.25/5')
        elif review[5] == 'C  -':
            ratings.append('2.25/5')
        elif review[5] == 'D+':
            ratings.append('2/5')
        elif review[5] == 'D':
            ratings.append('1.5/5')
        elif review[5] == 'D-':
            ratings.append('1.25/5')
        elif review[5] == 'F+':
            ratings.append('1/5')
        elif review[5] == 'F':
            ratings.append('0.5/5')
        elif review[5] == 'F-':
            ratings.append('0.25/5')
        else:
            ratings.append(review[5])

In [None]:
#Converting strings and fractions to floats for the LSTM
#Taking in to account errors and some typos that were in the dataset

ratings_as_float= []
for i in ratings:
    if '/' in i:
        a , b = i.split("/")
        if float(b) == 0:
            b = 100
        to_float = float(a) / float(b)
        ratings_as_float.append(to_float)
    else:
        if float(i) < 10:
            ratings_as_float.append(float(i)/10)
        elif float(i) <= 100:
            ratings_as_float.append(float(i)/100)
        else:
            ratings_as_float.append(float(i)/1000)

In [None]:
#Load needed data into a dataframe
Data = pd.DataFrame({'Review':reviews,'Rating':ratings_as_float})

In [None]:
#Replace non-letter characters with space

import re

def preprocess(text):
    text = re.sub(r'[^\w\s\']', ' ', text)
    text = re.sub(r' + ', ' ',text)
    return text.strip().lower()

In [None]:
#Preprocess
Data['Review'] = Data['Review'].map(preprocess)

Fasttext Method

Note: This method only works for discrete classification
Here, we are trying to make predictions on a continuous set from 0 to 1, so rather than fasttext, other methods are preferred and are seen below.

In [None]:
#Preprocess specifically for fasttext
Data['Rating'] = "__label__" + (Data['Rating'].astype(str))
Data["labeled_review"] = Data['Rating'] + " " + Data['Review']

In [None]:
#Prepare test and training
from sklearn.model_selection import train_test_split

train, test = train_test_split(Data, test_size=0.30)

In [None]:
#Export to csv
train.to_csv("review.train", columns=['labeled_review'], index=False, header=False)
test.to_csv("review.test", columns=['labeled_review'], index=False, header=False)

In [None]:
#Run fasttext model
import fasttext

model = fasttext.train_supervised("review.train")
model.test("review.test")

(37924, 0.25018457968568714, 0.25018457968568714)

In [None]:
#Predict
model.predict('The action scenes were good and the story and characters were amazing')

(('__label__0.8',), array([0.18067279]))

LSTM Method

As this is an NLP task, I choose to use an LSTM. Even though it is continuous and LSTMs generally work best with discrete classification, since my dataset is bounded between 0 and 1, using a sigmoid activation function can make it work

In [None]:
# Install necessary libraries
import tensorflow as tf
import tensorflow.keras as keras
from keras.models import Model, load_model
from keras.layers import Dense, Input, Dropout, LSTM, Activation, Bidirectional, Flatten
from keras.layers import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.regularizers import l1, l2
from keras.optimizers import Adam

In [None]:
#Create training and testing datasets
from sklearn.model_selection import train_test_split

train, test = train_test_split(Data, test_size=0.30)

In [None]:
#Get sentences and prepare to tokenize data
all_sentences = pd.concat([train, test], axis=0)
tokenizer = Tokenizer(split=' ')

In [None]:
#Tokenize all data extracting important features

tokenizer.fit_on_texts(all_sentences['Review'])
sequences = tokenizer.texts_to_sequences(all_sentences['Review'])
word_index = tokenizer.word_index
data = sequence.pad_sequences(sequences, padding = 'post')

In [None]:
#Vectorize data
from gensim.models import FastText

In [None]:
#Building vocab to vectorize data
fasttext_model=FastText(alpha=0.025,window=5,min_count=1,workers=4)
fasttext_model.build_vocab(sequences)

In [None]:
#Vectorizing data
fasttext_model.train(sequences, total_examples=len(sequences), epochs=3)
fasttext_model.save("review_model.bin")

In [None]:
#Create embedding matrix based on vectorization
vocab_size = len(word_index) + 1 
print(vocab_size)
embedding_matrix = np.random.random((vocab_size, 100))
for word, i in word_index.items():
    try:
        embedding_vector = fasttext_model.wv[word]
    except:
        print(word, 'not found')
    if embedding_vector is not None:
        embedding_matrix[i, :] = embedding_vector

131662


Here I use an embedded layer for the vectorization
The bidirectional LSTM layer is used so that the model gains context both forwards and backwards in the text sequences

In [None]:
#building model
model = keras.Sequential()
model.add(Input(shape=(len(data[0]),), dtype='int32'))
model.add(Embedding(input_dim=vocab_size, output_dim=100, weights=[embedding_matrix]))
model.add(Bidirectional(LSTM(units = 128, kernel_regularizer=l1(0.000001), return_sequences = True)))
model.add(Dropout(rate = 0.1))
model.add(Dense(128, activation='sigmoid'))
model.add(Dense(1, activation='sigmoid'))

In [None]:
#Viewing model
model = build_model()
model.summary()

Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 60)]              0         
                                                                 
 embedding_3 (Embedding)     (None, 60, 100)           13166200  
                                                                 
 bidirectional_3 (Bidirecti  (None, 60, 256)           234496    
 onal)                                                           
                                                                 
 dropout_3 (Dropout)         (None, 60, 256)           0         
                                                                 
 dense_6 (Dense)             (None, 60, 128)           32896     
                                                                 
 dense_7 (Dense)             (None, 60, 1)             129       
                                                           

In [None]:
#Checking shape and making train and test for the model
train_data = data[:train.shape[0]]
test_data = data[train.shape[0]:]
print(train_data.shape, test_data.shape)

(531096, 60) (227613, 60)


In [None]:
#Splitting data as necessary
X_train_val, X_test, Y_train_val, Y_test = train_test_split(train_data, train["Rating"].values, test_size=0.3)
X_train, X_val, Y_train, Y_val = train_test_split(X_train_val, Y_train_val, test_size=0.3)

In [None]:
#Restructuring dependent variable as necessary
Y_train = np.asarray(Y_train).astype('float32').reshape((-1,1))
Y_test = np.asarray(Y_test).astype('float32').reshape((-1,1))

Here, since it is a continuous problem, we want to use either mean absolute or mean squared error
I choose mean squared error because I want the training to punish outliers since some reviewers give unfair/unreasonable reviews
It should be noted that since this is a continuous problem, the standard accuracy metric isn't useful. Instead, later I'll use the R^2

In [None]:
#Setting optimizer and fitting the model
adam = Adam(learning_rate=0.001)
model.compile(loss='mean_sqared_error',optimizer=adam,metrics=['accuracy'])
history = model.fit(X_train, Y_train, batch_size=128, epochs=5, validation_data = (X_val, Y_val))



In [2]:
#Checking predictions on test dataset
model.evaluate(X_test,Y_test,batch_size=64)

In [None]:
#Checking predictions on user specified review
a = np.array(tokenizer.texts_to_sequences(['quite possibly the worst movie ever made']))
model.predict(a)

In [None]:
results = model.predict(test_data)
results_mean = []
i = 0
while i < len(test['Rating'].values):
    testings.append(results[i][0])
    i = i + 1


Here, the R^2 value in combination with the loss from before can be used to determine the effectiveness of the model

It is important that an R^2 approaching 1 may not be ideal as it could indicate overfitting.
Additionally, an R^2 of 0 indicates that the model always predicts the mean 

In [None]:
import sklearn
sklearn.metrics.r2_score(test['Rating'].values, testings)