This project prompts the user to give their thoughts on a movie they have seen. Then, it will provide an estimation of what that person would rate the movie out of 5, with 5 being the best and 1 being the worst.

In [1]:
# Imports necessary packages and load the dataframe
# CSV from https://www.kaggle.com/datasets/stefanoleone992/rotten-tomatoes-movies-and-critic-reviews-dataset/
import pandas as pd
import math
import numpy as np
df = pd.read_csv('rotten_tomatoes_critic_reviews.csv')


In [2]:
#Separating necessary data from this project
#Converting letter grades to numerical grades

reviews = []
ratings = []
for review in df.itertuples(index=False):
    if (type(review[5]) == str and type(review[7]) == str):
        reviews.append(review[7])
        if review[5] == 'A+':
            ratings.append('5/5')
        elif review[5] == 'A':
            ratings.append('4.5/5')
        elif review[5] == 'A-':
            ratings.append('4.75/5')
        elif review[5] == 'A  -':
            ratings.append('4.75/5')
        elif review[5] == 'B+':
            ratings.append('4/5')
        elif review[5] == 'B':
            ratings.append('3.5/5')
        elif review[5] == 'B-':
            ratings.append('3.25/5')
        elif review[5] == 'C+':
            ratings.append('3/5')
        elif review[5] == 'C':
            ratings.append('2.5/5')
        elif review[5] == 'C-':
            ratings.append('2.25/5')
        elif review[5] == 'C  -':
            ratings.append('2.25/5')
        elif review[5] == 'D+':
            ratings.append('2/5')
        elif review[5] == 'D':
            ratings.append('1.5/5')
        elif review[5] == 'D-':
            ratings.append('1.25/5')
        elif review[5] == 'F+':
            ratings.append('1/5')
        elif review[5] == 'F':
            ratings.append('0.5/5')
        elif review[5] == 'F-':
            ratings.append('0.25/5')
        else:
            ratings.append(review[5])

In [3]:
#Converting strings and fractions to floats for the LSTM
#Taking in to account errors and some typos that were in the dataset

ratings_as_float= []
for i in ratings:
    if '/' in i:
        a , b = i.split("/")
        if float(b) == 0:
            b = 100
        to_float = float(a) / float(b)
        ratings_as_float.append(to_float)
    else:
        if float(i) < 10:
            ratings_as_float.append(float(i)/10)
        elif float(i) <= 100:
            ratings_as_float.append(float(i)/100)
        else:
            ratings_as_float.append(float(i)/1000)

In [4]:
#Load needed data into a dataframe
Data = pd.DataFrame({'Review':reviews,'Rating':ratings_as_float})


In [5]:
#Replace non-letter characters with space

import re

def preprocess(text):
    text = re.sub(r'[^\w\s\']', ' ', text)
    text = re.sub(r' + ', ' ',text)
    return text.strip().lower()

In [6]:
#Preprocess
Data['Review'] = Data['Review'].map(preprocess)

Fasttext Method

In [7]:
#Preprocess specifically for fasttext
Data['Rating'] = "__label__" + (Data['Rating'].astype(str))
Data["labeled_review"] = Data['Rating'] + " " + Data['Review']

In [29]:
#Prepare test and training
from sklearn.model_selection import train_test_split

train, test = train_test_split(Data, test_size=0.30)

In [30]:
#Export to csv
train.to_csv("review.train", columns=['labeled_review'], index=False, header=False)
test.to_csv("review.test", columns=['labeled_review'], index=False, header=False)

In [31]:
#Run fasttext model
import fasttext

model = fasttext.train_supervised("review.train")
model.test("review.test")

(37924, 0.25018457968568714, 0.25018457968568714)

In [40]:
#Predict
model.predict('The action scenes were good and the story and characters were amazing')

(('__label__0.8',), array([0.18067279]))

LSTM Method

In [7]:
from keras.preprocessing.text import Tokenizer

In [7]:
import tensorflow as tf

In [8]:
import tensorflow.keras as keras

In [9]:
from keras.models import Model, load_model
from keras.layers import Dense, Input, Dropout, LSTM, Activation, Bidirectional, Flatten
from keras.layers import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform
from keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.regularizers import l1, l2
from keras.optimizers import Adam

In [10]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(Data, test_size=0.30)

In [11]:
all_sentences = pd.concat([train, test], axis=0)
tokenizer = Tokenizer(split=' ')

In [12]:
tokenizer.fit_on_texts(all_sentences['Review'])
sequences = tokenizer.texts_to_sequences(all_sentences['Review'])
word_index = tokenizer.word_index
data = sequence.pad_sequences(sequences, padding = 'post')

In [16]:
from gensim.models import FastText

In [17]:
fasttext_model=FastText(alpha=0.025,window=5,min_count=1,workers=4)
fasttext_model.build_vocab(sequences)

In [18]:
fasttext_model.train(sequences, total_examples=len(sequences), epochs=3)
fasttext_model.save("review_model.bin")

In [19]:
vocab_size = len(word_index) + 1 
print(vocab_size)
embedding_matrix = np.random.random((vocab_size, 100))
for word, i in word_index.items():
    try:
        embedding_vector = fasttext_model.wv[word]
    except:
        print(word, 'not found')
    if embedding_vector is not None:
        embedding_matrix[i, :] = embedding_vector

131662


In [42]:
def build_model():
    sentence_indices = Input(shape=(60,), dtype='int32')
    
    embeddings = Embedding(input_dim=vocab_size, output_dim=100, weights=[embedding_matrix])(sentence_indices)      

    X = Bidirectional(LSTM(units = 128, kernel_regularizer=l1(0.000001), return_sequences = True))(embeddings)
    X = Dropout(rate = 0.2)(X)
    X = Dense(128, activation='relu')(X)
    X = Dense(1, activation='sigmoid')(X)
    
    model = Model(inputs=sentence_indices, outputs=X)
    
    return model

In [43]:
model = build_model()
model.summary()

Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 60)]              0         
                                                                 
 embedding_3 (Embedding)     (None, 60, 100)           13166200  
                                                                 
 bidirectional_3 (Bidirecti  (None, 60, 256)           234496    
 onal)                                                           
                                                                 
 dropout_3 (Dropout)         (None, 60, 256)           0         
                                                                 
 dense_6 (Dense)             (None, 60, 128)           32896     
                                                                 
 dense_7 (Dense)             (None, 60, 1)             129       
                                                           

In [44]:
train_data = data[:train.shape[0]]
test_data = data[train.shape[0]:]
print(train_data.shape, test_data.shape)

(531096, 60) (227613, 60)


In [45]:
X_train_val, X_test, Y_train_val, Y_test = train_test_split(train_data, train["Rating"].values, test_size=0.3)
X_train, X_val, Y_train, Y_val = train_test_split(X_train_val, Y_train_val, test_size=0.3)

In [46]:
Y_train = np.asarray(Y_train).astype('float32').reshape((-1,1))
Y_test = np.asarray(Y_test).astype('float32').reshape((-1,1))

In [47]:
adam = Adam(learning_rate=0.001)
model.compile(loss='mean_absolute_error',optimizer=adam,metrics=['accuracy'])
history = model.fit(X_train, Y_train, batch_size=128, epochs=1, validation_data = (X_val, Y_val))



In [60]:
test['Rating_Pred'] = model.predict(test_data)



AssertionError: Shape of new values must be compatible with manager shape

Unnamed: 0,Review,Rating
101966,bad boys misses its chance at greatness but it...,0.750
452019,if early slasher films as their critics claime...,0.375
571797,it's bright and breezy from start to finish wi...,0.700
331805,no better or worse than 'truth or consequences...,0.500
253132,quite possibly the best godzilla film since th...,0.900
...,...,...
185881,declaration of war movingly chronicles a uniqu...,0.875
655128,leigh whannell employs some nifty fight sequen...,0.800
252273,in his feature film directorial debut john sla...,0.800
569659,spider man homecoming is smart and sincere and...,1.000


In [62]:
test

array([[  38,   99,   15, ...,    0,    0,    0],
       [ 587,   56,  936, ...,    0,    0,    0],
       [ 115, 1221,   13, ...,    0,    0,    0],
       ...,
       [  12,    2,  575, ...,    0,    0,    0],
       [  98,   13,    9, ...,    0,    0,    0],
       [ 296,  335,  296, ...,    0,    0,    0]])

In [65]:
a = tokenizer.texts_to_sequences(['quite possibly the worst movie ever made'])
model.predict(a)



array([[[0.39875183],
        [0.39765948],
        [0.4182627 ],
        [0.34570062],
        [0.39893106],
        [0.34730083],
        [0.32923663]]], dtype=float32)