This notebook uses deep learning (CNN, GRU, LSTM) to predict user ratings of movies based on user reviews. 

In [44]:
import pandas as pd
import numpy as np
import json
from time import time
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import text_to_word_sequence

Data pre-processing.

In [47]:
def get_list_of_dicts(fname): 
    return [json.loads(i) for i in open(fname, "rt")]

raw_data = get_list_of_dicts("Amazon_Instant_Video_5.json")
data = pd.DataFrame(raw_data).loc[:, ["reviewerID", "reviewText", "asin", "overall"]]

data = data[:10000] # only 10 000 user reviews

data.head()

Unnamed: 0,reviewerID,reviewText,asin,overall
0,A11N155CW1UV02,I had big expectations because I love English ...,B000H00VBQ,2.0
1,A3BC8O2KCL29V2,I highly recommend this series. It is a must f...,B000H00VBQ,5.0
2,A60D5HQFOTSOM,This one is a real snoozer. Don't believe anyt...,B000H00VBQ,1.0
3,A1RJPIGRSNX4PW,Mysteries are interesting. The tension betwee...,B000H00VBQ,4.0
4,A16XRPF40679KG,"This show always is excellent, as far as briti...",B000H00VBQ,5.0


In [48]:
def add_user_reviews(x):
    ur = user_reviews.loc[x["reviewerID"]].drop(x["asin"]).values.tolist()
    mr = movie_reviews.loc[x["asin"]].drop(x["reviewerID"]).values.tolist()
    x["userReviews"] = " ".join(list(map(lambda x: x[0], ur)))
    x["movieReviews"] = " ".join(list(map(lambda x: x[0], mr)))
    return x

user_item_review = data.drop("reviewText", axis=1)
user_reviews = pd.pivot_table(data, index=["reviewerID", "asin"], aggfunc=lambda x: x).drop("overall", axis=1)  
movie_reviews = pd.pivot_table(data, index=["asin", "reviewerID"], aggfunc=lambda x: x).drop("overall", axis=1)

df = user_item_review.apply(add_user_reviews, axis=1)
df.head()

Unnamed: 0,reviewerID,asin,overall,userReviews,movieReviews
0,A11N155CW1UV02,B000H00VBQ,2.0,,"This show always is excellent, as far as briti..."
1,A3BC8O2KCL29V2,B000H00VBQ,5.0,,I had big expectations because I love English ...
2,A60D5HQFOTSOM,B000H00VBQ,1.0,I watched this a couple of weeks ago. There ar...,I had big expectations because I love English ...
3,A1RJPIGRSNX4PW,B000H00VBQ,4.0,"The acting was excellent. The acting, the rel...",I had big expectations because I love English ...
4,A16XRPF40679KG,B000H00VBQ,5.0,As many people said this show kept getting bet...,I had big expectations because I love English ...


In [None]:
# Train-test split 
test_size = 0.005

# get test_size percentage of users
unique_users = df.loc[:, "reviewerID"].unique()
users_size = len(unique_users)
test_idx = np.random.choice(users_size, size=int(users_size * test_size), replace=False)

# get test users
test_users = unique_users[test_idx]

# everyone else is a training user
train_users = np.delete(unique_users, test_idx)

test = df[df["reviewerID"].isin(test_users)]
train = df[df["reviewerID"].isin(train_users)]

unique_test_movies = test["asin"].unique()

# drop the movies that also appear in our test set. In order to be
# a true train/test split, we are forced to discard some data entirely
train = train.where(np.logical_not(train["asin"].isin(unique_test_movies))).dropna()

train.head()

Embed the reviews into GloVe word2vect model. 

The pre-trained GloVe model is downloadable at
https://nlp.stanford.edu/projects/glove/

In [None]:
import os.path
# functions to embed user reviews into the GloVe word2vect model
def init_embeddings_map(fname):
    with open(os.path.join("glove.6B", fname), encoding="utf8") as glove:
        return {l[0]: np.asarray(l[1:], dtype="float32") for l in
                [line.split() for line in glove]}

def get_embed_func(i_len, u_len, pad_value, embedding_map):
    def embed(row):
        sentence = row["userReviews"].split()[:u_len]
        reviews = list(map(lambda word: embedding_map.get(word)
            if word in embedding_map else pad_value, sentence))
        row["userReviews"] = reviews +[pad_value] * (u_len - len(reviews))
        sentence = row["movieReviews"].split()[:i_len]
        reviews = list(map(lambda word: embedding_map.get(word) if word in embedding_map else pad_value, sentence))
        row["movieReviews"] = reviews +[pad_value] * (i_len - len(reviews))
        return row
    return embed

print("Before Embedding")
emb_size = 50 #or 100, 200, 300
embedding_map = init_embeddings_map("glove.6b." + str(emb_size) + "d.10000.txt")
print("After Embedding")

print("Before Apply")
user_sizes = df.loc[:, "userReviews"].apply(lambda x: x.split()).apply(len)
item_sizes = df.loc[:, "movieReviews"].apply(lambda x: x.split()).apply(len)
print("After Apply")

u_ptile = 40
i_ptile = 15
u_len = int(np.percentile(user_sizes, u_ptile))
i_len = int(np.percentile(item_sizes, i_ptile))

print("Before Embedding Function")
embedding_fn = get_embed_func(i_len, u_len, np.array([0.0] * emb_size), embedding_map)
print("After Embedding Function")

print("Before Embedding Train")
train_embedded = train.apply(embedding_fn, axis=1)
test_embedded = test.apply(embedding_fn, axis=1)
print("After Embedding Train")

print(u_len, i_len) # size of input in deep neural networks, useful to set parameters
train_embedded.head()

Deep learning models

In [None]:
import tensorflow as tf
from keras.models import Model
from keras.callbacks import EarlyStopping, TensorBoard, ModelCheckpoint
from keras.layers import Conv1D, GRU, LSTM, MaxPooling1D, Flatten
from keras.layers import Input, Dense, Dropout
from keras.layers import Add, Dot, Concatenate
import matplotlib.pyplot as plt

In [None]:
def cnn_tower(max_len, embedding_size, hidden_size, filters=4, kernel_size=10):
        input_layer = Input(shape=(max_len, embedding_size))
        tower = Conv1D(filters=filters, kernel_size=kernel_size, activation="tanh")(input_layer)
        tower = MaxPooling1D()(tower)
        tower = Conv1D(filters=filters, kernel_size=kernel_size, activation="tanh")(tower)
        tower = MaxPooling1D()(tower)
        tower = Flatten()(tower)
        tower = Dense(hidden_size, activation="relu")(tower)
        tower = Dropout(0.4)(tower)
        return input_layer, tower
    
def CNN_model(embedding_size, hidden_size, u_len, i_len):
    inputU, towerU = cnn_tower(u_len, embedding_size, hidden_size)
    inputM, towerM = cnn_tower(i_len, embedding_size, hidden_size)
    joined = Concatenate()([towerU, towerM])
    outNeuron = Dense(1)(joined)
    dotproduct = Dot(axes=1)([towerU, towerM])
    output_layer = Add()([outNeuron, dotproduct])
        
    model = Model(inputs=[inputU, inputM], outputs=[output_layer])
    return model

hidden_size = 64

model_cnn = CNN_model(emb_size, hidden_size, u_len, i_len)
model_cnn.compile(optimizer='Adam', loss='mse')
model_cnn.summary()

In [None]:
batch_size = 32
epochs = 20

user_reviews = np.array(list(train_embedded.loc[:, "userReviews"]))
movie_reviews = np.array(list(train_embedded.loc[:, "movieReviews"]))

train_inputs = [user_reviews, movie_reviews]
train_outputs = train_embedded.loc[:, "overall"]

tensorboard = TensorBoard(log_dir="cnn_log")
earlystop = EarlyStopping(monitor='val_loss', patience=3)
checkpoint = ModelCheckpoint('cnn_weights.{epoch:02d}-{val_loss:.2f}.h5', monitor='val_loss', save_best_only=True)
train_history = model_cnn.fit(train_inputs, train_outputs, callbacks=[tensorboard, earlystop, checkpoint], 
                              validation_split=0.05, batch_size=batch_size, epochs=epochs)

model_cnn.save("cnn.h5")

In [None]:
plt.plot(model_cnn.history.history['loss'])
plt.plot(model_cnn.history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Val'], loc='upper right')
plt.show()

In [None]:
def gru_tower(max_len, embedding_size, hidden_size, rnn_hidden_size, filters=2, kernel_size=8):
        input_layer = Input(shape=(max_len, embedding_size))
        tower = GRU(rnn_hidden_size, activation="tanh")(input_layer)
        tower = Dense(hidden_size, activation="relu")(tower)
        tower = Dropout(0.4)(tower)
        return input_layer, tower
    
def GRU_model(embedding_size, hidden_size, rnn_hidden_size, u_len, i_len):
    inputU, towerU = gru_tower(u_len, embedding_size, hidden_size, rnn_hidden_size)
    inputM, towerM = gru_tower(i_len, embedding_size, hidden_size, rnn_hidden_size)
    joined = Concatenate()([towerU, towerM])
    outNeuron = Dense(1)(joined)
    dotproduct = Dot(axes=1)([towerU, towerM])
    output_layer = Add()([outNeuron, dotproduct])
        
    model = Model(inputs=[inputU, inputM], outputs=[output_layer])
    return model

hidden_size = 64
rnn_hidden_size = 64

model_gru = GRU_model(emb_size, hidden_size, rnn_hidden_size, u_len, i_len)
model_gru.compile(optimizer='Adam', loss='mse')
model_gru.summary()

In [None]:
batch_size = 32
epochs = 20

tensorboard = TensorBoard(log_dir="gru_log")
earlystop = EarlyStopping(monitor='val_loss', patience=3)
checkpoint = ModelCheckpoint('gru_weights.{epoch:02d}-{val_loss:.2f}.h5', monitor='val_loss', save_best_only=True)
train_history = model_gru.fit(train_inputs, train_outputs, callbacks=[tensorboard, earlystop, checkpoint], 
                              validation_split=0.05, batch_size=batch_size, epochs=epochs)

model_gru.save("gru.h5")

In [None]:
plt.plot(model_gru.history.history['loss'])
plt.plot(model_gru.history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Val'], loc='upper right')
plt.show()

In [None]:
# LSTM model
def lstm_tower(max_len, embedding_size, hidden_size, rnn_hidden_size, filters=2, kernel_size=8):
        input_layer = Input(shape=(max_len, embedding_size))
        tower = LSTM(rnn_hidden_size, activation="tanh")(input_layer)
        tower = Dense(hidden_size, activation="relu")(tower)
        tower = Dropout(0.4)(tower)
        return input_layer, tower
    
def LSTM_model(embedding_size, hidden_size, rnn_hidden_size, u_len, i_len):
    inputU, towerU = lstm_tower(u_len, embedding_size, hidden_size, rnn_hidden_size)
    inputM, towerM = lstm_tower(i_len, embedding_size, hidden_size, rnn_hidden_size)
    joined = Concatenate()([towerU, towerM])
    outNeuron = Dense(1)(joined)
    dotproduct = Dot(axes=1)([towerU, towerM])
    output_layer = Add()([outNeuron, dotproduct])
        
    model = Model(inputs=[inputU, inputM], outputs=[output_layer])
    return model


hidden_size = 64
rnn_hidden_size = 64

model_lstm = LSTM_model(emb_size, hidden_size, rnn_hidden_size, u_len, i_len)
model_lstm.compile(optimizer='Adam', loss='mse')
model_lstm.summary()

In [None]:
batch_size = 32
epochs = 30

tensorboard = TensorBoard(log_dir="lstm_log")
earlystop = EarlyStopping(monitor='val_loss', patience=3)
checkpoint = ModelCheckpoint('lstm_weights.{epoch:02d}-{val_loss:.2f}.h5', monitor='val_loss', save_best_only=True)
train_history = model_lstm.fit(train_inputs, train_outputs, callbacks=[tensorboard, earlystop, checkpoint], 
                              validation_split=0.05, batch_size=batch_size, epochs=epochs)

model_lstm.save("lstm.h5")

In [None]:
plt.plot(model_lstm.history.history['loss'])
plt.plot(model_lstm.history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Val'], loc='upper right')
plt.show()

Prediction and errors

In [None]:
user_reviews = np.array(list(test_embedded.loc[:, "userReviews"]))
movie_reviews = np.array(list(test_embedded.loc[:, "movieReviews"]))
test_inputs = [user_reviews, movie_reviews]

true_rating = np.array(list(test_embedded.loc[:, "overall"])).reshape((-1, 1))

predictions_cnn = model_cnn.predict(test_inputs)
predictions_gru = model_gru.predict(test_inputs)
predictions_lstm = model_lstm.predict(test_inputs)

error_cnn = np.square(predictions_cnn - true_rating)
print("Test MSE for CNN model :", np.average(error_cnn))

error_gru = np.square(predictions_gru - true_rating)
print("Test MSE for GRU model :", np.average(error_gru))

error_lstm = np.square(predictions_lstm - true_rating)
print("Test MSE for LSTM model :", np.average(error_lstm))

In [55]:
def get_movies(data):
    data_movies = data["asin"]
    return list(set(data_movies))

In [63]:
def best_recommendations(predict, movies):
    num_recommendations = 10
    dist = [(i+1, predict[i]) for i in range(len(predict))]
    dist = sorted(dist, key=lambda x:x[1])[::-1]
    for i in range(num_recommendations):
        print(f"Recommended movie: {movies[dist[i][0]]}, overall: {dist[i][1][0]}")
        

In [64]:
data_movies = get_movies(data)

print("CNN")
best_recommendations(predictions_cnn, data_movies)

print("GRU")
best_recommendations(predictions_gru, data_movies)

print("LSTM")
best_recommendations(predictions_lstm, data_movies)

CNN
Recommended movie: B0047G01H0, overall: 4.48649263381958
Recommended movie: B000IKP5AM, overall: 4.480325698852539
Recommended movie: B004RZMQCE, overall: 4.357715129852295
Recommended movie: B001ENLHX6, overall: 4.352227210998535
Recommended movie: B003ZHOWFY, overall: 4.350558280944824
Recommended movie: B000MVN8HE, overall: 4.312946319580078
Recommended movie: B001VT4L7W, overall: 4.311435699462891
Recommended movie: B0051HIK04, overall: 4.281321048736572
Recommended movie: B001CMQH5M, overall: 4.229222774505615
Recommended movie: B000ULZLYY, overall: 4.228974342346191
GRU
Recommended movie: B003ZHOWFY, overall: 5.022066593170166
Recommended movie: B001RPORJ2, overall: 4.871674537658691
Recommended movie: B0057UGEUS, overall: 4.8230133056640625
Recommended movie: B000ULZLYY, overall: 4.795764446258545
Recommended movie: B000MMX5E4, overall: 4.7846269607543945
Recommended movie: B003HIC3ZW, overall: 4.7346343994140625
Recommended movie: B004JM9BXM, overall: 4.7027506828308105
Rec

In [None]:
predictions_cnn  = np.round(predictions_cnn,1)
predictions_gru  = np.round(predictions_gru,1)
predictions_lstm = np.round(predictions_lstm,1)


predictions_cnn  = np.array([x[0] for x in predictions_cnn])
predictions_gru  = np.array([x[0] for x in predictions_gru])
predictions_lstm = np.array([x[0] for x in predictions_lstm])
true_rating =      np.array([x[0] for x in true_rating])

#Here we want movies to be ranked the same or better than the real rating
true_positives_cnn = 0
true_positives_gru= 0
true_positives_lstm = 0
false_positives_cnn = 0
false_positives_gru = 0
false_positives_lstm = 0

false_negatives_cnn   = 0
false_negatives_gru = 0
false_negatives_lstm = 0


for i in range(len(true_rating)):
    # Count TP
    if predictions_cnn[i] <= true_rating[i] + 0.2 and predictions_cnn[i] >= true_rating[i] - 0.2:
        true_positives_cnn +=1

    if predictions_lstm[i] <= true_rating[i] + 0.2 and predictions_lstm[i] >= true_rating[i] - 0.2:
        true_positives_gru +=1
    
    if predictions_gru[i] <= true_rating[i] + 0.2 and predictions_gru[i] >= true_rating[i] - 0.2:
        true_positives_lstm +=1

    # Count FP
    if  predictions_cnn[i] > true_rating[i] + 0.2:
       false_positives_cnn +=1
    if  predictions_gru[i] > true_rating[i] + 0.2:
       false_positives_gru +=1
    if  predictions_lstm[i] > true_rating[i] + 0.2:
       false_positives_lstm +=1

    # Count FN
    if  predictions_cnn[i]  <  true_rating[i] - 0.2:
        false_negatives_cnn  += 1
    if  predictions_cnn[i]  <  true_rating[i] - 0.2:
        false_negatives_gru  += 1
    if  predictions_cnn[i]  <  true_rating[i] - 0.2:
        false_negatives_lstm += 1


true_negatives = 0

def calc_recall(true_positives, false_negatives):
    return true_positives /( true_positives + false_negatives)

def calc_precision(true_positives, false_positives):
    return true_positives /( true_positives + false_positives)

def f2_score(recall , precision):
    return 5 * ( ( precision * recall ) / ((5 * precision) + recall ) )

recall_cnn = calc_recall(true_positives_cnn, false_negatives_cnn) 
precision_cnn = calc_precision(true_positives_cnn, false_positives_cnn)

cnn_f2_score =  f2_score(recall_cnn,precision_cnn)
print(cnn_f2_score)



recall_gru = calc_recall(true_positives_gru, false_negatives_gru) 
precision_gru = calc_precision(true_positives_gru, false_positives_gru)

gru_f2_score =  f2_score(recall_gru,precision_gru)
print(gru_f2_score)




recall_lstm = calc_recall(true_positives_lstm, false_negatives_lstm) 
precision_lstm = calc_precision(true_positives_lstm, false_positives_lstm)

lstm_f2_score =  f2_score(recall_lstm, precision_lstm)
print(lstm_f2_score)

The data is very sparse and we need to run more epochs, and also a larger dataset. So the resulting MSE's are not so satisfying. However, we can still compare them and draw some early conclusions.

1. RNN works better than CNN. A possible reason might be that reviews are sequential data.
2. LSTM works better than GRU. More epochs will lead to better performance.