In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import pickle
import os
import sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

import tensorflow as tf
from tensorflow import keras
# from tensorflow.keras.layers import Conv2D, MaxPooling2D,  Dropout, Dense, Activation, BatchNormalization, Flatten
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
# from tensorflow.keras.applications import vgg16, inception_v3, resnet50, mobilenet
from tensorflow.keras.models import load_model
from sklearn.metrics import confusion_matrix
import itertools

In [2]:
# import data
data_file = '../data/2018_movie_links.csv'

In [102]:
data_2018 = pd.read_csv(data_file)

In [103]:
data_2018 = data_2018.drop(['Unnamed: 0', 'verified', 'rank', 'also_buy', 'also_view', 'details'], axis=1)

In [104]:
# data_2018.head()

In [105]:
data_2018 = data_2018.rename(columns={'overall':'rating', 'asin':'movieID'})

In [256]:
reviewer_count = data_2018.groupby('reviewerID')['rating'].count()
product_count = data_2018.groupby('movieID')['rating'].count()
average_rating = data_2018.groupby('movieID')['rating'].mean()

In [257]:
# remove reviewers that has only one review.
data_2018_1 = data_2018.merge(reviewer_count, on='reviewerID')

In [258]:
data_2018_1 = data_2018_1.rename(columns={'rating_y':'reviewer_count', 'rating_x':'rating'})

In [259]:
data_2018_1 = data_2018_1.merge(product_count, on='movieID')

In [260]:
data_2018_1 = data_2018_1.rename(columns={'rating_y':'movie_count', 'rating_x':'rating'})

In [263]:
data_2018_1 = data_2018_1.merge(average_rating, on='movieID')

In [265]:
data_2018_1 = data_2018_1.rename(columns={'rating_y':'average_rating', 'rating_x':'rating'})

In [266]:
data_2018_1.head()

Unnamed: 0,rating,reviewTime,reviewerID,movieID,style,reviewerName,reviewText,summary,unixReviewTime,vote,category,title,main_cat,description,brand,price,links,reviewer_count,movie_count,average_rating
0,5.0,"04 8, 2018",A1CW3NLH9MBQRY,6303022901,{'Format:': ' DVD'},Sally Nunez,I really enjoyed this movie. Brings tears to m...,Five Stars,1523145600,,"['Movies & TV', 'Genre for Featured Categories...",The Joy Luck Club VHS,Movies & TV,['Produced by Academy Award(R)-winning filmmak...,Tamlyn Tomita,$3.28,https://www.amazon.com/product-reviews/6303022...,3,21,4.857143
1,5.0,"04 2, 2018",A1WK0IRZ08NX9X,6303022901,{'Format:': ' DVD'},Ammie28,"If you have never seen this movie, you may jus...",Heart Touching...,1522627200,,"['Movies & TV', 'Genre for Featured Categories...",The Joy Luck Club VHS,Movies & TV,['Produced by Academy Award(R)-winning filmmak...,Tamlyn Tomita,$3.28,https://www.amazon.com/product-reviews/6303022...,4,21,4.857143
2,5.0,"03 30, 2018",A2HGXJQCQTXE4E,6303022901,{'Format:': ' Blu-ray'},Pa nhia lee,awesome all time fave movie.,Five Stars,1522368000,,"['Movies & TV', 'Genre for Featured Categories...",The Joy Luck Club VHS,Movies & TV,['Produced by Academy Award(R)-winning filmmak...,Tamlyn Tomita,$3.28,https://www.amazon.com/product-reviews/6303022...,4,21,4.857143
3,5.0,"03 30, 2018",A14ASTA78EK120,6303022901,{'Format:': ' DVD'},lalush,The greatest movie ever!!!!!,Five Stars,1522368000,,"['Movies & TV', 'Genre for Featured Categories...",The Joy Luck Club VHS,Movies & TV,['Produced by Academy Award(R)-winning filmmak...,Tamlyn Tomita,$3.28,https://www.amazon.com/product-reviews/6303022...,3,21,4.857143
4,5.0,"03 29, 2018",A2EGT1RXKVOXTJ,6303022901,{'Format:': ' Amazon Video'},Karin,Love this movie,Five Stars,1522281600,,"['Movies & TV', 'Genre for Featured Categories...",The Joy Luck Club VHS,Movies & TV,['Produced by Academy Award(R)-winning filmmak...,Tamlyn Tomita,$3.28,https://www.amazon.com/product-reviews/6303022...,3,21,4.857143


In [267]:
data_2018_1 = data_2018_1[data_2018_1['reviewer_count']>1]
data_2018_1 = data_2018_1[data_2018_1['movie_count']>1]

In [268]:
data_2018_1.shape

(116700, 20)

In [269]:
data_2018_1.sort_values('movie_count').head()

Unnamed: 0,rating,reviewTime,reviewerID,movieID,style,reviewerName,reviewText,summary,unixReviewTime,vote,category,title,main_cat,description,brand,price,links,reviewer_count,movie_count,average_rating
143214,2.0,"04 21, 2018",A3AUTVJ6HA7OO0,B0002VER6A,{'Format:': ' DVD'},Keith M.,Listed with English Subtitles. It does not hav...,Two Stars,1524268800,,"['Movies & TV', 'Genre for Featured Categories...",La Mafia De Un Gallero,Movies & TV,"['Starring: Sebastian Ligarde, Eleazar Garcia ...",Socorro Albarran,$3.79,https://www.amazon.com/product-reviews/B0002VE...,12,2,2.0
175763,5.0,"04 1, 2018",AEGZZGMLHEZGX,B0019BI0W4,{'Format:': ' DVD'},Truent101,"Love this show, creepy!",Five Stars,1522540800,,"['Movies & TV', 'Boxed Sets', 'Documentary']",A Haunting Season 4,Movies & TV,"[""This 3DVD set of the fourth season of A Haun...",Haunting,$10.29,https://www.amazon.com/product-reviews/B0019BI...,2,2,5.0
175762,5.0,"03 31, 2018",A3K4MUNEA7T5Q1,B0019BI0W4,{'Format:': ' DVD'},Robyn Cano,I like that it tells a lot about what goes on ...,I like that it tells a lot about what goes on ...,1522454400,,"['Movies & TV', 'Boxed Sets', 'Documentary']",A Haunting Season 4,Movies & TV,"[""This 3DVD set of the fourth season of A Haun...",Haunting,$10.29,https://www.amazon.com/product-reviews/B0019BI...,2,2,5.0
155016,5.0,"05 16, 2018",A3RV0307UNWEEX,B00005JMMT,{'Format:': ' DVD'},Linda C.,great series. too bad they did not continue t...,Not the run of the mill cop/firefighter/crime ...,1526428800,,"['Movies & TV', 'Studio Specials', 'Lionsgate ...",Boomtown - Season One,Movies & TV,"['I\'d say a lot more about NBC, but I don\'t ...",Donnie Wahlberg,$19.98,https://www.amazon.com/product-reviews/B00005J...,5,2,5.0
129966,5.0,"01 30, 2018",A6IDZ37BVM6DJ,B00XZZMTOM,{'Format:': ' DVD'},george botelho,1,Five Stars,1517270400,,"['Movies & TV', 'Independently Distributed', '...",I Am Chris Farley,Movies & TV,"[""I Am Chris Farley is a documentary film that...",Christina Applegate,$5.45,https://www.amazon.com/product-reviews/B00XZZM...,16,2,5.0


## encode reviewerID and movieID to index

In [270]:
from sklearn.preprocessing import LabelEncoder
reviewer_enc = LabelEncoder()
data_2018_1['reviewer'] = reviewer_enc.fit_transform(data_2018_1['reviewerID'].astype(str).values)
n_reviewers = data_2018_1['reviewer'].nunique()
movie_enc = LabelEncoder()
data_2018_1['movie'] = movie_enc.fit_transform(data_2018_1['movieID'].astype(str).values)
n_movies = data_2018_1['movie'].nunique()
data_2018_1['rating'] = data_2018_1['rating'].values.astype(np.float32)
min_rating = min(data_2018_1['rating'])
max_rating = max(data_2018_1['rating'])
print(n_reviewers, n_movies)

35992 20298


In [271]:
X = data_2018_1[['reviewer','movie']].values
y = data_2018_1['rating'].values
X.shape, y.shape

((116700, 2), (116700,))

In [272]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((93360, 2), (23340, 2), (93360,), (23340,))

# Create the model

In [273]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Reshape, Dot
from tensorflow.keras.layers import Embedding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2

In [274]:
n_factors = 50
X_train_array = [X_train[:, 0], X_train[:, 1]]
X_test_array = [X_test[:, 0], X_test[:, 1]]

In [275]:
def RecommenderV4(n_reviewers, n_movies, n_factors, loss, opt, metrics):
    reviewer = Input(shape=(1,))
    r = Embedding(n_reviewers, n_factors, embeddings_initializer='he_normal',
                  embeddings_regularizer=l2(1e-6))(reviewer)
    r = Reshape((n_factors,))(r)
    
    movie = Input(shape=(1,))
    m = Embedding(n_movies, n_factors, embeddings_initializer='he_normal',
                  embeddings_regularizer=l2(1e-6))(movie)
    m = Reshape((n_factors,))(m)
    
    x = Dot(axes=1)([r, m])
    model = Model(inputs=[reviewer, movie], outputs=x)
    opt = Adam(lr=0.001)
    model.compile(loss=loss, optimizer=opt, metrics=metrics)
    return model

In [276]:
loss='mean_squared_error'
opt = Adam(lr=0.001)
n_factors=50
metrics=['mae']

model1 = RecommenderV4(n_reviewers, n_movies,n_factors, loss, opt, metrics)

In [206]:
## base model
history = model1.fit(x=X_train_array, 
                     y=y_train, 
                     batch_size=64, 
                     epochs=10,
                     verbose=1,
                     validation_data=(X_test_array, y_test))

Train on 93360 samples, validate on 23340 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [277]:
from tensorflow.keras.layers import Add, Activation, Lambda
class EmbeddingLayer:
    def __init__(self, n_items, n_factors):
        self.n_items = n_items
        self.n_factors = n_factors
    
    def __call__(self, x):
        x = Embedding(self.n_items, self.n_factors, embeddings_initializer='he_normal',
                      embeddings_regularizer=l2(1e-6))(x)
        x = Reshape((self.n_factors,))(x)
        return x
def RecommenderV2(n_reviewers, n_movies, n_factors, min_rating, max_rating):
    reviewer = Input(shape=(1,))
    r = EmbeddingLayer(n_reviewers, n_factors)(reviewer)
    rb = EmbeddingLayer(n_reviewers, 1)(reviewer)
    
    movie = Input(shape=(1,))
    m = EmbeddingLayer(n_movies, n_factors)(movie)
    mb = EmbeddingLayer(n_movies, 1)(movie)
    x = Dot(axes=1)([r, m])
    x = Add()([x, rb, mb])
    x = Activation('sigmoid')(x)
    x = Lambda(lambda x: x * (max_rating - min_rating) + min_rating)(x)
    model = Model(inputs=[reviewer, movie], outputs=x)
    opt = Adam(lr=0.001)
    model.compile(loss='mean_squared_error', optimizer=opt, metrics=['mae'])
    return model

In [146]:
model = RecommenderV2(n_reviewers, n_movies, n_factors, min_rating, max_rating)
model.summary()

Model: "model_9"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_19 (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_20 (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding_18 (Embedding)        (None, 1, 50)        1799600     input_19[0][0]                   
__________________________________________________________________________________________________
embedding_20 (Embedding)        (None, 1, 50)        1014900     input_20[0][0]                   
____________________________________________________________________________________________

In [147]:
history = model.fit(x=X_train_array, 
                    y=y_train, 
                    batch_size=64, 
                    epochs=10,
                    verbose=1, 
                    validation_data=(X_test_array, y_test))

Train on 93360 samples, validate on 23340 samples
Epoch 1/10


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Add layers to model as deep learning

In [278]:
from tensorflow.keras.layers import Concatenate, Dense, Dropout
def RecommenderNet(n_reviewers, n_movies, n_factors, min_rating, max_rating):
    reviewer = Input(shape=(1,))
    r = EmbeddingLayer(n_reviewers, n_factors)(reviewer)
    
    movie = Input(shape=(1,))
    m = EmbeddingLayer(n_movies, n_factors)(movie)
    
    x = Concatenate()([r, m])
    x = Dropout(0.05)(x)
    
    x = Dense(10, kernel_initializer='he_normal')(x)
    x = Activation('relu')(x)
    x = Dropout(0.5)(x)
    
    x = Dense(10, kernel_initializer='he_normal')(x)
    x = Activation('relu')(x)
    x = Dropout(0.5)(x)
    
    x = Dense(1, kernel_initializer='he_normal')(x)
    x = Activation('sigmoid')(x)
    x = Lambda(lambda x: x * (max_rating - min_rating) + min_rating)(x)
    model = Model(inputs=[reviewer, movie], outputs=x)
    opt = Adam(lr=0.001)
    model.compile(loss='mean_squared_error', optimizer=opt)
    return model

In [164]:
model2 = RecommenderNet(n_reviewers, n_movies, n_factors, min_rating, max_rating)
model2.summary()

Model: "model_11"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_35 (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_36 (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding_36 (Embedding)        (None, 1, 50)        1799600     input_35[0][0]                   
__________________________________________________________________________________________________
embedding_37 (Embedding)        (None, 1, 50)        1014900     input_36[0][0]                   
___________________________________________________________________________________________

In [165]:
history = model.fit(x=X_train_array, 
                    y=y_train, 
                    batch_size=64, 
                    epochs=15,
                    verbose=1, 
                    validation_data=(X_test_array, y_test))

Train on 93360 samples, validate on 23340 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [203]:
from tensorflow.keras.layers import Concatenate, Dense, Dropout, Flatten
from tensorflow.keras import regularizers
def RecommenderNet_2(n_reviewers, n_movies, n_factors, min_rating, max_rating):
    reviewer = Input(shape=(1,))
    r = EmbeddingLayer(n_reviewers, n_factors)(reviewer)
    
    movie = Input(shape=(1,))
    m = EmbeddingLayer(n_movies, n_factors)(movie)
    
    x = Concatenate()([r, m])
    x = Dropout(0.05)(x)
    
    x = Dense(10, kernel_initializer='he_normal', kernel_regularizer=regularizers.l2(0.01))(x)
    x = Activation('relu')(x)
    x = Dropout(0.5)(x)
    
    x = Dense(10, kernel_initializer='he_normal')(x)
    x = Activation('relu')(x)
    x = Dropout(0.5)(x)
    
    x = Flatten()(x)
    x = Dropout(0.5)(x)
    
    x = Dense(1, kernel_initializer='he_normal')(x)
    x = Activation('sigmoid')(x)
    x = Lambda(lambda x: x * (max_rating - min_rating) + min_rating)(x)
    model = Model(inputs=[reviewer, movie], outputs=x)
    opt = Adam(lr=0.001)
    model.compile(loss='mean_squared_error', optimizer=opt, metrics=['mae'])
    return model

In [202]:
model2 = RecommenderNet_2(n_reviewers, n_movies, n_factors, min_rating, max_rating)
model2.summary()

Model: "model_17"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_57 (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_58 (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding_58 (Embedding)        (None, 1, 50)        1799600     input_57[0][0]                   
__________________________________________________________________________________________________
embedding_59 (Embedding)        (None, 1, 50)        1014900     input_58[0][0]                   
___________________________________________________________________________________________

In [181]:
history = model2.fit(x=X_train_array, 
                    y=y_train, 
                    batch_size=64, 
                    epochs=15,
                    verbose=1, 
                    validation_data=(X_test_array, y_test))

Train on 93360 samples, validate on 23340 samples
Epoch 1/15


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [204]:
model3 = RecommenderNet_2(n_reviewers, n_movies, n_factors, min_rating, max_rating)
model3.summary()

Model: "model_18"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_59 (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_60 (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding_60 (Embedding)        (None, 1, 50)        1799600     input_59[0][0]                   
__________________________________________________________________________________________________
embedding_61 (Embedding)        (None, 1, 50)        1014900     input_60[0][0]                   
___________________________________________________________________________________________

In [205]:
history = model3.fit(x=X_train_array, 
                    y=y_train, 
                    batch_size=64, 
                    epochs=5,
                    verbose=1, 
                    validation_data=(X_test_array, y_test))

Train on 93360 samples, validate on 23340 samples
Epoch 1/5


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [207]:
model3.save('final_model.h5')

# Cross validation

In [208]:
from tensorflow.keras.models import load_model
model_cv = load_model('final_model.h5')

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [220]:
model_cv.evaluate(X_test_array, y_test)



[1.5560174567329648, 0.7738759]

In [210]:
from sklearn.model_selection import StratifiedKFold
import numpy

In [212]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cvscores = []
for train_index, test_index in kfold.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

In [216]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((93360, 2), (93360,), (23340, 2), (23340,))

In [232]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cvscores = []
for train_index, test_index in kfold.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    X_train_array = [X_train[:, 0], X_train[:, 1]]
    X_test_array = [X_test[:, 0], X_test[:, 1]]
      # create model
    model_cv.fit(x=X_train_array, 
               y=y_train, 
               epochs=5, 
               verbose=1,
               validation_data=(X_test_array, y_test))
    #evaluate the model
    scores = model_cv.evaluate(x=X_test_array, y=y_test, verbose=1)
    print(model_cv.metrics_names[1], scores[1])
    cvscores.append(scores[1])


Train on 93360 samples, validate on 23340 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
mae 0.57497174
Train on 93360 samples, validate on 23340 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
mae 0.57323205
Train on 93360 samples, validate on 23340 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
mae 0.5948964
Train on 93360 samples, validate on 23340 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
mae 0.62406003
Train on 93360 samples, validate on 23340 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
mae 0.6115439
(0.59574085, 0.019952081)


In [233]:
print((numpy.mean(cvscores), numpy.std(cvscores)))

(0.59574085, 0.019952081)


# Make recommendations

In [238]:
movie_data = np.array(list(set(data_2018_1.movie)))
user = np.array([10000 for i in range(len(movie_data))])
predictions = model_cv.predict([user, movie_data])
predictions = np.array([a[0] for a in predictions])
recommended_movie_ids = (-predictions).argsort()[:5]
print(recommended_movie_ids)
print(predictions[recommended_movie_ids])

[10927 11458 19433  5131  5000]
[5. 5. 5. 5. 5.]


In [280]:
recommend = data_2018_1[data_2018_1['movie'].isin(recommended_movie_ids)]
recommend = (recommend[['title', 'average_rating','category', 'description', 'price', 'links']]).drop_duplicates()
recommend

Unnamed: 0,title,average_rating,category,description,price,links
5687,The King's Speech,4.632353,"['Movies & TV', 'Blu-ray', 'Movies']",['After the death of his father King George V ...,$5.00,https://www.amazon.com/product-reviews/B003UES...
34940,Under the Tuscan Sun VHS,4.973684,"['Movies & TV', 'Genre for Featured Categories...","[""This is a nice condition VHS, ideal for coll...",$14.95,https://www.amazon.com/product-reviews/B0000VD...
37254,Guardians of the Galaxy Vol. 1,4.764706,"['Movies & TV', 'Blu-ray', 'Movies']","[""From Marvel the studio that brought you Marv...",$32.47,https://www.amazon.com/product-reviews/B01A9R6...
61883,School of Rock VHS,4.888889,"['Movies & TV', 'Paramount Home Entertainment'...",,$15.92,https://www.amazon.com/product-reviews/B00018Y...
64222,The Lost Valentine Hallmark Hall of Fame,4.945946,"['Movies & TV', 'Genre for Featured Categories...","[""During World War II, Navy Lt. Neil Thomas bi...",$8.99,https://www.amazon.com/product-reviews/B004LO0...


In [243]:
data_2018_1[data_2018_1['reviewer']==10000][['title', 'reviewer','rating','summary']].drop_duplicates()

Unnamed: 0,title,reviewer,rating,summary
75301,Lone Survivor [DVD],10000,5.0,Five Stars
78851,Fury 2014,10000,5.0,"very good movie, lots of action to keep your a..."
