In [66]:
# !pip install wget
import os
import os.path
import numpy as np
import pandas as pd
from math import sqrt
from heapq import nlargest
from tqdm import trange
from tqdm import tqdm
from scipy import stats
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
from zipfile import ZipFile
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from pathlib import Path
from keras.utils.vis_utils import plot_model
import random

In [2]:
from zipfile import ZipFile

In [3]:
file_name = "ml-100k.zip"

In [4]:
with ZipFile(file_name, 'r') as zip: 
    # printing all the contents of the zip file 
    zip.printdir() 
  
    # extracting all the files 
    zip.extractall() 

File Name                                             Modified             Size
ml-100k/                                       2016-01-29 14:26:28            0
ml-100k/allbut.pl                              2000-07-19 16:09:28          716
ml-100k/mku.sh                                 2000-07-19 16:09:28          643
ml-100k/README                                 2016-01-29 14:26:28         6750
ml-100k/u.data                                 2000-07-19 16:09:30      1979173
ml-100k/u.genre                                2000-07-19 16:09:30          202
ml-100k/u.info                                 2000-07-19 16:09:30           36
ml-100k/u.item                                 2000-07-19 16:09:30       236344
ml-100k/u.occupation                           2000-07-19 16:09:30          193
ml-100k/u.user                                 2000-07-19 16:09:30        22628
ml-100k/u1.base                                2001-03-08 12:33:08      1586544
ml-100k/u1.test                         

In [5]:
MOVIELENS_DIR = "ml-100k"

In [6]:
def getData(folder_path, file_name):
    fields = ['userID', 'itemID', 'rating', 'timestamp']
    data = pd.read_csv(os.path.join(folder_path, file_name), sep='\t', names=fields)
    return data

In [7]:
rating_df = getData(MOVIELENS_DIR, 'u.data')

In [9]:
rating_df

Unnamed: 0,userID,itemID,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


In [13]:
user_ids = rating_df["userID"].unique().tolist()
user2user_encoded = {x: i for i, x in enumerate(user_ids)}
userencoded2user = {i: x for i, x in enumerate(user_ids)}
movie_ids = rating_df["itemID"].unique().tolist()
movie2movie_encoded = {x: i for i, x in enumerate(movie_ids)}
movie_encoded2movie = {i: x for i, x in enumerate(movie_ids)}
rating_df["user"] = rating_df["userID"].map(user2user_encoded)
rating_df["movie"] = rating_df["itemID"].map(movie2movie_encoded)

num_users = len(user2user_encoded)
num_movies = len(movie_encoded2movie)
rating_df["rating"] = rating_df["rating"].values.astype(np.float32)
# min and max ratings will be used to normalize the ratings later
min_rating = min(rating_df["rating"])
max_rating = max(rating_df["rating"])

In [16]:
rating_df = rating_df.sample(frac=1, random_state=1)
x = rating_df[["user", "movie"]].values

y = rating_df["rating"].apply(lambda x: (x - min_rating) / (max_rating - min_rating)).values

train_indices = int(0.9 * rating_df.shape[0])
x_train, x_val, y_train, y_val = (x[:train_indices],x[train_indices:],y[:train_indices],y[train_indices:])

In [28]:
EMBEDDING_SIZE = 50

class RecommenderNet(keras.Model):
    def __init__(self, num_users, num_movies, embedding_size, **kwargs):
        super(RecommenderNet, self).__init__(**kwargs)
        self.num_users = num_users
        self.num_movies = num_movies
        self.embedding_size = embedding_size
        self.user_embedding = layers.Embedding(
            num_users,
            embedding_size,
            embeddings_initializer="he_normal",
            embeddings_regularizer=keras.regularizers.l2(1e-6)
        )
        self.user_bias = layers.Embedding(num_users, 1)
        self.movie_embedding = layers.Embedding(
            num_movies,
            embedding_size,
            embeddings_initializer="he_normal",
            embeddings_regularizer=keras.regularizers.l2(1e-6)
        )
        self.movie_bias = layers.Embedding(num_movies, 1)

    def call(self, inputs):
        user_vector = self.user_embedding(inputs[:, 0])
        user_bias = self.user_bias(inputs[:, 0])
        movie_vector = self.movie_embedding(inputs[:, 1])
        movie_bias = self.movie_bias(inputs[:, 1])
        dot_user_movie = tf.tensordot(user_vector, movie_vector, 2)
        x = dot_user_movie + user_bias + movie_bias
        return tf.nn.sigmoid(x)

model = RecommenderNet(num_users, num_movies, EMBEDDING_SIZE)
model.compile(loss=tf.keras.losses.BinaryCrossentropy(), optimizer=keras.optimizers.Adam(lr=0.001))

In [29]:
history = model.fit(x=x_train,y=y_train,batch_size=64,epochs=5,verbose=1,validation_data=(x_val, y_val))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [30]:
model.summary()

Model: "recommender_net_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_16 (Embedding)     multiple                  47150     
_________________________________________________________________
embedding_17 (Embedding)     multiple                  943       
_________________________________________________________________
embedding_18 (Embedding)     multiple                  84100     
_________________________________________________________________
embedding_19 (Embedding)     multiple                  1682      
Total params: 133,875
Trainable params: 133,875
Non-trainable params: 0
_________________________________________________________________


In [34]:
fieldsMovies = ['movieID', 'movieTitle', 'releaseDate', 'videoReleaseDate', 'IMDbURL', 'unknown', 'action', 'adventure',
          'animation', 'childrens', 'comedy', 'crime', 'documentary', 'drama', 'fantasy', 'filmNoir', 'horror',
          'musical', 'mystery', 'romance','sciFi', 'thriller', 'war', 'western']

moviesDF = pd.read_csv(os.path.join(MOVIELENS_DIR, 'u.item'), sep='|', names=fieldsMovies, encoding='latin-1')

In [77]:
user_id = rating_df.userID.sample(1, random_state=1).iloc[0]
movies_watched_by_user = rating_df[rating_df.userID == user_id]
movies_not_watched = moviesDF[
    ~moviesDF["movieID"].isin(movies_watched_by_user.itemID.values)
]["movieID"]
movies_not_watched = list(
    set(movies_not_watched).intersection(set(movie2movie_encoded.keys()))
)
movies_not_watched = [[movie2movie_encoded.get(x)] for x in movies_not_watched]
user_encoder = user2user_encoded.get(user_id)
user_movie_array = np.hstack(
    ([[user_encoder]] * len(movies_not_watched), movies_not_watched)
)
ratings = model.predict(user_movie_array).flatten()
top_ratings_indices = ratings.argsort()[-10:][::-1]
recommended_movie_ids = [
    movie_encoded2movie.get(movies_not_watched[x][0]) for x in top_ratings_indices
]

print('movie recommendations for user %i:' %user_id)

recommended_movies = moviesDF[moviesDF["movieID"].isin(recommended_movie_ids)]
num = 0
print('')
for row in recommended_movies.itertuples():
    num+=1
    print('%i: %s' %(num,row.movieTitle))

movie recommendations for user 279:

1: Silence of the Lambs, The (1991)
2: Godfather, The (1972)
3: Citizen Kane (1941)
4: Schindler's List (1993)
5: Vertigo (1958)
6: Casablanca (1942)
7: Maltese Falcon, The (1941)
8: It's a Wonderful Life (1946)
9: African Queen, The (1951)
10: Lawrence of Arabia (1962)
