In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.models import Model, Sequential
import matplotlib.pyplot as plt
import pickle

In [2]:
rating= pd.read_csv('ratings.csv')


In [3]:
rating.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
#extracts ids and converts it into a list, encodes id to index in list for user and movies
user_ids = rating["userId"].unique().tolist()
user2user_encoded = {x: i for i, x in enumerate(user_ids)}
userencoded2user = {i: x for i, x in enumerate(user_ids)}
movie_ids = rating["movieId"].unique().tolist()
movie2movie_encoded = {x: i for i, x in enumerate(movie_ids)}
movie_encoded2movie = {i: x for i, x in enumerate(movie_ids)}
#adds encoded index columns to df
rating["user"] = rating["userId"].map(user2user_encoded)
rating["movie"] = rating["movieId"].map(movie2movie_encoded)

In [5]:
rating.head()

Unnamed: 0,userId,movieId,rating,timestamp,user,movie
0,1,1,4.0,964982703,0,0
1,1,3,4.0,964981247,0,1
2,1,6,4.0,964982224,0,2
3,1,47,5.0,964983815,0,3
4,1,50,5.0,964982931,0,4


In [6]:
num_users = len(user2user_encoded)
num_movies = len(movie2movie_encoded)
rating["rating"] = rating["rating"].values.astype(np.float32)
# min and max ratings will be used to normalize the ratings later
min_rating = min(rating["rating"])
max_rating = max(rating["rating"])

print(
    "Number of users: {}, Number of Movies: {}, Min rating: {}, Max rating: {}".format(
        num_users, num_movies, min_rating, max_rating))

Number of users: 610, Number of Movies: 9724, Min rating: 0.5, Max rating: 5.0


In [7]:
#Random Train-Test split
df = rating.sample(frac=1, random_state=42) # randomly shuffles dataset to ensure that model does not learn patterns specific to the data order
x = df[["user", "movie"]].values
# Normalize the targets between 0 and 1. Makes it easy to train.
y = df["rating"].apply(lambda x: (x - min_rating) / (max_rating - min_rating)).values
# Assuming training on 90% of the data and validating on 10%.
train_indices = int(0.9 * df.shape[0])
x_train, x_val, y_train, y_val = (
    x[:train_indices],
    x[train_indices:],
    y[:train_indices],
    y[train_indices:],
)

In [8]:
x_train.shape



(90752, 2)

In [9]:
y_train.shape

(90752,)

In [10]:
#defining the model
embedding_size = 50

#vectors capture the underlying preferences of users and the characteristics of movies.
user_ips= layers.Input(shape=[1])
user_embedding = layers.Embedding(num_users,embedding_size)(user_ips)
# Embedding layer will create 610 vectors look-up table of 50 dimension each
user_vect= layers.Flatten()(user_embedding)

movie_ips= layers.Input(shape=[1])
movie_embedding = layers.Embedding(num_movies, embedding_size)(movie_ips)
movie_vect= layers.Flatten()(movie_embedding)

#calculates the dot product
prod = layers.dot(inputs=[user_vect, movie_vect],axes=1)

dense1= layers.Dense(150, activation='relu')(prod)
dense2= layers.Dense(50, activation='relu')(dense1)
dense3= layers.Dense(1,activation='relu')(dense2)

#compiling model
model = Model([user_ips, movie_ips], dense3)
model.compile(optimizer='adam',loss='mean_squared_error')

In [11]:
history = model.fit([x_train[:,0], x_train[:,1]], y_train, batch_size=64,epochs=10,verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [12]:
# Save the Model in the native Keras format
model.save('movie_recommendation_model.h5')  # Replace with desired filename



  saving_api.save_model(


In [16]:
pred=model.predict([x_train[4:5,0], x_train[4:5,1]])
pred



array([[0.7826633]], dtype=float32)

In [17]:
movie_df = pd.read_csv('movies.csv')

In [18]:
user_id = df.userId.sample(1).iloc[0]
movies_watched_by_user = df[df.userId == user_id]
movies_not_watched = movie_df[ ~movie_df["movieId"].isin(movies_watched_by_user.movieId.values)]["movieId"]

In [19]:
user_id

561

In [20]:
movies_watched_by_user.head()

Unnamed: 0,userId,movieId,rating,timestamp,user,movie
86670,561,1278,3.5,1491094620,560,87
86688,561,1394,4.0,1491092758,560,1474
87049,561,106002,2.5,1491091525,560,3103
86999,561,79132,3.5,1491091990,560,244
86748,561,2403,4.5,1491092025,560,1863


In [21]:
## basically inner join between movies_not_watched and movie ids from df table (to be sure that the movie ids are not out of scope)
movies_not_watched = list(
    set(movies_not_watched).intersection(set(movie2movie_encoded.keys()))
)

In [22]:
movies_not_watched_index = [[movie2movie_encoded.get(x)] for x in movies_not_watched]  # get the indexes of the movies not watched by the user

In [23]:
user_encoder = user2user_encoded.get(user_id)
user_encoder

560

In [24]:
user_movie_array = np.hstack(([[user_encoder]] * len(movies_not_watched), movies_not_watched_index))
user_movie_array ## 1st column is user_index and 2nd col is the list of movie indexes not watched by the user

array([[ 560, 5804],
       [ 560,    1],
       [ 560,  482],
       ...,
       [ 560, 3870],
       [ 560, 2989],
       [ 560, 7869]])

In [25]:

ratings = model.predict([user_movie_array[:,0],user_movie_array[:,1]]).flatten()
ratings



array([0.68421036, 0.5925073 , 0.5387807 , ..., 0.60830903, 0.67212945,
       0.5878357 ], dtype=float32)

In [28]:
top_ratings_indices = ratings.argsort()[-10:][::-1] # indices of highest 10 ratings

In [29]:
recommended_movie_ids = [movie_encoded2movie.get(movies_not_watched_index[x][0]) for x in top_ratings_indices]

In [30]:
print("Showing recommendations for user: {}".format(user_id))
print("====" * 9)
print("Movies with high ratings from user")
print("----" * 8)
top_movies_user = (
    movies_watched_by_user.sort_values(by="rating", ascending=False)
    .head(5)
    .movieId.values
)
movie_df_rows = movie_df[movie_df["movieId"].isin(top_movies_user)]
for row in movie_df_rows.itertuples():
    print(row.title, ":", row.genres)

Showing recommendations for user: 561
Movies with high ratings from user
--------------------------------
Princess Bride, The (1987) : Action|Adventure|Comedy|Fantasy|Romance
Jaws (1975) : Action|Horror
Ghostbusters (a.k.a. Ghost Busters) (1984) : Action|Comedy|Sci-Fi
Predator (1987) : Action|Sci-Fi|Thriller
WALL·E (2008) : Adventure|Animation|Children|Romance|Sci-Fi


In [31]:
print("----" * 8)
print("Top 10 movie recommendations")
print("----" * 8)
recommended_movies = movie_df[movie_df["movieId"].isin(recommended_movie_ids)]
for row in recommended_movies.itertuples():
    print(row.title, ":", row.genres)

--------------------------------
Top 10 movie recommendations
--------------------------------
Blade Runner (1982) : Action|Sci-Fi|Thriller
Monty Python's Life of Brian (1979) : Comedy
Evil Dead II (Dead by Dawn) (1987) : Action|Comedy|Fantasy|Horror
Gandhi (1982) : Drama
Pi (1998) : Drama|Sci-Fi|Thriller
Blazing Saddles (1974) : Comedy|Western
Cowboy Bebop: The Movie (Cowboy Bebop: Tengoku no Tobira) (2001) : Action|Animation|Sci-Fi|Thriller
Children of Men (2006) : Action|Adventure|Drama|Sci-Fi|Thriller
Some Guy Who Kills People (2011) : Comedy|Thriller
Pride and Prejudice and Zombies (2016) : Comedy|Horror|Romance|Thriller
