In [5]:
import pandas as pd
import numpy as np
from zipfile import ZipFile
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from pathlib import Path
import matplotlib.pyplot as plt
from tensorflow.keras import Sequential
from sklearn.metrics import mean_squared_error
import math
from sklearn.metrics.pairwise import cosine_distances
from sklearn.metrics import pairwise_distances

In [6]:
movielens_data_file_url = (
    "http://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
)
movielens_zipped_file = keras.utils.get_file(
    "ml-latest-small.zip", movielens_data_file_url, extract=False
)
keras_datasets_path = Path(movielens_zipped_file).parents[0]
movielens_dir = keras_datasets_path / "ml-latest-small"

if not movielens_dir.exists():
    with ZipFile(movielens_zipped_file, "r") as zip:
        print("Extracting all the files now...")
        zip.extractall(path=keras_datasets_path)
        print("Done!")

ratings_file = movielens_dir / "ratings.csv"
df = pd.read_csv(ratings_file)

In [7]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [8]:
user_ids = df["userId"].unique().tolist()
user2user_encoded = {x: i for i, x in enumerate(user_ids)}
userencoded2user = {i: x for i, x in enumerate(user_ids)}
movie_ids = df["movieId"].unique().tolist()
movie2movie_encoded = {x: i for i, x in enumerate(movie_ids)}
movie_encoded2movie = {i: x for i, x in enumerate(movie_ids)}
df["user"] = df["userId"].map(user2user_encoded)
df["movie"] = df["movieId"].map(movie2movie_encoded)

num_users = len(user2user_encoded)
num_movies = len(movie_encoded2movie)
df["rating"] = df["rating"].values.astype(np.float32)
min_rating = min(df["rating"])
max_rating = max(df["rating"])

print(
    "Number of users: {}, Number of Movies: {}, Min rating: {}, Max rating: {}".format(
        num_users, num_movies, min_rating, max_rating
    )
)

Number of users: 610, Number of Movies: 9724, Min rating: 0.5, Max rating: 5.0


In [9]:
df = df.sample(frac=1, random_state=42)
x = df[["user", "movie"]].values
y = df["rating"].apply(lambda x: (x - min_rating) / (max_rating - min_rating)).values
train_indices = int(0.9 * df.shape[0])
x_train, x_val, y_train, y_val = (
    x[:train_indices],
    x[train_indices:],
    y[:train_indices],
    y[train_indices:],
)

In [10]:
embedding_dim = 32
hidden_units = [64, 32]

user_input = tf.keras.Input(shape=(1,), dtype=tf.int32, name='user_input')
movie_input = tf.keras.Input(shape=(1,), dtype=tf.int32, name='movie_input')

user_embedding = tf.keras.layers.Embedding(input_dim=num_users, output_dim=embedding_dim,name='user_embedding')(user_input)
movie_embedding = tf.keras.layers.Embedding(input_dim=num_movies, output_dim=embedding_dim)(movie_input)

user_embedding = tf.keras.layers.Flatten()(user_embedding)
movie_embedding = tf.keras.layers.Flatten()(movie_embedding)

concatenated = tf.keras.layers.Concatenate()([user_embedding, movie_embedding])

for units in hidden_units:
    concatenated = tf.keras.layers.Dense(units, activation='relu')(concatenated)

output = tf.keras.layers.Dense(1, activation='sigmoid')(concatenated)

model = tf.keras.Model(inputs=[user_input, movie_input], outputs=output)

model.compile(optimizer="Adam",loss="mse")

In [11]:
history = model.fit(
    [x_train[:, 0], x_train[:, 1]],
    y_train,
    validation_data=([x_val[:, 0], x_val[:, 1]], y_val),
    batch_size=64,
    epochs=5
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [12]:
user_embeddings = model.get_layer('user_embedding').get_weights()[0]

similarity_matrix = 1 - cosine_distances(user_embeddings)

In [13]:
similarity_matrix_pd = pd.DataFrame(similarity_matrix)
similarity_matrix_pd

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,600,601,602,603,604,605,606,607,608,609
0,1.000000,0.225947,-0.769070,-0.205515,-0.189535,0.102655,-0.222066,0.096135,0.362717,-0.559246,...,0.720006,-0.251539,-0.242822,0.053190,-0.396200,0.290414,0.232280,-0.314602,-0.223891,0.699052
1,0.225947,1.000000,-0.332028,-0.187846,-0.091234,0.365738,-0.373242,-0.129939,-0.289996,0.222059,...,0.376428,-0.221044,-0.205119,-0.007603,0.172903,0.139593,0.485259,-0.272553,0.239804,-0.112188
2,-0.769070,-0.332028,1.000000,-0.101370,0.130735,-0.030238,0.335071,-0.346694,-0.027123,0.336903,...,-0.714929,0.182352,0.259883,-0.098996,0.182093,-0.213893,-0.214915,0.509538,0.105409,-0.421063
3,-0.205515,-0.187846,-0.101370,1.000000,0.187141,-0.426643,0.167918,0.329547,-0.077952,0.049050,...,-0.032692,-0.122625,0.339673,0.156016,0.039179,-0.076379,0.096193,-0.132524,-0.220047,-0.025277
4,-0.189535,-0.091234,0.130735,0.187141,1.000000,-0.408656,-0.046456,0.224678,-0.295336,-0.135134,...,-0.112030,-0.210297,0.632698,-0.218463,0.038634,0.091649,-0.113612,-0.269927,0.083578,0.131443
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
605,0.290414,0.139593,-0.213893,-0.076379,0.091649,0.250862,-0.180549,-0.230092,0.378265,-0.327889,...,0.156175,0.019029,-0.047118,-0.191771,-0.040785,1.000000,-0.092524,-0.126006,0.038611,0.417340
606,0.232280,0.485259,-0.214915,0.096193,-0.113612,-0.017385,-0.127872,-0.035379,-0.215154,0.096215,...,0.438661,-0.329471,-0.065614,0.177347,-0.018688,-0.092524,1.000000,-0.064952,-0.164777,-0.010223
607,-0.314602,-0.272553,0.509538,-0.132524,-0.269927,0.239591,0.528065,-0.487909,0.181352,0.263709,...,-0.345564,0.201675,-0.086337,0.124913,0.347226,-0.126006,-0.064952,1.000000,-0.021313,-0.191431
608,-0.223891,0.239804,0.105409,-0.220047,0.083578,0.270878,-0.338530,0.234249,-0.347139,0.444078,...,0.000701,0.405256,0.200541,0.364551,0.648993,0.038611,-0.164777,-0.021313,1.000000,-0.134189


In [22]:
def top_n_similar_users(user_id,n,similarity_matrix):
    return np.argsort(np.delete(similarity_matrix[user_id - 1],user_id - 1))[-n:] + 1

In [23]:
def nth_similar_user(user_id,n,similarity_matrix):
     return np.argsort(np.delete(similarity_matrix[user_id - 1],user_id - 1))[-(n + 1)] + 1

In [26]:
rmse = 0
target_user_id = 1
n = 10
similar_users = top_n_similar_users(target_user_id,n,similarity_matrix)
movies_watched_by_user = df[df.userId == target_user_id]
df_user = movies_watched_by_user[['movieId', 'rating']]

for similar_user_id in similar_users:
    movies_watched_by_similar_user = df[df.userId == similar_user_id]
    df_similar_user = movies_watched_by_similar_user[['movieId', 'rating']]  
    corresponding_movie_ids = df_similar_user[df_similar_user.movieId.isin(df_user.movieId)]['movieId']            
    for movie_id in corresponding_movie_ids:
        rmse += math.pow(df_user[df_user.movieId == movie_id].iloc[0]['rating'] - 
                         df_similar_user[df_similar_user.movieId == movie_id].iloc[0]['rating'],2)
    rmse = math.sqrt(rmse*1/n)
    print(str(rmse) + str(corresponding_movie_ids.shape))

2.715695122800054(88,)
1.5399901013577995(13,)
0.7609198447509304(3,)
1.5735602894312923(26,)
1.4941071008944202(20,)
1.448934336017144(25,)
0.3806487010377343(0,)
1.9008063736487664(10,)
0.8000503967656517(3,)
2.7440854650824136(61,)


In [30]:
tmp_user = []
tmp_similar_user = []

str_target = 'user ' + str(target_user_id)
str_similar = 'user ' + str(similar_users[0])

similar_user_id = similar_users[0]
movies_watched_by_similar_user = df[df.userId == similar_user_id]
df_similar_user = movies_watched_by_similar_user[['movieId', 'rating']]  
corresponding_movie_ids = df_similar_user[df_similar_user.movieId.isin(df_user.movieId)]['movieId']

comparison_df = pd.DataFrame(index=corresponding_movie_ids,columns=[str_target,str_similar])

for i in corresponding_movie_ids:
    tmp_user.append(float(df[(df['movieId'] ==  i) & (df['userId'] == target_user_id)]['rating'].iloc[0]))
    tmp_similar_user.append(float(df[(df['movieId'] ==  i) & (df['userId'] == similar_user_id)]['rating'].iloc[0]))
    
comparison_df[str_target] = tmp_user
comparison_df[str_similar] = tmp_similar_user
comparison_df

Unnamed: 0_level_0,user 1,user 249
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
2353,5.0,4.5
367,4.0,4.0
1408,3.0,4.0
3744,4.0,3.0
608,5.0,4.5
...,...,...
1270,5.0,4.5
3273,5.0,2.5
2797,4.0,4.0
1198,5.0,5.0


In [44]:
movie_df = pd.read_csv(movielens_dir / "movies.csv")

# Let us get a user and see the top recommendations.
movies_watched_by_user = df[df.userId == target_user_id]
movies_not_watched = movie_df[
    ~movie_df["movieId"].isin(movies_watched_by_user.movieId.values)
]["movieId"]
movies_not_watched = list(
    set(movies_not_watched).intersection(set(movie2movie_encoded.keys()))
)
movies_not_watched = [[movie2movie_encoded.get(x)] for x in movies_not_watched]
user_encoder = user2user_encoded.get(target_user_id)
user_movie_array = np.hstack(
    ([[user_encoder]] * len(movies_not_watched), movies_not_watched))
users = user_movie_array[:, 0]
movies = user_movie_array[:, 1]
ratings = model.predict([users,movies]).flatten()
top_ratings_indices = ratings.argsort()[-10:][::-1]
recommended_movie_ids = [
    movie_encoded2movie.get(movies_not_watched[x][0]) for x in top_ratings_indices
]

print("Showing recommendations for user: {}".format(target_user_id))
print("====" * 9)
print("Movies with high ratings from user")
print("----" * 8)
top_movies_user = (
    movies_watched_by_user.sort_values(by="rating", ascending=False)
    .head(5)
    .movieId.values
)
movie_df_rows = movie_df[movie_df["movieId"].isin(top_movies_user)]
for row in movie_df_rows.itertuples():
    print(row.title, ":", row.genres)

print("----" * 8)
print("Top 10 movie recommendations")
print("----" * 8)
recommended_movies = movie_df[movie_df["movieId"].isin(recommended_movie_ids)]
for row in recommended_movies.itertuples():
    print(row.title, ":", row.genres)

Showing recommendations for user: 1
Movies with high ratings from user
--------------------------------
Tommy Boy (1995) : Comedy
Tombstone (1993) : Action|Drama|Western
Three Caballeros, The (1945) : Animation|Children|Musical
Full Metal Jacket (1987) : Drama|War
Newton Boys, The (1998) : Crime|Drama
--------------------------------
Top 10 movie recommendations
--------------------------------
Secrets & Lies (1996) : Drama
Patton (1970) : Drama|War
Kolya (Kolja) (1996) : Comedy|Drama
Celebration, The (Festen) (1998) : Drama
Trial, The (Procès, Le) (1962) : Drama
Discreet Charm of the Bourgeoisie, The (Charme discret de la bourgeoisie, Le) (1972) : Comedy|Drama|Fantasy
Adam's Rib (1949) : Comedy|Romance
Dead Man's Shoes (2004) : Crime|Thriller
Band of Brothers (2001) : Action|Drama|War
Sherlock - A Study in Pink (2010) : Crime


In [177]:
rmse = 0

for similar_user_id in similar_users:
    df_user = movies_watched_by_user[['movieId', 'rating']]
    movies_watched_by_similar_user = df[df.userId == similar_user_id]
    df_similar_user = movies_watched_by_similar_user[['movieId', 'rating']]  
    corresponding_movie_ids = df_similar_user[df_similar_user.movieId.isin(df_user.movieId)]['movieId']
    for movie_id in corresponding_movie_ids:
        rmse += math.pow(df_user[df_user.movieId == movie_id].iloc[0]['rating'] - 
                         df_similar_user[df_similar_user.movieId == movie_id].iloc[0]['rating'],2)
    rmse = math.sqrt(rmse*1/n)
    print(str(rmse) + str(corresponding_movie_ids.shape))

2.2135943621178655(21,)
4.729837146901761(73,)
5.536062112611289(58,)
2.480646329338612(31,)
1.0830810832684048(6,)
1.74163948862181(17,)
1.7955957086332608(41,)
2.4247803139384247(59,)
0.9578507354456864(8,)
4.875529209587874(145,)
