In [1]:
import pandas as pd
import numpy as np
from zipfile import ZipFile
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from pathlib import Path
import matplotlib.pyplot as plt
from tensorflow.keras import Sequential
from sklearn.metrics import mean_squared_error
import math
from sklearn.metrics.pairwise import cosine_distances
from sklearn.metrics import pairwise_distances

In [2]:
movielens_data_file_url = (
    "http://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
)
movielens_zipped_file = keras.utils.get_file(
    "ml-latest-small.zip", movielens_data_file_url, extract=False
)
keras_datasets_path = Path(movielens_zipped_file).parents[0]
movielens_dir = keras_datasets_path / "ml-latest-small"

if not movielens_dir.exists():
    with ZipFile(movielens_zipped_file, "r") as zip:
        print("Extracting all the files now...")
        zip.extractall(path=keras_datasets_path)
        print("Done!")

ratings_file = movielens_dir / "ratings.csv"
df = pd.read_csv(ratings_file)

In [3]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
user_ids = df["userId"].unique().tolist()
user2user_encoded = {x: i for i, x in enumerate(user_ids)}
userencoded2user = {i: x for i, x in enumerate(user_ids)}
movie_ids = df["movieId"].unique().tolist()
movie2movie_encoded = {x: i for i, x in enumerate(movie_ids)}
movie_encoded2movie = {i: x for i, x in enumerate(movie_ids)}
df["user"] = df["userId"].map(user2user_encoded)
df["movie"] = df["movieId"].map(movie2movie_encoded)

num_users = len(user2user_encoded)
num_movies = len(movie_encoded2movie)
df["rating"] = df["rating"].values.astype(np.float32)
min_rating = min(df["rating"])
max_rating = max(df["rating"])

print(
    "Number of users: {}, Number of Movies: {}, Min rating: {}, Max rating: {}".format(
        num_users, num_movies, min_rating, max_rating
    )
)

Number of users: 610, Number of Movies: 9724, Min rating: 0.5, Max rating: 5.0


In [5]:
df = df.sample(frac=1, random_state=42)
x = df[["user", "movie"]].values
y = df["rating"].apply(lambda x: (x - min_rating) / (max_rating - min_rating)).values
train_indices = int(0.9 * df.shape[0])
x_train, x_val, y_train, y_val = (
    x[:train_indices],
    x[train_indices:],
    y[:train_indices],
    y[train_indices:],
)

In [6]:
embedding_dim = 32
hidden_units = [64, 32]

user_input = tf.keras.Input(shape=(1,), dtype=tf.int32, name='user_input')
movie_input = tf.keras.Input(shape=(1,), dtype=tf.int32, name='movie_input')

user_embedding = tf.keras.layers.Embedding(input_dim=num_users, output_dim=embedding_dim,name='user_embedding')(user_input)
movie_embedding = tf.keras.layers.Embedding(input_dim=num_movies, output_dim=embedding_dim)(movie_input)

user_embedding = tf.keras.layers.Flatten()(user_embedding)
movie_embedding = tf.keras.layers.Flatten()(movie_embedding)

concatenated = tf.keras.layers.Concatenate()([user_embedding, movie_embedding])

for units in hidden_units:
    concatenated = tf.keras.layers.Dense(units, activation='relu')(concatenated)

output = tf.keras.layers.Dense(1, activation='sigmoid')(concatenated)

model = tf.keras.Model(inputs=[user_input, movie_input], outputs=output)

model.compile(optimizer="Adam",loss="mse")

In [7]:
history = model.fit(
    [x_train[:, 0], x_train[:, 1]],
    y_train,
    validation_data=([x_val[:, 0], x_val[:, 1]], y_val),
    batch_size=64,
    epochs=5
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [8]:
user_embeddings = model.get_layer('user_embedding').get_weights()[0]

similarity_matrix = 1 - cosine_distances(user_embeddings)

In [9]:
similarity_matrix_pd = pd.DataFrame(similarity_matrix)
similarity_matrix_pd

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,600,601,602,603,604,605,606,607,608,609
0,1.000000,0.143581,-0.738277,-0.285062,-0.010556,-0.234115,-0.314939,0.240821,-0.000552,-0.321879,...,0.595977,-0.228224,-0.112262,-0.084042,-0.548538,-0.097333,0.354957,-0.456432,-0.565634,0.497229
1,0.143581,1.000000,-0.152356,0.070405,0.096554,0.052328,0.234954,-0.097398,-0.475009,0.349591,...,0.403217,-0.132030,-0.281786,0.296772,0.130641,-0.208861,0.062222,-0.392097,-0.147235,-0.003285
2,-0.738277,-0.152356,1.000000,0.371596,0.010429,0.093523,0.179522,-0.375454,0.017000,0.432381,...,-0.726684,0.056878,-0.031882,-0.128189,0.367006,-0.134183,-0.312046,0.295997,0.583984,-0.520096
3,-0.285062,0.070405,0.371596,1.000000,0.419168,-0.129285,0.485201,-0.257887,0.398928,-0.142566,...,-0.378046,-0.270470,-0.212484,-0.321349,0.074074,0.246859,-0.525457,-0.070195,0.082530,0.010848
4,-0.010556,0.096554,0.010429,0.419168,1.000000,0.004247,0.115516,-0.095719,0.300869,-0.090539,...,-0.126425,0.233881,-0.063960,0.035397,0.197020,0.552481,-0.430446,-0.228892,0.146035,0.181874
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
605,-0.097333,-0.208861,-0.134183,0.246859,0.552481,0.161337,-0.048229,0.226113,0.414813,-0.212626,...,-0.193457,0.256181,-0.055319,0.121058,0.153550,1.000000,-0.378490,0.059524,0.155182,0.230360
606,0.354957,0.062222,-0.312046,-0.525457,-0.430446,0.167511,-0.448584,0.054585,-0.232753,0.113324,...,0.408652,0.074524,0.084662,-0.000135,-0.345567,-0.378490,1.000000,-0.094897,-0.100772,-0.217596
607,-0.456432,-0.392097,0.295997,-0.070195,-0.228892,0.283312,0.282718,-0.053120,0.056655,0.019500,...,-0.390694,-0.013419,0.384403,0.109313,0.145178,0.059524,-0.094897,1.000000,0.165452,-0.315494
608,-0.565634,-0.147235,0.583984,0.082530,0.146035,0.197078,-0.195916,-0.076822,-0.125595,0.462991,...,-0.419722,0.547661,-0.185671,0.241583,0.503989,0.155182,-0.100772,0.165452,1.000000,-0.417621


In [10]:
def top_n_similar_users(user_id,n,similarity_matrix):
    similar_user_ids = np.argsort(np.delete(similarity_matrix[user_id - 1],user_id - 1))[-n:] + 1
    movies_watched_by_user = df[df.userId == user_id]
    similarity_matrix_copy = np.copy()
    df_user = movies_watched_by_user[['movieId', 'rating']]
    for users in similar_user_id:
        movies_watched_by_similar_user = df[df.userId == similar_user_id]
        df_similar_user = movies_watched_by_similar_user[['movieId', 'rating']]
        corresponding_movie_ids = df_similar_user[df_similar_user.movieId.isin(df_user.movieId)]['movieId']
        if(len(corresponding_movie_ids) < 10):
            np.delete
    return

In [37]:
def nth_similar_user(user_id,n,similarity_matrix):
     return np.argsort(np.delete(similarity_matrix[user_id - 1],user_id - 1))[-(n + 1)] + 1

In [38]:
top_n_similar_users(1,10,similarity_matrix)

array([388, 105, 504, 170, 522, 200,  51, 347,  52,  24], dtype=int64)

In [39]:
nth_similar_user(1,5,similarity_matrix)

522

In [18]:
rmse = 0
target_user_id = 1
n = 10
similar_users = top_n_similar_users(target_user_id,n,similarity_matrix)
movies_watched_by_user = df[df.userId == target_user_id]
df_user = movies_watched_by_user[['movieId', 'rating']]

for similar_user_id in similar_users:
    movies_watched_by_similar_user = df[df.userId == similar_user_id]
    df_similar_user = movies_watched_by_similar_user[['movieId', 'rating']]  
    corresponding_movie_ids = df_similar_user[df_similar_user.movieId.isin(df_user.movieId)]['movieId']
    if(len(corresponding_movie_ids) < 10):
        while(len(corresponding_movie_ids) >= 10):
            
    for movie_id in corresponding_movie_ids:
        rmse += math.pow(df_user[df_user.movieId == movie_id].iloc[0]['rating'] - 
                         df_similar_user[df_similar_user.movieId == movie_id].iloc[0]['rating'],2)
    rmse = math.sqrt(rmse*1/n)
    print(str(rmse) + str(corresponding_movie_ids.shape))

0.31622776601683794(1,)
1.8252733429822734(35,)
0.7297447048785125(6,)
1.0358448100405055(12,)
2.535465338158668(44,)
2.4957056184205433(54,)
3.350309024827718(40,)
1.354633124680912(13,)
0.8862636811175844(8,)
1.4278047373894507(25,)


In [15]:
df_similar_user[df_similar_user.movieId.isin(df_user.movieId)]

Unnamed: 0,movieId,rating
3904,50,4.0
3915,733,3.0
3935,2115,4.0
3944,2916,3.5
3912,552,3.5
3922,1265,4.5
3913,593,4.5
3942,2617,3.5
3908,316,3.5
3947,3578,4.0


In [17]:
float(df[(df['movieId'] ==  50) & (df['userId'] == similar_users[9])]['rating'].iloc[0])

IndexError: single positional indexer is out-of-bounds

In [25]:
similar_user_id = similar_users[0]
similar_user_id

388

In [24]:
tmp_user = []
tmp_similar_user = []
str_target = 'user ' + str(target_user_id)
str_similar = 'user ' + str(similar_users[0])
comparison_df = pd.DataFrame(index=corresponding_movie_ids,columns=[str_target,str_similar])
similar_user_id = similar_users[0]
movies_watched_by_similar_user = df[df.userId == similar_user_id]
df_similar_user = movies_watched_by_similar_user[['movieId', 'rating']]  
corresponding_movie_ids = df_similar_user[df_similar_user.movieId.isin(df_user.movieId)]['movieId']


for i in corresponding_movie_ids:
    tmp_user.append(float(df[(df['movieId'] ==  i) & (df['userId'] == target_user_id)]['rating'].iloc[0]))
    tmp_similar_user.append(float(df[(df['movieId'] ==  i) & (df['userId'] == similar_user_id)]['rating'].iloc[0]))
comparison_df[str_target] = tmp_user
comparison_df[str_similar] = tmp_similar_user
comparison_df

Unnamed: 0_level_0,user 1,user 388
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1197,5.0,4.0


In [56]:
movie_df = pd.read_csv(movielens_dir / "movies.csv")

# Let us get a user and see the top recommendations.
movies_watched_by_user = df[df.userId == user_id]
movies_not_watched = movie_df[
    ~movie_df["movieId"].isin(movies_watched_by_user.movieId.values)
]["movieId"]
movies_not_watched = list(
    set(movies_not_watched).intersection(set(movie2movie_encoded.keys()))
)
movies_not_watched = [[movie2movie_encoded.get(x)] for x in movies_not_watched]
user_encoder = user2user_encoded.get(user_id)
user_movie_array = np.hstack(
    ([[user_encoder]] * len(movies_not_watched), movies_not_watched)
)
ratings = model.predict(user_movie_array).flatten()
top_ratings_indices = ratings.argsort()[-10:][::-1]
recommended_movie_ids = [
    movie_encoded2movie.get(movies_not_watched[x][0]) for x in top_ratings_indices
]

print("Showing recommendations for user: {}".format(user_id))
print("====" * 9)
print("Movies with high ratings from user")
print("----" * 8)
top_movies_user = (
    movies_watched_by_user.sort_values(by="rating", ascending=False)
    .head(5)
    .movieId.values
)
movie_df_rows = movie_df[movie_df["movieId"].isin(top_movies_user)]
for row in movie_df_rows.itertuples():
    print(row.title, ":", row.genres)

print("----" * 8)
print("Top 10 movie recommendations")
print("----" * 8)
recommended_movies = movie_df[movie_df["movieId"].isin(recommended_movie_ids)]
for row in recommended_movies.itertuples():
    print(row.title, ":", row.genres)

ValueError: in user code:

    File "C:\Users\debek\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\src\engine\training.py", line 2341, in predict_function  *
        return step_function(self, iterator)
    File "C:\Users\debek\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\src\engine\training.py", line 2327, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\debek\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\src\engine\training.py", line 2315, in run_step  **
        outputs = model.predict_step(data)
    File "C:\Users\debek\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\src\engine\training.py", line 2283, in predict_step
        return self(x, training=False)
    File "C:\Users\debek\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\src\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "C:\Users\debek\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\src\engine\input_spec.py", line 219, in assert_input_compatibility
        raise ValueError(

    ValueError: Layer "model_5" expects 2 input(s), but it received 1 input tensors. Inputs received: [<tf.Tensor 'IteratorGetNext:0' shape=(None, 2) dtype=int32>]


In [177]:
rmse = 0

for similar_user_id in similar_users:
    df_user = movies_watched_by_user[['movieId', 'rating']]
    movies_watched_by_similar_user = df[df.userId == similar_user_id]
    df_similar_user = movies_watched_by_similar_user[['movieId', 'rating']]  
    corresponding_movie_ids = df_similar_user[df_similar_user.movieId.isin(df_user.movieId)]['movieId']
    for movie_id in corresponding_movie_ids:
        rmse += math.pow(df_user[df_user.movieId == movie_id].iloc[0]['rating'] - 
                         df_similar_user[df_similar_user.movieId == movie_id].iloc[0]['rating'],2)
    rmse = math.sqrt(rmse*1/n)
    print(str(rmse) + str(corresponding_movie_ids.shape))

2.2135943621178655(21,)
4.729837146901761(73,)
5.536062112611289(58,)
2.480646329338612(31,)
1.0830810832684048(6,)
1.74163948862181(17,)
1.7955957086332608(41,)
2.4247803139384247(59,)
0.9578507354456864(8,)
4.875529209587874(145,)


In [178]:
corresponding_movie_ids = df_similar_user[df_similar_user.movieId.isin(df_user.movieId)]
corresponding_movie_ids

Unnamed: 0,movieId,rating
39677,2700,4.0
39755,3253,3.5
39409,1089,4.5
39340,590,4.0
39589,2096,3.0
...,...,...
39279,223,4.0
39341,592,3.0
39811,3703,4.0
39734,3034,3.5
