In [1]:
!pip install tensorflow_recommenders

import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_recommenders as tfrs
from typing import List, Union, Dict, Text
from google.colab import files
import io

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow_recommenders
  Downloading tensorflow_recommenders-0.7.3-py3-none-any.whl (96 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m96.2/96.2 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tensorflow_recommenders
Successfully installed tensorflow_recommenders-0.7.3


In [2]:
movie = files.upload()
user = files.upload()
rating = files.upload()

Saving movies_cleaned.csv to movies_cleaned.csv


Saving users_cleaned.csv to users_cleaned.csv


Saving ratings_cleaned.csv to ratings_cleaned.csv


In [3]:
movies_df = pd.read_csv(io.BytesIO(movie['movies_cleaned.csv']))
users_df = pd.read_csv(io.BytesIO(user['users_cleaned.csv']))
ratings_df = pd.read_csv(io.BytesIO(rating['ratings_cleaned.csv']))

In [None]:
#movies_df = pd.read_csv('data/movies_cleaned.csv')
#users_df = pd.read_csv('data/users_cleaned.csv')
#ratings_df = pd.read_csv('data/ratings_cleaned.csv' )

In [4]:
# Merge all data into a single DataFrame
merged_df = pd.merge(ratings_df, users_df, on='userId')
merged_df = pd.merge(merged_df, movies_df, left_on='movieId', right_on='ml_movieId')

merged_df['userId']=merged_df['userId'].astype(str)
merged_df['Title']=merged_df['Title'].astype(str)

# Convert the merged DataFrame to a TensorFlow Dataset
ratings = tf.data.Dataset.from_tensor_slices(dict(merged_df))

In [5]:
# Select the basic features.
ratings = ratings.map(lambda x: {
    "movie_title": x["Title"],
    "user_id": x["userId"],
    "user_rating": x["rating"],
})
movies = tf.data.Dataset.from_tensor_slices(movies_df["Title"].unique())

In [6]:
# Randomly shuffle data and split between train and test.
tf.random.set_seed(42)
shuffled = ratings.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(80_000)
test = shuffled.skip(80_000).take(20_000)

movie_titles = movies.batch(1_000)
user_ids = ratings.batch(1_000_000).map(lambda x: x["user_id"])

unique_movie_titles = np.unique(np.concatenate(list(movie_titles)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))

embedding_dimension = 32

user_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=unique_user_ids, mask_token=None),
  tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
])

movie_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=unique_movie_titles, mask_token=None),
  tf.keras.layers.Embedding(len(unique_movie_titles) + 1, embedding_dimension)
])

In [19]:
class MovielensModel(tfrs.models.Model):
    def __init__(self, user_model, movie_model, rating_weight: float, retrieval_weight: float):
        super().__init__()

        self.movie_model = movie_model
        self.user_model = user_model

        self.rating_model = tf.keras.Sequential([
            tf.keras.layers.Dense(256, activation="relu"),
            tf.keras.layers.Dense(128, activation="relu"),
            tf.keras.layers.Dense(1),
        ])

        self.rating_task = tfrs.tasks.Ranking(
            loss=tf.keras.losses.MeanSquaredError(),
            metrics=[tf.keras.metrics.RootMeanSquaredError()],
        )
        self.retrieval_task = tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(
                candidates=movies.batch(128).map(self.movie_model)
            )
        )

        self.rating_weight = rating_weight
        self.retrieval_weight = retrieval_weight

    def call(self, features):
        user_embeddings = self.user_model(features["user_id"])
        movie_embeddings = self.movie_model(features["movie_title"])

        return (
            user_embeddings,
            movie_embeddings,
            self.rating_model(tf.concat([user_embeddings, movie_embeddings], axis=1)),
        )

    def compute_loss(self, features, training=False):
        ratings = features.pop("user_rating")

        user_embeddings, movie_embeddings, rating_predictions = self(features)

        rating_loss = self.rating_task(
            labels=ratings,
            predictions=rating_predictions,
        )
        retrieval_loss = self.retrieval_task(user_embeddings, movie_embeddings)

        return (
            self.rating_weight * rating_loss + self.retrieval_weight * retrieval_loss
        )

    def recommend(self, user_id, k):
        user_id_tensor = tf.convert_to_tensor([user_id])
        user_embedding = self.user_model(user_id_tensor)
        movie_embeddings = self.movie_model(unique_movie_titles)
        scores = tf.linalg.matmul(user_embedding, movie_embeddings, transpose_b=True)
        scores = tf.reshape(scores, (1, -1))  # Reshape scores tensor
        top_k_movie_indices = tf.nn.top_k(scores, k=k).indices.numpy()[0]  # Update this line
        return [unique_movie_titles[i] for i in top_k_movie_indices]

    def get_config(self):
        config = super().get_config().copy()
        config.update({
            "user_model": self.user_model,
            "movie_model": self.movie_model,
            "rating_weight": self.rating_weight,
            "retrieval_weight": self.retrieval_weight
        })
        return config

    @classmethod
    def from_config(cls, config):
        return cls(**config)


In [8]:
def add_new_user_ratings(user_id: str, watched_movies: List[str], user_ratings: List[Union[int, float]]):
    # Declare global variables at the beginning of the function
    global ratings
    global unique_user_ids

    # Ensure the user is new
    assert user_id not in unique_user_ids, "The new user id already exists in the data."

    # Ensure the movies exist in the dataset
    for movie in watched_movies:
        assert movie in unique_movie_titles, f"The movie {movie} does not exist in the data."

    # Convert user_ratings to integers
    user_ratings = [int(rating) for rating in user_ratings]

    # Create new user data
    new_user_data = pd.DataFrame({
        'user_id': [user_id] * len(watched_movies),
        'movie_title': watched_movies,
        'user_rating': user_ratings,
    })

    # Convert the DataFrame to a TensorFlow Dataset
    new_ratings = tf.data.Dataset.from_tensor_slices(dict(new_user_data))

    # Update the global 'ratings' variable
    ratings = ratings.concatenate(new_ratings)

    # Update the global 'unique_user_ids' variable
    unique_user_ids = np.concatenate([unique_user_ids, np.array([user_id])])

In [9]:
cached_train = train.shuffle(100_000).batch(8192).cache()
cached_test = test.batch(4096).cache()

In [22]:
# Define model checkpoints and early stopping
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath="model_checkpoint",
    save_weights_only=True,
    monitor="val_loss",
    mode="min",
    save_best_only=True
)

early_stopping_callback = tf.keras.callbacks.EarlyStopping(
    monitor="val_loss",
    patience=3,
    mode="min",
    restore_best_weights=True
)

In [11]:
# Rating-focused model
rating_model = MovielensModel(user_model, movie_model, rating_weight=1.0, retrieval_weight=0.0)
rating_model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))
rating_model.fit(
    cached_train,
    epochs=10,
    validation_data=cached_test,
    callbacks=[checkpoint_callback, early_stopping_callback]
)
rating_metrics = rating_model.evaluate(cached_test, return_dict=True)
print(f"Retrieval top-100 accuracy: {rating_metrics['factorized_top_k/top_100_categorical_accuracy']:.3f}.")
print(f"Ranking RMSE: {rating_metrics['root_mean_squared_error']:.3f}.")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Retrieval top-100 accuracy: 0.053.
Ranking RMSE: 0.947.


In [12]:
# Retrieval-focused model
retrieval_model = MovielensModel(user_model, movie_model, rating_weight=0.0, retrieval_weight=1.0)
retrieval_model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))
retrieval_model.fit(
    cached_train,
    epochs=10,
    validation_data=cached_test,
    callbacks=[checkpoint_callback, early_stopping_callback]
)
retrieval_metrics = retrieval_model.evaluate(cached_test, return_dict=True)
print(f"Retrieval top-100 accuracy: {retrieval_metrics['factorized_top_k/top_100_categorical_accuracy']:.3f}.")
print(f"Ranking RMSE: {retrieval_metrics['root_mean_squared_error']:.3f}.")


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Retrieval top-100 accuracy: 0.555.
Ranking RMSE: 3.893.


In [23]:
# Balanced model
balanced_model = MovielensModel(user_model, movie_model, rating_weight=1.0, retrieval_weight=1.0)
balanced_model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))
balanced_model.fit(
    cached_train,
    epochs=10,
    validation_data=cached_test,
    callbacks=[checkpoint_callback, early_stopping_callback]
)
balanced_metrics = balanced_model.evaluate(cached_test, return_dict=True)
print(f"Retrieval top-100 accuracy: {balanced_metrics['factorized_top_k/top_100_categorical_accuracy']:.3f}.")
print(f"Ranking RMSE: {balanced_metrics['root_mean_squared_error']:.3f}.")


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Retrieval top-100 accuracy: 0.524.
Ranking RMSE: 1.231.


In [16]:
rating_model.save_weights('rating_model_weights')
retrieval_model.save_weights('retrieval_model_weights')
balanced_model.save_weights('balanced_model_weights')

In [27]:
new_model = MovielensModel(user_model, movie_model, rating_weight=1.0, retrieval_weight=1.0)
new_model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

# Load the saved weights into the new model:
new_model.load_weights('balanced_model_weights')

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x7fb42d518c10>

In [29]:
rec = new_model.recommend("42",k=5)
print(rec)

[b'Alien: Resurrection', b'Demolition Man', b'The Fifth Element', b'Star Trek VI: The Undiscovered Country', b'The Day the Earth Stood Still']


In [14]:
def get_unique_movie_title(movie_title):
    for index, unique_title in enumerate(unique_movie_titles):
        # Convert unique_title to string and remove the 'b' prefix and single quotes
        unique_title_str = str(unique_title)[2:-1]
        if movie_title == unique_title_str:
            return index
    return None


In [None]:
movie_title = '8 Seconds'
unique_title_index = get_unique_movie_title(movie_title)
if unique_title_index is not None:
    print(f"The corresponding unique title index for '{movie_title}' is {unique_title_index}.")
else:
    print(f"No matching unique title found for '{movie_title}'.")


The corresponding unique title index for '8 Seconds' is 20.


In [None]:
m1 = get_unique_movie_title('8 Seconds')
m2 = get_unique_movie_title('The Lion King')
m3 = get_unique_movie_title('The Shawshank Redemption')
m4 = get_unique_movie_title('The Godfather')
m5 = get_unique_movie_title('Toy Story')
m6 = get_unique_movie_title('The Matrix')
m7 = get_unique_movie_title('The Terminator')
m8 = get_unique_movie_title('Get Shorty')
m9 = get_unique_movie_title('The Silence of the Lambs')
m10 = get_unique_movie_title('The Usual Suspects')


# Add a new user's ratings
add_new_user_ratings(
    user_id="new_user_12",
    watched_movies=[
        unique_movie_titles[m1],
        unique_movie_titles[m2],
        unique_movie_titles[m3],
        unique_movie_titles[m4],
        unique_movie_titles[m5],
        unique_movie_titles[m6],
        unique_movie_titles[m7],
        unique_movie_titles[m8],
        unique_movie_titles[m9],
        unique_movie_titles[m10],
    ],
    user_ratings=[2,5,3.5,4,2,3,1.5,5,4.5,4 ]
)


In [None]:
def recommend_for_new_user(user_id, k):
    # If the user is new, recommend the top k most popular movies
    if user_id not in unique_user_ids:
        popular_movie_ids = ratings_df['movieId'].value_counts().index[:k]
        popular_movie_titles = movies_df[movies_df['movieId'].isin(popular_movie_ids)]['Title'].tolist()
        return popular_movie_titles

    # Otherwise, use the existing recommendation method
    else:
        recommendations = balanced_model.recommend(user_id, k)
        return [unique_movie_titles[i] for i in recommendations]


In [33]:
# Replace "42" and "Dances with Wolves (1990)" with valid user_id and movie_title from your data
user_id = "42"  # Replace with a valid user_id
recommendations = balanced_model.recommend(user_id, k=5)
print(f"Top 5 movie recommendations for user {user_id}: {recommendations}")

Top 5 movie recommendations for user 42: [b'Alien: Resurrection', b'Demolition Man', b'The Fifth Element', b'Star Trek VI: The Undiscovered Country', b'The Day the Earth Stood Still']


In [34]:
# Replace "42" and "Dances with Wolves (1990)" with valid user_id and movie_title from your data
user_id = "42"  # Replace with a valid user_id
recommendations1 = rating_model.recommend(user_id, k=5)
print(f"Top 5 movie recommendations for user {user_id}: {recommendations1}")


Top 5 movie recommendations for user 42: [b'Alien: Resurrection', b'Demolition Man', b'The Fifth Element', b'Star Trek VI: The Undiscovered Country', b'The Day the Earth Stood Still']


In [35]:
# Replace "42" and "Dances with Wolves (1990)" with valid user_id and movie_title from your data
user_id = "42"  # Replace with a valid user_id
recommendations2 = retrieval_model.recommend(user_id, k=5)
print(f"Top 5 movie recommendations for user {user_id}: {recommendations2}")


Top 5 movie recommendations for user 42: [b'Alien: Resurrection', b'Demolition Man', b'The Fifth Element', b'Star Trek VI: The Undiscovered Country', b'The Day the Earth Stood Still']
