# Importing libraries

In [None]:
!pip install tensorflow-recommenders

Collecting tensorflow-recommenders
  Downloading tensorflow_recommenders-0.7.3-py3-none-any.whl.metadata (4.6 kB)
Downloading tensorflow_recommenders-0.7.3-py3-none-any.whl (96 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m96.2/96.2 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tensorflow-recommenders
Successfully installed tensorflow-recommenders-0.7.3


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_recommenders as tfrs

# Loading and viewing the ratings dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_csv('/content/drive/MyDrive/Data sets/ratings.csv')
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523
3,1,1221,5.0,1425941546
4,1,1246,5.0,1425941556


# Applying simple data preprocessing

In [None]:
df.drop(['timestamp'], axis=1, inplace=True)
df = df[0:1000000]
df.shape

(1000000, 3)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 3 columns):
 #   Column   Non-Null Count    Dtype  
---  ------   --------------    -----  
 0   userId   1000000 non-null  int64  
 1   movieId  1000000 non-null  int64  
 2   rating   1000000 non-null  float64
dtypes: float64(1), int64(2)
memory usage: 22.9 MB


In [None]:
df.duplicated().sum()

0

In [None]:
df.isnull().sum()

Unnamed: 0,0
userId,0
movieId,0
rating,0


In [None]:
df['rating'].value_counts()

Unnamed: 0_level_0,count
rating,Unnamed: 1_level_1
4.0,269794
3.0,208032
5.0,144849
3.5,122558
4.5,82503
2.0,66423
2.5,48573
1.0,30815
1.5,13609
0.5,12844


# Building the matrix factorization model

In [None]:
class MatrixFactorizationModel(tfrs.Model):
    def __init__(self, num_users, num_items, embedding_dim):
        super().__init__()
        self.user_embedding = tf.keras.layers.Embedding(num_users, embedding_dim)
        self.item_embedding = tf.keras.layers.Embedding(num_items, embedding_dim)

    def call(self, features, training=False):
        user_ids = features['userId']
        item_ids = features['movieId']

        user_embeddings = self.user_embedding(user_ids)
        item_embeddings = self.item_embedding(item_ids)

        dot_product = tf.reduce_sum(user_embeddings * item_embeddings, axis=1)
        return dot_product

    def compute_loss(self, features, training=False):
        ratings = features['rating']
        user_ids = features['userId']
        item_ids = features['movieId']

        user_embeddings = self.user_embedding(user_ids)
        item_embeddings = self.item_embedding(item_ids)

        dot_product = tf.reduce_sum(user_embeddings * item_embeddings, axis=1)
        return tf.reduce_mean(tf.square(dot_product - tf.cast(ratings, tf.float32)))

# Building training data

In [None]:
training_data = {
    'userId': tf.convert_to_tensor(df['userId'].to_numpy()),
    'movieId': tf.convert_to_tensor(df['movieId'].to_numpy()),
    'rating': tf.convert_to_tensor(df['rating'].to_numpy())
}

dataset = tf.data.Dataset.from_tensor_slices(training_data)
dataset = dataset.shuffle(buffer_size=1000000).batch(32).prefetch(tf.data.AUTOTUNE)

training_data['userId'].shape

TensorShape([1000000])

In [None]:
num_users = df['userId'].max() +1
num_items = df['movieId'].max() +1
print(num_users, num_items)
embedding_dim = 10

10184 176272


# Model training

In [None]:
model = MatrixFactorizationModel(num_users, num_items, embedding_dim)

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.01))

model.fit(dataset, epochs=5)

Epoch 1/5
[1m31250/31250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m632s[0m 20ms/step - loss: 2.1647 - regularization_loss: 0.0000e+00 - total_loss: 2.1647
Epoch 2/5
[1m31250/31250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m657s[0m 19ms/step - loss: 1.3018 - regularization_loss: 0.0000e+00 - total_loss: 1.3018
Epoch 3/5
[1m31250/31250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m621s[0m 20ms/step - loss: 1.2310 - regularization_loss: 0.0000e+00 - total_loss: 1.2310
Epoch 4/5
[1m31250/31250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m613s[0m 19ms/step - loss: 1.2116 - regularization_loss: 0.0000e+00 - total_loss: 1.2116
Epoch 5/5
[1m31250/31250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m583s[0m 19ms/step - loss: 1.1946 - regularization_loss: 0.0000e+00 - total_loss: 1.1946


<keras.src.callbacks.history.History at 0x79e434796290>

# Loading the movies dataset

In [None]:
movies_df = pd.read_csv('/content/drive/MyDrive/Data sets/movies_metadata.csv')

movies_df.drop(['adult', 'belongs_to_collection', 'budget', 'genres',
                'homepage', 'original_language', 'imdb_id', 'popularity', 'poster_path',
                'production_companies', 'production_countries', 'release_date',
                'revenue', 'runtime', 'status', 'tagline', 'title', 'video',
                'vote_average', 'vote_count', 'spoken_languages'], axis=1, inplace=True)

  movies_df = pd.read_csv('/content/drive/MyDrive/Data sets/movies_metadata.csv')


In [None]:
movies_df.head()

Unnamed: 0,id,original_title,overview
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,8844,Jumanji,When siblings Judy and Peter discover an encha...
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...
3,31357,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom..."
4,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...


In [None]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              45466 non-null  object
 1   original_title  45466 non-null  object
 2   overview        44512 non-null  object
dtypes: object(3)
memory usage: 1.0+ MB


# Some required simple preprocessing

In [None]:
# Attempt to convert 'id' column to numeric, handling non-numeric values
movies_df['id'] = pd.to_numeric(movies_df['id'], errors='coerce')

# Convert the resulting numeric values to integers, filling non-convertible ones with a placeholder (e.g., -1)
movies_df['id'] = movies_df['id'].fillna(-1).astype(int) # Fill non-convertible values with -1

valid_ids_mask = movies_df['id'] != -1
movies_df = movies_df[valid_ids_mask]
valid_ids_mask = movies_df['id'] <= num_items
movies_df = movies_df[valid_ids_mask]

movies_df.reset_index(drop=True, inplace=True)

movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35224 entries, 0 to 35223
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              35224 non-null  int64 
 1   original_title  35224 non-null  object
 2   overview        34667 non-null  object
dtypes: int64(1), object(2)
memory usage: 825.7+ KB


# building prediction data

In [None]:
prediction_data = {
    'userId': np.array([len(movies_df['id'])]) ,
    'movieId': tf.convert_to_tensor(movies_df['id'].astype(int).to_numpy()),
}

# Predict ratings function

In [None]:
def predict_ratings(userId, model = model, prediction_data= prediction_data):
  prediction_data['userId'] = tf.convert_to_tensor(
      np.array([userId] * len(movies_df['id'])))

  predicted_ratings = model.predict(prediction_data)
  predicted_ratings = np.array([0 if rating < 0 else (5 if rating > 5 else rating) for rating in predicted_ratings])

  return predicted_ratings

In [None]:
predicted_ratings = predict_ratings(5)

[1m1101/1101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step


In [None]:
print(predicted_ratings)

[3.28930259 5.         0.01849502 ... 0.01805472 5.         0.        ]


# A movie recommender based on the rating prediction function

In [None]:
def recommend_movies(userId, prediction_data= prediction_data, movies_df=movies_df):
  predicted_ratings = predict_ratings(userId)
  prediction_data_final = pd.DataFrame(prediction_data)
  prediction_data_final['predicted_ratings'] = predicted_ratings
  sorted_data = prediction_data_final.sort_values(by='predicted_ratings', ascending=False)[0:10]
  # Here we should remove the watched movies to get the best out of the model
  sorted_data = sorted_data['movieId'].to_list()
  recommended_movies = movies_df[movies_df['id'].isin(sorted_data)]
  return recommended_movies[['original_title', 'overview']]


In [None]:
recommend_movies(6)

[1m1101/1101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step


Unnamed: 0,original_title,overview
2118,One Tough Cop,
2120,Die Siebtelbauern,In a small farming valley in Austria in the be...
2123,Children of a Lesser God,James is a new speech teacher at a school for ...
4781,Le Pacte des loups,"In 18th century France, the Chevalier de Frons..."
6391,Incident at Oglala,"On June 26, 1975, during a period of high tens..."
6398,Fire,"In a barren, arranged marriage to an amateur s..."
6402,My Boss's Daughter,When a young man agrees to housesit for his bo...
17439,Green Fire,"Set in the coffee fields of Colombia, Green Fi..."
17443,No Man of Her Own,"Babe Stewart, a card cheat who has to go on th..."
17451,De Dominee,Different solution new movie ;-) The Preacher ...
