# Item based recommender

Up to this point we have been giving recommendations based on users: we create a user embedding from their past ratings, and in front of a new user to recommend to, we look at how other users with similar embeddings rate the movies that the tested subject hasn't seen yet.

Now we want to explore a different approach: recommend movies based on similarities between the already seen movies. This involves creating a movie embedding system.

In [10]:
import numpy as np
import pandas as pd
import torch
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from keras.models import Model
from keras.layers import Input, Dense
from keras.optimizers import Adam
from keras.utils import to_categorical

from spotlight.datasets.movielens import get_movielens_dataset

In [2]:
dataset = get_movielens_dataset(variant='100K')
print(dataset)

<Interactions dataset (944 users x 1683 items x 100000 interactions)>


In [5]:
# Get metadata from movies
metadata = pd.read_csv('../datasets/movies_metadata_complet.csv')
metadata

Unnamed: 0,movie_id,movie_title,release_date,video_release_date,IMDb_URL,genres_name,avg_rating,min_rating,max_rating
0,1,Toy Story (1995),1995-01-01,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,"['Animation', ""Children's"", 'Comedy']",3.878318,1.0,5.0
1,2,GoldenEye (1995),1995-01-01,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,"['Action', 'Adventure', 'Thriller']",3.206107,1.0,5.0
2,3,Four Rooms (1995),1995-01-01,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,['Thriller'],3.033333,1.0,5.0
3,4,Get Shorty (1995),1995-01-01,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,"['Action', 'Comedy', 'Drama']",3.550239,1.0,5.0
4,5,Copycat (1995),1995-01-01,,http://us.imdb.com/M/title-exact?Copycat%20(1995),"['Crime', 'Drama', 'Thriller']",3.302325,1.0,5.0
...,...,...,...,...,...,...,...,...,...
1677,1678,Mat' i syn (1997),1998-02-06,,http://us.imdb.com/M/title-exact?Mat%27+i+syn+...,['Drama'],1.000000,1.0,1.0
1678,1679,B. Monkey (1998),1998-02-06,,http://us.imdb.com/M/title-exact?B%2E+Monkey+(...,"['Romance', 'Thriller']",3.000000,3.0,3.0
1679,1680,Sliding Doors (1998),1998-01-01,,http://us.imdb.com/Title?Sliding+Doors+(1998),"['Drama', 'Romance']",2.000000,2.0,2.0
1680,1681,You So Crazy (1994),1994-01-01,,http://us.imdb.com/M/title-exact?You%20So%20Cr...,['Comedy'],3.000000,3.0,3.0


This movie metadata is quite limited. We could use more information about movies to generate better movie representations.

In [None]:
# Extract year from release date
metadata['release_year'] = pd.to_datetime(metadata['release_date']).dt.year

In [17]:
# Flatten the list of genres and fit the OneHotEncoder
genres_flat = [genre for sublist in metadata['genres_name'] for genre in sublist]
genres_flat = np.array(genres_flat).reshape(-1, 1)

# OneHotEncoder from sklearn
encoder = OneHotEncoder()
encoded_genres = encoder.fit_transform(genres_flat)

# Reshape the encoded genres to match the movie-wise format (i.e., multiple genres per movie)
genre_matrix = []
for genres in metadata['genres_name']:
    genre_row = np.zeros(len(encoder.categories_[0]))
    for genre in genres:
        genre_index = encoder.categories_[0].tolist().index(genre)
        genre_row[genre_index] = 1
    genre_matrix.append(genre_row)

genre_matrix = np.array(genre_matrix)

In [14]:
# Normalize ratings (avg_rating, min_rating, max_rating)
rating_columns = ['avg_rating', 'min_rating', 'max_rating']
scaler = MinMaxScaler()
scaled_ratings = scaler.fit_transform(metadata[rating_columns])

In [19]:
# Combine all preprocessed features
X = np.concatenate([scaled_ratings, metadata[['release_year']].values, genre_matrix], axis=1)

In [20]:
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)

In [21]:
import tensorflow as tf
devices = tf.config.list_physical_devices()
print("\nDevices: ", devices)

gpus = tf.config.list_physical_devices('GPU')
if gpus:
    details = tf.config.experimental.get_device_details(gpus[0])
    print("GPU details: ", details)
    try:
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)


Devices:  [PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]


In [38]:
# Autoencoder architecture
input_dim = X_train.shape[1]  # Number of features
encoding_dim = 10  # Dimension of the embedding

# Encoder
input_layer = Input(shape=(input_dim,))
encoded = Dense(encoding_dim, activation='relu')(input_layer)

# Decoder
decoded = Dense(input_dim, activation='linear')(encoded)

# Autoencoder model
autoencoder = Model(input_layer, decoded)

# Encoder model (to retrieve embeddings)
encoder = Model(input_layer, encoded)

# Compile the autoencoder with a lower learning rate
autoencoder.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')

# Train the model
autoencoder.fit(X_train, X_train, epochs=50, batch_size=4, validation_data=(X_test, X_test))

# Getting the embeddings (encoded features)
embeddings = encoder.predict(X)

Epoch 1/50
[1m337/337[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 720us/step - loss: nan - val_loss: nan
Epoch 2/50
[1m337/337[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 532us/step - loss: nan - val_loss: nan
Epoch 3/50
[1m337/337[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 543us/step - loss: nan - val_loss: nan
Epoch 4/50
[1m337/337[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 528us/step - loss: nan - val_loss: nan
Epoch 5/50
[1m337/337[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 548us/step - loss: nan - val_loss: nan
Epoch 6/50
[1m337/337[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 562us/step - loss: nan - val_loss: nan
Epoch 7/50
[1m337/337[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 512us/step - loss: nan - val_loss: nan
Epoch 8/50
[1m337/337[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 507us/step - loss: nan - val_loss: nan
Epoch 9/50
[1m337/337[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s

In [39]:
# Check for NaN values in embeddings and handle them
if np.any(np.isnan(embeddings)):
    print("NaN values found in embeddings, handling...")
    # Remove rows with NaN values (or you could impute them)
    embeddings = embeddings[~np.isnan(embeddings).any(axis=1)]
    print(len(embeddings), 'movies left')

NaN values found in embeddings, handling...
0 movies left


In [26]:
import umap
import matplotlib.pyplot as plt

In [None]:
# Reduce dimensionality using UMAP
reducer = umap.UMAP(n_components=2)
umap_embeddings = reducer.fit_transform(embeddings)

# Plotting the UMAP
plt.figure(figsize=(8, 6))

# Define colors for genres
genre_colors = ['red', 'blue', 'green']  # You can expand this if you have more genres
genre_labels = metadata['genres_name']

# Create a color mapping based on genre
color_map = []
for genres in genre_labels:
    if 'Action' in genres:
        color_map.append('red')
    elif 'Drama' in genres:
        color_map.append('blue')
    elif 'Comedy' in genres:
        color_map.append('green')
    else:
        color_map.append('purple')

# Plot each point with its corresponding genre color
plt.scatter(umap_embeddings[:, 0], umap_embeddings[:, 1], c=color_map, s=100, cmap='viridis')
plt.title('Movie Embeddings (UMAP)', fontsize=16)
plt.xlabel('UMAP Component 1', fontsize=14)
plt.ylabel('UMAP Component 2', fontsize=14)

# Show plot
plt.colorbar()
plt.show()