# Embedding demo

### simplified two-tower embedding

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Example data
users = pd.DataFrame({
    'user_id': [1, 2, 3, 4],
    'user_age': [25, 34, 28, 22],
    'user_gender': ['M', 'F', 'M', 'F']
})

items = pd.DataFrame({
    'item_id': [1, 2, 3, 4],
    'movie_title': ['The Matrix', 'Titanic', 'The Godfather', 'The Shawshank Redemption'],
    'movie_genre': ['Sci-Fi', 'Romance', 'Crime', 'Drama']
})

ratings = pd.DataFrame({
    'user_id': [1, 1, 2, 2, 3, 3, 4, 4],
    'item_id': [1, 2, 2, 3, 1, 4, 2, 4],
    'rating': [5, 3, 4, 5, 4, 5, 3, 4]
})

# Encode categorical features
user_gender_encoder = LabelEncoder()
users['user_gender'] = user_gender_encoder.fit_transform(users['user_gender'])

# Normalize numerical features
age_scaler = MinMaxScaler()
users['user_age'] = age_scaler.fit_transform(users[['user_age']])

# Encode movie genre
movie_genre_encoder = LabelEncoder()
items['movie_genre'] = movie_genre_encoder.fit_transform(items['movie_genre'])

# Prepare input data
user_input = ratings['user_id'].values
item_input = ratings['item_id'].values
ratings_input = ratings['rating'].values

# Example model architecture
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate
from tensorflow.keras.models import Model

# User tower
user_input_layer = Input(shape=(1,), name='user_input')
user_embedding = Embedding(input_dim=len(users)+1, output_dim=8)(user_input_layer)
user_vec = Flatten()(user_embedding)

# Item tower
item_input_layer = Input(shape=(1,), name='item_input')
item_embedding = Embedding(input_dim=len(items)+1, output_dim=8)(item_input_layer)
item_vec = Flatten()(item_embedding)

# Dot product to calculate similarity
dot_product = tf.reduce_sum(tf.multiply(user_vec, item_vec), axis=1)

# Final model
model = Model(inputs=[user_input_layer, item_input_layer], outputs=dot_product)
model.compile(optimizer='adam', loss='mse')

# Train the model
model.fit([user_input, item_input], ratings_input, epochs=10, batch_size=2)

# Summary of the model
model.summary()


Metal device set to: Apple M2

systemMemory: 16.00 GB
maxCacheSize: 5.33 GB



2024-05-28 18:36:10.166251: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-05-28 18:36:10.166377: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Epoch 1/10


2024-05-28 18:36:10.702792: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2024-05-28 18:36:10.897086: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 user_input (InputLayer)        [(None, 1)]          0           []                               
                                                                                                  
 item_input (InputLayer)        [(None, 1)]          0           []                               
                                                                                                  
 embedding (Embedding)          (None, 1, 8)         40          ['user_input[0][0]']             
                                                                                                  
 embedding_1 (Embedding)        (None, 1, 8)         40          ['item_input[0][0]']        

In [8]:
import pandas as pd
import numpy as np
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Dot
from tensorflow.keras.models import Model
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import zipfile
import urllib.request


# Example data
# ratings = pd.DataFrame({
#     'user_id': [1, 1, 2, 2, 3, 3, 4, 4],
#     'item_id': [1, 2, 2, 3, 1, 4, 2, 4],
#     'rating': [5, 3, 4, 5, 4, 5, 3, 4]
# })

# # Prepare input data
# user_input = ratings['user_id'].values
# item_input = ratings['item_id'].values
# ratings_input = ratings['rating'].values

# # Model parameters
# num_users = ratings['user_id'].nunique()
# num_items = ratings['item_id'].nunique()


# Download and extract the dataset
url = 'http://files.grouplens.org/datasets/movielens/ml-100k.zip'
urllib.request.urlretrieve(url, '../../../data/.local/ml-100k.zip')
with zipfile.ZipFile('../../../data/.local/ml-100k.zip', 'r') as zip_ref:
    zip_ref.extractall('.')

# Load data into pandas DataFrame
ratings = pd.read_csv('../../../data/.local/ml-100k/u.data', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])
ratings.drop(columns=['timestamp'], inplace=True)

# Prepare input data
user_input = ratings['user_id'].values
item_input = ratings['item_id'].values
ratings_input = ratings['rating'].values

# Split data into training and testing sets
user_train, user_test, item_train, item_test, ratings_train, ratings_test = train_test_split(
    user_input, item_input, ratings_input, test_size=0.2, random_state=42)

num_users = ratings['user_id'].nunique()
num_items = ratings['item_id'].nunique()

embedding_dim = 8

# User tower
user_input_layer = Input(shape=(1,), name='user_input')
user_embedding = Embedding(input_dim=num_users+1, output_dim=embedding_dim, name='user_embedding')(user_input_layer)
user_vec = Flatten(name='user_flatten')(user_embedding)

# Item tower
item_input_layer = Input(shape=(1,), name='item_input')
item_embedding = Embedding(input_dim=num_items+1, output_dim=embedding_dim, name='item_embedding')(item_input_layer)
item_vec = Flatten(name='item_flatten')(item_embedding)

# Dot product to calculate similarity
dot_product = Dot(axes=1, name='dot_product')([user_vec, item_vec])

# Final model
model = Model(inputs=[user_input_layer, item_input_layer], outputs=dot_product)
model.compile(optimizer='adam', loss='mse')

# Train the model
model.fit([user_train, item_train], ratings_train, epochs=10, batch_size=64, verbose=1)


# Summary of the model
model.summary()

# Make predictions on the test set
predictions = model.predict([user_test, item_test])

# Evaluate the model
mse = mean_squared_error(ratings_test, predictions)
rmse = np.sqrt(mse)

print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")

Epoch 1/10
   4/1250 [..............................] - ETA: 23s - loss: 13.9053 

2024-05-28 18:58:40.736097: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Model: "model_6"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 user_input (InputLayer)        [(None, 1)]          0           []                               
                                                                                                  
 item_input (InputLayer)        [(None, 1)]          0           []                               
                                                                                                  
 user_embedding (Embedding)     (None, 1, 8)         7552        ['user_input[0][0]']             
                                                                                                  
 item_embedding (Embedding)     (None, 1, 8)         13464       ['item_input[0][0]']      

2024-05-28 18:59:26.398761: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Mean Squared Error (MSE): 0.9120335695677148
Root Mean Squared Error (RMSE): 0.955004486674128


### simplified graph embedding

In [4]:
import pandas as pd
import numpy as np
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Dot
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import zipfile
import urllib.request

# Download and extract the dataset
url = 'http://files.grouplens.org/datasets/movielens/ml-100k.zip'
urllib.request.urlretrieve(url, '../../../data/.local/ml-100k.zip')
with zipfile.ZipFile('../../../data/.local/ml-100k.zip', 'r') as zip_ref:
    zip_ref.extractall('.')

# Load data into pandas DataFrame
ratings = pd.read_csv('../../../data/.local/ml-100k/u.data', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])
ratings.drop(columns=['timestamp'], inplace=True)

# Prepare input data
user_input = ratings['user_id'].values
item_input = ratings['item_id'].values
ratings_input = ratings['rating'].values

# Split data into training and testing sets
user_train, user_test, item_train, item_test, ratings_train, ratings_test = train_test_split(
    user_input, item_input, ratings_input, test_size=0.2, random_state=42)

# Model parameters
num_users = ratings['user_id'].nunique()
num_items = ratings['item_id'].nunique()
embedding_dim = 8

# User tower
user_input_layer = Input(shape=(1,), name='user_input')
user_embedding = Embedding(input_dim=num_users+1, output_dim=embedding_dim, name='user_embedding')(user_input_layer)
user_vec = Flatten(name='user_flatten')(user_embedding)

# Item tower
item_input_layer = Input(shape=(1,), name='item_input')
item_embedding = Embedding(input_dim=num_items+1, output_dim=embedding_dim, name='item_embedding')(item_input_layer)
item_vec = Flatten(name='item_flatten')(item_embedding)

# Dot product to calculate similarity
dot_product = Dot(axes=1, name='dot_product')([user_vec, item_vec])

# Final model
model = Model(inputs=[user_input_layer, item_input_layer], outputs=dot_product)
model.compile(optimizer='adam', loss='mse')

# Train the model
model.fit([user_train, item_train], ratings_train, epochs=10, batch_size=64, verbose=1)

# Make predictions on the test set
predictions = model.predict([user_test, item_test])

# Evaluate the model
mse = mean_squared_error(ratings_test, predictions)
rmse = np.sqrt(mse)

print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")


Epoch 1/10
   5/1250 [..............................] - ETA: 16s - loss: 13.5537 

2024-05-28 18:43:18.351864: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
142/625 [=====>........................] - ETA: 0s

2024-05-28 18:44:02.913787: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Mean Squared Error (MSE): 0.9169867498032821
Root Mean Squared Error (RMSE): 0.9575942511331624


In [11]:
from tensorflow.keras.utils import plot_model
from IPython.display import Image

# Plot the model
plot_model(model, to_file='./two_tower_embedding_model.png', show_shapes=True, show_layer_names=True)

# Display the model plot
Image(filename='./two_tower_embedding_model.png')

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model/model_to_dot to work.


FileNotFoundError: [Errno 2] No such file or directory: './two_tower_embedding_model.png'