In [3]:
import tensorflow as tf
import pprint
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')


2025-06-15 14:41:00.871270: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-15 14:41:01.026434: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-06-15 14:41:01.026561: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-06-15 14:41:01.045160: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-06-15 14:41:01.097811: I tensorflow/core/platform/cpu_feature_guar

## <span style="color:#ff5f27">🔮 Connect to Hopsworks Feature Store </span>

In [1]:
import hopsworks

project = hopsworks.login()

fs = project.get_feature_store()
mr = project.get_model_registry()

2025-06-17 21:45:59,340 INFO: Initializing external client
2025-06-17 21:45:59,343 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-06-17 21:46:00,921 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1220788


## <span style="color:#ff5f27">🎯 Compute Candidate Embeddings </span>

computing candidate embeddings for all items in the training data.

First, we load the candidate model uploaded to the Hopsworks Model Registry in the previous notebook. 

In [3]:

# Load candidate model from Model Registry
model = mr.get_model(
    name="candidate_model",
    version=1,
)
model_path = model.download()



Downloading: 0.000%|          | 0/915 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 1 files)... 

Downloading: 0.000%|          | 0/1426208 elapsed<00:00 remaining<?

Downloading model artifact (1 dirs, 2 files)... 

Downloading: 0.000%|          | 0/481080 elapsed<00:00 remaining<?

Downloading model artifact (1 dirs, 3 files)... 

Downloading: 0.000%|          | 0/56 elapsed<00:00 remaining<?

Downloading model artifact (2 dirs, 4 files)... DONE

In [4]:
candidate_model = tf.saved_model.load(model_path)

Next we compute the embeddings of all events


In [2]:
# Retrieve feature view
feature_view = fs.get_feature_view(
    name="event_retrieval",
    version=1,
)


In [6]:
# Load training data
train_df, val_df, test_df, _, _, _ = feature_view.train_validation_test_split(
    validation_size=0.1, 
    test_size=0.01,
    description='Event retrieval dataset splits',
)


Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (21.01s) 



In [7]:
train_df.head(3)

Unnamed: 0,interaction_id,user_id,user_city,age,user_interests,event_id,event_city,event_type,title
0,ZO876H,LE249D,Berlin,34,"food,travel",WI267T,Berlin,Immersive Experiences,Synergized content-based collaboration Immersi...
1,WY612T,OX425G,Toronto,30,"travel,music,fitness,literature",UX026V,Toronto,Education & Learning,Multi-lateral disintermediate task-force Educa...
3,EV784M,ID377X,New York,39,"literature,fitness",SN272D,New York,Education & Learning,Sharable 24hour concept Education & Learning i...


In [8]:
# Get the list of input features for the candidate model from the model schema
model_schema = model.model_schema['input_schema']['columnar_schema']
candidate_features = [feat['name'] for feat in model_schema]

# Select the candidate features from the training DataFrame
item_df = train_df[candidate_features]

# Drop duplicate rows based on the 'event_id' column to get unique candidate items
item_df.drop_duplicates(subset="event_id", inplace=True)

item_df.head(3)

Downloading: 0.000%|          | 0/499 elapsed<00:00 remaining<?

Unnamed: 0,event_id,event_city,event_type,title
0,WI267T,Berlin,Immersive Experiences,Synergized content-based collaboration Immersi...
1,UX026V,Toronto,Education & Learning,Multi-lateral disintermediate task-force Educa...
3,SN272D,New York,Education & Learning,Sharable 24hour concept Education & Learning i...


In [5]:
events_df = fs.get_feature_group("events", version=1).read()

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (2.34s) 


### Select the candidate features from the main events DataFrame

In [10]:
# Get the list of input features for the candidate model from the model schema
model_schema = model.model_schema['input_schema']['columnar_schema']
candidate_features = [feat['name'] for feat in model_schema]

# Select the candidate features from the maing events DataFrame
item_df_2 = events_df[candidate_features]

# Drop duplicate rows based on the 'event_id' column to get unique candidate items
item_df_2.drop_duplicates(subset="event_id", inplace=True)

item_df_2.head(3)

Downloading: 0.000%|          | 0/499 elapsed<00:00 remaining<?

Unnamed: 0,event_id,event_city,event_type,title
0,CT562P,Dubai,Food & Drink,Multi-lateral grid-enabled projection Food & D...
1,YH404F,São Paulo,Music & Concerts,Enhanced global success Music & Concerts in Sã...
2,DZ892D,Tokyo,Community & Causes,Synchronized contextually-based website Commun...


In [11]:
# Create a TensorFlow dataset from the item DataFrame
item_ds = tf.data.Dataset.from_tensor_slices(
    {col: item_df[col] for col in item_df_2})

# Compute embeddings for all candidate items using the candidate_model
candidate_embeddings = item_ds.batch(1024).map(
    lambda x: (x["event_id"], candidate_model(x)))

In [12]:

# Concatenate all article IDs and embeddings from the candidate_embeddings dataset
all_article_ids = tf.concat([batch[0] for batch in candidate_embeddings], axis=0)
all_embeddings = tf.concat([batch[1] for batch in candidate_embeddings], axis=0)

# Convert tensors to numpy arrays
all_article_ids_np = all_article_ids.numpy()
all_embeddings_np = all_embeddings.numpy()

# Convert numpy arrays to lists
items_ids_list = all_article_ids_np.tolist()
embeddings_list = all_embeddings_np.tolist()



In [13]:
# Create a DataFrame
data_emb = pd.DataFrame({
    'event_id': items_ids_list, 
    'embeddings': embeddings_list,
})
data_emb['event_id'] = data_emb['event_id'].str.decode('utf-8')

data_emb.head()

Unnamed: 0,event_id,embeddings
0,WI267T,"[4.5834342017769814e-05, 0.018815260380506516,..."
1,UX026V,"[5.860486999154091e-05, 0.019117526710033417, ..."
2,SN272D,"[-0.0007730326615273952, -0.000566591508686542..."
3,ET528T,"[-0.0007730326615273952, -0.000566591508686542..."
4,CD998L,"[-0.0007730326615273952, -0.000566591508686542..."


## <span style="color:#ff5f27">🪄 Feature Group Creation </span>

Create a feature group for the candidate embeddings.

First, create Embedding Index where we will specify the name of the embeddings feature and the embeddings length.
Then we attach this index to the FG.


In [14]:
from hsfs import embedding

# Create the Embedding Index
emb = embedding.EmbeddingIndex()

emb.add_embedding(
    "embeddings",                           # Embeddings feature name
    len(data_emb["embeddings"].iloc[0]),    # Embeddings length
)

# Get or create the 'candidate_embeddings_fg' feature group
candidate_embeddings_fg = fs.get_or_create_feature_group(
    name="candidate_embeddings_fg",
    embedding_index=emb,                    # Specify the Embedding Index
    primary_key=['event_id'],
    version=1,
    description='Embeddings for each event',
    online_enabled=True,
)

candidate_embeddings_fg.insert(data_emb)

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1220788/fs/1208418/fg/1485285


Uploading Dataframe: 100.00% |██████████| Rows 9999/9999 | Elapsed Time: 00:03 | Remaining Time: 00:00


Launching job: candidate_embeddings_fg_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1220788/jobs/named/candidate_embeddings_fg_1_offline_fg_materialization/executions


(Job('candidate_embeddings_fg_1_offline_fg_materialization', 'SPARK'), None)


## <span style="color:#ff5f27">🪄 Feature View Creation </span>


In [15]:
# Get or create the 'candidate_embeddings' feature view
feature_view = fs.get_or_create_feature_view(
    name="candidate_embeddings",
    version=1,
    description='Embeddings of each event',
    query=candidate_embeddings_fg.select(["event_id"]),
)

Feature view created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1220788/fs/1208418/fv/candidate_embeddings/version/1
