In [None]:
import tensorflow as tf
import pprint
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')


2025-05-21 12:58:09.948371: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-21 12:58:10.399901: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-05-21 12:58:10.399992: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-05-21 12:58:10.481266: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-21 12:58:10.664174: I tensorflow/core/platform/cpu_feature_guar

## <span style="color:#ff5f27">🔮 Connect to Hopsworks Feature Store </span>

In [5]:
import hopsworks

project = hopsworks.login()

fs = project.get_feature_store()
mr = project.get_model_registry()

2025-05-21 13:00:23,462 INFO: Initializing external client
2025-05-21 13:00:23,464 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-05-21 13:00:25,095 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1220788


## <span style="color:#ff5f27">🎯 Compute Candidate Embeddings </span>

computing candidate embeddings for all items in the training data.

First, load the candidate model uploaded to the Hopsworks Model Registry in the previous notebook. 

In [6]:

# Load candidate model from Model Registry
model = mr.get_model(
    name="candidate_model",
    version=1,
)
model_path = model.download()



Downloading: 0.000%|          | 0/919 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 1 files)... 

Downloading: 0.000%|          | 0/2119072 elapsed<00:00 remaining<?

Downloading model artifact (1 dirs, 2 files)... 

Downloading: 0.000%|          | 0/566304 elapsed<00:00 remaining<?

Downloading model artifact (1 dirs, 3 files)... 

Downloading: 0.000%|          | 0/57 elapsed<00:00 remaining<?

Downloading model artifact (2 dirs, 4 files)... DONE

In [7]:
candidate_model = tf.saved_model.load(model_path)

Next we compute the embeddings of all candidate videos that were used to train the retrieval model.


In [8]:
# Retrieve feature view
feature_view = fs.get_feature_view(
    name="event_retrieval",
    version=1,
)


In [9]:
# Load training data
train_df, val_df, test_df, _, _, _ = feature_view.train_validation_test_split(
    validation_size=0.1, 
    test_size=0.01,
    description='Event retrieval dataset splits',
)


Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (7.09s) 



In [10]:
train_df.head(3)

Unnamed: 0,interaction_id,user_id,user_city,age,user_interests,event_id,title,event_type,event_city
0,4140ef03-afce-4fe5-a9a9-8e43fd45dfaa,LT819S,Sydney,26,sports literature cinema,OI841N,reengineered 3rdgeneration array food drink in...,Food & Drink,Tokyo
1,BI623H,UP287J,Mumbai,56,tech food travel,IO571I,balanced interactive artificial intelligence t...,Technology,Mumbai
2,08e342ae-f9b5-4358-9f41-cf89671b9ae2,RG723K,Mumbai,32,literature music,YW813D,visionary optimal monitoring education learnin...,Education & Learning,Paris


In [11]:
# Get the list of input features for the candidate model from the model schema
model_schema = model.model_schema['input_schema']['columnar_schema']
candidate_features = [feat['name'] for feat in model_schema]

# Select the candidate features from the training DataFrame
item_df = train_df[candidate_features]

# Drop duplicate rows based on the 'event_id' column to get unique candidate items
item_df.drop_duplicates(subset="event_id", inplace=True)

item_df.head(3)

Downloading: 0.000%|          | 0/499 elapsed<00:00 remaining<?

Unnamed: 0,event_id,event_city,event_type,title
0,OI841N,Tokyo,Food & Drink,reengineered 3rdgeneration array food drink in...
1,IO571I,Mumbai,Technology,balanced interactive artificial intelligence t...
2,YW813D,Paris,Education & Learning,visionary optimal monitoring education learnin...


In [14]:
events_df = fs.get_feature_group("events", version=1).read()

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (2.66s) 


### Select the candidate features from the main events DataFrame

In [None]:
# Get the list of input features for the candidate model from the model schema
model_schema = model.model_schema['input_schema']['columnar_schema']
candidate_features = [feat['name'] for feat in model_schema]

# Select the candidate features from the maing events DataFrame
item_df_2 = events_df[candidate_features]

# Drop duplicate rows based on the 'event_id' column to get unique candidate items
item_df_2.drop_duplicates(subset="event_id", inplace=True)

item_df_2.head(3)

Downloading: 0.000%|          | 0/499 elapsed<00:00 remaining<?

Unnamed: 0,event_id,event_city,event_type,title
0,CA178V,Toronto,Arts & Culture,seamless leadingedge timeframe arts culture in...
1,TF630U,New York,Community & Causes,visionary maximized definition community cause...
2,DY324Y,New York,Business & Networking,upsized local function business networking in ...


In [18]:
# Create a TensorFlow dataset from the item DataFrame
item_ds = tf.data.Dataset.from_tensor_slices(
    {col: item_df[col] for col in item_df_2})

# Compute embeddings for all candidate items using the candidate_model
candidate_embeddings = item_ds.batch(1024).map(
    lambda x: (x["event_id"], candidate_model(x)))

In [21]:

# Concatenate all article IDs and embeddings from the candidate_embeddings dataset
all_article_ids = tf.concat([batch[0] for batch in candidate_embeddings], axis=0)
all_embeddings = tf.concat([batch[1] for batch in candidate_embeddings], axis=0)

# Convert tensors to numpy arrays
all_article_ids_np = all_article_ids.numpy()
all_embeddings_np = all_embeddings.numpy()

# Convert numpy arrays to lists
items_ids_list = all_article_ids_np.tolist()
embeddings_list = all_embeddings_np.tolist()



In [22]:
# Create a DataFrame
data_emb = pd.DataFrame({
    'event_id': items_ids_list, 
    'embeddings': embeddings_list,
})
data_emb['event_id'] = data_emb['event_id'].str.decode('utf-8')

data_emb.head()

Unnamed: 0,event_id,embeddings
0,CA178V,"[-0.0017099515534937382, 0.0013913651928305626..."
1,TF630U,"[-0.0017099515534937382, 0.0013913651928305626..."
2,DY324Y,"[-0.0017099515534937382, 0.0013913651928305626..."
3,JE935V,"[-0.0017099515534937382, 0.0013913651928305626..."
4,VM016I,"[-0.0017099515534937382, 0.0013913651928305626..."


## <span style="color:#ff5f27">🪄 Feature Group Creation </span>

Create a feature group for the candidate embeddings.

First, create Embedding Index where we will specify the name of the embeddings feature and the embeddings length.
Then we attach this index to the FG.


In [24]:
from hsfs import embedding

# Create the Embedding Index
emb = embedding.EmbeddingIndex()

emb.add_embedding(
    "embeddings",                           # Embeddings feature name
    len(data_emb["embeddings"].iloc[0]),    # Embeddings length
)

# Get or create the 'candidate_embeddings_fg' feature group
candidate_embeddings_fg = fs.get_or_create_feature_group(
    name="candidate_embeddings_fg",
    embedding_index=emb,                    # Specify the Embedding Index
    primary_key=['event_id'],
    version=1,
    description='Embeddings for each event',
    online_enabled=True,
)

candidate_embeddings_fg.insert(data_emb)

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1220788/fs/1208418/fg/1479009


Uploading Dataframe: 100.00% |██████████| Rows 19989/19989 | Elapsed Time: 00:05 | Remaining Time: 00:00


Launching job: candidate_embeddings_fg_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1220788/jobs/named/candidate_embeddings_fg_1_offline_fg_materialization/executions


(Job('candidate_embeddings_fg_1_offline_fg_materialization', 'SPARK'), None)


## <span style="color:#ff5f27">🪄 Feature View Creation </span>


In [27]:
# Get or create the 'candidate_embeddings' feature view
feature_view = fs.get_or_create_feature_view(
    name="candidate_embeddings",
    version=1,
    description='Embeddings of each event',
    query=candidate_embeddings_fg.select(["event_id"]),
)

Feature view created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1220788/fs/1208418/fv/candidate_embeddings/version/1
