## Setup

In [1]:
# Constants
EMBEDDING_MODEL_NAME = "hyp1231/blair-roberta-base"     # @param {type: "string"}
FEAT_NAME = "blair-base"     # @param {type: "string"}
REC_MODEL_NAME = "UniSRec"     # @param {type: "string"}

PROJECT_ROOT_DIR = "/content/drive/MyDrive/Courses/ANLP/project" # @param {type:"string"}
MODEL_SAVE_DIR = f"{PROJECT_ROOT_DIR}/AmazonReviews2023/model"  # @param {type:"string"}

In [2]:
# Connect to drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [23]:
%pip install -qU datasets
%pip install -qU recbole

## Load Models

In [4]:
import torch
from transformers import AutoModel, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL_NAME)
model = AutoModel.from_pretrained(EMBEDDING_MODEL_NAME)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/958 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

### Get Embeddings

In [5]:
language_context = 'I need a product that can scoop, measure, and rinse grains without the need for multiple utensils and dishes. It would be great if the product has measurements inside and the ability to rinse and drain all in one. I just have to be careful not to pour too much accidentally.'
item_metadata = [
  'Talisman Designs 2-in-1 Measure Rinse & Strain | Holds up to 2 Cups | Food Strainer | Fruit Washing Basket | Strainer & Colander for Kitchen Sink | Dishwasher Safe - Dark Blue. The Measure Rinse & Strain by Talisman Designs is a 2-in-1 kitchen colander and strainer that will measure and rinse up to two cups. Great for any type of food from rice, grains, beans, fruit, vegetables, pasta and more. After measuring, fill with water and swirl to clean. Strain then pour into your pot, pan, or dish. The convenient size is easy to hold with one hand and is compact to fit into a kitchen cabinet or pantry. Dishwasher safe and food safe.',
  'FREETOO Airsoft Gloves Men Tactical Gloves for Hiking Cycling Climbing Outdoor Camping Sports (Not Support Screen Touch).'
]
texts = [language_context] + item_metadata

inputs = tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")

with torch.no_grad():
    embeddings = model(**inputs, return_dict=True).last_hidden_state[:, 0]
    embeddings = embeddings / embeddings.norm(dim=1, keepdim=True)

In [6]:
print(embeddings[0] @ embeddings[1])    # tensor(0.8564)
print(embeddings[0] @ embeddings[2])    # tensor(0.5741)

tensor(0.8564)
tensor(0.5741)


### Save model

In [7]:
# save model weight
checkpoint = {
    'state_dict': model.state_dict(),
}
torch.save(checkpoint, f"{MODEL_SAVE_DIR}/{FEAT_NAME}.pt")

## Tasks

### Recommendations benchmark: [Amazon Reviews 2023](https://github.com/hyp1231/AmazonReviews2023/tree/main/benchmark_scripts)

In [8]:
REC_DIR = f"{PROJECT_ROOT_DIR}/AmazonReviews2023/seq_rec_results"

#### Load Dataset

In [9]:
!pip install -qU datasets
!pip install -qU recbole

In [10]:
!python $REC_DIR/dataset/process_amazon_2023.py \
    --domain All_Beauty \
    --device cuda:0 \
    --plm $EMBEDDING_MODEL_NAME

README.md:   0% 0.00/19.7k [00:00<?, ?B/s]README.md: 100% 19.7k/19.7k [00:00<00:00, 56.7MB/s]
Traceback (most recent call last):
  File "/content/drive/MyDrive/Courses/ANLP/project/AmazonReviews2023/seq_rec_results/dataset/process_amazon_2023.py", line 140, in <module>
    datasets = load_dataset(
  File "/usr/local/lib/python3.10/dist-packages/datasets/load.py", line 2132, in load_dataset
    builder_instance = load_dataset_builder(
  File "/usr/local/lib/python3.10/dist-packages/datasets/load.py", line 1853, in load_dataset_builder
    dataset_module = dataset_module_factory(
  File "/usr/local/lib/python3.10/dist-packages/datasets/load.py", line 1645, in dataset_module_factory
    dataset_script_path = api.hf_hub_download(
  File "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/huggingface_hub/hf_api.py", line 5645, in hf_hub_download
    return hf_h

#### Run recommendation

In [None]:
python {REC_DIR}/run.py \
    -m SASRec \
    -d All_Beauty \
    -p {MODEL_SAVE_DIR}/{REC_MODEL_NAME}.pth \
    --eval-only \
    --gpu_id=0

In [None]:
!python $REC_DIR/run.py \
    -m UniSRec \
    -d All_Beauty \
    -p {MODEL_SAVE_DIR}/{REC_MODEL_NAME}.pth \
    --eval-only \
    --gpu_id=0

2024-11-09 17:04:57.176367: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-09 17:04:57.196763: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-09 17:04:57.203373: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-09 17:04:57.217778: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Namespace(m='UniSRec', d='All_Beauty', p='/content/dr

### Product search benchmark: [Amazon-C4](https://github.com/hyp1231/AmazonReviews2023/blob/main/amazon-c4/README.md)

In [11]:
SEARCH_DIR = f"{PROJECT_ROOT_DIR}/AmazonReviews2023/product_search_results"

In [12]:
# First generate dense query/item representations and cache them
!python $SEARCH_DIR/generate_emb.py --plm_name $EMBEDDING_MODEL_NAME --feat_name $FEAT_NAME --categories Pet

2024-11-15 21:24:00.858206: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-15 21:24:00.894297: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-15 21:24:00.905197: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder

In [24]:
# Then evaluate the product search performance
!python $SEARCH_DIR/eval_search.py --suffix {FEAT_NAME}CLS --categories Pet --data_path /content/cache

2024-11-15 22:09:00.171686: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-15 22:09:00.192721: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-15 22:09:00.198996: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
  0% 0/15 [00:00<?, ?it/s]
Traceback (most recent call last):
  File "/content/drive/MyDrive/Courses/ANLP/project/AmazonReviews2023/product_search_results/eval_search.py", line 159, in <module>
    ndcg = metric.metric_info(pos_index, pos_len)
  File "/usr/local/lib/python3.10/dist-packages/recbole/evaluator/metrics.py", line 190, in metric_info
    iranks = np.z