### Setup

In [1]:
import json
import os
import pickle

import torch
from transformers import DistilBertModel, DistilBertTokenizer
from tqdm import tqdm

In [2]:
# Setup
model_name = "distilbert-base-uncased"
model_class = DistilBertModel
tokenizer_class = DistilBertTokenizer

meta_data_file = "/root/research/ReviewDiff/preprocess/data/preprocessed/meta_data.json"
train_data_file = "/root/research/ReviewDiff/preprocess/data/preprocessed/train_data.json"

# Load files
with open(meta_data_file) as file:
    meta_data = json.load(file)

with open(train_data_file) as file:
    train_data = json.load(file)

### Processing

In [3]:
def generate_embeddings(model, tokenizer, sentences):
    inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt", max_length=512)
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach().cpu().numpy()

In [4]:
def process_and_save(data, model_key, model, tokenizer, save_dir):
    os.makedirs(save_dir, exist_ok=True)
    pickle_path = os.path.join(save_dir, f"{model_key}/embeddings_768.pckl")
    os.makedirs(os.path.dirname(pickle_path), exist_ok=True)
    
    with open(pickle_path, "wb") as pckl_file:
        for trajectory in tqdm(data, desc="Encoding trajectories"):
            attributes = [product["attribute"] for product in trajectory]
            embeddings = generate_embeddings(model, tokenizer, attributes)
            processed_trajectory = [
                {
                    "embedding": embeddings[i],
                    "review": product["review"],
                    "asin": product["asin"],
                }
                for i, product in enumerate(trajectory)
            ]
            pickle.dump(processed_trajectory, pckl_file)

    print(f"Saved embeddings to {pickle_path}.")

### Execution

In [5]:
# Prepare model
tokenizer = tokenizer_class.from_pretrained(model_name)
model = model_class.from_pretrained(model_name)
model.eval()

# Process
save_dir = "data/embeddings"
process_and_save(train_data[:5], model_name.split("-")[0], model, tokenizer, save_dir)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Encoding trajectories: 100%|██████████| 5/5 [00:23<00:00,  4.62s/it]

Saved embeddings to data/embeddings/distilbert/embeddings_768.pckl.





### Load and check

In [8]:
import pickle

def load_and_display_data(pickle_file):
    data = []
    with open(pickle_file, 'rb') as file:
        while True:
            try:
                trajectory = pickle.load(file)
                data.append(trajectory)
            except EOFError:
                break

    print(f"Loaded data from {pickle_file}")
    print(f"Total number of trajectories: {len(data)}")

    if data:
        print(f"\n[FIRST TRAJECTORY]")
        first_trajectory = data[0]
        for product in first_trajectory:
            print(f"ASIN: {product['asin']}")
            print(f"Review: {product['review']}")
            print(f"Embedding shape: {product['embedding'].shape}")
            print(f"Embedding sample (first 5 values): {product['embedding'][:5]}")
            print("---------------")

# ===============[ EXECUTE ]===============
pickle_file = "data/embeddings/distilbert/embeddings_768.pckl"
load_and_display_data(pickle_file)

Loaded data from data/embeddings/distilbert/embeddings_768.pckl
Total number of trajectories: 5

[FIRST TRAJECTORY]
ASIN: B000VZGTPY
Review: 5.0
Embedding shape: (768,)
Embedding sample (first 5 values): [-0.0578975   0.07030657  0.50867945  0.10296871  0.1382606 ]
---------------
ASIN: B000VZGTPY
Review: 5.0
Embedding shape: (768,)
Embedding sample (first 5 values): [-0.0578975   0.07030657  0.50867945  0.10296871  0.1382606 ]
---------------
ASIN: B003O6SJEQ
Review: 5.0
Embedding shape: (768,)
Embedding sample (first 5 values): [-0.13559039  0.06522012  0.67901325  0.09240711  0.16797067]
---------------
ASIN: B00BFI5SE4
Review: 5.0
Embedding shape: (768,)
Embedding sample (first 5 values): [-0.10564767 -0.02553836  0.62826556  0.06384551  0.07079843]
---------------
ASIN: B00GURJZZI
Review: 5.0
Embedding shape: (768,)
Embedding sample (first 5 values): [-0.19269682 -0.0229433   0.6225416   0.13620986  0.04851849]
---------------
ASIN: 7138258879
Review: 5.0
Embedding shape: (768,)
E