In [1]:
colab = False

if colab:
    from google.colab import drive

    drive.mount("/content/drive")
    save_path = "/content/drive/MyDrive/"
else:
    save_path = "./"

In [None]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
import numpy as np

In [None]:
# TODO: load data frame
df = pd.DataFrame(
    {
        "book_id": [1, 2, 3] * 50000,
        "description": [
            "A thrilling tale of adventure and mystery.",
            "An introspective look into the life of an artist.",
            "A historical account of ancient civilizations.",
        ] * 50000,
    }
)

In [None]:
model_name = "microsoft/deberta-v3-base"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
model = AutoModel.from_pretrained(model_name)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"using device: {device}")

In [None]:
batch_size = 32
descriptions = df["description"].tolist()

cls_embeddings = []
mean_embeddings = []

model.eval()
with torch.no_grad():
    for i in tqdm(
        range(0, len(descriptions), batch_size), desc="Encoding descriptions"
    ):
        batch_desc = descriptions[i : i + batch_size]

        inputs = tokenizer(
            batch_desc,
            return_tensors="pt",
            add_special_tokens=True,
            truncation=True,
            padding="max_length",
            max_length=512,
        )

        inputs = {k: v.to(device) for k, v in inputs.items()}
        outputs = model(**inputs)
        cls_emb = outputs.last_hidden_state[:, 0, :]

        mask = (
            inputs["attention_mask"]
            .unsqueeze(-1)
            .expand(outputs.last_hidden_state.size())
            .float()
        )
        sum_emb = torch.sum(outputs.last_hidden_state * mask, dim=1)
        token_count = torch.clamp(mask.sum(dim=1), min=1e-9)
        mean_emb = sum_emb / token_count

        cls_embeddings.append(cls_emb.cpu().numpy())
        mean_embeddings.append(mean_emb.cpu().numpy())

cls_embeddings = np.concatenate(cls_embeddings, axis=0)
mean_embeddings = np.concatenate(mean_embeddings, axis=0)

np.save(save_path + "cls_embeddings.npy", cls_embeddings)
np.save(save_path + "mean_embeddings.npy", mean_embeddings)

print(cls_embeddings.shape)
print(mean_embeddings.shape)