### 0. Import libraries

In [1]:
import json
import torch
from transformers import AutoModel, AutoTokenizer


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Current device: {torch.cuda.current_device()} ({device}) -> {torch.cuda.get_device_name(0)}")

Current device: 0 (cuda) -> NVIDIA GeForce GTX 1050 Ti


### 1. Load the model and the tokenizer

In [3]:
model_paths = ['Alibaba-NLP/gte-base-en-v1.5', 'Alibaba-NLP/gte-large-en-v1.5', 'Alibaba-NLP/gte-Qwen2-1.5B-instruct']
dataset_names = ["quora", "covid", "touche"]
tokenizer = AutoTokenizer.from_pretrained(model_paths[0])
model = AutoModel.from_pretrained(model_paths[0], trust_remote_code=True).to(device)

Data sources: \
https://huggingface.co/datasets/mteb/quora \
https://huggingface.co/datasets/mteb/trec-covid \
https://huggingface.co/datasets/mteb/touche2020

In [4]:
def load_data(dataset_name, path, subset_type):
    file_name = path.format(dataset_name) + f"{subset_type}.jsonl"

    with open(file_name, "r") as json_file:
        data = [
            "\n".join({k: v for k, v in json.loads(line).items() if k != "_id"}.values()).strip("\n")
            for line in json_file
        ]
    
    return data


In [5]:
path = "../data/datasets/{}/"
data = load_data("quora", path, "corpus")

### 2. Embed each input text using a batch of 1

In [33]:
for text in data[:3]:
    # tokenize the input texts
    batch_dict = tokenizer([text], max_length=8192, padding=True, truncation=True, return_tensors='pt').to(device)

    # get embeddings
    outputs = model(**batch_dict)
    embeddings = outputs.last_hidden_state[:, 0][0].cpu().detach().numpy()

    # del batch_dict,outputs, embeddings