In [None]:
import numpy as np
import os
import torch
from datasets import load_dataset , load_from_disk
from transformers import Trainer, TrainingArguments
from sentence_transformers import SentenceTransformer
from registry import *

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
from google.colab import drive

# Mount Drive
drive.mount('/content/drive')

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
def compute_embeddings(examples,model):
    # Tokenize the sentences
    if "sentence1" in examples and "sentence2" in examples:
        embeddings_A = model.encode(examples['sentence1'], convert_to_numpy=True)
        embeddings_B = model.encode(examples['sentence2'], convert_to_numpy=True)
    else:
        embeddings_A = model.encode(examples['question1'], convert_to_numpy=True)
        embeddings_B = model.encode(examples['question2'], convert_to_numpy=True)     
    # add embeddings to dataset
    return{
        'embeddings1' : embeddings_A.astype(np.float32),
        'embeddings2' : embeddings_B.astype(np.float32)
    }

In [None]:
# @title Binary datasets
for dataset_name in benchmark_datasets:
    os.makedirs(os.path.join(DATASETS, f"{dataset_name}"), exist_ok=True)
    path = os.path.join(DATASETS, f"{dataset_name}")
    if dataset_name == "SICK" or dataset_name == "SR":
        dataset_id = benchmark_datasets[dataset_name]
        dataset = load_from_disk(dataset_id)
    else:
        dataset_id = benchmark_datasets[dataset_name]
        dataset = load_dataset(*dataset_id)
    print(f"Dataset: {dataset_name}")
    for model_name in model_dict:
        print(f"Model: {model_name}")
        model_id = model_dict[model_name]
        model = SentenceTransformer(model_id,device=device)
        # Compute embeddings
        dataset = dataset.map(lambda x: compute_embeddings(x, model), batched=True, batch_size=32,load_from_cache_file=False)
        # Save the dataset with embeddings
        dataset.save_to_disk(os.path.join(path, f"{dataset_name}_{model_name}"))

In [None]:
# @title VISLA dataset
generic_path = os.path.join(VISLA_DATA,"Generic_VISLA.tsv")
Spatial_path = os.path.join(VISLA_DATA,"Spatial_VISLA.tsv")
Generic_VISLA = load_dataset(
    'csv',
    data_files=generic_path,
    delimiter='\t',
)
Spatial_VISLA = load_dataset(
    'csv',
    data_files=Spatial_path,
    delimiter='\t',
)