In [None]:
import os
import torch
import numpy as np
import pandas as pd
from datasets import load_dataset, load_from_disk , Dataset, DatasetDict
from sentence_transformers import SentenceTransformer
import time

In [None]:
from google.colab import drive

# Mount Drive
drive.mount('/content/drive')

In [None]:
REPO = "/content/drive/MyDrive/00-github/sentence-embedding-sensitivity"
DATA = os.path.join(REPO,"Data")
DATASETS_SAVE_PATH = os.path.join(DATA,"visla_datasets")
GEN_DATA = os.path.join(REPO,"VISLA","Generic_VISLA.tsv")
SPA_DATA = os.path.join(REPO,"VISLA","Spatial_VISLA.tsv")

In [None]:
model_dict = {
    "par-dis-roberta": "paraphrase-distilroberta-base-v1",
    "roberta-base-v3": "msmarco-roberta-base-v3",
    "par-mpnet": "paraphrase-mpnet-base-v2",
    "par-xlm-r": "paraphrase-xlm-r-multilingual-v1",
    "labse": "LaBSE",
    "e5-base": "intfloat/e5-base-v2",
    "gte-base": "thenlper/gte-base",
    "bge-base-v15": "BAAI/bge-base-en-v1.5"
}

In [None]:
# @title make dataset folder
os.makedirs(DATASETS_SAVE_PATH, exist_ok=True)

In [None]:
# @title load generic dataset
generic_df = pd.read_csv(GEN_DATA, sep="\t")

In [None]:
# @title  Fast GPU modes
device = "cuda" if torch.cuda.is_available() else "cpu"
if device == "cuda":
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True
    torch.set_float32_matmul_precision("high")
    gpu_name = torch.cuda.get_device_name(0)
    ENCODE_BS = 1024 if "A100" in gpu_name else 256
    AMP_DTYPE = torch.bfloat16 if "A100" in gpu_name else torch.float16
else:
    gpu_name = "CPU"
    ENCODE_BS = 64
    AMP_DTYPE = None

print(f"Running on {gpu_name}, batch_size={ENCODE_BS}, dtype={AMP_DTYPE}")