In [None]:
import os
import torch
import numpy as np
from datasets import load_dataset, load_from_disk
from sentence_transformers import SentenceTransformer

In [None]:
from google.colab import drive

# Mount Drive
drive.mount('/content/drive')

In [None]:
REPO = "content/drive/MyDrive/00-github/sentence-embedding-sensitivity"
DATA = os.path.join(REPO,"Data")
DATASETS = os.path.join(DATA,"datasets")
SICK_DATA = os.path.join(DATA,"sick_dataset")
SR_DATA = os.path.join(DATA,"sr_dataset")
VISLA_DATA = os.path.join(DATA,"VISLA")

model_dict = {
    "par_dis_roberta": "paraphrase-distilroberta-base-v1",
    "roberta_base_v3": "msmarco-roberta-base-v3",
    "par_mpnet": "paraphrase-mpnet-base-v2",
    "par_xlm_r": "paraphrase-xlm-r-multilingual-v1",
    "labse": "LaBSE",
    "e5_base": "intfloat/e5-base-v2",
    "gte_base": "thenlper/gte-base",
    "bge_base_v15": "BAAI/bge-base-en-v1.5"
}

benchmark_datasets = {
    "MRPC": ("glue", "mrpc"),
    "QQP": ("glue", "qqp"),
    "PAWS": ("paws", "labeled_final"),
    "STS-B": ("glue", "stsb"),
    "SICK": f"{SICK_DATA}",
    "SR": f"{SR_DATA}"
}

In [None]:
# @title  Fast GPU modes
device = "cuda" if torch.cuda.is_available() else "cpu"
if device == "cuda":
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True
    torch.set_float32_matmul_precision("high")
    gpu_name = torch.cuda.get_device_name(0)
    ENCODE_BS = 512 if "A100" in gpu_name else 256
    AMP_DTYPE = torch.bfloat16 if "A100" in gpu_name else torch.float16
else:
    gpu_name = "CPU"
    ENCODE_BS = 64
    AMP_DTYPE = None

print(f"Running on {gpu_name}, batch_size={ENCODE_BS}, dtype={AMP_DTYPE}")