In [None]:
import os
import torch
import numpy as np
from datasets import load_dataset, load_from_disk , Dataset, DatasetDict
from sentence_transformers import SentenceTransformer
import time

In [None]:
from google.colab import drive

# Mount Drive
drive.mount('/content/drive')

In [None]:
REPO = "/content/drive/MyDrive/00-github/sentence-embedding-sensitivity"
DATA = os.path.join(REPO,"Data")
DATASETS = os.path.join(DATA,"datasets")
SICK_DATA = os.path.join(DATA,"sick_dataset")
SR_DATA = os.path.join(DATA,"sr_dataset")
VISLA_DATA = os.path.join(DATA,"VISLA")

model_dict = {
    "par_dis_roberta": "paraphrase-distilroberta-base-v1",
    "roberta_base_v3": "msmarco-roberta-base-v3",
    "par_mpnet": "paraphrase-mpnet-base-v2",
    "par_xlm_r": "paraphrase-xlm-r-multilingual-v1",
    "labse": "LaBSE",
    "e5_base": "intfloat/e5-base-v2",
    "gte_base": "thenlper/gte-base",
    "bge_base_v15": "BAAI/bge-base-en-v1.5"
}

# "MRPC": ("glue", "mrpc"),

benchmark_datasets = {
    "MRPC": ("glue", "mrpc"),
    "QQP": ("glue", "qqp"),
    "PAWS": ("paws", "labeled_final"),
    "STS": ("glue", "stsb"),
    "SICK": f"{SICK_DATA}",
    "SR": f"{SR_DATA}"
}

In [None]:
# @title  Fast GPU modes
device = "cuda" if torch.cuda.is_available() else "cpu"
if device == "cuda":
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True
    torch.set_float32_matmul_precision("high")
    gpu_name = torch.cuda.get_device_name(0)
    ENCODE_BS = 1024 if "A100" in gpu_name else 256
    AMP_DTYPE = torch.bfloat16 if "A100" in gpu_name else torch.float16
else:
    gpu_name = "CPU"
    ENCODE_BS = 64
    AMP_DTYPE = None

print(f"Running on {gpu_name}, batch_size={ENCODE_BS}, dtype={AMP_DTYPE}")

In [None]:
# Create dataset directory if it doesn't exist
os.makedirs(DATASETS, exist_ok=True)

In [None]:
test_val = {
    "MRPC" : "test",
    "QQP" : "validation",
    "PAWS" : "test",
    "STS" : "validation",
    "SICK" : "test",
    "SR" : "test"
}

In [None]:
# @title  Main loop to embed datasets

for dataset_name, dataset_id in benchmark_datasets.items():
    print(f"\n=== Dataset: {dataset_name} ===")
    t1 = time.time()
    # Load dataset
    if isinstance(dataset_id, tuple):
        dataset = load_dataset(*dataset_id)
        t2 = time.time()
        print(f"Loaded {dataset_name} from HuggingFace in {t2-t1:.1f} seconds")
    else:
        dataset = load_from_disk(dataset_id)
        t2 = time.time()
        print(f"Loaded {dataset_name} from disk in {t2-t1:.1f} seconds")

    # Load each model and encode
    for model_name, model_id in model_dict.items():
        print(f"--- Model: {model_name} ---")    
        dataset_dict ={}
        for split in ["train",test_val[dataset_name]]:

            # Choose text columns
            if all(c in dataset[split].column_names for c in ("sentence1", "sentence2")):
                colA, colB = "sentence1", "sentence2"
            else:
                colA, colB = "question1", "question2"
            print(f"Processing split: {split}, columns: {colA}, {colB}")
            
            t1 = time.time()
            # Load model
            model = SentenceTransformer(model_id, device=device)
            t2 = time.time()
            print(f"Loaded model in {t2-t1:.1f} seconds")
            # Move to GPU in lower precision if possible
            t1 = time.time()
            if device == "cuda":
                try:
                    base = model._first_module().auto_model
                    base.to(dtype=AMP_DTYPE, device=device)
                except Exception:
                    pass
            t2 = time.time()
            print(f"Moved model to GPU in {t2-t1:.1f} seconds")
            # Pull texts
            t1 = time.time()
            texts_a = dataset[split][colA]
            texts_b = dataset[split][colB]
            t2 = time.time()
            print(f"Pulled {len(texts_a)} texts from dataset in {t2-t1:.1f} seconds")
            
            # Encode both sides at once
            with torch.inference_mode():
                with torch.autocast("cuda", dtype=AMP_DTYPE, enabled=(device=="cuda")):
                    t1 = time.time()
                    embA = model.encode(
                        texts_a,
                        batch_size=ENCODE_BS,
                        convert_to_numpy=True,
                        normalize_embeddings=False,
                        show_progress_bar=True,
                    )
                    t2 = time.time()
                    print(f"Encoded side A in {t2-t1:.1f} seconds")
                    t1 = time.time()
                    embB = model.encode(
                        texts_b,
                        batch_size=ENCODE_BS,
                        convert_to_numpy=True,
                        normalize_embeddings=False,
                        show_progress_bar=True,
                    )
                    t2 = time.time()
                    print(f"Encoded side B in {t2-t1:.1f} seconds")
            temp_dataset = {
                "embedding1": embA,
                "embedding2": embB,
                "label": dataset[split]["label"]
            }
            if split == "train":
                dataset_dict["train"] = Dataset.from_dict(temp_dataset)
            else:
                dataset_dict["test"] = Dataset.from_dict(temp_dataset)

        # Save to disk
        save_path = os.path.join(DATASETS, f"{dataset_name}_{model_name}")
        DatasetDict(dataset_dict).save_to_disk(save_path)
        print(f"Saved: {save_path}")

        # Clean up
        del model
        if device == "cuda":
            torch.cuda.empty_cache()