In [1]:

import pandas as pd
from sentence_transformers import SentenceTransformer
import torch
import numpy as np
import pickle
from tqdm import tqdm
from pathlib import Path
device = "cuda" if torch.cuda.is_available() else "cpu"
batch_size = 512

print(f"Using device: {device}")

  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda


In [2]:
# Load encoding model
model_name = "sentence-transformers/all-MiniLM-L6-v2"
model = SentenceTransformer(model_name).to(device)

In [3]:
def save_pickle(path: Path, arr: np.ndarray) -> None:
    """Write as a *single* 2-D float32 array; nothing nested."""
    path.parent.mkdir(parents=True, exist_ok=True)
    with path.open("wb") as f:
        pickle.dump(arr.astype("float32"), f, protocol=pickle.HIGHEST_PROTOCOL)


In [4]:
def encode_batch(texts: list[str]) -> np.ndarray:
    """Encode → mean-pool → L2-normalise → numpy (m,384)."""
    vec = model.encode(
        texts,
        batch_size=batch_size,
        convert_to_numpy=True,
        normalize_embeddings=True,   # L2 already
        show_progress_bar=False,
        device=device,
    )
    # sanity-check before returning
    if vec.ndim != 2 or vec.shape[1] != 384:
        raise ValueError(f"expected (m,384), got {vec.shape}")
    return vec

In [5]:
# Convert prompts to a Python list (if not already)
for i, csv_path in enumerate(sorted(Path("./datasets").glob("processed_chunk_*.csv")), 1):
    df          = pd.read_csv(csv_path, usecols=["prompt"])
    prompts     = df["prompt"].astype(str).tolist()

    # Encode in mini-batches and concatenate
    batches     = []
    for start in tqdm(range(0, len(prompts), batch_size), desc=f"encode {csv_path.name}"):
        batch = prompts[start:start + batch_size]
        batches.append(encode_batch(batch))
    embeddings  = np.vstack(batches)      # final shape  (N,384)

    # final assert – catches mistakes *before* you write anything
    assert embeddings.ndim == 2 and embeddings.shape[1] == 384, embeddings.shape

    out_pickle  = Path(f"./datasets/processed_chunk_{i}.pickle")
    save_pickle(out_pickle, embeddings)
    print(f"✅  wrote {embeddings.shape[0]:,} rows → {out_pickle}")

encode processed_chunk_1.csv: 100%|██████████| 654/654 [47:18<00:00,  4.34s/it]


✅  wrote 334,379 rows → datasets\processed_chunk_1.pickle


encode processed_chunk_2.csv: 100%|██████████| 672/672 [48:39<00:00,  4.34s/it]


✅  wrote 344,031 rows → datasets\processed_chunk_2.pickle


encode processed_chunk_3.csv: 100%|██████████| 683/683 [49:30<00:00,  4.35s/it]


✅  wrote 349,345 rows → datasets\processed_chunk_3.pickle


encode processed_chunk_4.csv: 100%|██████████| 686/686 [49:53<00:00,  4.36s/it]


✅  wrote 350,909 rows → datasets\processed_chunk_4.pickle


encode processed_chunk_5.csv: 100%|██████████| 686/686 [49:54<00:00,  4.36s/it]


✅  wrote 351,036 rows → datasets\processed_chunk_5.pickle


In [6]:
chunk = None

In [7]:
embeddings = None