In [1]:
import pickle
from tqdm import tqdm
import numpy as np
from transformers import CLIPModel, CLIPProcessor
from sentence_transformers import SentenceTransformer
from usearch.index import Index
import torch
import csv


def load_model(model_name):
    """Load the specified model and its processor/tokenizer."""
    if "clip" in model_name.lower():
        model = CLIPModel.from_pretrained(model_name)
        processor = CLIPProcessor.from_pretrained(model_name)
        return model.cuda(), processor
    else:  # Use SentenceTransformer for SBERT models
        model = SentenceTransformer(model_name)
        return model.cuda(), None

def txt2vec_batch(texts, model, processor):
    """Convert a batch of text to embedding vectors."""
    if processor:  # CLIP
        inputs = processor(text=texts, return_tensors="pt", padding=True, truncation=True)
        inputs = {k: v.cuda() for k, v in inputs.items()}  # Move inputs to CUDA
        with torch.no_grad():
            outputs = model.get_text_features(**inputs)
        return outputs.cpu().numpy()
    else:  # SentenceTransformer
        return model.encode(texts, convert_to_numpy=True, batch_size=32)

def get_embedding_size(model, processor):
    """Retrieve the dimensionality of the embedding vectors."""
    test_text = ["test"]
    if processor:  # CLIP
        test_vec = txt2vec_batch(test_text, model, processor)
    else:  # SentenceTransformer
        test_vec = model.encode(test_text, convert_to_numpy=True)
    return test_vec.shape[-1]

def build_text_db(model_name, txt_file, index_path, batch_size=32):
    """Build a vector database from a text file."""
    print(f"Loading model {model_name}...")
    model, processor = load_model(model_name)

    print("Determining embedding size...")
    embedding_size = get_embedding_size(model, processor)

    print(f"Reading text from {txt_file}...")
    with open(txt_file, "r", encoding="utf-8") as f:
        texts = [line.strip() for line in f.readlines()]

    index = Index(ndim=embedding_size, metric="cos")  # Dynamically use the correct size
    idx2word = {}

    print("Generating embeddings and building the index in batches...")
    for i in tqdm(range(0, len(texts), batch_size)):
        batch_texts = texts[i:i + batch_size]
        batch_vectors = txt2vec_batch(batch_texts, model, processor)

        for j, vec in enumerate(batch_vectors):
            index.add(i + j, vec)
            idx2word[i + j] = batch_texts[j]

    print(f"Saving index to {index_path}...")
    with open(index_path, "wb") as f:
        pickle.dump((index.save(), idx2word), f)

def build_text_csv(model_name, txt_file, csv_path, batch_size=32):
    """Build a CSV file containing word labels and their embeddings from a text file."""
    print(f"Loading model {model_name}...")
    model, processor = load_model(model_name)

    print("Determining embedding size...")
    embedding_size = get_embedding_size(model, processor)

    print(f"Reading text from {txt_file}...")
    with open(txt_file, "r", encoding="utf-8") as f:
        texts = [line.strip() for line in f.readlines()]

    print("Generating embeddings and saving to CSV...")
    with open(csv_path, "w", newline="", encoding="utf-8") as csv_file:
        writer = csv.writer(csv_file)
        # Write the header row
        header = ["word"] + [f"dim_{i}" for i in range(embedding_size)]
        writer.writerow(header)

        for i in tqdm(range(0, len(texts), batch_size)):
            batch_texts = texts[i:i + batch_size]
            batch_vectors = txt2vec_batch(batch_texts, model, processor)

            for text, vector in zip(batch_texts, batch_vectors):
                writer.writerow([text] + vector.tolist())

model_names = [
   "openai/clip-vit-base-patch32",
   "openai/clip-vit-large-patch14",
    "laion/CLIP-ViT-H-14-laion2B-s32B-b79K",
    "sentence-transformers/all-MiniLM-L6-v2",
]
txt_file = "../output/word_list.txt"

for model_name in model_names:
    build_text_db(model_name, txt_file, f"index_500words_2_{model_name.split('/')[-1]}.pkl")

  from .autonotebook import tqdm as notebook_tqdm


Loading model openai/clip-vit-base-patch32...
Determining embedding size...
Reading text from ../output/word_list.txt...
Generating embeddings and building the index in batches...


100%|██████████| 21/21 [00:00<00:00, 51.01it/s]


Saving index to index_500words_2_clip-vit-base-patch32.pkl...
Loading model openai/clip-vit-large-patch14...
Determining embedding size...
Reading text from ../output/word_list.txt...
Generating embeddings and building the index in batches...


100%|██████████| 21/21 [00:00<00:00, 42.77it/s]


Saving index to index_500words_2_clip-vit-large-patch14.pkl...
Loading model laion/CLIP-ViT-H-14-laion2B-s32B-b79K...
Determining embedding size...
Reading text from ../output/word_list.txt...
Generating embeddings and building the index in batches...


100%|██████████| 21/21 [00:00<00:00, 28.43it/s]


Saving index to index_500words_2_CLIP-ViT-H-14-laion2B-s32B-b79K.pkl...
Loading model sentence-transformers/all-MiniLM-L6-v2...
Determining embedding size...
Reading text from ../output/word_list.txt...
Generating embeddings and building the index in batches...


100%|██████████| 21/21 [00:00<00:00, 57.38it/s]

Saving index to index_500words_2_all-MiniLM-L6-v2.pkl...



