In [None]:
# from sentence_transformers import SentenceTransformer
# import toons

# with open('legal_taxonomy.toon', 'r', encoding='utf-8') as f:
#     toon_text = f.read()

# taxonomy = toons.loads(toon_text)

# model = SentenceTransformer('all-MiniLM-L12-v2')

No sentence-transformers model found with name google/bigbird-roberta-base. Creating a new one with mean pooling.


config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


pytorch_model.bin:   0%|          | 0.00/513M [00:00<?, ?B/s]

In [1]:
from sentence_transformers import SentenceTransformer
import toons
import numpy as np
from tqdm import tqdm # Added for a progress bar

# --- Configuration ---
MODEL_NAME = 'BAAI/bge-small-en-v1.5'
MAX_TOKENS = 512 # Max sequence length for this model
TOON_INPUT = 'legal_taxonomy.toon'
TOON_OUTPUT = 'legal_taxonomy_with_embeddings.toon'

# --- Load Model and Tokenizer ---
# Load model directly to GPU
model = SentenceTransformer(MODEL_NAME, device='cuda')
tokenizer = model.tokenizer

# --- Helper Function for Chunking ---
def chunk_text(text, max_tokens=MAX_TOKENS):
    """Chunks text into pieces smaller than the model's max token limit."""
    if not text:
        return [""]  # Handle empty descriptions/keywords
    
    tokens = tokenizer.encode(text, add_special_tokens=False)
    chunks = []
    
    if not tokens:
        return [""] # Handle cases where tokenizer returns empty

    for i in range(0, len(tokens), max_tokens):
        chunk_ids = tokens[i:i + max_tokens]
        chunks.append(tokenizer.decode(chunk_ids))
    return chunks

# --- Main Embedding Function (Modified) ---
def add_embeddings_to_taxonomy(taxonomy, model):
    """
    Iterates through the taxonomy, creates an embedding for each category
    using a chunking and averaging strategy, and adds it to the data.
    """
    print("Embedding taxonomy categories...")
    # Use tqdm for a nice progress bar
    for category_name, category_data in tqdm(taxonomy.items()):

        # 1. Combine Description and Keywords
        desc = category_data.get('Description', '')
        
        # Find the keyword field (handles 'Keywords' or 'Keywords_1', etc.)
        kw_field = next((k for k in category_data.keys() if k.startswith('Keywords')), None)
        keywords_value = category_data.get(kw_field, '') if kw_field else ''
        
        if isinstance(keywords_value, list):
            keywords_text = ' '.join(keywords_value)
        else:
            keywords_text = str(keywords_value)

        text_to_embed = f"{desc} {keywords_text}".strip()

        # 2. Apply Chunking Strategy
        chunks = chunk_text(text_to_embed)
        
        # 3. Encode chunks (this runs on the GPU)
        chunk_embs = model.encode(chunks, show_progress_bar=False)

        # 4. Average embeddings
        if len(chunk_embs) == 1:
            final_embedding = chunk_embs[0]
        else:
            # Stack all chunk embeddings and calculate the mean
            final_embedding = np.mean(np.vstack(chunk_embs), axis=0)

        # 5. Store the final embedding
        category_data['embeddings'] = final_embedding.tolist()

    return taxonomy

# --- Execution ---
print(f"Loading taxonomy from {TOON_INPUT}...")
with open(TOON_INPUT, 'r', encoding='utf-8') as f:
    toon_text = f.read()
taxonomy = toons.loads(toon_text)

# Run the main function
taxonomy_with_embeddings = add_embeddings_to_taxonomy(taxonomy, model)

print(f"\nSaving updated taxonomy to {TOON_OUTPUT}...")
updated_toon_text = toons.dumps(taxonomy_with_embeddings)
with open(TOON_OUTPUT, 'w', encoding='utf-8') as f:
    f.write(updated_toon_text)

print("✅ Done.")

  from .autonotebook import tqdm as notebook_tqdm


Loading taxonomy from legal_taxonomy.toon...
Embedding taxonomy categories...


100%|██████████| 66/66 [00:02<00:00, 30.04it/s]



Saving updated taxonomy to legal_taxonomy_with_embeddings.toon...
✅ Done.


In [None]:
# def add_embeddings_to_taxonomy(taxonomy, model):
    
#     for category_name, category_data in taxonomy.items():

#         desc = category_data.get('Description', '')

#         kw_field = next((k for k in category_data.keys() if k.startswith('Keywords')), None)
#         keywords_value = category_data.get(kw_field, '') if kw_field else ''
        
#         if isinstance(keywords_value, list):
#             keywords_text = ' '.join(keywords_value)
#         else:
#             keywords_text = str(keywords_value)

#         text_to_embed = f"{desc} {keywords_text}".strip()

#         embedding = model.encode(text_to_embed)

#         category_data['embeddings'] = embedding.tolist()

#     return taxonomy

# taxonomy_with_embeddings = add_embeddings_to_taxonomy(taxonomy, model)

In [None]:
# updated_toon_text = toons.dumps(taxonomy_with_embeddings)

# with open('legal_taxonomy_with_embeddings.toon', 'w', encoding='utf-8') as f:
#     f.write(updated_toon_text)