In [10]:
import toons
import math
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import numpy as np

TOON_INPUT = "9700cases.toon"
TOON_OUTPUT = "9700cases_embedded.toon"

with open(TOON_INPUT, "r", encoding="utf-8") as f:
    toon_text = f.read()

records = toons.loads(toon_text)
print("Loaded:", len(records), "cases")

COLUMNS_TO_EMBED = [
    "Issue",
    "Precedent_Analysis",
    "Analysis_of_the_law",
    "Fact",
    "Respondents_Argument",
    "Petitioners_Argument",
    "Courts_Reasoning",
    "Conclusion"
]

Loaded: 9760 cases


In [11]:
model = SentenceTransformer("BAAI/bge-small-en-v1.5", device= 'cuda')
print("Model Loaded....")

MAX_TOKENS = 512
BATCH_SIZE = 64

tokenizer = model.tokenizer

def chunk_text(text, max_tokens=MAX_TOKENS):
    if not text:
        return [""]
    tokens = tokenizer.encode(text, add_special_tokens=False)
    chunks = []

    for i in range(0, len(tokens), max_tokens):
        chunk_ids = tokens[i:i + max_tokens]
        chunks.append(tokenizer.decode(chunk_ids))

    return chunks

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Model Loaded....


In [12]:
# def embed_batch(text_list):
#     embeddings = []

#     for i in range(0, len(text_list), BATCH_SIZE):
#         batch = text_list[i:i + BATCH_SIZE]
#         emb = model.encode(batch, show_progress_bar=False)
#         embeddings.extend(emb)

#     return embeddings

In [13]:
all_rows_embeddings = []
print("\nEmbedding rows…")

for rec in tqdm(records):
    row_embs = {}

    for col in COLUMNS_TO_EMBED:
        text = rec.get(col, "")
        chunks = chunk_text(text)

        chunk_embs = model.encode(
            chunks, 
            batch_size=BATCH_SIZE,
            show_progress_bar=False
        )

        if len(chunk_embs) == 1:
            final = chunk_embs[0]
        else:
            final = np.mean(np.vstack(chunk_embs), axis=0)

        row_embs[col + "_embedding"] = final.tolist()

    all_rows_embeddings.append(row_embs)


Embedding rows…


  0%|          | 0/9760 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (2562 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 9760/9760 [1:08:56<00:00,  2.36it/s] 


In [15]:
for rec, emb in zip(records, all_rows_embeddings):
    rec.update(emb)


toon_out = toons.dumps(records)

with open(TOON_OUTPUT, "w", encoding="utf-8") as f:
    f.write(toon_out)

print("\nSaved embedded TOON file →", TOON_OUTPUT)


Saved embedded TOON file → 9700cases_embedded.toon
