In [None]:
#cell 1
import os
import random
import warnings
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, losses, InputExample
from sentence_transformers.evaluation import InformationRetrievalEvaluator
from torch.utils.data import DataLoader
from torch.utils.data.sampler import Sampler  # custom batch sampler

In [None]:
#cell 2
# Helpers + MINIMAL preprocessing rules for training pairs
import re, unicodedata
import warnings
from torch.utils.data.sampler import Sampler
import random

warnings.filterwarnings("ignore", category=SyntaxWarning)
warnings.filterwarnings("ignore", message="None of the inputs have requires_grad=True")

# ---------- Normalization ----------
def _norm_text(s):
    """Unicode-normalize (NFKC) and strip whitespace; return None if empty."""
    if s is None:
        return None
    if not isinstance(s, str):
        s = str(s)
    s = unicodedata.normalize("NFKC", s).strip()
    return s if s else None

def _nz(s, default="unknown"):
    """Normalize and provide default when missing/empty."""
    t = _norm_text(s)
    return t if t is not None else default

def _truncate(text: str, max_chars: int = 400) -> str:
    if not text:
        return ""
    return text[:max_chars]

# ---------- Heuristics to drop ONLY useless anchors ----------
_NA_LIKE = {"na", "n/a", "null", "none", "nan", "[na]", "[none]"}

_num_re = re.compile(r"^[+-]?(\d+(\.\d+)?|\.\d+)$")         # pure int/float
_date_re = re.compile(r"^\d{4}-\d{2}-\d{2}$")               # YYYY-MM-DD

def _is_pure_number(s: str) -> bool:
    return bool(_num_re.fullmatch(s))

def _is_date_like(s: str) -> bool:
    return bool(_date_re.fullmatch(s))

def _is_na_like(s: str) -> bool:
    return s.lower() in _NA_LIKE

def should_keep_example(author_term: str, ontology_id: str, label: str) -> bool:
    """
    Keep anchors EXACTLY as authored, but:
      - require a valid mapping (ontology_id AND label)
      - drop anchors that are clearly placeholders: empty/NA, pure numbers, dates
    Everything else (IDs like 'VLMC_6', 'Donor 2_Jej', etc.) is kept.
    """
    if not ontology_id or not label:
        return False
    s = _norm_text(author_term)
    if s is None:
        return False
    if _is_na_like(s) or _is_pure_number(s) or _is_date_like(s):
        return False
    return True

# ---------- Formatting functions (with normalization) ----------
def make_anchor(field_type: str, author_term: str, tissue: str, organism: str):
    # Return both the anchor string and the normalized field prefix we will use for batching
    prefix = _norm_text(field_type) or "cell_type"
    anchor = f"{prefix}: {_nz(author_term)}; tissue: {_nz(tissue)}; organism: {_nz(organism)}"
    return anchor, prefix

def make_positive(label: str, synonyms: dict, definition: str) -> str:
    # Keep exact -> narrow -> broad -> related, in that order (ALL kept; capped)
    syn_exact = (synonyms or {}).get("exact", []) or []
    syn_narrow = (synonyms or {}).get("narrow", []) or []
    syn_broad  = (synonyms or {}).get("broad",  []) or []
    syn_rel    = (synonyms or {}).get("related",[]) or []
    syn_all = []
    for lst in (syn_exact, syn_narrow, syn_broad, syn_rel):
        for s in lst:
            s = _norm_text(s)
            if s:
                syn_all.append(s)
    syn_field = " | ".join(syn_all[:50]) if syn_all else ""  # cap synonyms at 50
    def_short = _truncate(_norm_text(definition) or "", max_chars=400)

    parts = [f"label: {_nz(label, default='')}"]
    if syn_field:
        parts.append(f"synonyms: {syn_field}")
    if def_short:
        parts.append(f"definition: {def_short}")
    # IMPORTANT: do NOT embed the id/curie in the text
    return "; ".join(parts)

# ---------- Convenience: build a pair ONLY if it passes filters ----------
def build_pair_if_valid(field_type, author_term, tissue, organism, label, synonyms, definition, curie):
    """
    Returns (anchor, positive, curie, field_prefix) if the example should be kept; otherwise returns None.
    """
    if not should_keep_example(author_term, curie, label):
        return None
    anchor, field_prefix = make_anchor(field_type, author_term, tissue, organism)
    positive = make_positive(label, synonyms, definition)
    return anchor, positive, curie, field_prefix

# ---------- De-duplicate identical (anchor, ontology_id) ----------
def dedup_by_anchor_id(anchors, positives, ids, fields):
    seen = set()
    A, P, I, F = [], [], [], []
    for a, p, i, f in zip(anchors, positives, ids, fields):
        key = (a, i)
        if key in seen:
            continue
        seen.add(key)
        A.append(a); P.append(p); I.append(i); F.append(f)
    return A, P, I, F

# ---------- Sampler that produces an index SEQUENCE ----------
# Ensures: (1) each batch contains a single field_type, (2) no duplicate ontology_id within a batch.
class FieldUniqueOntologySampler(Sampler):
    def __init__(self, ontology_ids, field_types, batch_size: int, drop_last: bool = True, seed: int = 7):
        self.ontology_ids = list(ontology_ids)
        self.field_types = list(field_types)
        self.batch_size = int(batch_size)
        self.drop_last = drop_last
        self.seed = int(seed)

        # Build a pre-ordered list of indices grouped into valid batches
        rng = random.Random(self.seed)
        field_to_idxs = {}
        for i, (oid, ft) in enumerate(zip(self.ontology_ids, self.field_types)):
            field_to_idxs.setdefault(ft, []).append(i)

        fields = list(field_to_idxs.keys())
        rng.shuffle(fields)

        order = []
        for ft in fields:
            idxs = field_to_idxs[ft][:]
            rng.shuffle(idxs)
            seen = set()
            batch = []
            for i in idxs:
                oid = self.ontology_ids[i]
                if oid in seen:
                    continue
                batch.append(i)
                seen.add(oid)
                if len(batch) == self.batch_size:
                    order.extend(batch)
                    batch = []
                    seen = set()
            if not self.drop_last and batch:
                order.extend(batch)
        self._order = order

    def __iter__(self):
        # Just iterate over the precomputed sequence
        return iter(self._order)

    def __len__(self):
        # Number of samples yielded
        return len(self._order)

In [None]:
#cell 3
HF_DATASET = "pankajrajdeo/bond-czi-benchmark"

def build_pairs(split: str):
    ds = load_dataset(HF_DATASET, split=split)
    anchors, positives, curies, field_prefixes = [], [], [], []
    for row in ds:
        field_type = row.get("field_type")
        author_term = row.get("author_term")
        tissue = row.get("tissue")
        organism = row.get("organism")

        mapping = row.get("mapping") or {}
        resolved = mapping.get("resolved") or {}
        original = mapping.get("original") or {}

        label = resolved.get("label") or original.get("label")
        definition = resolved.get("definition") or original.get("definition")
        synonyms = resolved.get("synonyms") or original.get("synonyms") or {}
        curie = resolved.get("ontology_id") or original.get("ontology_id")

        pair = build_pair_if_valid(field_type, author_term, tissue, organism,
                                   label, synonyms, definition, curie)
        if pair:
            a, p, c, f = pair
            anchors.append(a)
            positives.append(p)
            curies.append(c)
            field_prefixes.append(f)

    anchors, positives, curies, field_prefixes = dedup_by_anchor_id(
        anchors, positives, curies, field_prefixes
    )
    return anchors, positives, curies, field_prefixes


# Build splits
train_anchors, train_positives, train_ids, train_fields = build_pairs("train")
val_anchors,   val_positives,   val_ids,   val_fields   = build_pairs("validation")
test_anchors,  test_positives,  test_ids,  test_fields  = build_pairs("test")

print("Train pairs:", len(train_anchors))
print("Validation pairs:", len(val_anchors))
print("Test pairs:", len(test_anchors))

# Build InputExamples
train_examples = [InputExample(texts=[a, p]) for a, p in zip(train_anchors, train_positives)]

# Batch size (T4-friendly)
batch_size = 16 if len(train_examples) >= 16 else len(train_examples)

# Use our index SEQUENCE sampler (single field per batch, unique ontology per batch)
sampler = FieldUniqueOntologySampler(
    ontology_ids=train_ids,
    field_types=train_fields,
    batch_size=batch_size,
    drop_last=True,
    seed=7
)

from torch.utils.data import DataLoader

train_loader = DataLoader(
    dataset=train_examples,
    sampler=sampler,           # <-- sampler (not batch_sampler)
    batch_size=batch_size,
    drop_last=True,
    shuffle=False              # sampler defines the order; keep False
)

Repo card metadata block was not found. Setting CardData to empty.
Repo card metadata block was not found. Setting CardData to empty.
Repo card metadata block was not found. Setting CardData to empty.


Train pairs: 58352
Validation pairs: 3224
Test pairs: 3163


In [None]:
#cell 4

from collections import defaultdict
from sentence_transformers.evaluation import InformationRetrievalEvaluator

def build_ir(split_anchors, split_positives, split_ids, name: str):
    """
    Build an IR evaluator where each query can match any document variant
    with the same ontology_id (pid).
    """
    # Each query maps to its full anchor string.
    queries = {f"q{i}": a for i, a in enumerate(split_anchors)}

    corpus = {}
    pid_to_docids = defaultdict(set)
    # Build corpus and group document ids by ontology_id
    for i, (pid, pos) in enumerate(zip(split_ids, split_positives)):
        if not pid:
            continue
        doc_id = f"{pid}__{i}"  # keep variants distinct in corpus
        corpus[doc_id] = pos
        pid_to_docids[pid].add(doc_id)

    relevant = {}
    # Each query is relevant to all docs with the same ontology_id (pid)
    for i, pid in enumerate(split_ids):
        qid = f"q{i}"
        if pid:
            relevant[qid] = pid_to_docids[pid]
        else:
            relevant[qid] = set()

    return InformationRetrievalEvaluator(
        queries=queries,
        corpus=corpus,
        relevant_docs=relevant,
        name=name,
        show_progress_bar=True,
    )

# Rebuild the evaluator using the new function
val_evaluator = build_ir(val_anchors, val_positives, val_ids, name="bond_benchmark_val")


In [None]:
#cell 5
from sentence_transformers import SentenceTransformer, losses

model = SentenceTransformer("pankajrajdeo/bond-embed-v1")
loss = losses.MultipleNegativesRankingLoss(model=model, scale=32.0)

In [None]:
#cell 6
output_dir = "mnrl-bond-embed-checkpoints"
os.makedirs(output_dir, exist_ok=True)

# Make sure SentenceTransformers uses its smart batching collate (not default_collate)
train_loader.collate_fn = model.smart_batching_collate

warmup_steps = int(0.1 * len(train_loader))  # 10% of batches per epoch

model.fit(
    train_objectives=[(train_loader, loss)],   # <-- use train_loader (not compat_loader)
    epochs=5,
    warmup_steps=warmup_steps,
    optimizer_params={"lr": 2e-5},
    evaluator=val_evaluator,
    evaluation_steps=50,                       # log/eval every 50 steps
    save_best_model=True,
    output_path=output_dir,
    use_amp=True,                              # fp16/bf16 mixed precision on T4
    checkpoint_path=output_dir,
    checkpoint_save_total_limit=5,
    checkpoint_save_steps=len(train_loader)    # save at end of each epoch
)

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss,Validation Loss,Bond Benchmark Val Cosine Accuracy@1,Bond Benchmark Val Cosine Accuracy@3,Bond Benchmark Val Cosine Accuracy@5,Bond Benchmark Val Cosine Accuracy@10,Bond Benchmark Val Cosine Precision@1,Bond Benchmark Val Cosine Precision@3,Bond Benchmark Val Cosine Precision@5,Bond Benchmark Val Cosine Precision@10,Bond Benchmark Val Cosine Recall@1,Bond Benchmark Val Cosine Recall@3,Bond Benchmark Val Cosine Recall@5,Bond Benchmark Val Cosine Recall@10,Bond Benchmark Val Cosine Ndcg@10,Bond Benchmark Val Cosine Mrr@10,Bond Benchmark Val Cosine Map@100
50,No log,No log,0.130583,0.173697,0.19665,0.243486,0.130583,0.130376,0.125186,0.112996,0.046482,0.100972,0.134837,0.188858,0.184859,0.160635,0.222709
100,No log,No log,0.126861,0.170596,0.19603,0.247829,0.126861,0.126861,0.121774,0.111352,0.047234,0.100781,0.13724,0.198335,0.185297,0.158355,0.227558
150,No log,No log,0.139268,0.188586,0.223015,0.269541,0.139268,0.142266,0.138027,0.125589,0.049242,0.109362,0.150593,0.214629,0.204453,0.174692,0.251127
200,No log,No log,0.181452,0.231079,0.26768,0.311725,0.181452,0.180831,0.174007,0.159553,0.059271,0.126708,0.169007,0.232746,0.246868,0.217735,0.292928
250,No log,No log,0.187655,0.233871,0.26799,0.316377,0.187655,0.183002,0.173759,0.158127,0.0614,0.132411,0.174659,0.242962,0.250429,0.22196,0.299646
300,No log,No log,0.170596,0.223325,0.259615,0.314206,0.170596,0.170285,0.16402,0.150993,0.059347,0.127774,0.1708,0.238631,0.240983,0.209127,0.295679
350,No log,No log,0.187035,0.238524,0.274194,0.326613,0.187035,0.187552,0.17866,0.16281,0.060174,0.134368,0.177525,0.248907,0.255261,0.224884,0.305717
400,No log,No log,0.203474,0.252481,0.28598,0.345844,0.203474,0.202647,0.193921,0.181638,0.059488,0.132477,0.177033,0.253466,0.272058,0.240746,0.321839
450,No log,No log,0.21464,0.259926,0.295906,0.353288,0.21464,0.20885,0.199069,0.183499,0.064024,0.139528,0.184933,0.256968,0.278748,0.249906,0.331726
500,1.469900,No log,0.203784,0.254653,0.297457,0.361042,0.203784,0.199442,0.194541,0.185236,0.063983,0.135529,0.181696,0.261088,0.278713,0.243998,0.330888


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.75s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.72s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.79s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.76s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.80s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.82s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.78s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.77s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.79s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.05s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.03s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.83s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.84s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.79s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.79s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.82s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.95s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.15s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.39s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.48s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.65s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.79s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.70s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.24s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.87s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.86s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.77s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.84s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.79s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.81s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.67s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.70s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.89s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.68s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.71s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.58s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.40s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.30s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.36s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.19s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.13s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.19s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.94s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.89s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.85s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.90s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.54s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.86s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.87s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.94s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.92s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.88s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.91s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.95s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.85s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.91s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.86s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.92s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.92s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.88s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.89s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.89s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.94s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.86s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.89s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.91s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.82s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.43s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.32s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.29s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.30s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.49s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.10s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.57s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.46s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.27s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.25s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.29s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.32s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.40s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.35s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.18s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.13s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.24s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.23s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.25s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.23s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.45s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.39s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.47s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.50s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.40s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.57s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.43s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.32s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.36s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.26s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.24s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.88s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.35s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.27s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.25s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.25s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.17s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.03s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.29s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.28s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.37s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.57s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.79s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.73s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.70s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.59s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.47s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.64s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.57s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.43s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.41s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.49s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.61s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.68s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.84s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.17s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.06s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.97s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.51s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.94s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.04s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.01s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.02s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.01s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.97s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.96s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.96s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.01s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.97s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.99s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.98s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.94s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.01s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.98s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.94s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.96s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.97s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.97s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.98s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.99s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.99s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.97s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.98s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.05s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.98s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.01s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.98s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.99s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.65s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.04s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.99s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.01s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  3.00s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.01s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.02s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.03s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.05s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.07s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.04s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.30s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.60s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.90s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.87s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.01s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.20s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.22s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.07s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.19s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.17s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.19s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.27s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.04s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.08s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.96s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.10s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.08s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.12s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.14s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.65s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.10s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.04s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.13s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.01s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  4.00s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  4.00s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.08s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.10s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.06s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.15s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.18s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.12s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.04s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.01s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.08s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.08s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.09s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.07s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.07s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.09s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.12s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.10s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.11s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.22s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.33s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.49s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.53s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.25s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.09s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.07s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.09s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.13s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.96s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.08s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.02s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.07s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.14s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.05s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.14s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.22s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.17s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.18s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.07s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.09s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.10s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.20s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.29s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.13s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.32s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.06s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.25s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.49s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.35s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.77s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.07s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.19s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  4.00s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.06s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.17s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.06s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.25s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.19s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.34s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.35s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.27s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.27s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.48s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.21s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.22s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.16s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.13s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.34s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.17s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.16s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.47s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.26s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.85s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.21s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.39s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.30s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.02s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.62s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.20s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.51s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.06s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.38s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.66s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.37s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.45s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.25s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.36s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.03s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.98s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.07s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.16s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.33s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.27s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.44s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.47s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.37s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.33s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.44s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.25s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.40s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.51s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.46s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.93s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.29s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.49s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.27s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.21s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.08s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.35s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.27s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.09s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.22s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.25s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.43s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.30s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.06s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.20s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.87s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.01s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.09s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.00s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.10s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.18s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.16s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.00s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.04s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.93s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.59s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.40s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.43s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.31s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.23s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.28s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.25s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.45s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.80s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.93s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.18s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.18s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.40s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.51s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.44s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.28s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.25s/it]


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.53s/it]


In [None]:
final_dir = "mnrl-bond-embed-final"
model.save(final_dir)
print("Final model saved to:", final_dir)

In [None]:
#@title Cell 2: Imports & global config
import os, re, unicodedata, json, random, warnings
from typing import List, Dict, Tuple, Optional
import pandas as pd
from datasets import load_dataset
from collections import defaultdict

import torch
from torch.utils.data import DataLoader
from torch.utils.data.sampler import Sampler

from sentence_transformers import SentenceTransformer, InputExample, losses
from sentence_transformers.evaluation import InformationRetrievalEvaluator

warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", message="None of the inputs have requires_grad=True")

# ---- I/O config ----
USE_LOCAL_JSONL = True  # <- set True to use the hydrated JSONL you generated
JSONL_TRAIN = "/Users/rajlq7/Downloads/Terms/BOND/Miscellaneous/CellxGene_Benchmark/Miscellaneous/CellxGene_Benchmark/benchmark_data/bond_czi_benchmark_data_hydrated_train.jsonl"
JSONL_DEV   = "/Users/rajlq7/Downloads/Terms/BOND/Miscellaneous/CellxGene_Benchmark/Miscellaneous/CellxGene_Benchmark/benchmark_data/bond_czi_benchmark_data_hydrated_dev.jsonl"
JSONL_TEST  = "/Users/rajlq7/Downloads/Terms/BOND/Miscellaneous/CellxGene_Benchmark/Miscellaneous/CellxGene_Benchmark/benchmark_data/bond_czi_benchmark_data_hydrated_test.jsonl"

HF_DATASET  = "pankajrajdeo/bond-czi-benchmark"  # fallback (expects same schema logically)

# ---- training config ----
BASE_MODEL        = "pankajrajdeo/bond-embed-v1"   # keep your base; swap if you want
EPOCHS            = 5
LR                = 2e-5
BATCH_SIZE        = 16         # effective batch; see sampler below
WARMUP_RATIO      = 0.1
OUTPUT_DIR        = "mnrl-bond-embed-checkpoints"
FINAL_DIR         = "mnrl-bond-embed-final"

# Positive text composition caps (just for safety)
MAX_SYNS_PER_BUCKET = 50
MAX_DEF_CHARS       = 400
MAX_POS_PER_ANCHOR  = 6   # cap how many positive variants we duplicate per anchor


In [None]:
#@title Cell 3: Text helpers (no filtering; pure formatting)
_NA_LIKE = {"na","n/a","null","none","nan","[na]","[none]"}
_num_re = re.compile(r"^[+-]?(\d+(\.\d+)?|\.\d+)$")
_date_re = re.compile(r"^\d{4}-\d{2}-\d{2}$")

def _norm_text(s: Optional[str]) -> Optional[str]:
    if s is None:
        return None
    if not isinstance(s, str):
        s = str(s)
    s = unicodedata.normalize("NFKC", s).strip()
    return s if s else None

def _nz(s, default="unknown") -> str:
    t = _norm_text(s)
    return t if t is not None else default

def _is_pure_number(s: str) -> bool:
    return bool(_num_re.fullmatch(s))

def _is_date_like(s: str) -> bool:
    return bool(_date_re.fullmatch(s))

def should_keep_example(author_term: str, ontology_id: str, label: str) -> bool:
    """
    Very light guard-rails: we trust the previous pipeline.
    Only skip anchors with no usable mapping or obvious placeholders.
    """
    if not ontology_id or not label:
        return False
    s = _norm_text(author_term)
    if s is None:
        return False
    if s.lower() in _NA_LIKE or _is_pure_number(s) or _is_date_like(s):
        return False
    return True

def make_anchor(field_type: str, author_term: str, tissue: str, organism: str):
    prefix = _norm_text(field_type) or "cell_type"
    return f"{prefix}: {_nz(author_term)}; tissue: {_nz(tissue)}; organism: {_nz(organism)}", f"{prefix}|{_nz(organism)}"

def _truncate(text: str, max_chars: int = 400) -> str:
    if not text:
        return ""
    return text[:max_chars]

def _cap_syns(lst: List[str], cap:int=50) -> List[str]:
    out, seen = [], set()
    for s in lst or []:
        t = _norm_text(s)
        if not t:
            continue
        k = t.lower()
        if k in seen:
            continue
        seen.add(k)
        out.append(t)
        if len(out) >= cap:
            break
    return out

def compose_positive_text(label: str, synonyms: Dict[str, List[str]], definition: str) -> str:
    syn_exact = _cap_syns((synonyms or {}).get("exact", []), MAX_SYNS_PER_BUCKET)
    syn_narrow = _cap_syns((synonyms or {}).get("narrow", []), MAX_SYNS_PER_BUCKET)
    syn_broad  = _cap_syns((synonyms or {}).get("broad",  []), MAX_SYNS_PER_BUCKET)
    syn_rel    = _cap_syns((synonyms or {}).get("related",[]), MAX_SYNS_PER_BUCKET)
    syn_all = [*syn_exact, *syn_narrow, *syn_broad, *syn_rel]
    def_short = _truncate(_norm_text(definition) or "", MAX_DEF_CHARS)

    parts = [f"label: {_nz(label, default='')}"]
    if syn_all:
        parts.append("synonyms: " + " | ".join(syn_all))
    if def_short:
        parts.append("definition: " + def_short)
    # never include ids in text
    return "; ".join(parts)


In [None]:
#@title Cell 4: Load dataset (prefer LOCAL JSONL from your hydrator)
def load_split(split_name: str):
    if USE_LOCAL_JSONL:
        path_map = {
            "train": JSONL_TRAIN,
            "validation": JSONL_DEV,
            "dev": JSONL_DEV,
            "test": JSONL_TEST,
        }
        p = path_map[split_name]
        ds = load_dataset("json", data_files=p, split="train")
        return ds
    else:
        # Fallback — expects your HF dataset to expose equivalent flattened fields
        split = {"validation": "dev"}.get(split_name, split_name)
        ds = load_dataset(HF_DATASET, split=split)
        return ds

train_ds = load_split("train")
val_ds   = load_split("validation")
test_ds  = load_split("test")

len(train_ds), len(val_ds), len(test_ds)


In [None]:
#@title Cell 5: Build training pairs with one-to-many & obsolete→replaced handling
def row_get(r: dict, key: str, default=None):
    v = r.get(key, default)
    # Some datasets store lists/arrays for columns; normalize
    if isinstance(v, list) and len(v) == 0:
        return default
    return v

def collect_positive_variants_from_row(r: dict) -> Tuple[str, List[str]]:
    """
    Return (pid, positive_text_variants[])
    pid = resolved_ontology_id if present; else original_ontology_id
    Positive variants include the resolved term text, and if original was obsolete
    (or differs materially), also include an original-based variant.
    """
    orig_id  = row_get(r, "original_ontology_id", "")
    res_id   = row_get(r, "resolved_ontology_id", "")
    pid      = res_id or orig_id or ""

    # Original bundle
    orig_label      = row_get(r, "original_label", "")
    orig_def        = row_get(r, "original_definition", "")
    orig_is_obsolete= int(row_get(r, "original_is_obsolete", 0) or 0)
    orig_syn = {
        "exact":   row_get(r, "original_synonyms_exact",   []) or [],
        "narrow":  row_get(r, "original_synonyms_narrow",  []) or [],
        "broad":   row_get(r, "original_synonyms_broad",   []) or [],
        "related": row_get(r, "original_synonyms_related", []) or [],
    }

    # Resolved bundle
    res_label = row_get(r, "resolved_label", "")
    res_def   = row_get(r, "resolved_definition", "")
    res_syn = {
        "exact":   row_get(r, "resolved_synonyms_exact",   []) or [],
        "narrow":  row_get(r, "resolved_synonyms_narrow",  []) or [],
        "broad":   row_get(r, "resolved_synonyms_broad",   []) or [],
        "related": row_get(r, "resolved_synonyms_related", []) or [],
    }

    variants = []
    if res_id:
        variants.append(compose_positive_text(res_label, res_syn, res_def))
    # If original was obsolete or text differs, include an original variant too
    if orig_id and (orig_is_obsolete == 1 or (orig_label and orig_label != res_label)):
        variants.append(compose_positive_text(orig_label, orig_syn, orig_def))

    # Dedup identical positive strings (case/space-insensitive)
    seen, uniq = set(), []
    for v in variants:
        k = re.sub(r"\s+", " ", (v or "").lower()).strip()
        if k and k not in seen:
            seen.add(k)
            uniq.append(v)
    return pid, uniq[:MAX_POS_PER_ANCHOR]

def build_pairs_from_split(ds) -> Tuple[List[str], List[str], List[str], List[str]]:
    """
    Returns:
      anchors[], positives[], pids[], group_keys[]
    Where group_key = f"{field_type}|{organism}" to keep homogeneous batches.
    """
    anchors, positives, pids, group_keys = [], [], [], []
    for r in ds:
        field_type = row_get(r, "field_type", "")
        author_term= row_get(r, "author_term", "")
        tissue     = row_get(r, "tissue", "")
        organism   = row_get(r, "organism", "")

        pid, pos_variants = collect_positive_variants_from_row(r)

        # choose an anchor string; one-to-many → duplicate anchor for each positive
        anchor, group_key = make_anchor(field_type, author_term, tissue, organism)

        # Only keep truly valid examples (light guard)
        label_for_keep = row_get(r, "resolved_label", "") or row_get(r, "original_label", "")
        if not should_keep_example(author_term, pid, label_for_keep):
            continue

        if not pos_variants:
            # Fallback: try building from whatever label/syns/def we have
            synonyms = row_get(r, "resolved_synonyms_exact", []) or row_get(r, "original_synonyms_exact", []) or []
            definition = row_get(r, "resolved_definition", "") or row_get(r, "original_definition", "") or ""
            label = row_get(r, "resolved_label", "") or row_get(r, "original_label", "") or ""
            pos_variants = [compose_positive_text(label, {"exact": synonyms}, definition)] if label else []

        for pv in pos_variants:
            anchors.append(anchor)
            positives.append(pv)
            pids.append(pid)
            group_keys.append(group_key)

    # Dedup exact (anchor, pid, positive) triples
    seen, A, P, I, G = set(), [], [], [], []
    for a, p, i, g in zip(anchors, positives, pids, group_keys):
        k = (a, i, re.sub(r"\s+", " ", p.lower()).strip())
        if k in seen:
            continue
        seen.add(k)
        A.append(a); P.append(p); I.append(i); G.append(g)
    return A, P, I, G

train_A, train_P, train_pid, train_group = build_pairs_from_split(train_ds)
val_A,   val_P,   val_pid,   val_group   = build_pairs_from_split(val_ds)
test_A,  test_P,  test_pid,  test_group  = build_pairs_from_split(test_ds)

print("Train pairs:", len(train_A), "Val pairs:", len(val_A), "Test pairs:", len(test_A))

# quick diagnostics
print("Unique resolved IDs (train):", len(set([p for p in train_pid if p])))
print("Unique anchors (train):", len(set(train_A)))


In [None]:
#@title Cell 6: Batch sampler — homogeneous group + unique pid per batch
class GroupUniquePidSampler(Sampler):
    """
    Yields indices in an order such that:
      1) Each batch contains a single group_key (e.g., "cell_type|Homo sapiens").
      2) Each batch has unique pid (resolved_ontology_id) — prevents false negatives.
    """
    def __init__(self, pids: List[str], groups: List[str], batch_size: int, drop_last: bool = True, seed: int = 7):
        self.pids   = list(pids)
        self.groups = list(groups)
        self.batch  = int(batch_size)
        self.drop_last = drop_last
        self.seed   = int(seed)

        rng = random.Random(self.seed)
        group_to_idxs = defaultdict(list)
        for i, g in enumerate(self.groups):
            group_to_idxs[g].append(i)

        groups = list(group_to_idxs.keys())
        rng.shuffle(groups)

        order = []
        for g in groups:
            idxs = group_to_idxs[g][:]
            rng.shuffle(idxs)
            seen_pids = set()
            batch = []
            for i in idxs:
                pid = self.pids[i]
                if pid in seen_pids:
                    continue
                batch.append(i)
                seen_pids.add(pid)
                if len(batch) == self.batch:
                    order.extend(batch)
                    batch = []
                    seen_pids = set()
            if not self.drop_last and batch:
                order.extend(batch)
        self._order = order

    def __iter__(self):
        return iter(self._order)

    def __len__(self):
        return len(self._order)


In [None]:
#@title Cell 7: Build datasets, loaders, evaluator
from sentence_transformers import SentenceTransformer

# Build InputExamples for training
train_examples = [InputExample(texts=[a, p]) for a, p in zip(train_A, train_P)]

# Batch size: keep user setting, but don't exceed dataset size
batch_size = min(BATCH_SIZE, max(len(train_examples), 1))

sampler = GroupUniquePidSampler(
    pids=train_pid,
    groups=train_group,
    batch_size=batch_size,
    drop_last=True,
    seed=7
)

model = SentenceTransformer(BASE_MODEL)
train_loader = DataLoader(
    dataset=train_examples,
    sampler=sampler,
    batch_size=batch_size,
    drop_last=True,
    shuffle=False
)
train_loader.collate_fn = model.smart_batching_collate

# ---- Evaluator: group *all* variants sharing the same resolved ID ----
def build_ir_evaluator(anchors, positives, pids, name: str):
    queries = {f"q{i}": a for i, a in enumerate(anchors)}
    corpus = {}
    pid_to_docids = defaultdict(set)

    for i, (pid, pos) in enumerate(zip(pids, positives)):
        if not pid:
            # Skip unknown pids in corpus; the query might still exist (it'll have zero relevant docs)
            continue
        doc_id = f"{pid}__{i}"
        corpus[doc_id] = pos
        pid_to_docids[pid].add(doc_id)

    relevant = {}
    for i, pid in enumerate(pids):
        qid = f"q{i}"
        relevant[qid] = pid_to_docids.get(pid, set())

    return InformationRetrievalEvaluator(
        queries=queries,
        corpus=corpus,
        relevant_docs=relevant,
        name=name,
        show_progress_bar=True
    )

val_evaluator = build_ir_evaluator(val_A, val_P, val_pid, name="bond_benchmark_val")

print("Train batches:", len(train_loader), "Batch size:", batch_size)


In [None]:
#@title Cell 8: Train with MultipleNegativesRankingLoss
from sentence_transformers import losses
import math, os

os.makedirs(OUTPUT_DIR, exist_ok=True)
loss = losses.MultipleNegativesRankingLoss(model=model, scale=32.0)

warmup_steps = int(WARMUP_RATIO * max(len(train_loader), 1))

model.fit(
    train_objectives=[(train_loader, loss)],
    epochs=EPOCHS,
    warmup_steps=warmup_steps,
    optimizer_params={"lr": LR},
    evaluator=val_evaluator,
    evaluation_steps=50,            # eval/log every 50 steps
    save_best_model=True,
    output_path=OUTPUT_DIR,
    use_amp=True,                   # fp16/bf16 on Colab GPUs
    checkpoint_path=OUTPUT_DIR,
    checkpoint_save_total_limit=5,
    checkpoint_save_steps=len(train_loader)  # end of each epoch
)

model.save(FINAL_DIR)
print("✅ Final model saved to:", FINAL_DIR)


In [None]:
#@title Cell 9 (optional): Quick offline sanity check on the test split
from sentence_transformers import util

def quick_ir_eval(anchors, positives, pids, k: int = 10):
    # build corpus grouped by pid
    corpus = {}
    pid_to_docids = defaultdict(set)
    for i, (pid, pos) in enumerate(zip(pids, positives)):
        if not pid:
            continue
        did = f"{pid}__{i}"
        corpus[did] = pos
        pid_to_docids[pid].add(did)

    q_emb = model.encode(anchors, batch_size=64, convert_to_tensor=True, normalize_embeddings=True, show_progress_bar=False)
    c_ids, c_texts = zip(*corpus.items()) if corpus else ([], [])
    if not c_texts:
        print("No corpus docs found for test split.")
        return
    c_emb = model.encode(list(c_texts), batch_size=64, convert_to_tensor=True, normalize_embeddings=True, show_progress_bar=False)

    scores = util.cos_sim(q_emb, c_emb)
    hits_at_k = 0
    for i, pid in enumerate(pids):
        if not pid or pid not in pid_to_docids:
            continue
        topk = torch.topk(scores[i], min(k, scores.size(1))).indices.tolist()
        top_doc_ids = [c_ids[j] for j in topk]
        gold_set = pid_to_docids[pid]
        if any(td in gold_set for td in top_doc_ids):
            hits_at_k += 1
    denom = sum(1 for pid in pids if pid and pid in pid_to_docids)
    if denom == 0:
        print("No evaluable queries in test split.")
        return
    print(f"Test Hit@{k}: {hits_at_k/denom:.3f}  ({hits_at_k}/{denom})")

quick_ir_eval(test_A, test_P, test_pid, k=10)
