In [1]:
GLINER_TO_CANON = {
    # ---- People & Orgs ----
    "person": "PERSON",
    "name": "PERSON",
    "first_name": "PERSON",
    "last_name": "PERSON",
    "organization": "ORG",
    "organisation": "ORG",
    "company": "ORG",
    "org": "ORG",

    # ---- Contact ----
    "email": "EMAIL_ADDRESS",
    "email_address": "EMAIL_ADDRESS",
    "phone": "UK_PHONE_NUMBER",
    "phone_number": "UK_PHONE_NUMBER",
    "mobile": "UK_PHONE_NUMBER",

    # ---- Network ----
    "ip": "IP_ADDRESS",
    "ip_address": "IP_ADDRESS",

    # ---- Dates ----
    "date": "DATE",
    "date_time": "DATE",
    "datetime": "DATE",

    # ---- Geography (privacy-safe) ----
    # Treat full address as its own PII type when available, otherwise fall back to
    # postcode/street/city/etc as LOCATION/UK_POSTCODE.
    "address": "UK_ADDRESS",
    "street_address": "UK_ADDRESS",
    "full_address": "UK_ADDRESS",

    "location": "LOCATION",
    "city": "LOCATION",
    "town": "LOCATION",
    "state": "LOCATION",
    "province": "LOCATION",
    "region": "LOCATION",
    "country": "LOCATION",
    "place": "LOCATION",

    # ---- UK specific ----
    "postcode": "UK_POSTCODE",
    "uk_postcode": "UK_POSTCODE",

    # ---- Banking ----
    "uk_iban": "UK_IBAN",
    "iban": "UK_IBAN",
    "sort_code": "UK_SORT_CODE",
    "uk_sort_code": "UK_SORT_CODE",
    "account_number": "UK_ACCOUNT_NUMBER",
    "uk_account_number": "UK_ACCOUNT_NUMBER",

    # ---- Cards ----
    "credit_card_number": "CREDIT_CARD_NUMBER",
    "card_number": "CREDIT_CARD_NUMBER",
    "card_expiry": "CARD_EXPIRY",
    "expiry": "CARD_EXPIRY",
    "expiration_date": "CARD_EXPIRY",

    # ---- IDs ----
    "transaction_id": "TRANSACTION_ID",
    "support_ticket_number": "SUPPORT_TICKET_NUMBER",
    "session_id": "SESSION_ID",
    "customer_reference": "CUSTOMER_REFERENCE",
    "account_id": "ACCOUNT_ID",
    "internal_id": "INTERNAL_ID",
}
CANON_LABELS = {
    # Identity
    "PERSON",
    "ORG",

    # Contact
    "EMAIL_ADDRESS",
    "UK_PHONE_NUMBER",
    "IP_ADDRESS",

    # Time
    "DATE",
    "DATE_OF_BIRTH",

    # Geography (privacy-safe)
    "LOCATION",   
    "UK_POSTCODE", 
    "UK_ADDRESS", 

    # Banking
    "UK_SORT_CODE",
    "UK_ACCOUNT_NUMBER",
    "UK_IBAN",

    # Cards
    "CREDIT_CARD_NUMBER",
    "CARD_EXPIRY",

    # IDs
    "TRANSACTION_ID",
    "CUSTOMER_REFERENCE",
    "SESSION_ID",
    "SUPPORT_TICKET_NUMBER",
    "ACCOUNT_ID",
    "INTERNAL_ID",
}


In [2]:

from typing import Dict, List, Tuple, Optional, Any
from collections import defaultdict
import re

from difflib import SequenceMatcher
PRIORITY = {
    "UK_IBAN": 120,
    "CREDIT_CARD_NUMBER": 115,
    "UK_SORT_CODE": 110,
    "UK_ACCOUNT_NUMBER": 108,
    "CARD_EXPIRY": 105,

    "EMAIL_ADDRESS": 95,
    "IP_ADDRESS": 95,
    "UK_PHONE_NUMBER": 92,

    "UK_ADDRESS": 88,
    "UK_POSTCODE": 85,

    "TRANSACTION_ID": 75,
    "SUPPORT_TICKET_NUMBER": 74,
    "SESSION_ID": 73,
    "CUSTOMER_REFERENCE": 72,
    "ACCOUNT_ID": 71,
    "INTERNAL_ID": 70,

    "DATE_OF_BIRTH": 55,
    "DATE": 50,
    "PERSON": 40,
    "ORG": 35,
}
ALLOWED_CANON = set(CANON_LABELS)

# --- Regex backstops for common misses (landlines, IPv4, postcodes, address lines) ---
_IPV4_RE = re.compile(r"\b(?:\d{1,3}\.){3}\d{1,3}\b")
_UK_POSTCODE_RE = re.compile(r"\b([A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2})\b", re.IGNORECASE)
_UK_LANDLINE_RE = re.compile(r"(?<!\w)0\d{2,4}\s?\d{3,4}\s?\d{3,4}(?!\w)")
_UK_MOBILE_INTL_RE = re.compile(r"(?<!\w)\+44\s?7\d{3}\s?\d{6}(?!\w)")

def _is_valid_ipv4(s: str) -> bool:
    parts = s.split(".")
    if len(parts) != 4:
        return False
    try:
        nums = [int(p) for p in parts]
    except Exception:
        return False
    return all(0 <= n <= 255 for n in nums)

def _expand_to_address_line(text: str, start: int, end: int) -> Optional[Tuple[int, int]]:
    """
    If the span (usually a postcode) sits inside a line that looks like a full address,
    expand to the full line boundaries.
    """
    line_start = text.rfind("\n", 0, start)
    line_start = 0 if line_start == -1 else line_start + 1
    line_end = text.find("\n", end)
    line_end = len(text) if line_end == -1 else line_end

    raw_line = text[line_start:line_end]
    line = raw_line.strip()
    if not line:
        return None

    # Heuristics: commas + digits + some street/country-ish token => likely address line
    lc = line.lower()
    if line.count(",") < 2:
        return None
    if not re.search(r"\d", line):
        return None
    if not any(tok in lc for tok in ["street", "st", "road", "rd", "avenue", "ave", "lane", "ln", "drive", "dr", "flat", "apt", "apartment", "unit", "uk", "united kingdom", "london"]):
        return None

    # Map back to exact offsets excluding surrounding whitespace
    left_ws = len(raw_line) - len(raw_line.lstrip())
    right_ws = len(raw_line) - len(raw_line.rstrip())
    return line_start + left_ws, line_end - right_ws

def extract_regex_spans(text: str) -> List[Dict[str, Any]]:
    spans: List[Dict[str, Any]] = []

    # UK phones (landline + +44 mobile)
    for m in _UK_LANDLINE_RE.finditer(text):
        spans.append({
            "start": m.start(),
            "end": m.end(),
            "label": "UK_PHONE_NUMBER",
            "score": 0.99,
            "source": "regex",
            "original": text[m.start():m.end()],
        })
    for m in _UK_MOBILE_INTL_RE.finditer(text):
        spans.append({
            "start": m.start(),
            "end": m.end(),
            "label": "UK_PHONE_NUMBER",
            "score": 0.99,
            "source": "regex",
            "original": text[m.start():m.end()],
        })

    # IPv4
    for m in _IPV4_RE.finditer(text):
        s = m.group(0)
        if not _is_valid_ipv4(s):
            continue
        spans.append({
            "start": m.start(),
            "end": m.end(),
            "label": "IP_ADDRESS",
            "score": 0.99,
            "source": "regex",
            "original": s,
        })

    # UK postcode (+ optional full-address line expansion)
    for m in _UK_POSTCODE_RE.finditer(text):
        spans.append({
            "start": m.start(),
            "end": m.end(),
            "label": "UK_POSTCODE",
            "score": 0.99,
            "source": "regex",
            "original": text[m.start():m.end()],
        })
        expanded = _expand_to_address_line(text, m.start(), m.end())
        if expanded:
            a_start, a_end = expanded
            spans.append({
                "start": a_start,
                "end": a_end,
                "label": "UK_ADDRESS",
                "score": 0.97,
                "source": "regex",
                "original": text[a_start:a_end],
            })

    return spans

def norm_spaces(s: str) -> str:
    return re.sub(r"\s+", " ", (s or "").strip())

def norm_general(s: str) -> str:
    s = (s or "").lower().strip()
    return re.sub(r"[^a-z0-9]+", "", s)

def norm_digits(s: str) -> str:
    return re.sub(r"\D+", "", s or "")

def norm_sort_code(s: str) -> str:
    # 20-45-67, 204567 -> 204567
    return re.sub(r"[^0-9]+", "", (s or ""))

def norm_iban(s: str) -> str:
    return re.sub(r"\s+", "", (s or "").upper().strip())

def norm_phone(s: str) -> str:
    # Keep digits and plus, remove spaces/punct
    s = (s or "").strip()
    s = re.sub(r"[^\d+]+", "", s)
    return s

def normalise_for_key(label: str, value: str) -> str:
    v = (value or "").strip()
    if label == "UK_SORT_CODE":
        return norm_sort_code(v)
    if label == "UK_ACCOUNT_NUMBER":
        return norm_digits(v)
    if label == "UK_IBAN":
        return norm_iban(v)
    if label == "CREDIT_CARD_NUMBER":
        return norm_digits(v)
    if label == "CARD_EXPIRY":
        return norm_general(v)  # "08/27" -> "0827"
    if label == "UK_PHONE_NUMBER":
        return norm_phone(v)
    if label in {"EMAIL_ADDRESS", "IP_ADDRESS"}:
        return norm_general(v)
    return norm_spaces(v).lower()
def luhn_check(number: str) -> bool:
    digits = re.sub(r"\D+", "", number)
    if len(digits) < 13:
        return False
    total = 0
    alt = False
    for ch in digits[::-1]:
        d = ord(ch) - 48
        if alt:
            d *= 2
            if d > 9:
                d -= 9
        total += d
        alt = not alt
    return total % 10 == 0

def iban_mod97(iban: str) -> bool:
    """
    Basic IBAN mod-97 validation.
    """
    s = re.sub(r"\s+", "", iban).upper()
    if len(s) < 15:
        return False
    # Move first 4 chars to end
    rearr = s[4:] + s[:4]
    # Convert letters to numbers A=10..Z=35
    converted = ""
    for ch in rearr:
        if ch.isdigit():
            converted += ch
        elif "A" <= ch <= "Z":
            converted += str(ord(ch) - ord("A") + 10)
        else:
            return False
    # mod 97 in chunks
    remainder = 0
    for i in range(0, len(converted), 9):
        chunk = str(remainder) + converted[i:i+9]
        remainder = int(chunk) % 97
    return remainder == 1
def apply_validators_and_adjust_score(label: str, value: str, base_score: float) -> float:
    """
    Boost/penalise confidence based on validators.
    """
    score = float(base_score)

    if label == "CREDIT_CARD_NUMBER":
        if luhn_check(value):
            score = min(1.0, score + 0.08)
        else:
            score = max(0.0, score - 0.15)

    if label == "UK_IBAN":
        if iban_mod97(value):
            score = min(1.0, score + 0.08)
        else:
            score = max(0.0, score - 0.20)

    if label == "UK_SORT_CODE":
        if len(norm_sort_code(value)) == 6:
            score = min(1.0, score + 0.03)
        else:
            score = max(0.0, score - 0.10)

    if label == "UK_ACCOUNT_NUMBER":
        if len(norm_digits(value)) == 8:
            score = min(1.0, score + 0.02)
        else:
            score = max(0.0, score - 0.10)

    return score


def assign_tags_and_mask(
    text: str,
    spans: List[Dict[str, Any]],
) -> Tuple[str, Dict[str, str], Dict[str, float], List[Dict[str, Any]]]:
    """
    Input spans must already be overlap-resolved and filtered.
    Adds:
      span["tag"]
    Returns:
      masked_text, mapping(tag->original), scores(tag->confidence), spans(with tag)
    """
    counters = defaultdict(int)
    value_to_tag: Dict[Tuple[str, str], str] = {}   # (label, norm_value) -> tag
    mapping: Dict[str, str] = {}
    scores: Dict[str, float] = {}

    # assign tags
    for s in spans:
        label = s["label"]
        original = s["original"]
        key = (label, normalise_for_key(label, original))

        if key in value_to_tag:
            tag = value_to_tag[key]
        else:
            counters[label] += 1
            tag = f"[{label}_{counters[label]}]"
            value_to_tag[key] = tag
            mapping[tag] = original

        # score: keep max across occurrences; adjust with validators
        adj = apply_validators_and_adjust_score(label, original, float(s.get("score", 0.0)))
        scores[tag] = max(scores.get(tag, 0.0), adj)
        s["tag"] = tag
        s["score"] = adj

    # replace from back to front
    masked_text = text
    for s in sorted(spans, key=lambda x: x["start"], reverse=True):
        masked_text = masked_text[:s["start"]] + s["tag"] + masked_text[s["end"]:]  # type: ignore

    return masked_text, mapping, scores, spans

def resolve_overlaps_spans(
    spans: List[Dict[str, Any]],
) -> List[Dict[str, Any]]:
    """
    spans elements:
      {start,end,label,score,source,original}
    """
    spans_sorted = sorted(
        spans,
        key=lambda s: (
            -PRIORITY.get(s["label"], 0),
            -(s["end"] - s["start"]),
            -float(s.get("score", 0.0)),
            int(s["start"]),
        )
    )

    kept: List[Dict[str, Any]] = []
    for s in spans_sorted:
        overlaps = False
        for k in kept:
            if not (s["end"] <= k["start"] or s["start"] >= k["end"]):
                overlaps = True
                break
        if not overlaps:
            kept.append(s)

    return sorted(kept, key=lambda s: s["start"])

def mask_with_gliner(
    text: str,
    model_name_or_obj: Any,
    labels: Optional[List[str]] = None,
    threshold: float = 0.5,
) -> Tuple[str, Dict[str, str], Dict[str, float], List[Dict[str, Any]]]:
    """
    Supports:
    - model_name_or_obj: either a GLiNER instance or a model name string

    Expected GLiNER output varies by version; we handle common patterns:
    - list of dicts with keys: start, end, label, score
    """
    try:
        from gliner import GLiNER
    except Exception as e:
        raise RuntimeError("GLiNER not installed. Run: pip install gliner") from e

    if isinstance(model_name_or_obj, str):
        gliner = GLiNER.from_pretrained(model_name_or_obj)
    else:
        gliner = model_name_or_obj

    # If you don't pass labels, use canonical-ish ones (edit for your project)
    if labels is None:
        labels = [
            "person", "organization",
            "email_address", "phone_number", "ip_address",
            "date",
            "address", "street_address",
            "location",
            "postcode",
            "uk_iban", "sort_code", "account_number",
            "credit_card_number", "card_expiry",
            "transaction_id", "support_ticket_number",
            "session_id", "customer_reference", "account_id",
]
    preds = gliner.predict_entities(text, labels, threshold=threshold)

    spans: List[Dict[str, Any]] = []
    for p in preds:
        raw_label = str(p.get("label", "")).strip()
        canon = GLINER_TO_CANON.get(raw_label.lower(), GLINER_TO_CANON.get(raw_label, None))
        if not canon:
            continue
        if canon not in ALLOWED_CANON:
            continue

        start = int(p["start"])
        end = int(p["end"])
        original = text[start:end]
        spans.append({
            "start": start,
            "end": end,
            "label": canon,
            "score": float(p.get("score", 0.0)),
            "source": "gliner",
            "original": original,
})

    # Add deterministic regex spans to cover common GLiNER misses
    spans.extend(extract_regex_spans(text))

    spans = resolve_overlaps_spans(spans)
    return assign_tags_and_mask(text, spans)

In [3]:
# Keep a stable reference to the original regex extractor (v1) before adding more patterns
# This prevents recursion if we later override `extract_regex_spans`.
extract_regex_spans_v1 = extract_regex_spans

In [4]:
# --- Extra regex backstops: email + dates (incl DOB) ---
import datetime

_EMAIL_RE = re.compile(
    r"(?<![\w.+-])([A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,})(?![\w.+-])",
    re.IGNORECASE,
 )

_DOB_CONTEXT_RE = re.compile(r"\bDOB\b|\bD\.?O\.?B\.?\b|date\s+of\s+birth|\bborn\b", re.IGNORECASE)

# 12 March 1990 / 17 February 1989 etc.
_DATE_TEXT_RE = re.compile(
    r"\b(?P<day>\d{1,2})\s+"
    r"(?P<mon>jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|"
    r"jul(?:y)?|aug(?:ust)?|sep(?:t|tember)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)\s+"
    r"(?P<year>\d{4})\b",
    re.IGNORECASE,
 )

# 21/12/2025, 21-12-2025, 21.12.2025 (kept strict: requires year)
_DATE_NUMERIC_RE = re.compile(
    r"\b(?P<day>\d{1,2})[\/\-.](?P<mon>\d{1,2})[\/\-.](?P<year>\d{4})\b"
 )

_MONTH_MAP = {
    "jan": 1, "january": 1,
    "feb": 2, "february": 2,
    "mar": 3, "march": 3,
    "apr": 4, "april": 4,
    "may": 5,
    "jun": 6, "june": 6,
    "jul": 7, "july": 7,
    "aug": 8, "august": 8,
    "sep": 9, "sept": 9, "september": 9,
    "oct": 10, "october": 10,
    "nov": 11, "november": 11,
    "dec": 12, "december": 12,
}

def _is_valid_date_parts(year: int, month: int, day: int) -> bool:
    if year < 1900 or year > 2100:
        return False
    try:
        datetime.date(year, month, day)
    except Exception:
        return False
    return True

def _date_label_for_match(text: str, start: int, end: int) -> str:
    # If "DOB" appears nearby, treat it as DATE_OF_BIRTH; else DATE
    left = text[max(0, start - 40):start]
    right = text[end:min(len(text), end + 25)]
    ctx = left + " " + right
    return "DATE_OF_BIRTH" if _DOB_CONTEXT_RE.search(ctx) else "DATE"

def extract_regex_spans_v2(text: str) -> List[Dict[str, Any]]:
    # Start with existing regex extraction (phones, IPs, postcodes, address expansion)
    # IMPORTANT: use v1 reference to avoid recursion
    spans = extract_regex_spans_v1(text)

    # Emails
    for m in _EMAIL_RE.finditer(text):
        email = m.group(1)
        spans.append({
            "start": m.start(1),
            "end": m.end(1),
            "label": "EMAIL_ADDRESS",
            "score": 0.99,
            "source": "regex",
            "original": email,
        })

    # Dates (month name)
    for m in _DATE_TEXT_RE.finditer(text):
        day = int(m.group("day"))
        mon_raw = m.group("mon").lower()
        month = _MONTH_MAP.get(mon_raw, _MONTH_MAP.get(mon_raw[:3], 0))
        year = int(m.group("year"))
        if month and _is_valid_date_parts(year, month, day):
            start, end = m.start(), m.end()
            spans.append({
                "start": start,
                "end": end,
                "label": _date_label_for_match(text, start, end),
                "score": 0.97,
                "source": "regex",
                "original": text[start:end],
            })

    # Dates (numeric with year)
    for m in _DATE_NUMERIC_RE.finditer(text):
        day = int(m.group("day"))
        month = int(m.group("mon"))
        year = int(m.group("year"))
        if _is_valid_date_parts(year, month, day):
            start, end = m.start(), m.end()
            spans.append({
                "start": start,
                "end": end,
                "label": _date_label_for_match(text, start, end),
                "score": 0.97,
                "source": "regex",
                "original": text[start:end],
            })

    # Re-resolve overlaps after adding more spans
    spans = resolve_overlaps_spans(spans)
    return spans

# Make mask_with_gliner() pick up the newer extractor
extract_regex_spans = extract_regex_spans_v2

In [5]:


def _norm(s: str) -> str:
    s = (s or "").lower().strip()
    return re.sub(r"[^a-z0-9]+", "", s)

def _sim(a: str, b: str) -> float:
    return SequenceMatcher(None, a, b).ratio()

def _match_score(exp: str, found: str) -> float:
    """
    Returns a match score in [0,1]:
    - exact => 1.0
    - containment (>=6 chars) => 0.95
    - else SequenceMatcher similarity
    """
    e = _norm(exp)
    f = _norm(found)
    if not e or not f:
        return 0.0
    if e == f:
        return 1.0
    shorter, longer = (e, f) if len(e) <= len(f) else (f, e)
    if len(shorter) >= 6 and shorter in longer:
        return 0.95
    return _sim(e, f)

def build_found_typed(mapping: dict) -> dict:
    """
    mapping: { "[LABEL_1]": "original value", ... }
    returns: { "LABEL": {values...} }
    """
    out = defaultdict(set)
    for tag, val in mapping.items():
        label = tag.strip("[]").rsplit("_", 1)[0]
        out[label].add(val)
    return out
def score_run_typed(mapping: dict, expected_typed: dict, sim_threshold: float = 0.88) -> dict:
    """
    Computes:
      - recall (% expected items matched to some found item, lenient)
      - type_accuracy (% of matched expected items with correct predicted type)
      - overall (= recall * type_accuracy)
      - false_positives_total (count of found items not matched to ANY expected)
      - false_positives_by_type (dict label -> count)
    """
    found_typed = build_found_typed(mapping)

    # Flatten expected (type,value)
    expected_pairs = [(label, v) for label, vals in expected_typed.items() for v in vals]
    total_expected = len(expected_pairs)

    # Flatten found (type,value)
    found_pairs = [(label, v) for label, vals in found_typed.items() for v in vals]

    matched_found = set()
    matched_expected = set()
    correct_type_hits = 0

    # Reserved matching: each found can match at most one expected
    for i, (exp_label, exp_val) in enumerate(expected_pairs):
        best_j = None
        best_score = -1.0
        best_type_ok = False

        for j, (found_label, found_val) in enumerate(found_pairs):
            if j in matched_found:
                continue
            score = _match_score(exp_val, found_val)
            if score > best_score:
                best_score = score
                best_j = j
                best_type_ok = (found_label == exp_label)

        if best_j is not None and best_score >= sim_threshold:
            matched_expected.add(i)
            matched_found.add(best_j)
            if best_type_ok:
                correct_type_hits += 1

    recall = len(matched_expected) / max(1, total_expected)
    type_acc = (correct_type_hits / len(matched_expected)) if matched_expected else 0.0
    overall = recall * type_acc

    # False positives = found pairs not used in any match
    fp_by_type = defaultdict(int)
    fp_total = 0
    for j, (found_label, found_val) in enumerate(found_pairs):
        if j not in matched_found:
            fp_total += 1
            fp_by_type[found_label] += 1

    return {
        "recall": recall * 100.0,
        "type_accuracy": type_acc * 100.0,
        "overall": overall * 100.0,
        "found_count": len(found_pairs),
        "expected_count": total_expected,
        "false_positives_total": fp_total,
        "false_positives_by_type": dict(fp_by_type),
    }

In [6]:
# GLiNER
def print_scores(name: str, s: dict, show_fp_types: bool = True, top_k: int = 12):
    print(
        f"{name:<10} | "
        f"Recall: {s['recall']:.1f}% | "
        f"TypeAcc: {s['type_accuracy']:.1f}% | "
        f"Overall: {s['overall']:.1f}% | "
        f"FP: {s['false_positives_total']} "
        f"(found {s['found_count']}, expected {s['expected_count']})"
    )
    if show_fp_types and s["false_positives_total"] > 0:
        fp = s["false_positives_by_type"]
        # sort by count desc, then label
        items = sorted(fp.items(), key=lambda kv: (-kv[1], kv[0]))[:top_k]
        fp_str = ", ".join([f"{k}:{v}" for k, v in items])
        print(f"   FP by type: {fp_str}")


In [14]:
import json
from pathlib import Path
from typing import Any, Dict, List, Tuple

def load_tests_bundle(path: str | Path = "tests.json") -> Dict[str, Any]:
    p = Path(path)
    if not p.exists():
        raise FileNotFoundError(f"Could not find {p.resolve()}")
    with p.open("r", encoding="utf-8") as f:
        bundle = json.load(f)
    if not isinstance(bundle, dict) or "tests" not in bundle:
        raise ValueError(f"Unexpected format in {p}")
    return bundle

def expected_typed_to_sets(expected_typed: Dict[str, List[str]]) -> Dict[str, set]:
    # stored as JSON lists; scorer works with any iterable, but sets are convenient
    out: Dict[str, set] = {}
    for label, values in (expected_typed or {}).items():
        out[label] = set(values or [])
    return out

def run_tests_from_json(
    model_name: str = "urchade/gliner_multi_pii-v1",
    path: str | Path = "tests.json",
    threshold: float = 0.5,
    show_masked_text: bool = False,
    limit: int | None = None,
) -> List[Tuple[str, dict, str]]:
    bundle = load_tests_bundle(path)
    tests = bundle.get("tests", [])
    if not isinstance(tests, list):
        raise ValueError("bundle['tests'] must be a list")

    try:
        from gliner import GLiNER
    except RuntimeError as e:
        msg = str(e)
        if "TORCH_LIBRARY" in msg and "prims" in msg:
            raise RuntimeError(
                "PyTorch is in a corrupted import state (duplicate prims registration). "
                "In VS Code: restart the notebook kernel, then run cells 1→6 again. "
                "Also avoid installing/upgrading torch while the kernel is running."
            ) from e
        raise
    model = GLiNER.from_pretrained(model_name)

    if limit is not None:
        tests = tests[: int(limit)]

    results: List[Tuple[str, dict, str]] = []
    print(f"Loaded {len(tests)} tests from {Path(path).resolve()}")
    print(f"Model: {model_name} | threshold={threshold}")
    print("-")

    for t in tests:
        test_id = str(t.get("id", "<no-id>"))
        text = str(t.get("text", ""))
        expected_typed_raw = t.get("expected_typed", {}) or {}
        expected_typed = expected_typed_to_sets(expected_typed_raw)

        masked_text, mapping, scores, spans = mask_with_gliner(
            text=text,
            model_name_or_obj=model,
            threshold=threshold,
        )
        s = score_run_typed(mapping=mapping, expected_typed=expected_typed)
        print_scores(test_id, s)
        if show_masked_text:
            print("Masked text:")
            print(masked_text)
            print("-")
        results.append((test_id, s, masked_text,mapping))

    # Simple aggregate (macro average)
    if results:
        avg = {
            "recall": sum(r[1]["recall"] for r in results) / len(results),
            "type_accuracy": sum(r[1]["type_accuracy"] for r in results) / len(results),
            "overall": sum(r[1]["overall"] for r in results) / len(results),
            "false_positives_total": sum(r[1]["false_positives_total"] for r in results),
            "found_count": sum(r[1]["found_count"] for r in results),
            "expected_count": sum(r[1]["expected_count"] for r in results),
            "false_positives_by_type": {},
        }
        print("=")
        print_scores("AVG", avg, show_fp_types=False)

    return results

# Run all saved tests
_results = run_tests_from_json(path="tests.json", threshold=0.1, show_masked_text=False)

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Loaded 14 tests from G:\Proxyon\Test\tests.json
Model: urchade/gliner_multi_pii-v1 | threshold=0.1
-
sample_1   | Recall: 100.0% | TypeAcc: 92.3% | Overall: 92.3% | FP: 0 (found 26, expected 26)
sample_2   | Recall: 96.2% | TypeAcc: 88.0% | Overall: 84.6% | FP: 0 (found 25, expected 26)
sample_3   | Recall: 92.3% | TypeAcc: 95.8% | Overall: 88.5% | FP: 1 (found 25, expected 26)
   FP by type: CARD_EXPIRY:1
sample_3   | Recall: 92.3% | TypeAcc: 95.8% | Overall: 88.5% | FP: 1 (found 25, expected 26)
   FP by type: CARD_EXPIRY:1
sample_4   | Recall: 83.3% | TypeAcc: 85.0% | Overall: 70.8% | FP: 0 (found 20, expected 24)
sample_5   | Recall: 88.9% | TypeAcc: 81.2% | Overall: 72.2% | FP: 1 (found 17, expected 18)
   FP by type: ORG:1
sample_6   | Recall: 80.0% | TypeAcc: 93.8% | Overall: 75.0% | FP: 0 (found 16, expected 20)
sample_7   | Recall: 88.2% | TypeAcc: 100.0% | Overall: 88.2% | FP: 0 (found 15, expected 17)
sample_8   | Recall: 82.4% | TypeAcc: 100.0% | Overall: 82.4% | FP: 1 (fou

In [15]:
tests = load_tests_bundle(path="tests.json").get("tests", [])


In [17]:
test_num = 13
print(tests[test_num]["text"])#
print(20 * "---- Masked ----")
print(_results[test_num][3])
print(_results[test_num][2])

QUEUE: Onboarding
agent SB

Applicant name: George A. Patel
DOB 01/06/1990

Employer org: Westmoor Consulting Group

Address:
21 Orchard Way, Reading RG2 9QF

Email george.patel@westmoor-group.co.uk
Phone 0118 992 7744

IP logged 51.140.77.9
Session sess_113a9cfe

Account verification via £1 test card
Card 4000 1234 5678 9010 exp 04/28

Refs
TRX-2026-01-16-441002
CUST-REF-UK-440771
SUP-440771
ACC-2200441
INT-EE120

Welcome email sent 16 January 2026
---- Masked -------- Masked -------- Masked -------- Masked -------- Masked -------- Masked -------- Masked -------- Masked -------- Masked -------- Masked -------- Masked -------- Masked -------- Masked -------- Masked -------- Masked -------- Masked -------- Masked -------- Masked -------- Masked -------- Masked ----
{'[PERSON_1]': 'George A. Patel', '[DATE_OF_BIRTH_1]': '01/06/1990', '[ORG_1]': 'Westmoor Consulting Group', '[UK_ADDRESS_1]': '21 Orchard Way, Reading RG2 9QF', '[EMAIL_ADDRESS_1]': 'george.patel@westmoor-group.co.uk', '[UK_

In [10]:
for test in _results[3:]:
    print(test[2])
    print(10*"-----")

Case note (KYC + Support) created by [PERSON_1]. DOB confirmed as [DATE_OF_BIRTH_1].

Primary contact email: [EMAIL_ADDRESS_1]; backup: [EMAIL_ADDRESS_2].

Customer reachable on [UK_PHONE_NUMBER_1] (mobile) and [UK_PHONE_NUMBER_2] (desk).
Alt number seen in legacy CRM: [UK_IBAN_1].

Address verified via proof of residence:
[UK_ADDRESS_1]

Login activity: current IP [IP_ADDRESS_1]; prior IP [IP_ADDRESS_2].

Bank details for refunds: bank [ORG_1], sort code [UK_SORT_CODE_1], account number [UK_ACCOUNT_NUMBER_1].
IBAN used for international transfers: [UK_IBAN_2].

Payment method update: Visa 4242 4242 4242 4242 and Mastercard [CREDIT_CARD_NUMBER_1] stored; expiry [CARD_EXPIRY_1].
Note: customer mentioned “card ending [CARD_EXPIRY_2]” during the call.

Timeline: verification completed on [DATE_1].
Transaction refs: [TRANSACTION_ID_1] and [TRANSACTION_ID_2].

Internal refs include ACCOUNT_ID [ACCOUNT_ID_1], CUSTOMER_REF [CUSTOMER_REFERENCE_1], session [SESSION_ID_1], and internal id INT-UK