# we will be using this notebook to generate our meta knowledge to extract our dictionary for the rule based meta explainer 

## Meta explainer  

we will take in the features.txt, then use our deature_mask.npy to select our features 

collect info about our selected variable 

Map each feature to an explanation 

save jason

### Meta_explnation_v2

In [None]:
############################################
# Imports & Paths
############################################
import os, json, joblib, re, time, functools, warnings
from pathlib import Path

import numpy as np
import requests

# ---------- chemistry toolkits ----------
from mordred import Calculator, descriptors
try:
    from rdkit.Chem import Descriptors as RD_DESC
except ImportError:
    RD_DESC = None
    warnings.warn("RDKit not found – RDKit fallback disabled")

############################################
# Repo-relative paths
############################################
DATA_DIR    = Path("tox21_lightgb_pipeline")
FEATURE_TXT = DATA_DIR / "Data_v6/processed/feature_names.txt"
MASK_PKL    = DATA_DIR / "models/v7/feature_masks.pkl"
MASK_NPY    = DATA_DIR / "Data_v6/processed/label_mask.npy"
SAVE_JSON   = DATA_DIR / "Data_v6/meta_explainer/meta_explanations.json"
SAVE_JSON.parent.mkdir(parents=True, exist_ok=True)

############################################
# Load feature names & mask
############################################
with open(FEATURE_TXT) as f:
    feature_names = [ln.strip() for ln in f]
num_features = len(feature_names)

if MASK_PKL.exists():
    masks    = joblib.load(MASK_PKL)               # dict[label → list(indices)]
    all_idx  = {i for idx_list in masks.values() for i in idx_list}
elif MASK_NPY.exists():
    mask_arr = np.load(MASK_NPY, allow_pickle=True)
    all_idx  = set(np.where(mask_arr.any(axis=0))[0])
else:
    raise FileNotFoundError("No feature-mask file found.")

print(f"📝 {num_features} features total – {len(all_idx)} occur in any label-specific model")

############################################
# Build a Mordred → docstring map
############################################
MORDRED_DOCS: dict[str, str] = {}

def _first_sentence(txt: str, max_len: int = 160) -> str:
    """Return first sentence, optionally truncated to max_len."""
    sent = re.split(r"(?<=\.)\s", txt.strip())[0]
    return (sent if len(sent) <= max_len
            else sent[: max_len - 1].rstrip() + "…")

calc_all = Calculator(descriptors, ignore_3D=True)

for desc_obj in calc_all.descriptors:
    raw_name = str(desc_obj)            # e.g. 'piPC10', 'JGI9', 'ATS8m'
    base_cls = type(desc_obj)
    doc      = (base_cls.__doc__ or "").strip()
    if not doc:
        continue

    line = _first_sentence(doc)
    if len(line.split()) < 4:           # ignore one-word junk
        continue

    # Store the exact name (piPC10) and, if applicable, its base prefix (piPC)
    MORDRED_DOCS[raw_name] = line

    m = re.match(r"^([A-Za-z_]+)\d+$", raw_name)
    if m:                               # 'piPC10' → 'piPC'
        base = m.group(1)
        MORDRED_DOCS.setdefault(base, line)

print(f"🔍 Collected doc-strings for {len(MORDRED_DOCS):,} Mordred descriptors")

############################################
# Helper look-ups
############################################
def mordred_lookup(term: str) -> str | None:
    """Exact or numeric-suffix match against Mordred docs."""
    if term in MORDRED_DOCS:
        return MORDRED_DOCS[term]

    m = re.match(r"^([A-Za-z_]+)(\d+)$", term)
    if m and m.group(1) in MORDRED_DOCS:
        base, order = m.groups()
        return f"{MORDRED_DOCS[base]} — order {order}"
    return None


def rdkit_lookup(term: str) -> str | None:
    if RD_DESC is None:
        return None
    term_low = term.lower()
    for name, fn in RD_DESC.descList:
        if name.lower() == term_low or term_low.startswith(name.lower()):
            doc = (fn.__doc__ or "").strip()
            sent = _first_sentence(doc)
            return sent if len(sent.split()) > 4 else None
    return None


PUBCHEM_API = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/descriptor/JSON?descriptor={}"

@functools.lru_cache(maxsize=1024)
def pubchem_lookup(term: str, retries: int = 2) -> str | None:
    safe = requests.utils.quote(term)
    url  = PUBCHEM_API.format(safe)
    for _ in range(retries):
        try:
            r = requests.get(url, timeout=4)
            if r.status_code == 200:
                js   = r.json()
                desc = js.get("Information", [{}])[0].get("Description", "")
                sent = _first_sentence(desc)
                return sent if len(sent.split()) > 4 else None
            if r.status_code == 404:
                return None
        except requests.exceptions.RequestException:
            time.sleep(0.5)
    return None


WIKI_API = "https://en.wikipedia.org/api/rest_v1/page/summary/{}"

@functools.lru_cache(maxsize=1024)
def wikipedia_lookup(term: str, retries: int = 2) -> str | None:
    safe = requests.utils.quote(term.replace(" ", "_"))
    url  = WIKI_API.format(safe)
    for _ in range(retries):
        try:
            r = requests.get(url, timeout=4)
            if r.status_code == 200:
                summary = r.json().get("extract", "")
                sent    = _first_sentence(summary)
                return sent if len(sent.split()) > 4 else None
            if r.status_code == 404:
                return None
        except requests.exceptions.RequestException:
            time.sleep(0.5)
    return None

############################################
# Build the meta-explanation map
############################################
meta_expl     = {}
SOURCE_ORDER  = (
    ("Mordred",   mordred_lookup),
    ("RDKit",     rdkit_lookup),
    ("PubChem",   pubchem_lookup),
    ("Wikipedia", wikipedia_lookup),
)

SOURCE_LIST = ", ".join(name for name, _ in SOURCE_ORDER)

for idx in sorted(all_idx):
    feat   = feature_names[idx]
    expl   = None

    for src_name, fn in SOURCE_ORDER:
        expl = fn(feat)
        if expl:
            meta_expl[feat] = expl
            break

    if expl is None:
        meta_expl[feat] = (
            f"no information about this feature and looked at these sources: {SOURCE_LIST}"
        )

print(f"🧬 Generated explanations for {len(meta_expl)} descriptors")

############################################
# Save & quick preview
############################################
with open(SAVE_JSON, "w") as f:
    json.dump(meta_expl, f, indent=2)

print(f"✅ Saved → {SAVE_JSON}")
for k, v in list(meta_expl.items())[:15]:
    print(f" - {k}: {v}")


📝 1613 features total – 1086 occur in any label-specific model
🔍 Collected doc-strings for 682 Mordred descriptors
🧬 Generated explanations for 1086 descriptors
✅ Saved → tox21_lightgb_pipeline\Data_v6\meta_explainer\meta_explanations.json
 - nAcid: acidic group count descriptor.
 - nBase: basic group count descriptor.
 - SpMax_A: no information about this feature and looked at these sources: Mordred, RDKit, PubChem, Wikipedia
 - SpDiam_A: no information about this feature and looked at these sources: Mordred, RDKit, PubChem, Wikipedia
 - SpMAD_A: no information about this feature and looked at these sources: Mordred, RDKit, PubChem, Wikipedia
 - VE1_A: no information about this feature and looked at these sources: Mordred, RDKit, PubChem, Wikipedia
 - VE2_A: no information about this feature and looked at these sources: Mordred, RDKit, PubChem, Wikipedia
 - VE3_A: no information about this feature and looked at these sources: Mordred, RDKit, PubChem, Wikipedia
 - VR1_A: no information

### Check script

so for our initial output, we got 67% of the description (missing 32%), so we are going to save them adn try to find another way to get thoses descriptions

In [18]:
import json
from pathlib import Path

# Path to the explanations file you just generated
JSON_PATH = Path("tox21_lightgb_pipeline/Data_v6/meta_explainer/meta_explanations.json")

with open(JSON_PATH) as f:
    meta = json.load(f)

# Detect the fallback phrase (prefix-match is enough)
missing_keys = [k for k, v in meta.items()
                if v.startswith("no information about this feature")]

total   = len(meta)
missing = len(missing_keys)
print(f"Missing descriptions: {missing} / {total}  ({missing/total:.1%})")

# Optional: show a few example names
if missing_keys:
    print("first few missing:", ", ".join(missing_keys[:10]))


Missing descriptions: 734 / 1086  (67.6%)
first few missing: SpMax_A, SpDiam_A, SpMAD_A, VE1_A, VE2_A, VE3_A, VR1_A, VR2_A, VR3_A, nBridgehead


### to find the feature for the missing ones 

In [19]:
import json, re, html, requests, time
from pathlib import Path

# --------------------------------------------------
# 1. load existing explanations
# --------------------------------------------------
JSON_IN  = Path("tox21_lightgb_pipeline/Data_v6/meta_explainer/meta_explanations.json")
JSON_OUT = JSON_IN.with_stem(JSON_IN.stem + "_v2")   # meta_explanations_v2.json

with open(JSON_IN) as f:
    meta = json.load(f)

missing = [k for k, v in meta.items()
           if v.startswith("no information about this feature")]

print(f"🔍  pass-2 will retry {len(missing)} still-missing descriptors")

if not missing:
    print("✅ nothing to do – everything already annotated")
    raise SystemExit

# --------------------------------------------------
# 2. pull the master Mordred descriptor table once
# --------------------------------------------------
DOC_URL = "https://mordred-descriptor.github.io/documentation/master/descriptors.html"
print("⏳  downloading Mordred descriptor list ...")
html_text = requests.get(DOC_URL, timeout=15).text

# strip all HTML tags (quick-and-dirty is fine here)
plain = re.sub("<[^>]+>", "", html_text)           # keep only text
plain = html.unescape(plain)                       # un-escape &nbsp; etc.
lines = [ln.strip() for ln in plain.splitlines() if ln.strip()]

# build {descriptor → description} map from the flat text table
mordred_table = {}
for i, ln in enumerate(lines[:-1]):
    # descriptor names have no spaces and are followed two lines later
    if re.fullmatch(r"[A-Za-z0-9_]+", ln) and not ln[0].isdigit():
        desc = lines[i + 1]                        # the next line is the description
        # heuristic: accept descriptions containing ≥3 words
        if len(desc.split()) >= 3:
            mordred_table[ln] = desc

print(f"📜  scraped {len(mordred_table):,} descriptor rows from online doc")

# --------------------------------------------------
# 3. fill any gaps we can
# --------------------------------------------------
fixed = 0
for feat in missing:
    if feat in mordred_table:
        meta[feat] = mordred_table[feat] + " — scraped from Mordred docs"
        fixed += 1
    else:
        # try base-name match for variants like "SpMax_A" -> "SpMax"
        base = feat.split("_", 1)[0]
        if base in mordred_table:
            meta[feat] = (mordred_table[base] +
                          f" — weighting variant ({feat[len(base)+1:]})")
            fixed += 1

print(f"✅  recovered {fixed} additional descriptions; "
      f"{len(missing) - fixed} still missing")

# --------------------------------------------------
# 4. save the augmented JSON
# --------------------------------------------------
with open(JSON_OUT, "w") as f:
    json.dump(meta, f, indent=2)

print(f"💾  wrote updated file   →  {JSON_OUT}")


🔍  pass-2 will retry 734 still-missing descriptors
⏳  downloading Mordred descriptor list ...
📜  scraped 1,453 descriptor rows from online doc
✅  recovered 635 additional descriptions; 99 still missing
💾  wrote updated file   →  tox21_lightgb_pipeline\Data_v6\meta_explainer\meta_explanations_v2.json


### diagnostics 

So first we need to understand the style of our descriptions, then we can post process them

In [8]:

import json, re, textwrap
from collections import Counter, defaultdict
from pathlib import Path

# ───────────────────────────────────────────────
# 0.  paths  (edit if you keep the file elsewhere)
# ───────────────────────────────────────────────
META = Path("tox21_lightgb_pipeline/Data_v6/meta_explainer/meta_explanations_plain.json")

# ───────────────────────────────────────────────
# 1.  existing maps you already have
#     (copy-paste the latest versions here ↓)
# ───────────────────────────────────────────────
PROPERTY_CODE = {
    "a": "atomic mass",
    "p": "polarizability",
    "i": "ionisation potential",
    "s": "sigma electronegativity",
    "e": "electronegativity",
    "u": "van-der-Waals volume",
}

OVERRIDES = {"piPC", "JGI", "BIC", "RPCG"}     # just the *prefixes* you’ve handled

JARGON_MAP = {
    r"\bautocorrelation\b": "similarity pattern",
    r"\blipophilicity\b":   "greasiness",
    r"\bpolar\b":           "water-attracting",
    r"\bhydrophobicity\b":  "water-repelling",
    r"\btopological\b":     "structural",
    r"\beigenvalue\b":      "mathematical value",
    r"\bpermeability\b":    "ability to pass through membranes",
    r"\bπ\b":               "pi-bond",
    r"\bconjugated\b":      "alternating-double-bond",
    r"\bdescriptor\b":      "numeric property",
}

# ───────────────────────────────────────────────
# 2.  load explanations
# ───────────────────────────────────────────────
with open(META) as f:
    meta = json.load(f)

# ───────────────────────────────────────────────
# 3.  basic counts
# ───────────────────────────────────────────────
missing = {k for k, v in meta.items()
           if v.lower().startswith("no information about this feature")}
scraped_generic = {k for k, v in meta.items()
                   if "scraped from mordred docs" in v.lower()}

print("┌────────────────────────────┐")
print(f"│  Missing completely : {len(missing):>4} │")
print(f"│  Generic ‘scraped’ : {len(scraped_generic):>4} │")
print("└────────────────────────────┘\n")

# ───────────────────────────────────────────────
# 4.  property-code letters not yet mapped
#     (AATS4p → code ‘p’; if ‘p’ absent from PROPERTY_CODE, flag it)
# ───────────────────────────────────────────────
prop_pat = re.compile(r"^(?:AATS|ATS|AATSC|ATSC|MATS|GATS)(\d+)([a-z])$", re.I)
missing_props = Counter()

for feat in meta:
    m = prop_pat.match(feat)
    if m:
        code = m.group(2).lower()
        if code not in PROPERTY_CODE:
            missing_props[code] += 1

if missing_props:
    print("⚙️  Property-code letters *not* in PROPERTY_CODE:")
    for code, n in missing_props.most_common():
        print(f"   • {code!s:<2}  ({n} features)")
    print()
else:
    print("✅ All AATS/ATS property codes are covered.\n")

# ───────────────────────────────────────────────
# 5.  descriptor families worth an OVERRIDE
# ───────────────────────────────────────────────
family_pat = re.compile(r"^([A-Z]{3,})(\d*)$")
candidates = Counter()

for feat, expl in meta.items():
    m = family_pat.match(feat)
    if not m:
        continue
    base = m.group(1)
    if base not in OVERRIDES and base.isupper():
        # heuristic: if explanation still contains the bare family name, flag it
        if base.lower() in expl.lower():
            candidates[base] += 1

if candidates:
    print("💡 Families you may want to add to OVERRIDES:")
    for base, n in candidates.most_common(10):
        print(f"   • {base:<6}  ({n} variants)")
    print()
else:
    print("✅ No obvious new OVERRIDE families needed.\n")

# ───────────────────────────────────────────────
# 6.  frequent jargon not yet in JARGON_MAP
# ───────────────────────────────────────────────
word_counter = Counter()
token_pat = re.compile(r"[A-Za-z]{7,}")

for expl in meta.values():
    for w in token_pat.findall(expl):
        wl = w.lower()
        # ignore if already translated
        if any(re.search(pat, wl, flags=re.I) for pat in JARGON_MAP):
            continue
        word_counter[wl] += 1

common_jargon = [w for w, c in word_counter.most_common(20) if c >= 3]

if common_jargon:
    print("📝 Frequent jargon words not in JARGON_MAP (count ≥ 3):")
    print(textwrap.fill(", ".join(common_jargon), width=80, subsequent_indent="    "))
else:
    print("✅ No high-frequency jargon left un-mapped.")


┌────────────────────────────┐
│  Missing completely :   99 │
│  Generic ‘scraped’ :  615 │
└────────────────────────────┘

⚙️  Property-code letters *not* in PROPERTY_CODE:
   • z   (52 features)
   • m   (51 features)
   • v   (51 features)
   • d   (49 features)
   • c   (34 features)

✅ No obvious new OVERRIDE families needed.

📝 Frequent jargon words not in JARGON_MAP (count ≥ 3):
mordred, scraped, numeric, property, information, feature, sources, pubchem,
    wikipedia, structural, pattern, similarity, structure, surface, descriptors,
    contribution, content, baryszmatrix, distance, ringcount


### Post-processing

making sure it is in plain english and easy to follow

In [9]:
import json, re
from pathlib import Path

# ──────────────────────────────────────────
# 1. paths
# ──────────────────────────────────────────
ROOT = Path("tox21_lightgb_pipeline/Data_v6/meta_explainer")
IN   = ROOT / "meta_explanations_v2.json"
OUT  = ROOT / "meta_explanations_plain.json"

# ──────────────────────────────────────────
# 2. load source JSON
# ──────────────────────────────────────────
with open(IN) as f:
    meta = json.load(f)

# ──────────────────────────────────────────
# 3. helper maps
# ──────────────────────────────────────────
PROPERTY_CODE = {
    # original
    "a": "atomic mass",
    "p": "polarizability",
    "i": "ionisation potential",
    "s": "sigma electronegativity",
    "e": "electronegativity",
    "u": "van der Waals volume",
    # newly added
    "z": "atomic number",
    "m": "atomic mass",
    "v": "van der Waals volume",
    "d": "electron density",
    "c": "atomic charge",
}

JARGON_MAP = {
    # scientific jargon
    r"\bautocorrelation\b": "similarity pattern",
    r"\blipophilicity\b":   "greasiness",
    r"\bpolar\b":           "water-attracting",
    r"\bhydrophobicity\b":  "water-repelling",
    r"\btopological\b":     "structural",
    r"\beigenvalue\b":      "mathematical value",
    r"\bpermeability\b":    "ability to pass membranes",
    r"\bπ\b":               "pi-bond (double-bond)",
    r"\bconjugated\b":      "alternating-double-bond",
    r"\bdescriptor\b":      "numeric property",
    # frequent filler / boiler-plate → delete
    r"\bmordred\b":         "",
    r"\bscraped\b":         "",
    r"\bnumeric\b":         "",
    r"\bproperty\b":        "",
    r"\binformation\b":     "",
    r"\bfeature\b":         "",
    r"\bsources?\b":        "",
    r"\bpubchem\b":         "",
    r"\bwikipedia\b":       "",
    # misc often-repeated words (keep or shorten)
    r"\bpattern\b":         "pattern",
    r"\bsimilarity\b":      "similarity",
    r"\bstructure\b":       "molecule",
    r"\bsurface\b":         "surface",
    r"\bdescriptors?\b":    "descriptors",
    r"\bcontribution\b":    "contribution",
    r"\bcontent\b":         "content",
    r"\bbaryszmatrix\b":    "Barysz matrix",
    r"\bdistance\b":        "distance",
    r"\bringcount\b":       "ring count",
}

OVERRIDES = {
    "piPC": lambda n: (f"Count of π-electron paths exactly {n} bonds long "
                       "(log-scaled). High values flag long conjugated chains."),
    "JGI":  lambda n: (f"Average structural charge difference between atoms "
                       f"{n} bonds apart. Gauges charge distribution."),
    "BIC":  lambda n: (f"Bonding-information content at distance {n}. "
                       "Higher = more complex bonding pattern."),
    "RPCG": lambda _: ("Relative positive charge: most positive atom’s charge "
                       "divided by total positive charge in the molecule."),
}

# ──────────────────────────────────────────
# 4. family-specific decoders  (AATS / ATS …)
# ──────────────────────────────────────────
AATS_PAT = re.compile(r"^(AATS|ATS|AATSC|ATSC|MATS|GATS)(\d+)([a-z])$", re.I)

def decode_aats(name: str) -> str | None:
    """
    Turn ‘AATS4p’ into 'Average similarity pattern of polarizability at lag 4'.
    Returns None if the name does not match the regex.
    """
    m = AATS_PAT.match(name)
    if not m:
        return None
    base, lag, code = m.groups()
    prop = PROPERTY_CODE.get(code.lower(), f"property {code}")
    family = "Average" if base[0] == "A" else "Centered"
    return (f"{family} similarity pattern of {prop} at lag {lag}")

# ──────────────────────────────────────────
# 5. misc cleaning helpers
# ──────────────────────────────────────────
SCRAPE_PAT = re.compile(r"—\s*(?:scraped|from)\s+(?:mordred|pubchem|wikipedia).*",
                        flags=re.I)

def apply_overrides(name: str) -> str | None:
    m = re.match(r"^([A-Za-z]+)(\d*)$", name)
    if not m:
        return None
    base, order = m.groups()
    if base in OVERRIDES:
        return OVERRIDES[base](order) if order else OVERRIDES[base]("")
    return None

def simplify(text: str) -> str:
    # drop scrape notes & parameter blobs
    text = SCRAPE_PAT.sub("", text)
    text = re.sub(r"\([^)]*\)", "", text)            # (…) blocks
    text = re.sub(r"—\s*order\s*\d+", "", text)      # — order N
    # jargon replacements
    for pat, repl in JARGON_MAP.items():
        text = re.sub(pat, repl, text, flags=re.I)
    # tidy whitespace / dashes
    text = re.sub(r"\s{2,}", " ", text).strip(" —")
    return text[0].upper() + text[1:] if text else text

# ──────────────────────────────────────────
# 6. build the plain-English dict
# ──────────────────────────────────────────
plain_meta = {}
for name, sentence in meta.items():
    # 1) family-specific decoder
    decoded = decode_aats(name)
    if decoded:
        plain_meta[name] = decoded
        continue
    # 2) hard overrides
    override = apply_overrides(name)
    if override:
        plain_meta[name] = override
        continue
    # 3) general simplifier
    plain_meta[name] = simplify(sentence)

# ──────────────────────────────────────────
# 7. save
# ──────────────────────────────────────────
with open(OUT, "w") as f:
    json.dump(plain_meta, f, indent=2)

print(f"✅  Plain-English explanations written → {OUT}")


✅  Plain-English explanations written → tox21_lightgb_pipeline\Data_v6\meta_explainer\meta_explanations_plain.json


In [5]:
from pathlib import Path, PurePath
import json

plain_path = Path("tox21_lightgb_pipeline/Data_v6/meta_explainer/meta_explanations_plain.json")
with open(plain_path) as f:
    plain_meta = json.load(f)

print(plain_meta["AATS5i"])


AATS — scraped from Mordred docs


## SMART explainer  

In [1]:
import json, re, time, functools, requests
from pathlib import Path

# ─────────────────────────────────────────────
# 1. paths
# ─────────────────────────────────────────────
ROOT = Path("tox21_lightgb_pipeline/Data_v6/meta_explainer")
IN   = ROOT / "smarts_rules_final.json"
OUT  = ROOT / "smarts_rules_final_v2.json"

# ─────────────────────────────────────────────
# 2. load existing rules
# ─────────────────────────────────────────────
with open(IN) as f:
    rules_by_label: dict = json.load(f)

label_cols = [
    "NR-AR", "NR-AR-LBD", "NR-AhR", "NR-Aromatase", "NR-ER",
    "NR-ER-LBD", "NR-PPAR-gamma", "SR-ARE", "SR-ATAD5",
    "SR-HSE", "SR-MMP", "SR-p53",
]

# ─────────────────────────────────────────────
# 3. add starter kits for empty endpoints
# ─────────────────────────────────────────────
GENERIC_RULES = [
    {"name": "Nitro group",        "smarts": "[NX3](=O)=O",
     "explanation": "",            "severity": "high"},
    {"name": "Epoxide",            "smarts": "[C;r3]1[O;r3][C;r3]1",
     "explanation": "",            "severity": "high"},
    {"name": "Alkyl halide",       "smarts": "[CX4][F,Cl,Br,I]",
     "explanation": "",            "severity": "medium"},
    {"name": "Aldehyde",           "smarts": "[CX3H1](=O)[#6]",
     "explanation": "",            "severity": "medium"},
]

for lab in label_cols:
    rules_by_label.setdefault(lab, []).extend([])   # ensure key exists
    if not rules_by_label[lab]:                     # insert generic set
        rules_by_label[lab] = [r.copy() for r in GENERIC_RULES]

# ─────────────────────────────────────────────
# 4. helpers to fetch a one-liner
# ─────────────────────────────────────────────
WIKI_API = "https://en.wikipedia.org/api/rest_v1/page/summary/{}"
PUBCHEM  = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{}/description/JSON"

@functools.lru_cache(maxsize=256)
def wiki_sentence(term: str) -> tuple[str | None, str | None]:
    safe = requests.utils.quote(term.replace(" ", "_"))
    url  = WIKI_API.format(safe)
    try:
        r = requests.get(url, timeout=5)
        if r.status_code == 200:
            text = r.json().get("extract", "")
            sent = re.split(r"(?<=\.)\s", text.strip())[0]
            return (sent if len(sent.split()) > 4 else None, url)
    except requests.exceptions.RequestException:
        pass
    return (None, None)

@functools.lru_cache(maxsize=256)
def pubchem_sentence(term: str) -> tuple[str | None, str | None]:
    safe = requests.utils.quote(term)
    url  = PUBCHEM.format(safe)
    try:
        r = requests.get(url, timeout=5)
        if r.status_code == 200:
            tree = r.json()
            desc = tree["InformationList"]["Information"][0].get("Description", "")
            sent = re.split(r"(?<=\.)\s", desc.strip())[0]
            return (sent if len(sent.split()) > 4 else None, url)
    except requests.exceptions.RequestException:
        pass
    return (None, None)

# ─────────────────────────────────────────────
# 5. iterate and enrich
# ─────────────────────────────────────────────
updated = 0
for lab, rules in rules_by_label.items():
    for rule in rules:
        # add severity if missing
        rule.setdefault("severity", "medium")

        # skip if explanation already >10 words
        if rule.get("explanation") and len(rule["explanation"].split()) > 10:
            continue

        sent, ref = wiki_sentence(rule["name"])
        if not sent:
            sent, ref = pubchem_sentence(rule["name"])

        if sent:
            rule["explanation"] = sent
            rule["ref"] = ref
            updated += 1
        else:
            rule.setdefault(
                "explanation",
                "No concise public description found (checked Wikipedia & PubChem).",
            )

print(f"📝  Updated {updated} explanations; saving enriched file …")

# ─────────────────────────────────────────────
# 6. write out
# ─────────────────────────────────────────────
with open(OUT, "w") as f:
    json.dump(rules_by_label, f, indent=2)

print(f"✅  Wrote →  {OUT}")


📝  Updated 57 explanations; saving enriched file …
✅  Wrote →  tox21_lightgb_pipeline\Data_v6\meta_explainer\smarts_rules_final_v2.json
