# we will be using this notebook to generate our meta knowledge to extract our dictionary for the rule based meta explainer 

## Meta explainer  

In [4]:
import os
import json
import joblib

# === Paths ===
FEATURE_PATH = "tox21_lightgb_pipeline/Data_v6/processed/feature_names.txt"
MASK_PATH = "tox21_lightgb_pipeline/models/v7/feature_masks.pkl"
SAVE_DIR = "tox21_lightgb_pipeline/Data_v6/rule_based_meta"
os.makedirs(SAVE_DIR, exist_ok=True)

# === Load feature names and feature masks ===
with open(FEATURE_PATH, "r") as f:
    feature_names = [line.strip() for line in f.readlines()]

feature_masks = joblib.load(MASK_PATH)

# === Define rule-based biological explanations ===
RULES = {
    "HybRatio": "Hybridization ratio — reflects degree of unsaturation or sp2 content",
    "SLogP": "Lipophilicity — affects membrane permeability and solubility",
    "LogP": "Lipophilicity — linked to hydrophobicity and membrane crossing",
    "XLogP": "Lipophilicity — alternative LogP calculation method",
    "TopoPSA": "Topological polar surface area — predicts absorption, BBB penetration",
    "TPSA": "Polar surface area — affects bioavailability and permeability",
    "nRot": "Rotatable bonds — influence molecular flexibility and entropy",
    "RotatableBond": "Molecular flexibility — associated with conformational change",
    "Ring": "Ring count — indicates cyclic motifs, common in drug scaffolds",
    "nRing": "Ring systems — often indicate structural rigidity or pharmacophoric elements",
    "HeavyAtom": "Heavy atom count — correlates with molecular size and complexity",
    "MolWt": "Molecular weight — affects pharmacokinetics and distribution",
    "ExactMolWt": "Exact molecular mass — relevant for high-resolution mass spectrometry",
    "Chi": "Kier's shape index — reflects branching and molecular topology",
    "Kappa": "Molecular shape indices — measures of flexibility and cyclicity",
    "PEOE_VSA": "Partial Charge Surface Area — surface-weighted electronic properties",
    "EState_VSA": "Electrostatic surface area — relates to electrostatic interactions",
    "SMR_VSA": "Molar refractivity surface area — tied to dispersion and polarizability",
    "VSA": "Van der Waals surface area — size and shape-based contribution",
    "nHBDon": "Hydrogen bond donors — influence protein-ligand interactions",
    "nHBAcc": "Hydrogen bond acceptors — key for binding site recognition",
    "HBD": "Hydrogen bond donor — impacts solubility and biological activity",
    "HBA": "Hydrogen bond acceptor — affects drug-receptor affinity",
    "Electronegativity": "Atomic electronegativity — governs reactivity and polarity",
    "ETA": "Electron-topological state index — shape, branching, and reactivity",
    "ETA_alpha": "ETA alpha — captures electronegativity and polarizability effects",
    "AATS": "Average autocorrelation — captures pairwise interactions across distances",
    "ATSC": "Centered autocorrelation — emphasizes central atom contributions",
    "MATS": "Moran autocorrelation — weighted by physicochemical properties",
    "GATS": "Geary autocorrelation — sensitive to molecular symmetry",
    "ZMIC": "Molecular information content — quantifies complexity and diversity",
    "BCUT": "BCUT descriptors — eigenvalue-based diversity of atoms",
    "BertzCT": "Molecular complexity index — encodes graph-theoretic size",
    "LabuteASA": "Approximate surface area — useful for passive membrane permeability estimation",
    "BalabanJ": "Balaban's index — encodes topological complexity of the molecular graph",
    "FpDensityMorgan": "Fingerprint density — indicates molecular fingerprint sparsity/density",
    "MaxPartialCharge": "Maximum atomic partial charge — influences polarity and reactivity",
    "MinPartialCharge": "Minimum atomic partial charge — influences reactivity and interaction",
    "FractionCSP3": "Fraction of sp3 carbon atoms — relates to saturation and 3D structure",
    "NumAromaticRings": "Aromatic ring count — often linked to receptor interaction",
    "NumAliphaticRings": "Aliphatic ring count — can indicate flexibility and metabolism",
    "NumHeteroatoms": "Heteroatom count — includes N, O, S; related to polarity and H-bonding"
}

# === Collect all features used across all models ===
used_features = set()
for indices in feature_masks.values():
    used_features.update(indices)

# === Map used feature indices → names ===
used_feature_names = {feature_names[i] for i in used_features}

# === Match used feature names to explanations ===
meta_explanation_dict = {}
for feat in used_feature_names:
    for keyword, explanation in RULES.items():
        if keyword.lower() in feat.lower():
            meta_explanation_dict[feat] = explanation
            break
    else:
        meta_explanation_dict[feat] = "miscellaneous descriptor — no mapped biological role found"

# === Save to JSON ===
output_path = os.path.join(SAVE_DIR, "meta_explanations.json")
with open(output_path, "w") as f:
    json.dump(meta_explanation_dict, f, indent=2)

# === Preview sample ===
print("✅ Meta-explanation dictionary (used features only) created and saved.")
print("🔍 Sample mappings:")
for k, v in list(meta_explanation_dict.items())[:10]:
    print(f" - {k}: {v}")


✅ Meta-explanation dictionary (used features only) created and saved.
🔍 Sample mappings:
 - piPC3: miscellaneous descriptor — no mapped biological role found
 - Xc-5dv: miscellaneous descriptor — no mapped biological role found
 - JGI5: miscellaneous descriptor — no mapped biological role found
 - MATS5c: Moran autocorrelation — weighted by physicochemical properties
 - MINssCH2: miscellaneous descriptor — no mapped biological role found
 - AATS4s: Average autocorrelation — captures pairwise interactions across distances
 - ETA_shape_x: Electron-topological state index — shape, branching, and reactivity
 - MDEN-11: miscellaneous descriptor — no mapped biological role found
 - ATSC3dv: Centered autocorrelation — emphasizes central atom contributions
 - Xpc-5dv: miscellaneous descriptor — no mapped biological role found


## SMART explainer  

In [1]:
import os
import json
from rdkit import Chem

# === Output path ===
output_path = "tox21_lightgb_pipeline/Data_v6/meta_explainer/smarts_rules.json"
os.makedirs(os.path.dirname(output_path), exist_ok=True)

# === Define toxicophores ===
tox_rules = {
    "Aromatic amine": {
        "smarts": "[NX3][cR]",
        "explanation": "mutagenicity and DNA interaction"
    },
    "Nitro group": {
        "smarts": "[NX3](=O)=O",
        "explanation": "oxidative stress and mutagenicity"
    },
    "Sulfonamide": {
        "smarts": "S(=O)(=O)[NX3]",
        "explanation": "allergic responses and metabolic activation"
    },
    "Phenol": {
        "smarts": "c[OH]",
        "explanation": "redox cycling and toxicity via reactive oxygen species"
    },
    "Halogenated benzene": {
        "smarts": "c[Cl,Br,F,I]",
        "explanation": "endocrine disruption and liver toxicity"
    },
    "Carboxylic acid": {
        "smarts": "C(=O)[OH]",
        "explanation": "irritancy and phase I metabolism"
    },
    "Aldehyde": {
        "smarts": "[CX3H1](=O)[#6]",
        "explanation": "cross-linking with DNA/proteins, tissue damage"
    },
    "Ketone": {
        "smarts": "[#6][CX3](=O)[#6]",
        "explanation": "involved in redox cycling and electrophilic reactivity"
    },
    "Primary amine": {
        "smarts": "[NX3;H2][#6]",
        "explanation": "forms DNA adducts, mutagenic potential"
    },
    "Thiol": {
        "smarts": "[#16H]",
        "explanation": "strong nucleophile, binds metals and proteins"
    }
}

# === Validate SMARTS patterns ===
validated_rules = {}
for name, info in tox_rules.items():
    smarts = info["smarts"]
    if Chem.MolFromSmarts(smarts):
        validated_rules[name] = info
    else:
        print(f"❌ Invalid SMARTS: {name} → {smarts}")

# === Save JSON ===
with open(output_path, "w") as f:
    json.dump(validated_rules, f, indent=2)

print(f"✅ Saved {len(validated_rules)} validated SMARTS rules to:")
print(f"📁 {output_path}")


✅ Saved 10 validated SMARTS rules to:
📁 tox21_lightgb_pipeline/Data_v6/meta_explainer/smarts_rules.json
