In [3]:
# ADR-only evaluation with MedDRA gold — notebook library

import re
from dataclasses import dataclass
from pathlib import Path
from typing import List, Tuple, Set, Dict

@dataclass(frozen=True)
class Span:
    label: str
    start: int
    end: int

RANGE_RE = re.compile(r"(\d+)\s+(\d+)")  # matches "start end" pairs

def _iter_t_lines(path: Path):
    for line in path.read_text(encoding="utf-8").splitlines():
        line = line.strip()
        if not line or not line.startswith("T"):
            continue
        parts = line.split("\t")
        if len(parts) < 3:
            continue
        yield parts  # [tid, head, text]

def parse_meddra_gold_as_adr(path: Path) -> List[Span]:
    """Treat every T-line in MedDRA gold as ADR; split multi-ranges into separate spans."""
    out: List[Span] = []
    for _, head, _ in _iter_t_lines(path):
        first = head.split()[0]     # could be a MedDRA code or 'ADR'
        tail = head[len(first):]
        for m in RANGE_RE.finditer(tail):
            s, e = int(m.group(1)), int(m.group(2))
            if e > s:
                out.append(Span("ADR", s, e))
    return sorted(set(out), key=lambda x: (x.start, x.end))

def parse_pred_adr_only(path: Path) -> List[Span]:
    """From predictions, keep only ADR-labeled T-lines (case-insensitive); split multi-ranges."""
    out: List[Span] = []
    for _, head, _ in _iter_t_lines(path):
        label = head.split()[0]
        if re.sub(r"[^a-z0-9]+", "", label.lower()) != "adr":
            continue
        tail = head[len(label):]
        for m in RANGE_RE.finditer(tail):
            s, e = int(m.group(1)), int(m.group(2))
            if e > s:
                out.append(Span("ADR", s, e))
    return sorted(set(out), key=lambda x: (x.start, x.end))

def to_set(spans: List[Span]) -> Set[Tuple[str, int, int]]:
    return {(s.label, s.start, s.end) for s in spans}

def prf1(tp: int, fp: int, fn: int):
    p = tp/(tp+fp) if tp+fp else 0.0
    r = tp/(tp+fn) if tp+fn else 0.0
    f = 2*p*r/(p+r) if p+r else 0.0
    return p, r, f

def overlap_len(a: Tuple[int, int], b: Tuple[int, int]) -> int:
    return max(0, min(a[1], b[1]) - max(a[0], b[0]))

def evaluate_file_meddra_adr_only(
    cadec_root: Path,     # folder that CONTAINS 'meddra' and 'text'
    predicted_dir: Path,  # folder with your predicted .ann
    file_basename: str,   # e.g. "ARTHROTEC.24"
    match: str = "strict",
    show_entities: bool = True,
) -> Dict[str, float]:
    gold_ann = cadec_root / "meddra" / f"{file_basename}.ann"
    txt_path = cadec_root / "text" / f"{file_basename}.txt"
    pred_ann = predicted_dir / f"{file_basename}.ann"

    if not gold_ann.exists(): raise FileNotFoundError(f"Gold not found: {gold_ann}")
    if not pred_ann.exists(): raise FileNotFoundError(f"Pred not found: {pred_ann}")
    if not txt_path.exists():  raise FileNotFoundError(f"Text not found: {txt_path}")

    raw = txt_path.read_text(encoding="utf-8")
    gold = parse_meddra_gold_as_adr(gold_ann)
    pred = parse_pred_adr_only(pred_ann)

    if show_entities:
        print(f"\n===== {file_basename} =====")
        print("\n--- Ground Truth ADR Entities (MedDRA) ---")
        for s in gold:
            print(("ADR", s.start, s.end, raw[s.start:s.end].replace("\n", " ")))
        print("\n--- Predicted ADR Entities ---")
        for s in pred:
            print(("ADR", s.start, s.end, raw[s.start:s.end].replace("\n", " ")))

    if match == "strict":
        gset, pset = to_set(gold), to_set(pred)
        tp = len(gset & pset)
        fp = len(pset - gset)
        fn = len(gset - pset)
    else:
        gold_unused = gold[:]
        tp = 0
        for p in pred:
            ps, pe = p.start, p.end
            hit = next((i for i, g in enumerate(gold_unused) if overlap_len((ps, pe), (g.start, g.end)) > 0), None)
            if hit is not None:
                tp += 1
                gold_unused.pop(hit)
        fp = len(pred) - tp
        fn = len(gold_unused)

    P, R, F = prf1(tp, fp, fn)
    print(f"\n--- Metrics ({match}) ---  TP:{tp} FP:{fp} FN:{fn}  |  P:{P:.2f} R:{R:.2f} F1:{F:.2f}")
    return {"precision": P, "recall": R, "f1": F, "tp": tp, "fp": fp, "fn": fn}

def evaluate_all_overlap_meddra_adr(
    cadec_root: Path,
    predicted_dir: Path,
    match: str = "strict",
    show_entities_each: bool = False,
) -> Dict[str, float]:
    """Evaluate ALL files present in BOTH meddra/ and predicted/."""
    gold_dir = cadec_root / "meddra"
    text_dir = cadec_root / "text"
    gold_bases = sorted(p.stem for p in gold_dir.glob("*.ann"))
    pred_bases = sorted(p.stem for p in predicted_dir.glob("*.ann"))
    both = sorted(set(gold_bases) & set(pred_bases))
    if not both:
        raise RuntimeError(f"No overlap between {gold_dir} and {predicted_dir}.")

    micro_tp = micro_fp = micro_fn = 0
    macro_P = macro_R = macro_F = 0.0

    print(f"Evaluating {len(both)} files (intersection of meddra/ and predicted/)")
    for base in both:
        m = evaluate_file_meddra_adr_only(
            cadec_root=cadec_root,
            predicted_dir=predicted_dir,
            file_basename=base,
            match=match,
            show_entities=show_entities_each,
        )
        micro_tp += m["tp"]; micro_fp += m["fp"]; micro_fn += m["fn"]
        macro_P  += m["precision"]; macro_R  += m["recall"];  macro_F  += m["f1"]

    P_micro, R_micro, F_micro = prf1(micro_tp, micro_fp, micro_fn)
    n = len(both)
    P_macro, R_macro, F_macro = (macro_P/n, macro_R/n, macro_F/n)

    print("\n========== ADR-only (MedDRA) SUMMARY ==========")
    print(f"Micro  — P:{P_micro:.3f} R:{R_micro:.3f} F1:{F_micro:.3f}   (TP:{micro_tp} FP:{micro_fp} FN:{micro_fn})")
    print(f"Macro  — P:{P_macro:.3f} R:{R_macro:.3f} F1:{F_macro:.3f}   (avg over {n} files)")
    return {
        "P_micro": P_micro, "R_micro": R_micro, "F1_micro": F_micro,
        "P_macro": P_macro, "R_macro": R_macro, "F1_macro": F_macro,
        "TP": micro_tp, "FP": micro_fp, "FN": micro_fn, "n_files": n,
    }


In [11]:
from pathlib import Path

# set paths
cadec_root    = Path("/Users/anjalikulkarni/Desktop/Assignment1/CADEC-lPWNPfjE-/data/cadec")  # contains 'meddra' and 'text'
predicted_dir = Path("/Users/anjalikulkarni/Desktop/Assignment1/predicted")

# A) evaluate ONE file (same style as task 3, but ADR-only & meddra)
_ = evaluate_file_meddra_adr_only(
    cadec_root=cadec_root,
    predicted_dir=predicted_dir,
    file_basename="ARTHROTEC.25",   # <-- change to a basename present in BOTH meddra/ and predicted/
    match="strict",                 # or "overlap"
    show_entities=True,             # set False to only show metrics
)


===== ARTHROTEC.25 =====

--- Ground Truth ADR Entities (MedDRA) ---
('ADR', 0, 6, 'nausea')
('ADR', 7, 21, 'extreme hunger')
('ADR', 22, 34, 'stomach pain')
('ADR', 36, 50, 'abdominal pain')
('ADR', 52, 68, 'vaginal bleeding')
('ADR', 69, 75, 'cramps')

--- Predicted ADR Entities ---
('ADR', 0, 6, 'nausea')
('ADR', 7, 21, 'extreme hunger')
('ADR', 22, 34, 'stomach pain')
('ADR', 36, 50, 'abdominal pain')
('ADR', 52, 68, 'vaginal bleeding')
('ADR', 69, 75, 'cramps')

--- Metrics (strict) ---  TP:6 FP:0 FN:0  |  P:1.00 R:1.00 F1:1.00
