# Triage rules baseline

Prototype notebook for rule-based clinical NLP triage.

- Loads `data/lexicon_redflags.csv`
- Loads `data/notes_synthetic.csv`
- Applies simple rule-based scoring
- Exports `outputs/predictions.csv`


In [None]:
import pandas as pd

lexicon = pd.read_csv("../data/lexicon_redflags.csv")
notes = pd.read_csv("../data/notes_synthetic.csv")

# Normalize + deduplicate terms to avoid inflated hit counts when the same term appears in multiple categories.
terms = (
    lexicon["term"]
    .dropna()
    .astype(str)
    .str.strip()
    .str.lower()
)
terms = sorted({t for t in terms.tolist() if t})

def count_hits(text: str) -> int:
    s = str(text).lower()
    return sum(1 for t in terms if t in s)

def predict_label_from_hits(hits: int) -> str:
    if hits >= 2:
        return "high"
    if hits == 1:
        return "intermediate"
    return "low"

notes["hits_count"] = notes["text"].apply(count_hits)
notes["predicted_label"] = notes["hits_count"].apply(predict_label_from_hits)

notes[["id", "text", "entity", "hits_count", "predicted_label"]].to_csv(
    "../outputs/predictions.csv", index=False
)

print("Saved outputs/predictions.csv")
