<a href="https://colab.research.google.com/github/GabrielWarner/DL4H-finalproject/blob/main/notebook/baseline_ce_no_weights.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%pip install -q --upgrade torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
%pip install -q "transformers==4.44.2" "datasets>=2.20.0" "evaluate==0.4.2" pandas matplotlib tqdm

[31mERROR: Operation cancelled by user[0m[31m
[0m

KeyboardInterrupt: 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

DATA_DIR  = "/content/drive/MyDrive/DL4H_data/mimic"
CKPT_DIR  = "/content/drive/MyDrive/DL4H_data/ckpt"
LOGS_DIR  = "/content/drive/MyDrive/DL4H_data/logs"
FIGS_DIR  = "/content/drive/MyDrive/DL4H_data/figs"

import os
for p in [DATA_DIR, CKPT_DIR, LOGS_DIR, FIGS_DIR]:
    os.makedirs(p, exist_ok=True)

In [None]:
import os
os.environ["HF_HOME"] = "/content/drive/MyDrive/DL4H_data/hf_cache"
os.makedirs(os.environ["HF_HOME"], exist_ok=True)

In [None]:
from datasets import load_dataset
import pandas as pd
import numpy as np

ds = load_dataset("itsanmolgupta/mimic-cxr-dataset")
print(ds)

df = pd.DataFrame({
    "report_id": np.arange(len(ds["train"])),
    "findings": ds["train"]["findings"],
    "impression": ds["train"]["impression"],
})
df["report_text"] = (
    df["impression"].fillna("").astype(str).str.strip() + " " +
    df["findings"].fillna("").astype(str).str.strip()
).str.strip()

# drop empty reports
df = df[df["report_text"].str.len() > 0].reset_index(drop=True)
print("Reports after filtering:", len(df))
df.head(3)

In [None]:
rng = np.random.default_rng(42)
perm = rng.permutation(len(df))
n = len(df)
i_tr = int(0.8*n); i_va = int(0.9*n)

df["split"] = "test"
df.loc[perm[:i_tr], "split"] = "train"
df.loc[perm[i_tr:i_va], "split"] = "val"

df["split"].value_counts()

In [None]:
import re
from tqdm import tqdm

def sent_tokenize(text: str):
    parts = re.split(r'(?<=[\.\?\!])\s+|\n+', text)
    return [s.strip() for s in parts if s and s.strip()]

ABN_TERMS = [
    "pneumonia","consolidation","edema","effusion","atelectasis","pneumothorax",
    "fracture","opacity","lesion","mass","enlarged","cardiomegaly","infiltrate",
    "hemorrhage","emphysema","fibrosis","collapse","airspace","air-fluid","pleural",
    "mediastinal widening","hyperinflation","interstitial","ground-glass"
]
NORM_PHRASES = [
    "no acute cardiopulmonary process","no acute cardiopulmonary disease",
    "no acute process","no acute abnormality","no acute findings","no focal consolidation",
    "no pleural effusion","no pneumothorax","heart size is normal","lungs are clear",
    "no acute osseous abnormality"
]
UNCERTAIN_MARKERS = [
    "cannot exclude","question of","possible","may represent","suggest","probable",
    "likely","suspicious for"," ?"," ? "
]

abn_re  = re.compile(r"\b(" + "|".join(re.escape(w) for w in ABN_TERMS) + r")\b", re.I)
norm_re = re.compile("|".join(re.escape(p) for p in NORM_PHRASES), re.I)
unc_re  = re.compile("|".join(re.escape(p) for p in UNCERTAIN_MARKERS), re.I)

def weak_label(s: str) -> str:
    s = s.strip()
    if not s:
        return "uncertain"
    has_abn  = bool(abn_re.search(s))
    has_norm = bool(norm_re.search(s))
    has_unc  = bool(unc_re.search(s))
    if has_abn: return "abnormal"
    if has_norm and not has_abn: return "normal"
    return "uncertain"

In [None]:
rows = []
for idx, r in tqdm(df.iterrows(), total=len(df)):
    sents = sent_tokenize(r["report_text"])
    for j, sent in enumerate(sents):
        if len(sent) < 3:
            continue
        rows.append({
            "report_id": int(r["report_id"]),
            "sentence_id": j,
            "text": sent,
            "label": weak_label(sent),
            "split": r["split"],
        })

sent_df = pd.DataFrame(rows)
print("Total sentences:", len(sent_df))
sent_df.head(5)

In [None]:
def dist(df):
    return df["label"].value_counts(normalize=True).round(3).to_dict()

print("ALL:", dist(sent_df))
for sp in ["train","val","test"]:
    print(sp, dist(sent_df[sent_df["split"]==sp]))

In [None]:
import os
for split in ["train","val","test"]:
    out = (sent_df[sent_df["split"]==split]
           [["report_id","sentence_id","text","label"]]
           .reset_index(drop=True))
    out_path = f"{DATA_DIR}/{split}.csv"
    out.to_csv(out_path, index=False)
    print(split, len(out), "->", out_path)

import pandas as pd
for split in ["train","val","test"]:
    path = f"{DATA_DIR}/{split}.csv"
    df_split = pd.read_csv(path)
    df_split.sample(n=min(1000, len(df_split)), random_state=1).to_csv(
        f"{DATA_DIR}/{split}_mini.csv", index=False
    )
print("Wrote mini splits.")

In [None]:
import pandas as pd

def label_dist(path):
    df = pd.read_csv(path)
    return {"rows": len(df), "dist": df["label"].value_counts(normalize=True).round(3).to_dict()}

print({
    "source": "HF itsanmolgupta/mimic-cxr-dataset + weak sentence labels (rule-based)",
    "train": label_dist(f"{DATA_DIR}/train.csv"),
    "val":   label_dist(f"{DATA_DIR}/val.csv"),
    "test":  label_dist(f"{DATA_DIR}/test.csv"),
})

In [None]:
import torch, platform
print("CUDA:", torch.cuda.is_available())
if torch.cuda.is_available(): print("GPU:", torch.cuda.get_device_name(0))
import transformers, datasets, evaluate
print("transformers:", transformers.__version__)
print("datasets:", datasets.__version__)
print("evaluate:", evaluate.__version__)
print("Python:", platform.python_version())
print("CSV sizes:",
      {s: sum(1 for _ in open(f"{DATA_DIR}/{s}.csv"))-1 for s in ["train","val","test"]})

In [None]:
# --- config ---
LABELS = ["normal","abnormal","uncertain"]
label2id = {l:i for i,l in enumerate(LABELS)}
id2label = {i:l for l,i in label2id.items()}
MODEL_NAME = "emilyalsentzer/Bio_ClinicalBERT"
MAX_LEN = 128
BATCH_TRAIN = 32
BATCH_EVAL  = 64
LR = 2e-5
EPOCHS = 3
SEED = 42

In [None]:
import pandas as pd, os
def need(path):
    assert os.path.exists(path), f"Missing {path}"
    df = pd.read_csv(path)
    req = {"report_id","sentence_id","text","label"}
    assert req.issubset(df.columns), f"{path} must have columns {req}"
    return df

train_df = need(f"{DATA_DIR}/train.csv")
val_df   = need(f"{DATA_DIR}/val.csv")
test_df  = need(f"{DATA_DIR}/test.csv")

for d in (train_df, val_df, test_df):
    d["labels"] = d["label"].map(label2id)

In [None]:
from datasets import Dataset
from transformers import AutoTokenizer
tok = AutoTokenizer.from_pretrained(MODEL_NAME)

def to_ds(df): return Dataset.from_pandas(df[["text","labels"]].reset_index(drop=True))
train_ds, val_ds, test_ds = map(to_ds, [train_df, val_df, test_df])

def tokenize(batch):
    return tok(batch["text"], padding="max_length", truncation=True, max_length=MAX_LEN)

train_tok = train_ds.map(tokenize, batched=True).with_format("torch", columns=["input_ids","attention_mask","labels"])
val_tok   = val_ds.map(tokenize,   batched=True).with_format("torch", columns=["input_ids","attention_mask","labels"])
test_tok  = test_ds.map(tokenize,  batched=True).with_format("torch", columns=["input_ids","attention_mask","labels"])

In [None]:
import numpy as np, evaluate
metric_acc = evaluate.load("accuracy")
metric_f1  = evaluate.load("f1")
metric_auc = evaluate.load("roc_auc","multiclass")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    out = {
        "accuracy": metric_acc.compute(predictions=preds, references=labels)["accuracy"],
        "f1_macro": metric_f1.compute(predictions=preds, references=labels, average="macro")["f1"],
    }
    try:
        out["auc_ovr"] = metric_auc.compute(
            prediction_scores=logits, references=labels, multi_class="ovr", average="macro"
        )["roc_auc"]
    except Exception:
        pass
    return out

In [None]:
import torch, numpy as np, time, json
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, set_seed
import os
set_seed(SEED)

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=3, id2label=id2label, label2id=label2id
)

run_name = f"ce_no_weights_{MODEL_NAME.split('/')[-1]}_{int(time.time())}"
out_dir  = f"{CKPT_DIR}/{run_name}"
os.makedirs(out_dir, exist_ok=True)

args = TrainingArguments(
    output_dir=out_dir,
    per_device_train_batch_size=BATCH_TRAIN,
    per_device_eval_batch_size=BATCH_EVAL,
    num_train_epochs=EPOCHS,
    learning_rate=LR,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    seed=SEED,
    fp16=True if torch.cuda.is_available() else False,
    logging_steps=50,
    report_to="none",
    save_safetensors=False,
    overwrite_output_dir=True,
)

# REMOVING WEIGHTED TRAINER

"""
trainer = WeightedTrainer(
    model=model,
    args=args,
    train_dataset=train_tok,
    eval_dataset=val_tok,
    compute_metrics=compute_metrics,
)"""
#reglar trainer
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_tok,
    eval_dataset=val_tok,
    compute_metrics=compute_metrics,
)

In [None]:

train_out = trainer.train()
test_metrics = trainer.evaluate(test_tok)
test_metrics

In [None]:
import os, json, pandas as pd, matplotlib.pyplot as plt

os.makedirs(FIGS_DIR, exist_ok=True)

with open(os.path.join(out_dir, "test_metrics.json"), "w") as f:
    json.dump(test_metrics, f, indent=2)
pd.DataFrame([test_metrics]).to_csv(os.path.join(out_dir, "test_metrics.csv"), index=False)

keys = [k for k in ["eval_accuracy","eval_f1_macro"] if k in test_metrics]
vals = [test_metrics[k] for k in keys]
plt.figure(); plt.bar(keys, vals)
for i,v in enumerate(vals): plt.text(i, v, f"{v:.4f}", ha="center", va="bottom")
plt.title("Bio_ClinicalBERT (no class weights) â€” Test")
fig_path = os.path.join(FIGS_DIR, f"{os.path.basename(out_dir)}_metrics.png")
plt.savefig(fig_path, bbox_inches="tight"); plt.show()

{"out_dir": out_dir, "fig": fig_path}

In [None]:
import numpy as np, pandas as pd
from sklearn.metrics import classification_report, confusion_matrix

preds = trainer.predict(test_tok)
y_true = preds.label_ids
y_pred = np.argmax(preds.predictions, axis=-1)

LABELS = ["normal","abnormal","uncertain"]
print(classification_report(y_true, y_pred, target_names=LABELS, digits=3))

cm = confusion_matrix(y_true, y_pred, labels=range(len(LABELS)))
cm_df = pd.DataFrame(cm, index=[f"true_{l}" for l in LABELS],
                         columns=[f"pred_{l}" for l in LABELS])
cm_df

In [None]:
import os, matplotlib.pyplot as plt

logs = trainer.state.log_history

train_epochs, train_losses = [], []
val_epochs,   val_losses   = [], []

for rec in logs:
    if "loss" in rec and "epoch" in rec and "eval_loss" not in rec:
        train_epochs.append(rec["epoch"])
        train_losses.append(rec["loss"])
    if "eval_loss" in rec and "epoch" in rec:
        val_epochs.append(rec["epoch"])
        val_losses.append(rec["eval_loss"])

print("Train loss points:", len(train_losses))
print("Val loss points:", len(val_losses))

plt.figure()
plt.plot(train_epochs, train_losses, label="Train Loss")
plt.plot(val_epochs,   val_losses,   label="Val Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training vs Validation Loss")
plt.legend()

loss_fig_path = os.path.join(FIGS_DIR, "baseline_loss_curve.png")
plt.savefig(loss_fig_path, bbox_inches="tight")
plt.show()

loss_fig_path