<a href="https://colab.research.google.com/github/GabrielWarner/DL4H-finalproject/blob/main/notebook/baseline_ce.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
%pip install -q --upgrade torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
%pip install -q "transformers==4.44.2" "datasets>=2.20.0" "evaluate==0.4.2" pandas matplotlib tqdm

In [15]:
from google.colab import drive
drive.mount('/content/drive')

DATA_DIR  = "/content/drive/MyDrive/DL4H_data/mimic"
CKPT_DIR  = "/content/drive/MyDrive/DL4H_data/ckpt"
LOGS_DIR  = "/content/drive/MyDrive/DL4H_data/logs"
FIGS_DIR  = "/content/drive/MyDrive/DL4H_data/figs"

import os
for p in [DATA_DIR, CKPT_DIR, LOGS_DIR, FIGS_DIR]:
    os.makedirs(p, exist_ok=True)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [16]:
import os
os.environ["HF_HOME"] = "/content/drive/MyDrive/DL4H_data/hf_cache"
os.makedirs(os.environ["HF_HOME"], exist_ok=True)

In [17]:
from datasets import load_dataset
import pandas as pd
import numpy as np

ds = load_dataset("itsanmolgupta/mimic-cxr-dataset")
print(ds)

df = pd.DataFrame({
    "report_id": np.arange(len(ds["train"])),
    "findings": ds["train"]["findings"],
    "impression": ds["train"]["impression"],
})
df["report_text"] = (
    df["impression"].fillna("").astype(str).str.strip() + " " +
    df["findings"].fillna("").astype(str).str.strip()
).str.strip()

# drop empty reports
df = df[df["report_text"].str.len() > 0].reset_index(drop=True)
print("Reports after filtering:", len(df))
df.head(3)

DatasetDict({
    train: Dataset({
        features: ['image', 'findings', 'impression'],
        num_rows: 30633
    })
})
Reports after filtering: 30633


Unnamed: 0,report_id,findings,impression,report_text
0,0,"The lungs are clear of focal consolidation, pl...",No acute cardiopulmonary process.,No acute cardiopulmonary process. The lungs ar...
1,1,Lung volumes remain low. There are innumerable...,Low lung volumes and mild pulmonary vascular c...,Low lung volumes and mild pulmonary vascular c...
2,2,Lung volumes are low. This results in crowding...,Innumerable pulmonary metastases. Possible mil...,Innumerable pulmonary metastases. Possible mil...


In [18]:
rng = np.random.default_rng(42)
perm = rng.permutation(len(df))
n = len(df)
i_tr = int(0.8*n); i_va = int(0.9*n)

df["split"] = "test"
df.loc[perm[:i_tr], "split"] = "train"
df.loc[perm[i_tr:i_va], "split"] = "val"

df["split"].value_counts()

Unnamed: 0_level_0,count
split,Unnamed: 1_level_1
train,24506
test,3064
val,3063


In [19]:
import re
from tqdm import tqdm

def sent_tokenize(text: str):
    parts = re.split(r'(?<=[\.\?\!])\s+|\n+', text)
    return [s.strip() for s in parts if s and s.strip()]

ABN_TERMS = [
    "pneumonia","consolidation","edema","effusion","atelectasis","pneumothorax",
    "fracture","opacity","lesion","mass","enlarged","cardiomegaly","infiltrate",
    "hemorrhage","emphysema","fibrosis","collapse","airspace","air-fluid","pleural",
    "mediastinal widening","hyperinflation","interstitial","ground-glass"
]
NORM_PHRASES = [
    "no acute cardiopulmonary process","no acute cardiopulmonary disease",
    "no acute process","no acute abnormality","no acute findings","no focal consolidation",
    "no pleural effusion","no pneumothorax","heart size is normal","lungs are clear",
    "no acute osseous abnormality"
]
UNCERTAIN_MARKERS = [
    "cannot exclude","question of","possible","may represent","suggest","probable",
    "likely","suspicious for"," ?"," ? "
]

abn_re  = re.compile(r"\b(" + "|".join(re.escape(w) for w in ABN_TERMS) + r")\b", re.I)
norm_re = re.compile("|".join(re.escape(p) for p in NORM_PHRASES), re.I)
unc_re  = re.compile("|".join(re.escape(p) for p in UNCERTAIN_MARKERS), re.I)

def weak_label(s: str) -> str:
    s = s.strip()
    if not s:
        return "uncertain"
    has_abn  = bool(abn_re.search(s))
    has_norm = bool(norm_re.search(s))
    has_unc  = bool(unc_re.search(s))
    if has_abn: return "abnormal"
    if has_norm and not has_abn: return "normal"
    return "uncertain"

In [20]:
rows = []
for idx, r in tqdm(df.iterrows(), total=len(df)):
    sents = sent_tokenize(r["report_text"])
    for j, sent in enumerate(sents):
        if len(sent) < 3:
            continue
        rows.append({
            "report_id": int(r["report_id"]),
            "sentence_id": j,
            "text": sent,
            "label": weak_label(sent),
            "split": r["split"],
        })

sent_df = pd.DataFrame(rows)
print("Total sentences:", len(sent_df))
sent_df.head(5)

100%|██████████| 30633/30633 [00:09<00:00, 3120.32it/s]


Total sentences: 230355


Unnamed: 0,report_id,sentence_id,text,label,split
0,0,0,No acute cardiopulmonary process.,normal,train
1,0,1,"The lungs are clear of focal consolidation, pl...",abnormal,train
2,0,2,The heart size is normal.,normal,train
3,0,3,The mediastinal contours are normal.,uncertain,train
4,0,4,Multiple surgical clips project over the left ...,uncertain,train


In [21]:
def dist(df):
    return df["label"].value_counts(normalize=True).round(3).to_dict()

print("ALL:", dist(sent_df))
for sp in ["train","val","test"]:
    print(sp, dist(sent_df[sent_df["split"]==sp]))

ALL: {'uncertain': 0.517, 'abnormal': 0.455, 'normal': 0.028}
train {'uncertain': 0.517, 'abnormal': 0.456, 'normal': 0.028}
val {'uncertain': 0.522, 'abnormal': 0.45, 'normal': 0.028}
test {'uncertain': 0.518, 'abnormal': 0.453, 'normal': 0.028}


In [22]:
import os
for split in ["train","val","test"]:
    out = (sent_df[sent_df["split"]==split]
           [["report_id","sentence_id","text","label"]]
           .reset_index(drop=True))
    out_path = f"{DATA_DIR}/{split}.csv"
    out.to_csv(out_path, index=False)
    print(split, len(out), "->", out_path)

import pandas as pd
for split in ["train","val","test"]:
    path = f"{DATA_DIR}/{split}.csv"
    df_split = pd.read_csv(path)
    df_split.sample(n=min(1000, len(df_split)), random_state=1).to_csv(
        f"{DATA_DIR}/{split}_mini.csv", index=False
    )
print("Wrote mini splits.")

train 183888 -> /content/drive/MyDrive/DL4H_data/mimic/train.csv
val 23232 -> /content/drive/MyDrive/DL4H_data/mimic/val.csv
test 23235 -> /content/drive/MyDrive/DL4H_data/mimic/test.csv
Wrote mini splits.


In [None]:
import pandas as pd

def label_dist(path):
    df = pd.read_csv(path)
    return {"rows": len(df), "dist": df["label"].value_counts(normalize=True).round(3).to_dict()}

print({
    "source": "HF itsanmolgupta/mimic-cxr-dataset + weak sentence labels (rule-based)",
    "train": label_dist(f"{DATA_DIR}/train.csv"),
    "val":   label_dist(f"{DATA_DIR}/val.csv"),
    "test":  label_dist(f"{DATA_DIR}/test.csv"),
})

In [23]:
import torch, platform
print("CUDA:", torch.cuda.is_available())
if torch.cuda.is_available(): print("GPU:", torch.cuda.get_device_name(0))
import transformers, datasets, evaluate
print("transformers:", transformers.__version__)
print("datasets:", datasets.__version__)
print("evaluate:", evaluate.__version__)
print("Python:", platform.python_version())
print("CSV sizes:",
      {s: sum(1 for _ in open(f"{DATA_DIR}/{s}.csv"))-1 for s in ["train","val","test"]})

CUDA: True
GPU: Tesla T4
transformers: 4.44.2
datasets: 4.0.0
evaluate: 0.4.2
Python: 3.12.12
CSV sizes: {'train': 183888, 'val': 23232, 'test': 23235}
