In [15]:
import os, re
from pathlib import Path
import pandas as pd

IM_DIR = Path("data/images_normalized")
PROJ_CSV = Path("data/indiana_projections.csv")
REP_CSV  = Path("data/indiana_reports.csv")
OUT_CSV  = Path("data/pairs.csv")

projections = pd.read_csv(PROJ_CSV)
reports = pd.read_csv(REP_CSV)[["uid","findings","impression"]]  # <-- only what we need

# Frontal only, prefer -1001, keep existing files
frontals = projections[projections["projection"].str.lower() == "frontal"].copy()
frontals["image_path"] = (IM_DIR / frontals["filename"]).astype(str)
frontals = frontals[frontals["image_path"].apply(os.path.exists)].copy()
frontals["pref1001"] = frontals["filename"].str.contains(r"-1001", na=False)
frontals = frontals.sort_values(by=["uid","pref1001","filename"], ascending=[True,False,True])
frontals_one = frontals.drop_duplicates(subset="uid", keep="first").copy()

df = pd.merge(frontals_one[["uid","image_path"]], reports, on="uid", how="inner")

def clean_text(findings, impression):
    parts = [x for x in (findings, impression) if isinstance(x, str)]
    text = " ".join(parts)
    text = re.sub(r"X{2,}", "", text)
    text = re.sub(r"\s+", " ", text).strip(" .;,-\n\t")
    return text

df["report"] = df.apply(lambda r: clean_text(r.get("findings",""), r.get("impression","")), axis=1)
df = df[df["report"].fillna("").str.len() >= 15].copy()

df = df.rename(columns={"image_path":"image"})[["image","report"]]
OUT_CSV.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(OUT_CSV, index=False)
print(f"Wrote {len(df)} rows to {OUT_CSV}")

Wrote 3666 rows to data/pairs.csv


In [17]:
pd.read_csv('data/pairs.csv')

Unnamed: 0,image,report
0,data/images_normalized/1_IM-0001-4001.dcm.png,The cardiac silhouette and mediastinum size ar...
1,data/images_normalized/2_IM-0652-1001.dcm.png,Borderline cardiomegaly. Midline sternotomy . ...
2,data/images_normalized/3_IM-1384-1001.dcm.png,"No displaced rib fractures, pneumothorax, or p..."
3,data/images_normalized/4_IM-2050-1001.dcm.png,There are diffuse bilateral interstitial and a...
4,data/images_normalized/5_IM-2117-1003002.dcm.png,The cardiomediastinal silhouette and pulmonary...
...,...,...
3661,data/images_normalized/3995_IM-2046-1001.dcm.png,The cardiomediastinal silhouette and pulmonary...
3662,data/images_normalized/3996_IM-2047-1001.dcm.png,The lungs are clear. Heart size is normal. No ...
3663,data/images_normalized/3997_IM-2048-1001.dcm.png,"Heart size within normal limits. Small, nodula..."
3664,data/images_normalized/3998_IM-2048-1001.dcm.png,Heart size is normal and the lungs are clear
