In [None]:
import gzip
import pandas as pd
import requests
from pathlib import Path
from getpass import getpass

BASE = "https://physionet.org/files/mimic-cxr/2.1.0/files"

username = input("PhysioNet username: ")
password = getpass("PhysioNet password: ")

session = requests.Session()
session.auth = (username, password)

# Load metadata
df = pd.read_csv("cxr-study-list.csv.gz")

# Filter p10 only
paths = df[df["path"].str.startswith("p10/")]["path"]

out_root = Path("mimic_cxr_p10")
out_root.mkdir(exist_ok=True)

for rel in paths:
    url = f"{BASE}/{rel}"
    local_path = out_root / rel

    local_path.parent.mkdir(parents=True, exist_ok=True)

    print(f"Downloading {rel}")
    with session.get(url, stream=True) as r:
        r.raise_for_status()
        with open(local_path, "wb") as f:
            for chunk in r.iter_content(8192):
                f.write(chunk)


PhysioNet password:  ········


In [6]:
df

Unnamed: 0,subject_id,study_id,path
0,10000032,50414267,files/p10/p10000032/s50414267.txt
1,10000032,53189527,files/p10/p10000032/s53189527.txt
2,10000032,53911762,files/p10/p10000032/s53911762.txt
3,10000032,56699142,files/p10/p10000032/s56699142.txt
4,10000764,57375967,files/p10/p10000764/s57375967.txt
...,...,...,...
227830,19999442,58708861,files/p19/p19999442/s58708861.txt
227831,19999733,57132437,files/p19/p19999733/s57132437.txt
227832,19999987,55368167,files/p19/p19999987/s55368167.txt
227833,19999987,58621812,files/p19/p19999987/s58621812.txt


In [8]:
import zipfile
import os
from pathlib import Path

ZIP_PATH = "mimic-cxr-reports.zip"
OUTPUT_DIR = Path("mimic_cxr_reports")

def extract_mimic_cxr_reports(zip_path, output_dir):
    """
    Extract only .txt radiology reports from mimic-cxr-reports.zip.
    Handles PhysioNet's weird partial/directory-embedded structure.
    """

    output_dir.mkdir(exist_ok=True)

    with zipfile.ZipFile(zip_path, 'r') as zf:
        members = zf.namelist()

        print(f"Total items in ZIP: {len(members)}")

        txt_files = [m for m in members if m.lower().endswith(".txt")]

        print(f"Found {len(txt_files)} .txt report files.")

        for member in txt_files:
            out_path = output_dir / member

            out_path.parent.mkdir(parents=True, exist_ok=True)

            with zf.open(member) as src, open(out_path, "wb") as dst:
                dst.write(src.read())

        print(f"Extraction complete. Reports saved under: {output_dir}")

# Run it
extract_mimic_cxr_reports(ZIP_PATH, OUTPUT_DIR)


Total items in ZIP: 293234
Found 227835 .txt report files.
Extraction complete. Reports saved under: mimic_cxr_reports


In [9]:
from pathlib import Path

root = Path("mimic_cxr_reports")  # folder created by extraction script

p10_files = list(root.glob("files/p10/**/*.txt"))
p11_files = list(root.glob("files/p11/**/*.txt"))

print("p10 report count:", len(p10_files))
print("p11 report count:", len(p11_files))


p10 report count: 22197
p11 report count: 23358


In [14]:
import pandas as pd
from pathlib import Path
import re

ROOT = Path("mimic_cxr_reports/files/p10")  # load ONLY p10

# find all .txt files under p10
report_paths = list(ROOT.glob("**/*.txt"))
print("p10 report count:", len(report_paths))

def parse_report_text(text):
    f_match = re.search(r"FINDINGS:\s*(.*?)\n\s*(IMPRESSION:|$)", text, re.S | re.I)
    i_match = re.search(r"IMPRESSION:\s*(.*)", text, re.S | re.I)

    findings = f_match.group(1).strip() if f_match else ""
    impression = i_match.group(1).strip() if i_match else ""
    return findings, impression

rows = []

for path in report_paths:
    text = path.read_text(errors="ignore")

    findings, impression = parse_report_text(text)

    # Extract IDs from file path
    # e.g.: mimic_cxr_reports/files/p10/p10000032/s50414267.txt
    parts = path.parts
    subject_id = parts[-2]      # p10000032
    study_id = parts[-1][:-4]   # s50414267

    rows.append({
        "subject_id": subject_id,
        "study_id": study_id,
        "findings": findings,
        "impression": impression
    })

df_p10 = pd.DataFrame(rows)
df_p10.head()


p10 report count: 22197


Unnamed: 0,subject_id,study_id,findings,impression
0,p10000032,s50414267,"There is no focal consolidation, pleural effus...",No acute cardiopulmonary process.
1,p10000032,s53189527,"The cardiac, mediastinal and hilar contours ar...",No acute cardiopulmonary abnormality.
2,p10000032,s53911762,Single frontal view of the chest provided.\n \...,No acute intrathoracic process.
3,p10000032,s56699142,"The lungs are clear of focal consolidation, pl...",No acute cardiopulmonary process.
4,p10000764,s57375967,PA and lateral views of the chest provided. ...,"Focal consolidation at the left lung base, pos..."


In [27]:
df_p10 = df_p10.sort_values(['subject_id', 'study_id'])
df_p10

Unnamed: 0,subject_id,study_id,findings,impression
0,p10000032,s50414267,"There is no focal consolidation, pleural effus...",No acute cardiopulmonary process.
1,p10000032,s53189527,"The cardiac, mediastinal and hilar contours ar...",No acute cardiopulmonary abnormality.
2,p10000032,s53911762,Single frontal view of the chest provided.\n \...,No acute intrathoracic process.
3,p10000032,s56699142,"The lungs are clear of focal consolidation, pl...",No acute cardiopulmonary process.
4,p10000764,s57375967,PA and lateral views of the chest provided. ...,"Focal consolidation at the left lung base, pos..."
...,...,...,...,...
22192,p10999395,s59802033,,Comparison to ___. No relevant change is note...
22193,p10999395,s59897888,,"Allowing for differences in technique, there h..."
22194,p10999512,s52257272,The lungs are well expanded and clear. There ...,No acute cardiopulmonary process.
22195,p10999737,s52341872,PA and lateral views of the chest provided. L...,No acute findings.


In [28]:
df_p10 = df_p10.sort_values(['subject_id', 'study_id'])
df_p10.to_parquet('data/train_set.parquet')

In [16]:
import pandas as pd
from pathlib import Path
import re

# Load ONLY p11 report files
ROOT = Path("mimic_cxr_reports/files/p11")

# Find all .txt reports inside p11 (recursive)
report_paths = list(ROOT.glob("**/*.txt"))
print("p11 report count:", len(report_paths))

def parse_report_text(text):
    """Extract FINDINGS and IMPRESSION sections."""
    f_match = re.search(r"FINDINGS:\s*(.*?)\n\s*(IMPRESSION:|$)", text, re.S | re.I)
    i_match = re.search(r"IMPRESSION:\s*(.*)", text, re.S | re.I)

    findings = f_match.group(1).strip() if f_match else ""
    impression = i_match.group(1).strip() if i_match else ""
    return findings, impression

rows = []

for path in report_paths:
    text = path.read_text(errors="ignore")

    findings, impression = parse_report_text(text)

    # File path example:
    # mimic_cxr_reports/files/p11/p11148901/s58832226.txt
    parts = path.parts

    subject_id = parts[-2]         # p11148901
    study_id = parts[-1][:-4]      # s58832226 (remove .txt)

    rows.append({
        "subject_id": subject_id,
        "study_id": study_id,
        "findings": findings,
        "impression": impression,
    })

df_p11 = pd.DataFrame(rows)
df_p11.head()


p11 report count: 23358


Unnamed: 0,subject_id,study_id,findings,impression
0,p11000011,s51029426,No focal consolidation is seen. There is no p...,No focal consolidation to suggest pneumonia.
1,p11000183,s50336039,Again seen is the indwelling right-sided cathe...,"No gross effusion detected on either side, but..."
2,p11000183,s51967845,Right Port-A-Cath in place. Elevated right he...,"More prominent bibasilar opacities, likely ate..."
3,p11000183,s53970869,,"In comparison to ___ chest radiograph, pulmona..."
4,p11000183,s54898709,Mild cardiomegaly vascular congestion is impro...,No good evidence of aspiration pneumonia.


In [21]:
df_p11 = df_p11.sort_values(['subject_id', 'study_id'])[0:5217]

In [25]:
df_p11.to_parquet('data/test_set.parquet')