# 1 `MIMIC-IV`

## 1.1 `Download Data`

In [None]:
import os, sys, subprocess
from getpass import getpass

username = input("Enter PhysioNet username: ")
password = getpass("Enter PhysioNet password: ")
location = input("Enter download location (full path): ")

os.makedirs(location, exist_ok=True)
os.chdir(location)

url = "https://physionet.org/files/mimiciv/3.1/"

# Force progress bar output even in Jupyter
cmd = [
    "wget",
    "-r", "-N", "-c", "-np",
    "--show-progress", "--progress=bar:force:noscroll",
    "--user", username,
    "--password", password,
    url,
]

# Stream wget output live so progress is visible
proc = subprocess.Popen(
    cmd,
    stdout=subprocess.PIPE,
    stderr=subprocess.STDOUT,
    text=True,
    bufsize=1,
)

for line in proc.stdout:
    print(line, end="")
    sys.stdout.flush()

proc.wait()
if proc.returncode == 0:
    print("\n‚úÖ Download complete.")
else:
    print(f"\n‚ùå wget exited with code {proc.returncode}")


## 1.2 `Check files`

In [None]:
import os
import hashlib
from pathlib import Path

# --- Step 1: Define and check the base location ---
# location = Path("E:/Data/MIMIC_IV")  # example path ‚Äî adjust for your setup
if 'location' not in locals():
    location = Path(input("Enter download location (full path): "))

print(f"\n‚úÖ Checking data in folder: {location}")


# --- Step 2: Define checksum file path ---
checksum_file = location / "SHA256SUMS.txt"
if not checksum_file.exists():
    raise FileNotFoundError(f"Checksum file not found: {checksum_file}")

# --- Step 3: Load checksums ---
checksums = {}
with open(checksum_file, "r", encoding="utf-8") as f:
    for line in f:
        hash_val, rel_path = line.split(" ")
        cur_path = location / rel_path.strip()
        checksums[ cur_path ] = hash_val.strip()

print(f"Found {len(checksums)} entries in {checksum_file.name}")


# --- Step 4: Verify files ---
def sha256sum(file_path):
    """Compute SHA256 hash for a file."""
    h = hashlib.sha256()
    with open(file_path, "rb") as f:
        for chunk in iter(lambda: f.read(8192), b""):
            h.update(chunk)
    return h.hexdigest()

results = []
for rel_path, expected_hash in checksums.items():
    file_path = location / rel_path
    if not file_path.exists():
        results.append((rel_path, "‚ùå Missing", None))
        continue

    actual_hash = sha256sum(file_path)
    if actual_hash.lower() == expected_hash.lower():
        results.append((rel_path, "‚úÖ OK", actual_hash))
    else:
        results.append((rel_path, "‚ö†Ô∏è Mismatch", actual_hash))

# --- Step 5: Display results ---
print("\n=== Verification Report ===")
for rel_path, status, actual in results:
    print(f"{status:10s}  {rel_path}")

# Optional: summary counts
ok = sum(1 for r in results if "OK" in r[1])
missing = sum(1 for r in results if "Missing" in r[1])
mismatch = sum(1 for r in results if "Mismatch" in r[1])
print(f"\nSummary: {ok} OK, {missing} Missing, {mismatch} Mismatch\n")


## 1.3. `UnZip Files`

In [None]:
import os
import gzip
import shutil
from pathlib import Path

# --- 0) Base folder ---
if 'location' not in locals():
    location = Path(input("Enter download location (full path): ").strip()).expanduser().resolve()

print(f"\nüìÅ Using folder: {location}")

checksum_file = location / "SHA256SUMS.txt"
if not checksum_file.exists():
    raise FileNotFoundError(f"Checksum file not found: {checksum_file}")

# --- 1) Read relative paths from SHA256SUMS.txt ---
rel_paths = []
with open(checksum_file, "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        # Robust split: first token is hash, the rest is the path (may contain spaces)
        parts = line.split(maxsplit=1)
        if len(parts) != 2:
            continue
        _, rel_path = parts
        rel_paths.append(rel_path.strip())

# --- 2) Filter only .gz files ---
gz_paths = [rp for rp in rel_paths if rp.endswith(".gz")]
print(f"Found {len(gz_paths)} .gz files to process")

# --- 3) Decompress settings ---
OVERWRITE = False  # set to True to overwrite existing outputs
CHUNK_SIZE = 1024 * 1024  # 1 MB chunks

# --- 4) Decompress loop ---
done = 0
skipped = 0
missing = 0
failed = 0
log = []

for rp in gz_paths:
    src = (location / rp).resolve()
    # Destination: remove the final ".gz" only
    dst = src.with_name(src.name[:-3])  # e.g., "file.csv.gz" -> "file.csv"

    if not src.exists():
        missing += 1
        log.append(("‚ùå Missing", rp))
        continue

    # Ensure destination folder exists
    dst.parent.mkdir(parents=True, exist_ok=True)

    if dst.exists() and not OVERWRITE:
        skipped += 1
        log.append(("‚è© Skipped (exists)", str(dst.relative_to(location))))
        continue

    try:
        with gzip.open(src, "rb") as f_in, open(dst, "wb") as f_out:
            shutil.copyfileobj(f_in, f_out, length=CHUNK_SIZE)
        done += 1
        log.append(("‚úÖ Unzipped", f"{rp}  ->  {dst.relative_to(location)}"))
    except Exception as e:
        failed += 1
        log.append(("‚ö†Ô∏è Failed", f"{rp}  ({e})"))

# --- 5) Report ---
print("\n=== Decompression Report ===")
for status, msg in log:
    print(f"{status:16s} {msg}")

print(
    f"\nSummary: {done} unzipped, {skipped} skipped, {missing} missing, {failed} failed.\n"
    f"Outputs are written next to sources (e.g., *.csv beside *.csv.gz)."
)


# 2 `MIMIC-CXR`

## 2.1 `Download Data`

In [None]:
import os, sys, subprocess
from getpass import getpass

username = input("Enter PhysioNet username: ")
password = getpass("Enter PhysioNet password: ")
location = input("Enter download location (full path): ")

os.makedirs(location, exist_ok=True)
os.chdir(location)

"""
#url = "https://physionet.org/files/mimic-cxr/2.1.0/files/p10/"
url = "https://physionet.org/files/mimic-cxr/2.1.0/"

# Download data parent folder excluding pXX subfolders
# Force progress bar output even in Jupyter
cmd = [
    "wget",
    "-r", "-N", "-c", "-np",
    "--show-progress", "--progress=bar:force:noscroll",
    "--reject-regex", ".*/files/p[0-9]+",
    "--user", username,
    "--password", password,
    url,
]

# Stream wget output live so progress is visible
proc = subprocess.Popen(
    cmd,
    stdout=subprocess.PIPE,
    stderr=subprocess.STDOUT,
    text=True,
    bufsize=1,
)

for line in proc.stdout:
    print(line, end="")
    sys.stdout.flush()

proc.wait()
if proc.returncode == 0:
    print("\n‚úÖ Root Folder download complete.")
else:
    print(f"\n‚ùå wget exited with code {proc.returncode}")

"""

url = "https://physionet.org/files/mimic-cxr/2.1.0/"
cmd = [
    "wget",
    "-r", "-N", "-c", "-np",
    "--show-progress", "--progress=bar:force:noscroll",
#    "--accept-regex", "(/files/|/files/p10/|/files/p10/p1000.*)",
    "--user", username,
    "--password", password,
    url,
]

# Stream wget output live so progress is visible
proc = subprocess.Popen(
    cmd,
    stdout=subprocess.PIPE,
    stderr=subprocess.STDOUT,
    text=True,
    bufsize=1,
)

for line in proc.stdout:
    print(line, end="")
    sys.stdout.flush()

proc.wait()
if proc.returncode == 0:
    print("\n‚úÖ Files folder download complete.")
else:
    print(f"\n‚ùå wget exited with code {proc.returncode}")



--2025-11-10 12:30:14--  https://physionet.org/files/mimic-cxr/2.1.0/
Resolving physionet.org (physionet.org)... 18.18.42.54
Connecting to physionet.org (physionet.org)|18.18.42.54|:443... connected.
HTTP request sent, awaiting response... 401 Unauthorized
Authentication selected: Basic realm="PhysioNet", charset="UTF-8"
Reusing existing connection to physionet.org:443.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/html]
Saving to: 'physionet.org/files/mimic-cxr/2.1.0/index.html'


physionet.org/files     [<=>                 ]       0  --.-KB/s               
physionet.org/files     [ <=>                ]   1.05K  --.-KB/s    in 0s      

Last-modified header missing -- time-stamps turned off.
2025-11-10 12:30:15 (198 MB/s) - 'physionet.org/files/mimic-cxr/2.1.0/index.html' saved [1079]

Loading robots.txt; please ignore errors.
--2025-11-10 12:30:15--  https://physionet.org/robots.txt
Reusing existing connection to physionet.org:443.
HTTP request sent, awai

## 2.2 `Check files`

In [None]:
import os
import hashlib
from pathlib import Path

# --- Step 1: Define and check the base location ---
# location = Path("E:/Data/MIMIC_IV")  # example path ‚Äî adjust for your setup
if 'location' not in locals():
    location = Path(input("Enter download location (full path): "))

print(f"\n‚úÖ Checking data in folder: {location}")


# --- Step 2: Define checksum file path ---
checksum_file = location / "SHA256SUMS.txt"
if not checksum_file.exists():
    raise FileNotFoundError(f"Checksum file not found: {checksum_file}")

# --- Step 3: Load checksums ---
checksums = {}
with open(checksum_file, "r", encoding="utf-8") as f:
    for line in f:
        hash_val, rel_path = line.split(" ")
        cur_path = location / rel_path.strip()
        checksums[ cur_path ] = hash_val.strip()

print(f"Found {len(checksums)} entries in {checksum_file.name}")


# --- Step 4: Verify files ---
def sha256sum(file_path):
    """Compute SHA256 hash for a file."""
    h = hashlib.sha256()
    with open(file_path, "rb") as f:
        for chunk in iter(lambda: f.read(8192), b""):
            h.update(chunk)
    return h.hexdigest()

results = []
for rel_path, expected_hash in checksums.items():
    file_path = location / rel_path
    if not file_path.exists():
        results.append((rel_path, "‚ùå Missing", None))
        continue

    actual_hash = sha256sum(file_path)
    if actual_hash.lower() == expected_hash.lower():
        results.append((rel_path, "‚úÖ OK", actual_hash))
    else:
        results.append((rel_path, "‚ö†Ô∏è Mismatch", actual_hash))

# --- Step 5: Display results ---
print("\n=== Verification Report ===")
for rel_path, status, actual in results:
    print(f"{status:10s}  {rel_path}")

# Optional: summary counts
ok = sum(1 for r in results if "OK" in r[1])
missing = sum(1 for r in results if "Missing" in r[1])
mismatch = sum(1 for r in results if "Mismatch" in r[1])
print(f"\nSummary: {ok} OK, {missing} Missing, {mismatch} Mismatch\n")


# 3 `MIMIC-CXR: multi-threading`

## 3.1 `Download CHECKSUM`

In [31]:
# Add project root to sys.path

import sys
from pathlib import Path
import os, sys, subprocess
from getpass import getpass



ROOT = Path.cwd()
for parent in [ROOT] + list(ROOT.parents):
    if (parent / "config").is_dir():
        ROOT = parent
        break
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

print( '[X] Root fount' )

from xrh.settings import write_parameter, read_parameter   

[X] Root fount


In [4]:

username = input("Enter PhysioNet username: ")
password = getpass("Enter PhysioNet password: ")
location = input("Enter download location (full path): ")

write_parameter("MIMIC_CXR_src_data_path", location)


In [None]:

os.makedirs(location, exist_ok=True)
os.chdir(location)

url = "https://physionet.org/files/mimic-cxr/2.1.0/SHA256SUMS.txt"
cmd = [
    "wget",
    "-r", "-N", "-c", "-np",
    "--show-progress", "--progress=bar:force:noscroll",
#    "--accept-regex", "(/files/|/files/p10/|/files/p10/p1000.*)",
    "--user", username,
    "--password", password,
    url,
]

# Stream wget output live so progress is visible
proc = subprocess.Popen(
    cmd,
    stdout=subprocess.PIPE,
    stderr=subprocess.STDOUT,
    text=True,
    bufsize=1,
)

for line in proc.stdout:
    print(line, end="")
    sys.stdout.flush()

proc.wait()
if proc.returncode == 0:
    print("\n‚úÖ Files folder download complete.")
else:
    print(f"\n‚ùå wget exited with code {proc.returncode}")



--2025-11-20 21:21:27--  https://physionet.org/files/mimic-cxr/2.1.0/SHA256SUMS.txt
Resolving physionet.org (physionet.org)... 18.18.42.54
Connecting to physionet.org (physionet.org)|18.18.42.54|:443... connected.
HTTP request sent, awaiting response... 401 Unauthorized
Authentication selected: Basic realm="PhysioNet", charset="UTF-8"
Reusing existing connection to physionet.org:443.
HTTP request sent, awaiting response... 200 OK
Length: 76859934 (73M) [text/plain]
Saving to: 'physionet.org/files/mimic-cxr/2.1.0/SHA256SUMS.txt'


physionet.org/files   0%[                    ]       0  --.-KB/s               
physionet.org/files   0%[                    ]  47.49K   178KB/s               
physionet.org/files   0%[                    ]  63.49K   133KB/s               
physionet.org/files   0%[                    ] 143.49K   201KB/s               
physionet.org/files   0%[                    ] 191.49K   198KB/s               
physionet.org/files   0%[                    ] 223.49K   191KB/s

## 3.2 `Generate filelist for wget`


In [5]:
# Add project root to sys.path

import sys
from pathlib import Path
import os, sys, subprocess
from getpass import getpass



ROOT = Path.cwd()
for parent in [ROOT] + list(ROOT.parents):
    if (parent / "config").is_dir():
        ROOT = parent
        break
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

print( '[X] Root fount' )

from xrh.settings import write_parameter, read_parameter   

[X] Root fount


In [6]:
from pathlib import Path
from collections import defaultdict
import math
import subprocess
from getpass import getpass
from urllib.parse import urljoin

patient_size = 2000
wget_threads_num = 15
wget_root = "https://physionet.org/files/mimic-cxr/2.1.0"
tmp_dir_name = "tmp.filelist"

location = read_parameter("MIMIC_CXR_src_data_path")

location = Path(location)


In [3]:




def build_patient_url_table(location, patient_size, wget_root):
    if wget_root is None:
        raise ValueError("wget_root must be provided")

    location = Path(location)
    sha_file = location / "physionet.org" / "files" / "mimic-cxr" / "2.1.0" / "SHA256SUMS.txt"

    if not sha_file.is_file():
        raise FileNotFoundError(f"SHA256SUMS.txt not found at: {sha_file}")

    patient_to_urls = defaultdict(list)

    with sha_file.open("r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue

            # Expected format: "<checksum> <relative_path>"
            try:
                checksum, rel_path = line.split(maxsplit=1)
            except ValueError:
                # Malformed line, skip
                continue

            # Only DICOM files
            if not rel_path.endswith(".dcm"):
                continue

            parts = rel_path.split("/")
            # Expected: ["files", "p10", "p10000032", "s50414267", "xxxxx.dcm"]
            if len(parts) < 5:
                # Unexpected format, skip
                continue

            patient_id = parts[2]  # e.g. "p10000032"

            full_url = f"{wget_root}/{rel_path}"
            patient_to_urls[patient_id].append(full_url)

    # Sort patient IDs and select first N
    all_patients = sorted(patient_to_urls.keys())
    selected_patients = all_patients[:patient_size]

    reduced_table = {pid: patient_to_urls[pid] for pid in selected_patients}
    return reduced_table


def write_wget_filelists(patient_url_table, location, wget_threads_num=50, tmp_dir_name="tmp.filelist"):
    """
    Flatten URLs from the hash table and split into K filelists for wget.

    patient_url_table : dict
        Mapping patient_id -> list of URLs.
    location : Path or str
        Base folder where tmp_dir_name will be created.
    wget_threads_num : int
        Desired number of filelists (upper bound).
    tmp_dir_name : str
        Name of temporary subfolder.
    """
    location = Path(location)
    tmp_dir = location / tmp_dir_name
    tmp_dir.mkdir(parents=True, exist_ok=True)

    # Flatten URLs
    all_urls = []
    for urls in patient_url_table.values():
        all_urls.extend(urls)

    if not all_urls:
        raise ValueError("No URLs found in patient_url_table.")

    # Number of chunks (cannot exceed number of URLs)
    k = min(wget_threads_num, len(all_urls))
    chunk_size = math.ceil(len(all_urls) / k)

    filelist_paths = []
    for i in range(k):
        start = i * chunk_size
        end = start + chunk_size
        chunk = all_urls[start:end]
        if not chunk:
            continue

        fname = tmp_dir / f"filelist_{i:03d}.txt"
        with fname.open("w", encoding="utf-8") as f:
            for url in chunk:
                f.write(url + "\n")

        filelist_paths.append(fname)

    return filelist_paths


# Cell 3: build patient‚ÜíURLs mapping and create filelists

patient_url_table = build_patient_url_table(
    location=location,
    patient_size=patient_size,
    wget_root=wget_root,
)

print(f"Number of patients in table: {len(patient_url_table)}")

filelists = write_wget_filelists(
    patient_url_table=patient_url_table,
    location=location,
    wget_threads_num=wget_threads_num,
    tmp_dir_name=tmp_dir_name,
)

print("Created wget filelists:")
for p in filelists:
    print("  ", p)


Number of patients in table: 500
Created wget filelists:
   D:\003.Data\MIMIC-CXR.v2.1\tmp.filelist\filelist_000.txt
   D:\003.Data\MIMIC-CXR.v2.1\tmp.filelist\filelist_001.txt
   D:\003.Data\MIMIC-CXR.v2.1\tmp.filelist\filelist_002.txt
   D:\003.Data\MIMIC-CXR.v2.1\tmp.filelist\filelist_003.txt
   D:\003.Data\MIMIC-CXR.v2.1\tmp.filelist\filelist_004.txt
   D:\003.Data\MIMIC-CXR.v2.1\tmp.filelist\filelist_005.txt
   D:\003.Data\MIMIC-CXR.v2.1\tmp.filelist\filelist_006.txt
   D:\003.Data\MIMIC-CXR.v2.1\tmp.filelist\filelist_007.txt
   D:\003.Data\MIMIC-CXR.v2.1\tmp.filelist\filelist_008.txt
   D:\003.Data\MIMIC-CXR.v2.1\tmp.filelist\filelist_009.txt
   D:\003.Data\MIMIC-CXR.v2.1\tmp.filelist\filelist_010.txt
   D:\003.Data\MIMIC-CXR.v2.1\tmp.filelist\filelist_011.txt
   D:\003.Data\MIMIC-CXR.v2.1\tmp.filelist\filelist_012.txt
   D:\003.Data\MIMIC-CXR.v2.1\tmp.filelist\filelist_013.txt
   D:\003.Data\MIMIC-CXR.v2.1\tmp.filelist\filelist_014.txt


In [None]:
checkSUM_path = Path(location) / "physionet.org" / "files" / "mimic-cxr" / "2.1.0" / "SHA256SUMS.txt"

root_file_paths = []
patient_dict = {}
pid_list = []

# Note: need to update - the last patient does not downloaded properly

with checkSUM_path.open("r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        (checkSum, rel_path) = line.split(" ")
        path_type = rel_path.count('/')

        if path_type == 4:
            (file, GID, PID, SID, img_name) = rel_path.split("/")

            if len(patient_dict) >= patient_size:
                break
            if PID not in patient_dict:
                if len(patient_dict) >= patient_size:
                    break
                patient_dict[PID] = []
            full_url = wget_root + "/" + rel_path
            patient_dict[PID].append(full_url)

        elif path_type == 3:
            (file, GID, PID, txt_name) = rel_path.split("/")

            if PID not in patient_dict:
                if len(patient_dict) >= patient_size:
                    break
                patient_dict[PID] = []
            full_url = wget_root + "/" + rel_path
            patient_dict[PID].append(full_url)
        else:
            full_url = wget_root + "/" + rel_path
            root_file_paths.append(full_url)
        
print(f"Number of patients in table: {len(patient_dict)}")

patien_per_file = math.ceil(len(patient_dict) / wget_threads_num)
filelist_paths = location / tmp_dir_name
filelist_paths.mkdir(parents=True, exist_ok=True)

for i in range(wget_threads_num):
    start = i * patien_per_file
    end = start + patien_per_file
    chunk_PIDs = list(patient_dict.keys())[start:end]
    if not chunk_PIDs:
        continue

    fname = filelist_paths / f"filelist_{i:03d}.txt"
    with fname.open("w", encoding="utf-8") as f:
        for PID in chunk_PIDs:
            for url in patient_dict[PID]:
                f.write(url + "\n")

    #print("  ", fname)

print("[X] Created wget filelists")



Number of patients in table: 500
[X] Created wget filelists


## 3.3 `Download files`

In [None]:
import subprocess
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed

username = input("Enter PhysioNet username: ")
password = getpass("Enter PhysioNet password: ")

# Get all filelist_*.txt files
filelist_list = sorted(filelist_paths.glob("filelist_*.txt"))

# === Function to download one filelist ===
def download_filelist(filelist_path: Path):
    print(f"üöÄ Process started: {filelist_path.name}")
    
    cmd = [
    "wget",
    "-r",               # recursive
    "-N",               # only download if newer
    "-c",               # continue downloads
    "-np",              # no parent directories
    "--user", username,
    "--password", password,
    "-i", str(filelist_path),
    "-P", str(location)  # where to save downloads
]
    proc = subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
    return filelist_path.name, proc


# === Run all in parallel ===
with ThreadPoolExecutor(max_workers=wget_threads_num) as executor:
    futures = [executor.submit(download_filelist, fp) for fp in filelist_list]
    
    for future in as_completed(futures):
        filelist_name, code = future.result()
        status = "‚úÖ OK" if code == 0 else f"‚ùå Failed)"
        print(f"{filelist_name}: {status}")





üöÄ Process started: filelist_000.txtüöÄ Process started: filelist_001.txt

üöÄ Process started: filelist_002.txt
üöÄ Process started: filelist_003.txt
üöÄ Process started: filelist_004.txt
üöÄ Process started: filelist_005.txt
üöÄ Process started: filelist_006.txt
üöÄ Process started: filelist_007.txt
üöÄ Process started: filelist_008.txt
üöÄ Process started: filelist_009.txt
üöÄ Process started: filelist_010.txt
üöÄ Process started: filelist_011.txt
üöÄ Process started: filelist_012.txt
üöÄ Process started: filelist_013.txt
üöÄ Process started: filelist_014.txt
filelist_014.txt: ‚ùå Failed)
filelist_002.txt: ‚ùå Failed)
filelist_000.txt: ‚ùå Failed)
filelist_008.txt: ‚ùå Failed)
filelist_013.txt: ‚ùå Failed)
filelist_006.txt: ‚ùå Failed)
filelist_004.txt: ‚ùå Failed)
filelist_011.txt: ‚ùå Failed)
filelist_010.txt: ‚ùå Failed)
filelist_003.txt: ‚ùå Failed)
filelist_007.txt: ‚ùå Failed)
filelist_005.txt: ‚ùå Failed)
filelist_009.txt: ‚ùå Failed)
filelist_001.txt: ‚ùå Fai

: 

## 3.4 Data Checksum Validation

In [56]:
import hashlib

data_path = Path(location) / "physionet.org" / "files" / "mimic-cxr" / "2.1.0" / "files"
checkSUM_path = Path(location) / "physionet.org" / "files" / "mimic-cxr" / "2.1.0" / "SHA256SUMS.txt"

root_file_paths = []
patient_dict = {}
pid_list = []
checkSum_dict = {}

patient_ids = []

data_subfolders = [p.name for p in data_path.iterdir() if p.is_dir()]

for gid in data_subfolders:
    gfolder = data_path / gid

pid_list = [p.name for p in gfolder.iterdir() if p.is_dir()]


with checkSUM_path.open("r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        (checkSum, rel_path) = line.split(" ")
        path_type = rel_path.count('/')

        if path_type == 4:
            (file, GID, PID, SID, img_name) = rel_path.split("/")
            if PID in pid_list:
                file_path = data_path / GID / PID / SID / img_name
                checkSum_dict[file_path] = checkSum
        
        elif path_type == 3:
            (file, GID, PID, txt_name) = rel_path.split("/")
            if PID in pid_list:
                file_path = data_path / GID / PID / txt_name
                checkSum_dict[file_path] = checkSum
            

def sha256(path: Path, chunk_size=1024 * 1024):
    h = hashlib.sha256()
    with path.open("rb") as f:
        for chunk in iter(lambda: f.read(chunk_size), b""):
            h.update(chunk)
    return h.hexdigest()

missing = []
mismatch = []
ok = []

for file_path, expected_checksum in checkSum_dict.items():
    path = Path(file_path)

    if not path.exists():
        missing.append(file_path)
        continue

    actual_checksum = sha256(path)

    if actual_checksum != expected_checksum:
        mismatch.append(file_path)
    else:
        ok.append(file_path)

print(f"Missing files: {len(missing)}")
print(f"Mismatched files: {len(mismatch)}")
print(f"OK files: {len(ok)}")



Missing files: 0
Mismatched files: 0
OK files: 18876


In [55]:
# Create file list for re-downloading corrupted or missing files

filelist_paths = location / tmp_dir_name

for p in filelist_paths.iterdir():
    path = Path(p)
    if path.is_file():
        path.unlink()

missing_filelist_path = filelist_paths / "missing_files.txt"
with open(missing_filelist_path, "w") as f:
    for missing_file_path in missing:
        fid = missing_file_path.name          # last
        sid = missing_file_path.parts[-2]     # second from end
        pid = missing_file_path.parts[-3]     # third from end
        gid = missing_file_path.parts[-4]     # fourth from end
        if ".dcm" in fid:
            rel_path = 'files/' + gid + '/' + pid + '/' + sid + '/' + fid
        else:
            rel_path = gid + '/' + pid + '/' + sid + '/' + fid

        full_url = wget_root + "/" + rel_path
        f.write(full_url + "\n")

    for mismatch_file_path in mismatch:
        fid = mismatch_file_path.name          # last
        sid = mismatch_file_path.parts[-2]     # second from end
        pid = mismatch_file_path.parts[-3]     # third from end
        gid = mismatch_file_path.parts[-4]     # fourth from end

        if ".dcm" in fid:
            rel_path = 'files/' + gid + '/' + pid + '/' + sid + '/' + fid
        else:
            rel_path = gid + '/' + pid + '/' + sid + '/' + fid

        full_url = wget_root + "/" + rel_path
        f.write(full_url + "\n")




In [57]:
# download missing/corrupted files
username = input("Enter PhysioNet username: ")
password = getpass("Enter PhysioNet password: ")
location = Path(read_parameter("MIMIC_CXR_src_data_path"))

location.mkdir(parents=True, exist_ok=True)

# Build wget command
wget_cmd = [
    "wget",
    "-r",
    "-N",
    "-c",
    "-np",
    "--user", username,
    "--password", password,
    "-P", str(location),
    "-i", str(missing_filelist_path)
]

# Run the command
try:
    subprocess.run(wget_cmd, check=True)
except subprocess.CalledProcessError as e:
    print(f"\n‚ùå Download failed with exit code {e.returncode}")
