# 1 `MIMIC-IV`

## 1.1 `Download Data`

In [None]:
import os, sys, subprocess
from getpass import getpass

username = input("Enter PhysioNet username: ")
password = getpass("Enter PhysioNet password: ")
location = input("Enter download location (full path): ")

os.makedirs(location, exist_ok=True)
os.chdir(location)

url = "https://physionet.org/files/mimiciv/3.1/"

# Force progress bar output even in Jupyter
cmd = [
    "wget",
    "-r", "-N", "-c", "-np",
    "--show-progress", "--progress=bar:force:noscroll",
    "--user", username,
    "--password", password,
    url,
]

# Stream wget output live so progress is visible
proc = subprocess.Popen(
    cmd,
    stdout=subprocess.PIPE,
    stderr=subprocess.STDOUT,
    text=True,
    bufsize=1,
)

for line in proc.stdout:
    print(line, end="")
    sys.stdout.flush()

proc.wait()
if proc.returncode == 0:
    print("\n‚úÖ Download complete.")
else:
    print(f"\n‚ùå wget exited with code {proc.returncode}")


## 1.2 `Check files`

In [None]:
import os
import hashlib
from pathlib import Path

# --- Step 1: Define and check the base location ---
# location = Path("E:/Data/MIMIC_IV")  # example path ‚Äî adjust for your setup
if 'location' not in locals():
    location = Path(input("Enter download location (full path): "))

print(f"\n‚úÖ Checking data in folder: {location}")


# --- Step 2: Define checksum file path ---
checksum_file = location / "SHA256SUMS.txt"
if not checksum_file.exists():
    raise FileNotFoundError(f"Checksum file not found: {checksum_file}")

# --- Step 3: Load checksums ---
checksums = {}
with open(checksum_file, "r", encoding="utf-8") as f:
    for line in f:
        hash_val, rel_path = line.split(" ")
        cur_path = location / rel_path.strip()
        checksums[ cur_path ] = hash_val.strip()

print(f"Found {len(checksums)} entries in {checksum_file.name}")


# --- Step 4: Verify files ---
def sha256sum(file_path):
    """Compute SHA256 hash for a file."""
    h = hashlib.sha256()
    with open(file_path, "rb") as f:
        for chunk in iter(lambda: f.read(8192), b""):
            h.update(chunk)
    return h.hexdigest()

results = []
for rel_path, expected_hash in checksums.items():
    file_path = location / rel_path
    if not file_path.exists():
        results.append((rel_path, "‚ùå Missing", None))
        continue

    actual_hash = sha256sum(file_path)
    if actual_hash.lower() == expected_hash.lower():
        results.append((rel_path, "‚úÖ OK", actual_hash))
    else:
        results.append((rel_path, "‚ö†Ô∏è Mismatch", actual_hash))

# --- Step 5: Display results ---
print("\n=== Verification Report ===")
for rel_path, status, actual in results:
    print(f"{status:10s}  {rel_path}")

# Optional: summary counts
ok = sum(1 for r in results if "OK" in r[1])
missing = sum(1 for r in results if "Missing" in r[1])
mismatch = sum(1 for r in results if "Mismatch" in r[1])
print(f"\nSummary: {ok} OK, {missing} Missing, {mismatch} Mismatch\n")


## 1.3. `UnZip Files`

In [None]:
import os
import gzip
import shutil
from pathlib import Path

# --- 0) Base folder ---
if 'location' not in locals():
    location = Path(input("Enter download location (full path): ").strip()).expanduser().resolve()

print(f"\nüìÅ Using folder: {location}")

checksum_file = location / "SHA256SUMS.txt"
if not checksum_file.exists():
    raise FileNotFoundError(f"Checksum file not found: {checksum_file}")

# --- 1) Read relative paths from SHA256SUMS.txt ---
rel_paths = []
with open(checksum_file, "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        # Robust split: first token is hash, the rest is the path (may contain spaces)
        parts = line.split(maxsplit=1)
        if len(parts) != 2:
            continue
        _, rel_path = parts
        rel_paths.append(rel_path.strip())

# --- 2) Filter only .gz files ---
gz_paths = [rp for rp in rel_paths if rp.endswith(".gz")]
print(f"Found {len(gz_paths)} .gz files to process")

# --- 3) Decompress settings ---
OVERWRITE = False  # set to True to overwrite existing outputs
CHUNK_SIZE = 1024 * 1024  # 1 MB chunks

# --- 4) Decompress loop ---
done = 0
skipped = 0
missing = 0
failed = 0
log = []

for rp in gz_paths:
    src = (location / rp).resolve()
    # Destination: remove the final ".gz" only
    dst = src.with_name(src.name[:-3])  # e.g., "file.csv.gz" -> "file.csv"

    if not src.exists():
        missing += 1
        log.append(("‚ùå Missing", rp))
        continue

    # Ensure destination folder exists
    dst.parent.mkdir(parents=True, exist_ok=True)

    if dst.exists() and not OVERWRITE:
        skipped += 1
        log.append(("‚è© Skipped (exists)", str(dst.relative_to(location))))
        continue

    try:
        with gzip.open(src, "rb") as f_in, open(dst, "wb") as f_out:
            shutil.copyfileobj(f_in, f_out, length=CHUNK_SIZE)
        done += 1
        log.append(("‚úÖ Unzipped", f"{rp}  ->  {dst.relative_to(location)}"))
    except Exception as e:
        failed += 1
        log.append(("‚ö†Ô∏è Failed", f"{rp}  ({e})"))

# --- 5) Report ---
print("\n=== Decompression Report ===")
for status, msg in log:
    print(f"{status:16s} {msg}")

print(
    f"\nSummary: {done} unzipped, {skipped} skipped, {missing} missing, {failed} failed.\n"
    f"Outputs are written next to sources (e.g., *.csv beside *.csv.gz)."
)


# 2 `MIMIC-CXR`

## 2.1 `Download Data`

In [None]:
import os, sys, subprocess
from getpass import getpass

username = input("Enter PhysioNet username: ")
password = getpass("Enter PhysioNet password: ")
location = input("Enter download location (full path): ")

os.makedirs(location, exist_ok=True)
os.chdir(location)

"""
#url = "https://physionet.org/files/mimic-cxr/2.1.0/files/p10/"
url = "https://physionet.org/files/mimic-cxr/2.1.0/"

# Download data parent folder excluding pXX subfolders
# Force progress bar output even in Jupyter
cmd = [
    "wget",
    "-r", "-N", "-c", "-np",
    "--show-progress", "--progress=bar:force:noscroll",
    "--reject-regex", ".*/files/p[0-9]+",
    "--user", username,
    "--password", password,
    url,
]

# Stream wget output live so progress is visible
proc = subprocess.Popen(
    cmd,
    stdout=subprocess.PIPE,
    stderr=subprocess.STDOUT,
    text=True,
    bufsize=1,
)

for line in proc.stdout:
    print(line, end="")
    sys.stdout.flush()

proc.wait()
if proc.returncode == 0:
    print("\n‚úÖ Root Folder download complete.")
else:
    print(f"\n‚ùå wget exited with code {proc.returncode}")

"""

url = "https://physionet.org/files/mimic-cxr/2.1.0/"
cmd = [
    "wget",
    "-r", "-N", "-c", "-np",
    "--show-progress", "--progress=bar:force:noscroll",
#    "--accept-regex", "(/files/|/files/p10/|/files/p10/p1000.*)",
    "--user", username,
    "--password", password,
    url,
]

# Stream wget output live so progress is visible
proc = subprocess.Popen(
    cmd,
    stdout=subprocess.PIPE,
    stderr=subprocess.STDOUT,
    text=True,
    bufsize=1,
)

for line in proc.stdout:
    print(line, end="")
    sys.stdout.flush()

proc.wait()
if proc.returncode == 0:
    print("\n‚úÖ Files folder download complete.")
else:
    print(f"\n‚ùå wget exited with code {proc.returncode}")



--2025-11-10 12:30:14--  https://physionet.org/files/mimic-cxr/2.1.0/
Resolving physionet.org (physionet.org)... 18.18.42.54
Connecting to physionet.org (physionet.org)|18.18.42.54|:443... connected.
HTTP request sent, awaiting response... 401 Unauthorized
Authentication selected: Basic realm="PhysioNet", charset="UTF-8"
Reusing existing connection to physionet.org:443.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/html]
Saving to: 'physionet.org/files/mimic-cxr/2.1.0/index.html'


physionet.org/files     [<=>                 ]       0  --.-KB/s               
physionet.org/files     [ <=>                ]   1.05K  --.-KB/s    in 0s      

Last-modified header missing -- time-stamps turned off.
2025-11-10 12:30:15 (198 MB/s) - 'physionet.org/files/mimic-cxr/2.1.0/index.html' saved [1079]

Loading robots.txt; please ignore errors.
--2025-11-10 12:30:15--  https://physionet.org/robots.txt
Reusing existing connection to physionet.org:443.
HTTP request sent, awai

## 2.2 `Check files`

In [None]:
import os
import hashlib
from pathlib import Path

# --- Step 1: Define and check the base location ---
# location = Path("E:/Data/MIMIC_IV")  # example path ‚Äî adjust for your setup
if 'location' not in locals():
    location = Path(input("Enter download location (full path): "))

print(f"\n‚úÖ Checking data in folder: {location}")


# --- Step 2: Define checksum file path ---
checksum_file = location / "SHA256SUMS.txt"
if not checksum_file.exists():
    raise FileNotFoundError(f"Checksum file not found: {checksum_file}")

# --- Step 3: Load checksums ---
checksums = {}
with open(checksum_file, "r", encoding="utf-8") as f:
    for line in f:
        hash_val, rel_path = line.split(" ")
        cur_path = location / rel_path.strip()
        checksums[ cur_path ] = hash_val.strip()

print(f"Found {len(checksums)} entries in {checksum_file.name}")


# --- Step 4: Verify files ---
def sha256sum(file_path):
    """Compute SHA256 hash for a file."""
    h = hashlib.sha256()
    with open(file_path, "rb") as f:
        for chunk in iter(lambda: f.read(8192), b""):
            h.update(chunk)
    return h.hexdigest()

results = []
for rel_path, expected_hash in checksums.items():
    file_path = location / rel_path
    if not file_path.exists():
        results.append((rel_path, "‚ùå Missing", None))
        continue

    actual_hash = sha256sum(file_path)
    if actual_hash.lower() == expected_hash.lower():
        results.append((rel_path, "‚úÖ OK", actual_hash))
    else:
        results.append((rel_path, "‚ö†Ô∏è Mismatch", actual_hash))

# --- Step 5: Display results ---
print("\n=== Verification Report ===")
for rel_path, status, actual in results:
    print(f"{status:10s}  {rel_path}")

# Optional: summary counts
ok = sum(1 for r in results if "OK" in r[1])
missing = sum(1 for r in results if "Missing" in r[1])
mismatch = sum(1 for r in results if "Mismatch" in r[1])
print(f"\nSummary: {ok} OK, {missing} Missing, {mismatch} Mismatch\n")
