In [None]:
# -*- coding: utf-8 -*-
"""finetune-sam-xray-chest-download-and-preprocess.ipynb
Colab-ready script to fetch & preprocess X-Ray Chest (Montgomery + Shenzhen).
"""

# =========================
# 0) DOWNLOAD THE DATA
# =========================

# Make base dirs
!mkdir -p /content/datasets/Xray_Chest/montgomery/CXR_png
!mkdir -p /content/datasets/Xray_Chest/montgomery/ManualMask/leftMask
!mkdir -p /content/datasets/Xray_Chest/montgomery/ManualMask/rightMask
!mkdir -p /content/datasets/Xray_Chest/shenzhen/CXR_png
!mkdir -p /content/datasets/Xray_Chest/shenzhen/masks  # will hold Shenzhen lung masks (Kaggle)

# --- Montgomery (images) ---
!wget -q -r -np -nH --cut-dirs=5 -R "index.html*" -P /content/datasets/Xray_Chest/montgomery/CXR_png \
 https://data.lhncbc.nlm.nih.gov/public/Tuberculosis-Chest-X-ray-Datasets/Montgomery-County-CXR-Set/MontgomerySet/CXR_png/

# --- Montgomery (masks: left & right lung) ---
!wget -q -r -np -nH --cut-dirs=6 -R "index.html*" -P /content/datasets/Xray_Chest/montgomery/ManualMask/leftMask \
 https://data.lhncbc.nlm.nih.gov/public/Tuberculosis-Chest-X-ray-Datasets/Montgomery-County-CXR-Set/MontgomerySet/ManualMask/leftMask/

!wget -q -r -np -nH --cut-dirs=6 -R "index.html*" -P /content/datasets/Xray_Chest/montgomery/ManualMask/rightMask \
 https://data.lhncbc.nlm.nih.gov/public/Tuberculosis-Chest-X-ray-Datasets/Montgomery-County-CXR-Set/MontgomerySet/ManualMask/rightMask/

# --- Shenzhen (images) ---
!wget -q -r -np -nH --cut-dirs=5 -R "index.html*" -P /content/datasets/Xray_Chest/shenzhen/CXR_png \
 https://data.lhncbc.nlm.nih.gov/public/Tuberculosis-Chest-X-ray-Datasets/Shenzhen-Hospital-CXR-Set/CXR_png/

# --- Shenzhen LUNG masks via Kaggle (optional but recommended) ---
# If you have a Kaggle API token, place kaggle.json in /root/.kaggle or upload it.
# This will fetch 'yoctoman/shcxr-lung-mask' (binary lung masks for Shenzhen).
!pip -q install kaggle >/dev/null

import os, json, shutil, glob, zipfile

# If you're running this first time in Colab, uncomment to upload kaggle.json interactively:
# from google.colab import files
# files.upload()  # then move kaggle.json into ~/.kaggle

os.makedirs('//home/allank24/Programming/HQCNN/Segmentation/finetune-SAM/download_datasets/.kaggle', exist_ok=True)
if os.path.exists('../kaggle.json'):
    shutil.move('../kaggle.json', '../.kaggle/kaggle.json')

kaggle_ok = os.path.exists('/root/.kaggle/kaggle.json')
if kaggle_ok:
    # secure permissions
    os.chmod('.kaggle/kaggle.json', 0o600)
    # download and unzip
    !kaggle datasets download -d yoctoman/shcxr-lung-mask -p /content/datasets/Xray_Chest/shenzhen/masks -q
    mask_zip = '/content/datasets/Xray_Chest/shenzhen/masks/shcxr-lung-mask.zip'
    if os.path.exists(mask_zip):
        with zipfile.ZipFile(mask_zip, 'r') as zf:
            zf.extractall('/content/datasets/Xray_Chest/shenzhen/masks')
        os.remove(mask_zip)
else:
    print("Kaggle API credentials not found. "
          "Place Shenzhen lung masks in /content/datasets/Xray_Chest/shenzhen/masks/"
          " (e.g., from 'yoctoman/shcxr-lung-mask').")

# =========================
# 1) PREPROCESS (to PNG + CSV)
# =========================

import os
import glob
import csv
import numpy as np
from PIL import Image
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# --- Configuration ---
BASE_INPUT_DIR = '/content/datasets/Xray_Chest'

# Montgomery
MONT_IMG_DIR = os.path.join(BASE_INPUT_DIR, 'montgomery', 'CXR_png')
MONT_LEFT_MASK_DIR = os.path.join(BASE_INPUT_DIR, 'montgomery', 'ManualMask', 'leftMask')
MONT_RIGHT_MASK_DIR = os.path.join(BASE_INPUT_DIR, 'montgomery', 'ManualMask', 'rightMask')

# Shenzhen
SHEN_IMG_DIR = os.path.join(BASE_INPUT_DIR, 'shenzhen', 'CXR_png')
SHEN_MASK_DIR = os.path.join(BASE_INPUT_DIR, 'shenzhen', 'masks')  # expects lung masks here

# Output
BASE_OUTPUT_DIR = '/content/preprocessed_datasets/xraychest'
IMAGE_OUTPUT_DIR = os.path.join(BASE_OUTPUT_DIR, 'images')
MASK_OUTPUT_DIR = os.path.join(BASE_OUTPUT_DIR, 'masks')

# Split ratios (match your hip script logic: 70/10/20 overall)
TEST_SIZE = 0.20        # 20% test
VAL_SIZE_OF_TRAIN = 0.125  # 10% overall (0.1 / 0.8)

# Optional: downscale very large CXRs to a max side (keep None to keep originals)
MAX_SIDE = None  # e.g., set to 2048 for memory-friendly training

# ---- Helpers ----
def to_uint8_minmax(img_np):
    """Min-max normalize to [0,255] uint8 (like your hip script, per-image)."""
    img = img_np.astype(np.float32)
    mn, mx = np.min(img), np.max(img)
    if mx > mn:
        img = (img - mn) / (mx - mn) * 255.0
    else:
        img = np.zeros_like(img)
    return img.clip(0, 255).astype(np.uint8)

def pil_open_gray(path):
    """Open an image as numpy grayscale (handles 8/16-bit)."""
    im = Image.open(path)
    if im.mode not in ('L', 'I;16', 'I'):
        im = im.convert('L')
    arr = np.array(im)
    # If 16-bit, scale down to 8-bit for consistency
    if arr.dtype == np.uint16:
        # normalize min-max to 8-bit
        arr = to_uint8_minmax(arr)
    elif arr.dtype != np.uint8:
        arr = to_uint8_minmax(arr)
    return arr

def resize_if_needed(img_np, target_max_side):
    if target_max_side is None:
        return img_np
    h, w = img_np.shape[:2]
    mx = max(h, w)
    if mx <= target_max_side:
        return img_np
    scale = target_max_side / float(mx)
    new_w = int(round(w * scale))
    new_h = int(round(h * scale))
    pil = Image.fromarray(img_np)
    # images: bilinear; masks: nearest (handled separately)
    return np.array(pil.resize((new_w, new_h), resample=Image.BILINEAR))

def ensure_size(mask_np, W, H):
    if mask_np.shape[0] == H and mask_np.shape[1] == W:
        return mask_np
    pil = Image.fromarray(mask_np)
    return np.array(pil.resize((W, H), resample=Image.NEAREST))

def binarize(mask_np):
    m = (mask_np > 0).astype(np.uint8) * 255
    return m

def find_shenzhen_mask(mask_root, base_noext):
    """
    Try several common filename patterns used by Shenzhen lung mask packages.
    Returns a path or None.
    """
    candidates = [
        os.path.join(mask_root, f'{base_noext}_mask.png'),
        os.path.join(mask_root, f'{base_noext}.png'),
        os.path.join(mask_root, f'{base_noext}_1.png'),
        os.path.join(mask_root, f'{base_noext}_mask.jpg'),
        os.path.join(mask_root, f'{base_noext}.jpg'),
    ]
    for c in candidates:
        if os.path.exists(c):
            return c
    # fallback: any file that starts with base and is an image
    globs = glob.glob(os.path.join(mask_root, base_noext + '*'))
    globs = [g for g in globs if g.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tif', '.tiff'))]
    return globs[0] if globs else None

def save_pair(img_np, mask_np, out_stem):
    """Write image/mask PNGs under IMAGE_OUTPUT_DIR/MASK_OUTPUT_DIR."""
    os.makedirs(IMAGE_OUTPUT_DIR, exist_ok=True)
    os.makedirs(MASK_OUTPUT_DIR, exist_ok=True)
    img_p = os.path.join(IMAGE_OUTPUT_DIR, out_stem + '.png')
    msk_p = os.path.join(MASK_OUTPUT_DIR, out_stem + '.png')
    Image.fromarray(img_np).save(img_p)
    Image.fromarray(mask_np).save(msk_p)
    return img_p, msk_p

def preprocess_xray_chest():
    """
    Build 2D PNG pairs + CSVs for Montgomery + Shenzhen lung segmentation.
    """
    print('--- Starting Dataset Preprocessing for X-Ray Chest (Montgomery + Shenzhen) ---')
    os.makedirs(IMAGE_OUTPUT_DIR, exist_ok=True)
    os.makedirs(MASK_OUTPUT_DIR, exist_ok=True)

    # Gather file lists
    mont_images = sorted(glob.glob(os.path.join(MONT_IMG_DIR, '*.png')))
    shen_images = sorted(glob.glob(os.path.join(SHEN_IMG_DIR, '*.png')))

    if not mont_images:
        print(f'Warning: No Montgomery images found in {MONT_IMG_DIR}')
    if not shen_images:
        print(f'Warning: No Shenzhen images found in {SHEN_IMG_DIR}')

    # Data split bookkeeping
    entries = {'train': [], 'val': [], 'test': []}
    all_ids = []

    # ---------- Process Montgomery ----------
    print('\nProcessing Montgomery...')
    for img_path in tqdm(mont_images, desc='Montgomery'):
        base = os.path.basename(img_path)           # e.g., MCUCXR_0001_0.png
        stem = os.path.splitext(base)[0]

        left_mask_path  = os.path.join(MONT_LEFT_MASK_DIR,  base)
        right_mask_path = os.path.join(MONT_RIGHT_MASK_DIR, base)
        if not os.path.exists(left_mask_path) or not os.path.exists(right_mask_path):
            # some images might be missing one side; skip if masks incomplete
            # (you can relax this if you prefer)
            continue

        # Load image + masks
        img_np = pil_open_gray(img_path)
        ml = pil_open_gray(left_mask_path)
        mr = pil_open_gray(right_mask_path)

        # Combine left+right into a single binary lung mask
        mask_np = binarize(ml) | binarize(mr)

        # Match sizes
        H, W = img_np.shape[:2]
        mask_np = ensure_size(mask_np, W, H)

        # Optional resize
        img_np_resized = resize_if_needed(img_np, MAX_SIDE)
        if MAX_SIDE is not None:
            # recompute dims if resized
            H2, W2 = img_np_resized.shape[:2]
            mask_np = np.array(Image.fromarray(mask_np).resize((W2, H2), resample=Image.NEAREST))
        else:
            H2, W2 = H, W

        # Normalize image to 0-255 uint8
        img_np_uint8 = to_uint8_minmax(img_np_resized)
        mask_np_bin  = binarize(mask_np)

        # Skip empty masks (shouldn't happen for Montgomery)
        if mask_np_bin.sum() == 0:
            continue

        out_stem = f"mc_{stem}"
        img_out, msk_out = save_pair(img_np_uint8, mask_np_bin, out_stem)
        # record relative paths
        rel_img = os.path.join(os.path.basename(BASE_OUTPUT_DIR), 'images', os.path.basename(img_out)).replace(os.sep, '/')
        rel_msk = os.path.join(os.path.basename(BASE_OUTPUT_DIR), 'masks', os.path.basename(msk_out)).replace(os.sep, '/')
        all_ids.append((out_stem, rel_img, rel_msk))

    # ---------- Process Shenzhen ----------
    print('\nProcessing Shenzhen...')
    have_sh_masks = os.path.isdir(SHEN_MASK_DIR) and len(glob.glob(os.path.join(SHEN_MASK_DIR, '*'))) > 0
    if not have_sh_masks:
        print(f"Warning: No Shenzhen lung masks found in {SHEN_MASK_DIR}. "
              f"To include Shenzhen, download masks (e.g., Kaggle 'yoctoman/shcxr-lung-mask') to that folder.")

    for img_path in tqdm(shen_images, desc='Shenzhen'):
        base = os.path.basename(img_path)      # e.g., CHNCXR_0001_0.png
        stem = os.path.splitext(base)[0]

        if not have_sh_masks:
            # skip Shenzhen if masks missing
            continue

        mask_path = find_shenzhen_mask(SHEN_MASK_DIR, stem)
        if mask_path is None:
            # no mask match => skip
            continue

        img_np  = pil_open_gray(img_path)
        mask_np = pil_open_gray(mask_path)

        # Match size
        H, W = img_np.shape[:2]
        mask_np = ensure_size(mask_np, W, H)

        # Optional resize
        img_np_resized = resize_if_needed(img_np, MAX_SIDE)
        if MAX_SIDE is not None:
            H2, W2 = img_np_resized.shape[:2]
            mask_np = np.array(Image.fromarray(mask_np).resize((W2, H2), resample=Image.NEAREST))
        else:
            H2, W2 = H, W

        img_np_uint8 = to_uint8_minmax(img_np_resized)
        mask_np_bin  = binarize(mask_np)

        if mask_np_bin.sum() == 0:
            continue

        out_stem = f"sh_{stem}"
        img_out, msk_out = save_pair(img_np_uint8, mask_np_bin, out_stem)
        rel_img = os.path.join(os.path.basename(BASE_OUTPUT_DIR), 'images', os.path.basename(img_out)).replace(os.sep, '/')
        rel_msk = os.path.join(os.path.basename(BASE_OUTPUT_DIR), 'masks', os.path.basename(msk_out)).replace(os.sep, '/')
        all_ids.append((out_stem, rel_img, rel_msk))

    # ---------- Splits ----------
    print('\nCreating splits...')
    all_stems = [sid for (sid, _, _) in all_ids]
    # Deterministic split
    train_val_ids, test_ids = train_test_split(all_stems, test_size=TEST_SIZE, random_state=42, shuffle=True)
    val_size = VAL_SIZE_OF_TRAIN
    train_ids, val_ids = train_test_split(train_val_ids, test_size=val_size, random_state=42, shuffle=True)

    # Maps for quick membership checks
    train_set = set(train_ids)
    val_set   = set(val_ids)
    test_set  = set(test_ids)

    # Collect rows
    for sid, rel_img, rel_msk in all_ids:
        if sid in train_set:
            entries['train'].append(f"{rel_img},{rel_msk}")
        elif sid in val_set:
            entries['val'].append(f"{rel_img},{rel_msk}")
        else:
            entries['test'].append(f"{rel_img},{rel_msk}")

    os.makedirs(BASE_OUTPUT_DIR, exist_ok=True)
    for split_name in ['train', 'val', 'test']:
        csv_path = os.path.join(BASE_OUTPUT_DIR, f"{split_name}.csv")
        with open(csv_path, 'w') as f:
            f.write('\n'.join(entries[split_name]))
        print(f"Saved {csv_path} with {len(entries[split_name])} entries.")

    print('\n--- Preprocessing Complete! ---')

# Run
preprocess_xray_chest()

# =========================
# 2) ZIP THE PREPROCESSED FOLDER
# =========================
def zip_folder(folder_path, output_path):
    with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zf:
        for root, dirs, files in os.walk(folder_path):
            for file in files:
                fp = os.path.join(root, file)
                arc = os.path.relpath(fp, folder_path)
                zf.write(fp, arc)

folder_to_zip = "/content/preprocessed_datasets/xraychest"
output_zip = "xray_chest_dataset.zip"
if os.path.isdir(folder_to_zip):
    zip_folder(folder_to_zip, output_zip)
    print(f"Zipped {folder_to_zip} → {output_zip}")
else:
    print(f"Nothing to zip. Missing folder: {folder_to_zip}")

Dataset URL: https://www.kaggle.com/datasets/yoctoman/shcxr-lung-mask
License(s): CC-BY-NC-SA-4.0
--- Starting Dataset Preprocessing for X-Ray Chest (Montgomery + Shenzhen) ---

Processing Montgomery...


Montgomery: 0it [00:00, ?it/s]



Processing Shenzhen...


Shenzhen: 0it [00:00, ?it/s]


Creating splits...





ValueError: With n_samples=0, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

## V2

In [None]:
# -*- coding: utf-8 -*-
"""finetune-sam-xray-chest-704-kaggle.ipynb
Colab-friendly: downloads 704 lung-mask pairs (Shenzhen+Montgomery) and prepares CSV splits.
"""

# =========================
# 0) DOWNLOAD (Kaggle)  — avoids NIH 403 issues
# =========================
!pip -q install kaggle tqdm scikit-learn pillow >/dev/null

import os, shutil, zipfile, glob, re
from pathlib import Path

DATASET_SLUG = "iamtapendu/chest-x-ray-lungs-segmentation"  # 704 pairs (Shenzhen+Montgomery)

# Where to place the raw Kaggle dump
KAGGLE_DL_DIR = "/content/datasets/Xray_Chest_704_raw"
os.makedirs(KAGGLE_DL_DIR, exist_ok=True)

# Ensure Kaggle token (upload kaggle.json once if missing)
if not os.path.exists("/root/.kaggle/kaggle.json"):
    os.makedirs("/root/.kaggle", exist_ok=True)
    if os.path.exists("/content/kaggle.json"):
        shutil.move("/content/kaggle.json", "/root/.kaggle/kaggle.json")
    else:
        print("⚠️ Kaggle API credentials not found.\n"
              "• In Colab: upload your kaggle.json to /content (Files pane), OR run:\n"
              "    from google.colab import files; files.upload()\n"
              "  then move it:\n"
              "    !mkdir -p /root/.kaggle && mv /content/kaggle.json /root/.kaggle/kaggle.json && chmod 600 /root/.kaggle/kaggle.json\n")
# secure permissions if present
if os.path.exists("/root/.kaggle/kaggle.json"):
    os.chmod("/root/.kaggle/kaggle.json", 0o600)

# Try to download (quiet); if creds missing, this will error — we catch and explain.
download_ok = True
try:
    !kaggle datasets download -d $DATASET_SLUG -p $KAGGLE_DL_DIR -q
except Exception as e:
    download_ok = False
    print("❌ Kaggle download failed. Make sure kaggle.json is installed as shown above.")
    print("Error:", e)

# Unzip if we have the archive
if download_ok:
    zips = [p for p in glob.glob(os.path.join(KAGGLE_DL_DIR, "*.zip"))]
    if not zips:
        print("❌ Did not find the Kaggle zip in", KAGGLE_DL_DIR)
    else:
        for z in zips:
            with zipfile.ZipFile(z, 'r') as zf:
                zf.extractall(KAGGLE_DL_DIR)
        print("✅ Unzipped into", KAGGLE_DL_DIR)

# =========================
# 1) PREPROCESS to unified PNGs + CSVs (train/val/test)
# =========================
import numpy as np
from PIL import Image
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# Output layout (mirrors your hip script)
BASE_OUTPUT_DIR = '/content/preprocessed_datasets/xraychest'
IMAGE_OUTPUT_DIR = os.path.join(BASE_OUTPUT_DIR, 'images')
MASK_OUTPUT_DIR  = os.path.join(BASE_OUTPUT_DIR, 'masks')
os.makedirs(IMAGE_OUTPUT_DIR, exist_ok=True)
os.makedirs(MASK_OUTPUT_DIR, exist_ok=True)

# Splits: 70/10/20 (like your hip script)
TEST_SIZE = 0.20
VAL_SIZE_OF_TRAIN = 0.125  # 0.1 / 0.8

# Optional resize upper bound (keep None to retain original size)
MAX_SIDE = None  # e.g., set 2048 to tame memory

# --- helpers ---
def to_uint8_minmax(img_np):
    img = img_np.astype(np.float32)
    mn, mx = img.min(), img.max()
    if mx > mn:
        img = (img - mn) / (mx - mn) * 255.0
    else:
        img = np.zeros_like(img)
    return img.clip(0, 255).astype(np.uint8)

def open_gray(path):
    im = Image.open(path)
    if im.mode not in ('L', 'I;16', 'I'):
        im = im.convert('L')
    arr = np.array(im)
    if arr.dtype != np.uint8:
        arr = to_uint8_minmax(arr)
    return arr

def resize_img(img_np, target_max_side):
    if target_max_side is None:
        return img_np
    h, w = img_np.shape[:2]
    mx = max(h, w)
    if mx <= target_max_side:
        return img_np
    scale = target_max_side / float(mx)
    new_w, new_h = int(round(w*scale)), int(round(h*scale))
    return np.array(Image.fromarray(img_np).resize((new_w, new_h), resample=Image.BILINEAR))

def ensure_size(mask_np, W, H):
    if mask_np.shape[:2] == (H, W):
        return mask_np
    return np.array(Image.fromarray(mask_np).resize((W, H), resample=Image.NEAREST))

def binarize(mask_np):
    return (mask_np > 0).astype(np.uint8) * 255

def find_all_images(root):
    # grab PNGs/JPGs that look like Shenzhen (CHNCXR_*.png) or Montgomery (MCUCXR_*.png)
    exts = ('.png', '.jpg', '.jpeg', '.bmp', '.tif', '.tiff')
    files = []
    for p in Path(root).rglob("*"):
        if p.is_file() and p.suffix.lower() in exts:
            name = p.name
            if re.match(r"(CHNCXR|MCUCXR)_\d+_\d+\.(png|jpg|jpeg|bmp|tif|tiff)", name, flags=re.I):
                # likely an image (not mask) — skip files containing 'mask' in their path/name
                if "mask" not in name.lower() and "manualmask" not in str(p.parent).lower():
                    files.append(str(p))
    return sorted(files)

def try_find_mask_for(base_root, image_path):
    """
    Heuristic: look for mask file with same stem anywhere under base_root.
    Accept common variations: same-name in 'masks' folder, suffix '_mask', or NIH ManualMask/left+right.
    """
    img_name = Path(image_path).name
    stem = Path(img_name).stem  # e.g., CHNCXR_0001_0 or MCUCXR_0001_0

    # 1) Direct same-name in a sibling masks folder somewhere
    cands = list(Path(base_root).rglob(f"*mask*{stem}*")) + \
            list(Path(base_root).rglob(f"{stem}*mask*")) + \
            list(Path(base_root).rglob(f"{stem}.png"))  # some bundles store masks with same filename
    # filter to image-like files
    cands = [c for c in cands if c.is_file() and c.suffix.lower() in ('.png','.jpg','.jpeg','.bmp','.tif','.tiff')]

    # NIH Montgomery structure: left/right masks live in ManualMask/{leftMask,rightMask}/SAME_FILENAME
    left = list(Path(base_root).rglob(f"ManualMask/leftMask/{img_name}"))
    right = list(Path(base_root).rglob(f"ManualMask/rightMask/{img_name}"))

    return [str(p) for p in cands], [str(p) for p in left], [str(p) for p in right]

def save_pair(img_np, mask_np, out_stem):
    img_p = os.path.join(IMAGE_OUTPUT_DIR, out_stem + '.png')
    msk_p = os.path.join(MASK_OUTPUT_DIR,  out_stem + '.png')
    Image.fromarray(img_np).save(img_p)
    Image.fromarray(mask_np).save(msk_p)
    return img_p, msk_p

def preprocess_704_bundle(bundle_root):
    print('--- Preprocessing X-Ray Chest (704 pairs: Shenzhen + Montgomery) ---')
    entries = {'train': [], 'val': [], 'test': []}
    all_ids = []

    images = find_all_images(bundle_root)
    if not images:
        print(f"❌ No images found under {bundle_root}. Check that the Kaggle zip was extracted.")
        return

    for ipath in tqdm(images, desc="Pairing & saving"):
        img_np = open_gray(ipath)

        cands, lefts, rights = try_find_mask_for(bundle_root, ipath)

        mask_np = None
        if lefts and rights:
            # Montgomery: combine left + right
            ml = open_gray(lefts[0])
            mr = open_gray(rights[0])
            mask_np = binarize(ml) | binarize(mr)
        else:
            # Shenzhen or already-unified masks
            chosen = None
            # prefer exact same-name in a folder named masks/Mask
            for p in cands:
                pname = Path(p).name.lower()
                if "manualmask" in p.lower():
                    continue
                if (("mask" in pname) or ("masks" in Path(p).parent.name.lower()) or (Path(p).parent.name.lower() in ("mask","masks"))):
                    chosen = p
                    break
            if chosen is None and cands:
                chosen = cands[0]
            if chosen:
                mask_np = open_gray(chosen)
                mask_np = binarize(mask_np)

        # if we still don't have a mask, skip
        if mask_np is None or mask_np.sum() == 0:
            continue

        H, W = img_np.shape[:2]
        mask_np = ensure_size(mask_np, W, H)

        img_np_r = resize_img(img_np, MAX_SIDE)
        if MAX_SIDE is not None:
            H2, W2 = img_np_r.shape[:2]
            mask_np = np.array(Image.fromarray(mask_np).resize((W2, H2), resample=Image.NEAREST))

        img_np_u8 = to_uint8_minmax(img_np_r)
        mask_np_b  = binarize(mask_np)

        # tag by dataset prefix for clarity
        stem = Path(ipath).stem  # CHNCXR_xxx or MCUCXR_xxx
        prefix = "sh" if stem.upper().startswith("CHN") else "mc"
        out_stem = f"{prefix}_{stem}"

        img_out, msk_out = save_pair(img_np_u8, mask_np_b, out_stem)
        rel_img = os.path.join(os.path.basename(BASE_OUTPUT_DIR), 'images', os.path.basename(img_out)).replace(os.sep, '/')
        rel_msk = os.path.join(os.path.basename(BASE_OUTPUT_DIR), 'masks', os.path.basename(msk_out)).replace(os.sep, '/')
        all_ids.append((out_stem, rel_img, rel_msk))

    if not all_ids:
        print("❌ Found images but failed to match any masks. The bundle layout might differ.")
        return

    # Deterministic splits (70/10/20)
    all_ids_sorted = sorted(all_ids, key=lambda x: x[0])
    stems = [sid for (sid, _, _) in all_ids_sorted]
    train_val, test = train_test_split(stems, test_size=TEST_SIZE, random_state=42, shuffle=True)
    train, val = train_test_split(train_val, test_size=VAL_SIZE_OF_TRAIN, random_state=42, shuffle=True)

    train_s, val_s, test_s = set(train), set(val), set(test)
    for sid, rel_img, rel_msk in all_ids_sorted:
        if sid in train_s:
            entries['train'].append(f"{rel_img},{rel_msk}")
        elif sid in val_s:
            entries['val'].append(f"{rel_img},{rel_msk}")
        else:
            entries['test'].append(f"{rel_img},{rel_msk}")

    os.makedirs(BASE_OUTPUT_DIR, exist_ok=True)
    for split_name in ['train','val','test']:
        with open(os.path.join(BASE_OUTPUT_DIR, f"{split_name}.csv"), 'w') as f:
            f.write('\n'.join(entries[split_name]))
        print(f"✅ Saved {os.path.join(BASE_OUTPUT_DIR, f'{split_name}.csv')} with {len(entries[split_name])} entries.")

    print("\n--- Preprocessing Complete! ---")

# Run on the Kaggle dump
preprocess_704_bundle(KAGGLE_DL_DIR)

# =========================
# 2) ZIP the preprocessed folder (like your hip script)
# =========================
def zip_folder(folder_path, output_path):
    with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zf:
        for root, _, files in os.walk(folder_path):
            for file in files:
                fp = os.path.join(root, file)
                arc = os.path.relpath(fp, folder_path)
                zf.write(fp, arc)

folder_to_zip = "/content/preprocessed_datasets/xraychest"
if os.path.isdir(folder_to_zip) and glob.glob(os.path.join(folder_to_zip, "images", "*.png")):
    output_zip = "xray_chest_dataset.zip"
    zip_folder(folder_to_zip, output_zip)
    print(f"📦 Zipped {folder_to_zip} → {output_zip}")
else:
    print(f"Nothing to zip yet at {folder_to_zip} (no images found).")

Dataset URL: https://www.kaggle.com/datasets/iamtapendu/chest-x-ray-lungs-segmentation
License(s): apache-2.0
✅ Unzipped into /content/datasets/Xray_Chest_704_raw
--- Preprocessing X-Ray Chest (704 pairs: Shenzhen + Montgomery) ---
❌ No images found under /content/datasets/Xray_Chest_704_raw. Check that the Kaggle zip was extracted.
Nothing to zip yet at /content/preprocessed_datasets/xraychest (no images found).


In [None]:
!apt-get install tree
!tree /content

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  tree
0 upgraded, 1 newly installed, 0 to remove and 35 not upgraded.
Need to get 47.9 kB of archives.
After this operation, 116 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tree amd64 2.0.2-1 [47.9 kB]
Fetched 47.9 kB in 0s (327 kB/s)
Selecting previously unselected package tree.
(Reading database ... 126371 files and directories currently installed.)
Preparing to unpack .../tree_2.0.2-1_amd64.deb ...
Unpacking tree (2.0.2-1) ...
Setting up tree (2.0.2-1) ...
Processing triggers for man-db (2.10.2-1) ...
[01;34m/content[0m
├── [01;34mdatasets[0m
│   ├── [01;34mXray_Chest[0m
│   │   ├── [01;34mmontgomery[0m
│   │   │   ├── [01;34mCXR_png[0m
│   │   │   └── [01;34mManualMask[0m
│   │   │       ├── [01;34mleftMask[0m
│   │   │       └── [01;34mrightMask[0m
│   │   └── [01;3

# v3

In [1]:
# -*- coding: utf-8 -*-
"""finetune-sam-xray-chest-704-kaggle.ipynb
Colab-friendly: downloads 704 lung-mask pairs (Shenzhen+Montgomery) and prepares CSV splits.
"""

# =========================
# 0) DOWNLOAD (Kaggle)  — avoids NIH 403 issues
# =========================
!pip -q install kaggle tqdm scikit-learn pillow >/dev/null

import os, shutil, zipfile, glob, re
from pathlib import Path

DATASET_SLUG = "iamtapendu/chest-x-ray-lungs-segmentation"  # 704 pairs (Shenzhen+Montgomery)

# Where to place the raw Kaggle dump
KAGGLE_DL_DIR = "/content/datasets/Xray_Chest_704_raw"
os.makedirs(KAGGLE_DL_DIR, exist_ok=True)

# Ensure Kaggle token (upload kaggle.json once if missing)
if not os.path.exists("/root/.kaggle/kaggle.json"):
    os.makedirs("/root/.kaggle", exist_ok=True)
    if os.path.exists("/content/kaggle.json"):
        shutil.move("/content/kaggle.json", "/root/.kaggle/kaggle.json")
    else:
        print("⚠️ Kaggle API credentials not found.\n"
              "• In Colab: upload your kaggle.json to /content (Files pane), OR run:\n"
              "    from google.colab import files; files.upload()\n"
              "  then move it:\n"
              "    !mkdir -p /root/.kaggle && mv /content/kaggle.json /root/.kaggle/kaggle.json && chmod 600 /root/.kaggle/kaggle.json\n")
# secure permissions if present
if os.path.exists("/root/.kaggle/kaggle.json"):
    os.chmod("/root/.kaggle/kaggle.json", 0o600)

# Try to download (quiet); if creds missing, this will error — we catch and explain.
download_ok = True
try:
    !kaggle datasets download -d $DATASET_SLUG -p $KAGGLE_DL_DIR -q
except Exception as e:
    download_ok = False
    print("❌ Kaggle download failed. Make sure kaggle.json is installed as shown above.")
    print("Error:", e)

# Unzip if we have the archive
if download_ok:
    zips = [p for p in glob.glob(os.path.join(KAGGLE_DL_DIR, "*.zip"))]
    if not zips:
        print("❌ Did not find the Kaggle zip in", KAGGLE_DL_DIR)
    else:
        for z in zips:
            with zipfile.ZipFile(z, 'r') as zf:
                zf.extractall(KAGGLE_DL_DIR)
        print("✅ Unzipped into", KAGGLE_DL_DIR)

Dataset URL: https://www.kaggle.com/datasets/iamtapendu/chest-x-ray-lungs-segmentation
License(s): apache-2.0
✅ Unzipped into /content/datasets/Xray_Chest_704_raw


In [2]:
# -*- coding: utf-8 -*-
"""
Final Colab script: X-Ray Chest (Montgomery + Shenzhen + Kaggle 704) → PNG pairs + CSV splits
- Uses your existing directory layout under /content/datasets/
- Merges left/right lungs into a single binary mask (0 background, 255 foreground)
- Output: /content/preprocessed_datasets/xraychest/{images,masks,train.csv,val.csv,test.csv}
"""

# =========================
# 0) Setup
# =========================
!pip -q install kaggle tqdm scikit-learn pillow >/dev/null

import os, re, glob, zipfile, shutil, subprocess
from pathlib import Path
import numpy as np
from PIL import Image
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# =========================
# 1) Config (edit if needed)
# =========================
# Sources (set either/both True). Script will skip gracefully if a source has no data.
INCLUDE_NIH     = True   # /content/datasets/Xray_Chest/{montgomery,shenzhen}
INCLUDE_KAGGLE  = True   # /content/datasets/Xray_Chest_704_raw (numeric 1000.png etc.)

# Base paths (match your structure)
NIH_BASE_DIR       = "/content/datasets/Xray_Chest"
MONT_IMG_DIR       = f"{NIH_BASE_DIR}/montgomery/CXR_png"
MONT_LEFT_MASK_DIR = f"{NIH_BASE_DIR}/montgomery/ManualMask/leftMask"
MONT_RIGHT_MASK_DIR= f"{NIH_BASE_DIR}/montgomery/ManualMask/rightMask"

SHEN_IMG_DIR       = f"{NIH_BASE_DIR}/shenzhen/CXR_png"
# We will search recursively under this root to handle masks/mask/... or masks/mask/mask/...
SHEN_MASK_ROOT     = f"{NIH_BASE_DIR}/shenzhen/masks"

# Kaggle bundle root
KAGGLE_ROOT        = "/content/datasets/Xray_Chest_704_raw"
KAGGLE_SLUG        = "iamtapendu/chest-x-ray-lungs-segmentation"

# Output
BASE_OUTPUT_DIR = "/content/preprocessed_datasets/xraychest"
IMAGE_OUTPUT_DIR= f"{BASE_OUTPUT_DIR}/images"
MASK_OUTPUT_DIR = f"{BASE_OUTPUT_DIR}/masks"
os.makedirs(IMAGE_OUTPUT_DIR, exist_ok=True)
os.makedirs(MASK_OUTPUT_DIR,  exist_ok=True)

# Splits: 70/10/20 (your hip script style)
TEST_SIZE = 0.20
VAL_SIZE_OF_TRAIN = 0.125  # 0.1 / 0.8

# Optional: limit max side (longer edge) to save memory. None keeps original sizes.
MAX_SIDE = None  # e.g., 2048

# =========================
# 2) Helpers
# =========================
IMG_EXTS = ('.png', '.jpg', '.jpeg', '.bmp', '.tif', '.tiff')

def to_uint8_minmax(img_np):
    img = img_np.astype(np.float32)
    mn, mx = img.min(), img.max()
    if mx > mn:
        img = (img - mn) / (mx - mn) * 255.0
    else:
        img = np.zeros_like(img)
    return img.clip(0, 255).astype(np.uint8)

def open_gray(path):
    im = Image.open(path)
    if im.mode not in ('L', 'I;16', 'I'):
        im = im.convert('L')
    arr = np.array(im)
    if arr.dtype != np.uint8:
        arr = to_uint8_minmax(arr)
    return arr

def resize_img(img_np, target_max_side):
    if target_max_side is None: return img_np
    h, w = img_np.shape[:2]
    mx = max(h, w)
    if mx <= target_max_side: return img_np
    scale = target_max_side / float(mx)
    new_w, new_h = int(round(w*scale)), int(round(h*scale))
    return np.array(Image.fromarray(img_np).resize((new_w, new_h), resample=Image.BILINEAR))

def ensure_size(mask_np, W, H):
    if mask_np.shape[:2] == (H, W):
        return mask_np
    return np.array(Image.fromarray(mask_np).resize((W, H), resample=Image.NEAREST))

def binarize(mask_np):
    # single foreground class (lungs)
    return (mask_np > 0).astype(np.uint8) * 255

def save_pair(img_np, mask_np, out_stem):
    img_p = os.path.join(IMAGE_OUTPUT_DIR, out_stem + '.png')
    msk_p = os.path.join(MASK_OUTPUT_DIR,  out_stem + '.png')
    Image.fromarray(img_np).save(img_p)
    Image.fromarray(mask_np).save(msk_p)
    return img_p, msk_p

def rel_paths(img_out, msk_out):
    rel_img = os.path.join(os.path.basename(BASE_OUTPUT_DIR), 'images', os.path.basename(img_out)).replace(os.sep, '/')
    rel_msk = os.path.join(os.path.basename(BASE_OUTPUT_DIR), 'masks', os.path.basename(msk_out)).replace(os.sep, '/')
    return rel_img, rel_msk

# =========================
# 3) NIH processing
# =========================
def process_montgomery():
    pairs = []
    img_files = sorted([p for p in glob.glob(os.path.join(MONT_IMG_DIR, '*')) if p.lower().endswith(IMG_EXTS)])
    if not img_files:
        print(f"Montgomery: no images in {MONT_IMG_DIR}")
        return pairs
    print(f"Montgomery: found {len(img_files)} images")

    for ip in tqdm(img_files, desc="Montgomery"):
        base = os.path.basename(ip)  # MCUCXR_0001_0.png
        lmask = os.path.join(MONT_LEFT_MASK_DIR, base)
        rmask = os.path.join(MONT_RIGHT_MASK_DIR, base)
        if not (os.path.exists(lmask) and os.path.exists(rmask)):
            continue
        img_np = open_gray(ip)
        ml = open_gray(lmask)
        mr = open_gray(rmask)
        mask_np = binarize(ml) | binarize(mr)

        H, W = img_np.shape[:2]
        mask_np = ensure_size(mask_np, W, H)
        img_np_r = resize_img(img_np, MAX_SIDE)
        if MAX_SIDE is not None:
            H2, W2 = img_np_r.shape[:2]
            mask_np = np.array(Image.fromarray(mask_np).resize((W2, H2), resample=Image.NEAREST))
        img_np_u8 = to_uint8_minmax(img_np_r)
        mask_np_b = binarize(mask_np)
        if mask_np_b.sum() == 0:
            continue

        stem = Path(base).stem
        out_stem = f"mc_{stem}"
        img_out, msk_out = save_pair(img_np_u8, mask_np_b, out_stem)
        pairs.append((out_stem, *rel_paths(img_out, msk_out)))
    print(f"Montgomery: saved {len(pairs)} pairs")
    return pairs

def find_shenzhen_mask(stem):
    """
    Search recursively under SHEN_MASK_ROOT for files corresponding to 'stem'
    Prefer names containing 'mask' and image file extensions.
    """
    root = Path(SHEN_MASK_ROOT)
    if not root.exists():
        return None
    candidates = []
    # e.g., CHNCXR_0001_0_mask.png or similar
    for p in root.rglob(f"{stem}*"):
        if p.is_file() and p.suffix.lower() in IMG_EXTS:
            name_low = p.name.lower()
            if ('mask' in name_low) or ('masks' in p.parent.name.lower()) or ('mask' in p.parent.name.lower()):
                candidates.append(p)
    if candidates:
        # prefer ones with '_mask' in the filename
        candidates.sort(key=lambda x: (0 if '_mask' in x.name.lower() else 1, len(str(x))))
        return str(candidates[0])
    # fallback: any image with same stem under mask root
    for p in root.rglob(f"{stem}.*"):
        if p.is_file() and p.suffix.lower() in IMG_EXTS:
            return str(p)
    return None

def process_shenzhen():
    pairs = []
    img_files = sorted([p for p in glob.glob(os.path.join(SHEN_IMG_DIR, '*')) if p.lower().endswith(IMG_EXTS)])
    if not img_files:
        print(f"Shenzhen: no images in {SHEN_IMG_DIR}")
        return pairs
    print(f"Shenzhen: found {len(img_files)} images. Searching masks under {SHEN_MASK_ROOT} ...")

    for ip in tqdm(img_files, desc="Shenzhen"):
        base = os.path.basename(ip)  # CHNCXR_0001_0.png
        stem = Path(base).stem
        mpath = find_shenzhen_mask(stem)
        if mpath is None:
            continue
        img_np  = open_gray(ip)
        mask_np = binarize(open_gray(mpath))

        H, W = img_np.shape[:2]
        mask_np = ensure_size(mask_np, W, H)
        img_np_r = resize_img(img_np, MAX_SIDE)
        if MAX_SIDE is not None:
            H2, W2 = img_np_r.shape[:2]
            mask_np = np.array(Image.fromarray(mask_np).resize((W2, H2), resample=Image.NEAREST))

        img_np_u8 = to_uint8_minmax(img_np_r)
        mask_np_b = binarize(mask_np)
        if mask_np_b.sum() == 0:
            continue

        out_stem = f"sh_{stem}"
        img_out, msk_out = save_pair(img_np_u8, mask_np_b, out_stem)
        pairs.append((out_stem, *rel_paths(img_out, msk_out)))
    print(f"Shenzhen: saved {len(pairs)} pairs")
    return pairs

# =========================
# 4) Kaggle 704 processing
# =========================
def have_kaggle_image_mask_dirs(root):
    """Return (image_dir, mask_dir, matched_names) if found, else None."""
    rootp = Path(root)
    if not rootp.exists():
        return None
    img_dirs  = [p for p in rootp.rglob('image') if p.is_dir()]
    mask_dirs = [p for p in rootp.rglob('mask')  if p.is_dir()]
    if not img_dirs or not mask_dirs:
        return None

    best = None
    best_n = 0
    for I in img_dirs:
        imgs = {f.name for f in I.iterdir() if f.is_file() and f.suffix.lower() in IMG_EXTS}
        if not imgs:
            continue
        for M in mask_dirs:
            masks = {f.name for f in M.iterdir() if f.is_file() and f.suffix.lower() in IMG_EXTS}
            inter = sorted(imgs & masks)
            if len(inter) > best_n:
                best = (I, M, inter)
                best_n = len(inter)
    return best if best_n > 0 else None

def attempt_kaggle_download():
    """If Kaggle bundle not extracted, try to download it (requires kaggle.json)."""
    ok = have_kaggle_image_mask_dirs(KAGGLE_ROOT)
    if ok:
        return True
    print("Kaggle bundle not detected; trying to download (needs kaggle.json)...")
    # move kaggle.json from /content if present
    os.makedirs("/root/.kaggle", exist_ok=True)
    if os.path.exists("/content/kaggle.json"):
        shutil.move("/content/kaggle.json", "/root/.kaggle/kaggle.json")
    if os.path.exists("/root/.kaggle/kaggle.json"):
        os.chmod("/root/.kaggle/kaggle.json", 0o600)
        # download
        os.makedirs(KAGGLE_ROOT, exist_ok=True)
        subprocess.run(["kaggle", "datasets", "download", "-d", KAGGLE_SLUG, "-p", KAGGLE_ROOT, "-q"], check=False)
        # unzip any zips
        for z in Path(KAGGLE_ROOT).glob("*.zip"):
            with zipfile.ZipFile(z, 'r') as zf:
                zf.extractall(KAGGLE_ROOT)
        return have_kaggle_image_mask_dirs(KAGGLE_ROOT) is not None
    else:
        print("No kaggle.json found. Skipping Kaggle download.")
        return False

def process_kaggle_bundle():
    pairs = []
    found = have_kaggle_image_mask_dirs(KAGGLE_ROOT)
    if not found:
        print(f"Kaggle 704: no image/mask dirs detected under {KAGGLE_ROOT}")
        return pairs
    img_dir, mask_dir, names = found
    print(f"Kaggle 704: using image dir {img_dir}")
    print(f"Kaggle 704: using mask dir  {mask_dir}")
    print(f"Kaggle 704: matched pairs  : {len(names)}")

    for name in tqdm(names, desc="Kaggle 704"):
        ip = img_dir / name
        mp = mask_dir / name
        img_np  = open_gray(str(ip))
        mask_np = binarize(open_gray(str(mp)))

        H, W = img_np.shape[:2]
        mask_np = ensure_size(mask_np, W, H)
        img_np_r = resize_img(img_np, MAX_SIDE)
        if MAX_SIDE is not None:
            H2, W2 = img_np_r.shape[:2]
            mask_np = np.array(Image.fromarray(mask_np).resize((W2, H2), resample=Image.NEAREST))

        img_np_u8 = to_uint8_minmax(img_np_r)
        mask_np_b = binarize(mask_np)
        if mask_np_b.sum() == 0:
            continue

        stem = Path(name).stem   # e.g., "1000"
        out_stem = f"kg_{stem}"
        img_out, msk_out = save_pair(img_np_u8, mask_np_b, out_stem)
        pairs.append((out_stem, *rel_paths(img_out, msk_out)))
    print(f"Kaggle 704: saved {len(pairs)} pairs")
    return pairs

# =========================
# 5) Main: run sources → split → CSVs → zip
# =========================
def main():
    all_pairs = []

    if INCLUDE_KAGGLE:
        # Try to use existing bundle; if missing, try to download with kaggle.json
        if not have_kaggle_image_mask_dirs(KAGGLE_ROOT):
            attempt_kaggle_download()
        all_pairs += process_kaggle_bundle()

    if INCLUDE_NIH:
        # Use what you already have under /content/datasets/Xray_Chest
        all_pairs += process_montgomery()
        all_pairs += process_shenzhen()

    # Deduplicate by out_stem if any collision (unlikely due to prefixes)
    seen = set()
    uniq_pairs = []
    for sid, rel_img, rel_msk in all_pairs:
        if sid in seen:
            continue
        seen.add(sid)
        uniq_pairs.append((sid, rel_img, rel_msk))

    if not uniq_pairs:
        print("❌ No valid image–mask pairs found from selected sources.")
        print("   Check paths, or set INCLUDE_* flags accordingly.")
        return

    print(f"\nTotal unique pairs: {len(uniq_pairs)}")

    # Splits
    stems = [sid for (sid, _, _) in uniq_pairs]
    train_val, test = train_test_split(stems, test_size=TEST_SIZE, random_state=42, shuffle=True)
    train, val = train_test_split(train_val, test_size=VAL_SIZE_OF_TRAIN, random_state=42, shuffle=True)
    s_train, s_val, s_test = set(train), set(val), set(test)

    entries = {'train': [], 'val': [], 'test': []}
    for sid, rel_img, rel_msk in uniq_pairs:
        if sid in s_train:
            entries['train'].append(f"{rel_img},{rel_msk}")
        elif sid in s_val:
            entries['val'].append(f"{rel_img},{rel_msk}")
        else:
            entries['test'].append(f"{rel_img},{rel_msk}")

    os.makedirs(BASE_OUTPUT_DIR, exist_ok=True)
    for split_name in ['train','val','test']:
        csv_path = os.path.join(BASE_OUTPUT_DIR, f"{split_name}.csv")
        with open(csv_path, 'w') as f:
            f.write('\n'.join(entries[split_name]))
        print(f"✅ Saved {csv_path} with {len(entries[split_name])} entries.")

    # Zip
    def zip_folder(folder_path, output_path):
        with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zf:
            for root, _, files in os.walk(folder_path):
                for file in files:
                    fp = os.path.join(root, file)
                    arc = os.path.relpath(fp, folder_path)
                    zf.write(fp, arc)

    folder_to_zip = BASE_OUTPUT_DIR
    if glob.glob(os.path.join(IMAGE_OUTPUT_DIR, "*.png")):
        output_zip = "/content/xray_chest_dataset.zip"
        zip_folder(folder_to_zip, output_zip)
        print(f"📦 Zipped {folder_to_zip} → {output_zip}")
    else:
        print(f"Nothing to zip at {folder_to_zip} (no images found).")

# Run
main()

Kaggle 704: using image dir /content/datasets/Xray_Chest_704_raw/Chest-X-Ray/Chest-X-Ray/image
Kaggle 704: using mask dir  /content/datasets/Xray_Chest_704_raw/Chest-X-Ray/Chest-X-Ray/mask
Kaggle 704: matched pairs  : 704


Kaggle 704: 100%|██████████| 704/704 [24:58<00:00,  2.13s/it]


Kaggle 704: saved 704 pairs
Montgomery: no images in /content/datasets/Xray_Chest/montgomery/CXR_png
Shenzhen: no images in /content/datasets/Xray_Chest/shenzhen/CXR_png

Total unique pairs: 704
✅ Saved /content/preprocessed_datasets/xraychest/train.csv with 492 entries.
✅ Saved /content/preprocessed_datasets/xraychest/val.csv with 71 entries.
✅ Saved /content/preprocessed_datasets/xraychest/test.csv with 141 entries.
📦 Zipped /content/preprocessed_datasets/xraychest → /content/xray_chest_dataset.zip


# v4

In [None]:
# -*- coding: utf-8 -*-
"""
Final Colab script: X-Ray Chest (Montgomery + Shenzhen + Kaggle 704) → PNG pairs + CSV splits
- Kaggle bundle: iamtapendu/chest-x-ray-lungs-segmentation (704 lung-mask pairs)
- Optional NIH folders if you already have them locally
- Single foreground class (lungs): 0 background, 255 foreground
- Deterministic: sorted IDs + random_state=42 (+ RNG seeds)
- Output: /content/preprocessed_datasets/xraychest/{images,masks,train.csv,val.csv,test.csv,manifest.csv}
"""

# =========================
# 0) Setup
# =========================
!pip -q install kaggle tqdm scikit-learn pillow >/dev/null

import os, re, glob, zipfile, shutil, subprocess, random
from pathlib import Path
import numpy as np
from PIL import Image
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# RNG seeds for determinism
RAND_SEED = 42
np.random.seed(RAND_SEED)
random.seed(RAND_SEED)

# =========================
# 1) Config (edit if needed)
# =========================
# Sources (toggle as you like)
INCLUDE_KAGGLE  = True   # /content/datasets/Xray_Chest_704_raw
INCLUDE_NIH     = False  # /content/datasets/Xray_Chest/{montgomery,shenzhen} (turn on if you have them)

# Kaggle bundle
KAGGLE_SLUG  = "iamtapendu/chest-x-ray-lungs-segmentation"
KAGGLE_ROOT  = "datasets/Xray_Chest_704_raw"

# NIH layout (used only if INCLUDE_NIH=True and dirs exist)
NIH_BASE_DIR       = "datasets/Xray_Chest"
MONT_IMG_DIR       = f"{NIH_BASE_DIR}/montgomery/CXR_png"
MONT_LEFT_MASK_DIR = f"{NIH_BASE_DIR}/montgomery/ManualMask/leftMask"
MONT_RIGHT_MASK_DIR= f"{NIH_BASE_DIR}/montgomery/ManualMask/rightMask"
SHEN_IMG_DIR       = f"{NIH_BASE_DIR}/shenzhen/CXR_png"
SHEN_MASK_ROOT     = f"{NIH_BASE_DIR}/shenzhen/masks"   # we search recursively

# Output
BASE_OUTPUT_DIR = "preprocessed_datasets/xraychest"
IMAGE_OUTPUT_DIR= f"{BASE_OUTPUT_DIR}/images"
MASK_OUTPUT_DIR = f"{BASE_OUTPUT_DIR}/masks"
os.makedirs(IMAGE_OUTPUT_DIR, exist_ok=True)
os.makedirs(MASK_OUTPUT_DIR,  exist_ok=True)

# Splits: 70/10/20 (like your hip pipeline)
TEST_SIZE = 0.20
VAL_SIZE_OF_TRAIN = 0.125  # 0.1 / 0.8

# Optional: limit max side (longer edge) to save memory. None keeps original sizes.
MAX_SIDE = None  # e.g., 2048

# =========================
# 2) Helpers
# =========================
IMG_EXTS = ('.png', '.jpg', '.jpeg', '.bmp', '.tif', '.tiff')

def to_uint8_minmax(img_np):
    img = img_np.astype(np.float32)
    mn, mx = img.min(), img.max()
    if mx > mn:
        img = (img - mn) / (mx - mn) * 255.0
    else:
        img = np.zeros_like(img)
    return img.clip(0, 255).astype(np.uint8)

def open_gray(path):
    im = Image.open(path)
    if im.mode not in ('L', 'I;16', 'I'):
        im = im.convert('L')
    arr = np.array(im)
    if arr.dtype != np.uint8:
        arr = to_uint8_minmax(arr)
    return arr

def resize_img(img_np, target_max_side):
    if target_max_side is None: return img_np
    h, w = img_np.shape[:2]
    mx = max(h, w)
    if mx <= target_max_side: return img_np
    scale = target_max_side / float(mx)
    new_w, new_h = int(round(w*scale)), int(round(h*scale))
    return np.array(Image.fromarray(img_np).resize((new_w, new_h), resample=Image.BILINEAR))

def ensure_size(mask_np, W, H):
    if mask_np.shape[:2] == (H, W):
        return mask_np
    return np.array(Image.fromarray(mask_np).resize((W, H), resample=Image.NEAREST))

def binarize(mask_np):
    # single foreground class (lungs)
    return (mask_np > 0).astype(np.uint8) * 255

def save_pair(img_np, mask_np, out_stem):
    img_p = os.path.join(IMAGE_OUTPUT_DIR, out_stem + '.png')
    msk_p = os.path.join(MASK_OUTPUT_DIR,  out_stem + '.png')
    Image.fromarray(img_np).save(img_p)
    Image.fromarray(mask_np).save(msk_p)
    return img_p, msk_p

def rel_paths(img_out, msk_out):
    rel_img = os.path.join(os.path.basename(BASE_OUTPUT_DIR), 'images', os.path.basename(img_out)).replace(os.sep, '/')
    rel_msk = os.path.join(os.path.basename(BASE_OUTPUT_DIR), 'masks', os.path.basename(msk_out)).replace(os.sep, '/')
    return rel_img, rel_msk

# =========================
# 3) Kaggle 704 processing
# =========================
def have_kaggle_image_mask_dirs(root):
    """Return (image_dir, mask_dir, matched_names) if found, else None."""
    rootp = Path(root)
    if not rootp.exists():
        return None
    img_dirs  = [p for p in rootp.rglob('image') if p.is_dir()]
    mask_dirs = [p for p in rootp.rglob('mask')  if p.is_dir()]
    if not img_dirs or not mask_dirs:
        return None

    best = None
    best_n = 0
    for I in img_dirs:
        imgs = {f.name for f in I.iterdir() if f.is_file() and f.suffix.lower() in IMG_EXTS}
        if not imgs:
            continue
        for M in mask_dirs:
            masks = {f.name for f in M.iterdir() if f.is_file() and f.suffix.lower() in IMG_EXTS}
            inter = sorted(imgs & masks)
            if len(inter) > best_n:
                best = (I, M, sorted(inter, key=str))  # sort for determinism
                best_n = len(inter)
    return best if best_n > 0 else None

def attempt_kaggle_download():
    """If Kaggle bundle not extracted, try to download it (requires kaggle.json)."""
    ok = have_kaggle_image_mask_dirs(KAGGLE_ROOT)
    if ok:
        return True
    print("Kaggle bundle not detected; trying to download (needs kaggle.json)...")
    # move kaggle.json from /content if present
    os.makedirs("/root/.kaggle", exist_ok=True)
    if os.path.exists("/content/kaggle.json"):
        shutil.move("/content/kaggle.json", "/root/.kaggle/kaggle.json")
    if os.path.exists("/root/.kaggle/kaggle.json"):
        os.chmod("/root/.kaggle/kaggle.json", 0o600)
        # download
        os.makedirs(KAGGLE_ROOT, exist_ok=True)
        subprocess.run(["kaggle", "datasets", "download", "-d", KAGGLE_SLUG, "-p", KAGGLE_ROOT, "-q"], check=False)
        # unzip any zips
        for z in Path(KAGGLE_ROOT).glob("*.zip"):
            with zipfile.ZipFile(z, 'r') as zf:
                zf.extractall(KAGGLE_ROOT)
        return have_kaggle_image_mask_dirs(KAGGLE_ROOT) is not None
    else:
        print("No kaggle.json found. Skipping Kaggle download.")
        return False

def process_kaggle_bundle():
    pairs = []
    found = have_kaggle_image_mask_dirs(KAGGLE_ROOT)
    if not found:
        print(f"Kaggle 704: no image/mask dirs detected under {KAGGLE_ROOT}")
        return pairs
    img_dir, mask_dir, names = found
    print(f"Kaggle 704: using image dir {img_dir}")
    print(f"Kaggle 704: using mask dir  {mask_dir}")
    print(f"Kaggle 704: matched pairs  : {len(names)}")

    for name in tqdm(names, desc="Kaggle 704"):
        ip = img_dir / name
        mp = mask_dir / name
        img_np  = open_gray(str(ip))
        mask_np = binarize(open_gray(str(mp)))

        H, W = img_np.shape[:2]
        mask_np = ensure_size(mask_np, W, H)
        img_np_r = resize_img(img_np, MAX_SIDE)
        if MAX_SIDE is not None:
            H2, W2 = img_np_r.shape[:2]
            mask_np = np.array(Image.fromarray(mask_np).resize((W2, H2), resample=Image.NEAREST))

        img_np_u8 = to_uint8_minmax(img_np_r)
        mask_np_b = binarize(mask_np)
        if mask_np_b.sum() == 0:
            continue

        stem = Path(name).stem   # e.g., "1000"
        out_stem = f"kg_{stem}"
        img_out, msk_out = save_pair(img_np_u8, mask_np_b, out_stem)
        pairs.append((out_stem, *rel_paths(img_out, msk_out)))
    print(f"Kaggle 704: saved {len(pairs)} pairs")
    return pairs

# =========================
# 4) NIH processing (optional)
# =========================
def process_montgomery():
    pairs = []
    img_files = sorted([p for p in glob.glob(os.path.join(MONT_IMG_DIR, '*')) if p.lower().endswith(IMG_EXTS)])
    if not img_files:
        print(f"Montgomery: no images in {MONT_IMG_DIR}")
        return pairs
    print(f"Montgomery: found {len(img_files)} images")

    for ip in tqdm(img_files, desc="Montgomery"):
        base = os.path.basename(ip)  # MCUCXR_0001_0.png
        lmask = os.path.join(MONT_LEFT_MASK_DIR, base)
        rmask = os.path.join(MONT_RIGHT_MASK_DIR, base)
        if not (os.path.exists(lmask) and os.path.exists(rmask)):
            continue
        img_np = open_gray(ip)
        ml = open_gray(lmask)
        mr = open_gray(rmask)
        mask_np = binarize(ml) | binarize(mr)

        H, W = img_np.shape[:2]
        mask_np = ensure_size(mask_np, W, H)
        img_np_r = resize_img(img_np, MAX_SIDE)
        if MAX_SIDE is not None:
            H2, W2 = img_np_r.shape[:2]
            mask_np = np.array(Image.fromarray(mask_np).resize((W2, H2), resample=Image.NEAREST))
        img_np_u8 = to_uint8_minmax(img_np_r)
        mask_np_b = binarize(mask_np)
        if mask_np_b.sum() == 0:
            continue

        stem = Path(base).stem
        out_stem = f"mc_{stem}"
        img_out, msk_out = save_pair(img_np_u8, mask_np_b, out_stem)
        pairs.append((out_stem, *rel_paths(img_out, msk_out)))
    print(f"Montgomery: saved {len(pairs)} pairs")
    return pairs

def find_shenzhen_mask(stem):
    root = Path(SHEN_MASK_ROOT)
    if not root.exists():
        return None
    candidates = []
    for p in root.rglob(f"{stem}*"):
        if p.is_file() and p.suffix.lower() in IMG_EXTS:
            name_low = p.name.lower()
            if ('mask' in name_low) or ('masks' in p.parent.name.lower()) or ('mask' in p.parent.name.lower()):
                candidates.append(p)
    if candidates:
        candidates.sort(key=lambda x: (0 if '_mask' in x.name.lower() else 1,
                                       len(str(x)),
                                       str(x).lower()))
        return str(candidates[0])
    for p in root.rglob(f"{stem}.*"):
        if p.is_file() and p.suffix.lower() in IMG_EXTS:
            return str(p)
    return None

def process_shenzhen():
    pairs = []
    img_files = sorted([p for p in glob.glob(os.path.join(SHEN_IMG_DIR, '*')) if p.lower().endswith(IMG_EXTS)])
    if not img_files:
        print(f"Shenzhen: no images in {SHEN_IMG_DIR}")
        return pairs
    print(f"Shenzhen: found {len(img_files)} images. Searching masks under {SHEN_MASK_ROOT} ...")

    for ip in tqdm(img_files, desc="Shenzhen"):
        base = os.path.basename(ip)  # CHNCXR_0001_0.png
        stem = Path(base).stem
        mpath = find_shenzhen_mask(stem)
        if mpath is None:
            continue
        img_np  = open_gray(ip)
        mask_np = binarize(open_gray(mpath))

        H, W = img_np.shape[:2]
        mask_np = ensure_size(mask_np, W, H)
        img_np_r = resize_img(img_np, MAX_SIDE)
        if MAX_SIDE is not None:
            H2, W2 = img_np_r.shape[:2]
            mask_np = np.array(Image.fromarray(mask_np).resize((W2, H2), resample=Image.NEAREST))

        img_np_u8 = to_uint8_minmax(img_np_r)
        mask_np_b = binarize(mask_np)
        if mask_np_b.sum() == 0:
            continue

        out_stem = f"sh_{stem}"
        img_out, msk_out = save_pair(img_np_u8, mask_np_b, out_stem)
        pairs.append((out_stem, *rel_paths(img_out, msk_out)))
    print(f"Shenzhen: saved {len(pairs)} pairs")
    return pairs

# =========================
# 5) Main: run sources → split → CSVs → zip
# =========================
def main():
    all_pairs = []

    if INCLUDE_KAGGLE:
        if not have_kaggle_image_mask_dirs(KAGGLE_ROOT):
            attempt_kaggle_download()
        all_pairs += process_kaggle_bundle()

    if INCLUDE_NIH:
        all_pairs += process_montgomery()
        all_pairs += process_shenzhen()

    # Deduplicate and sort by ID to make splits order-independent
    seen, uniq_pairs = set(), []
    for sid, rel_img, rel_msk in all_pairs:
        if sid in seen:
            continue
        seen.add(sid)
        uniq_pairs.append((sid, rel_img, rel_msk))
    if not uniq_pairs:
        print("❌ No valid image–mask pairs found from selected sources.")
        return

    uniq_pairs.sort(key=lambda x: x[0])
    print(f"\nTotal unique pairs: {len(uniq_pairs)}")

    # Splits (deterministic)
    stems = [sid for (sid, _, _) in uniq_pairs]
    train_val, test = train_test_split(stems, test_size=TEST_SIZE, random_state=RAND_SEED, shuffle=True)
    train, val = train_test_split(train_val, test_size=VAL_SIZE_OF_TRAIN, random_state=RAND_SEED, shuffle=True)
    s_train, s_val, s_test = set(train), set(val), set(test)

    entries = {'train': [], 'val': [], 'test': []}
    for sid, rel_img, rel_msk in uniq_pairs:
        if sid in s_train:
            entries['train'].append(f"{rel_img},{rel_msk}")
        elif sid in s_val:
            entries['val'].append(f"{rel_img},{rel_msk}")
        else:
            entries['test'].append(f"{rel_img},{rel_msk}")

    os.makedirs(BASE_OUTPUT_DIR, exist_ok=True)
    for split_name in ['train','val','test']:
        csv_path = os.path.join(BASE_OUTPUT_DIR, f"{split_name}.csv")
        with open(csv_path, 'w') as f:
            f.write('\n'.join(entries[split_name]))
        print(f"✅ Saved {csv_path} with {len(entries[split_name])} entries.")

    # Write a manifest for audit/repro
    with open(os.path.join(BASE_OUTPUT_DIR, "manifest.csv"), "w") as f:
        f.write("id,image,mask\n")
        for sid, rel_img, rel_msk in uniq_pairs:
            f.write(f"{sid},{rel_img},{rel_msk}\n")

    # Zip the folder
    def zip_folder(folder_path, output_path):
        with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zf:
            for root, _, files in os.walk(folder_path):
                for file in files:
                    fp = os.path.join(root, file)
                    arc = os.path.relpath(fp, folder_path)
                    zf.write(fp, arc)

    if glob.glob(os.path.join(IMAGE_OUTPUT_DIR, "*.png")):
        output_zip = "/content/xray_chest_dataset.zip"
        zip_folder(BASE_OUTPUT_DIR, output_zip)
        print(f"📦 Zipped {BASE_OUTPUT_DIR} → {output_zip}")
    else:
        print(f"Nothing to zip at {BASE_OUTPUT_DIR} (no images found).")

# Run
main()

Kaggle bundle not detected; trying to download (needs kaggle.json)...
Kaggle 704: using image dir /content/datasets/Xray_Chest_704_raw/Chest-X-Ray/Chest-X-Ray/image
Kaggle 704: using mask dir  /content/datasets/Xray_Chest_704_raw/Chest-X-Ray/Chest-X-Ray/mask
Kaggle 704: matched pairs  : 704


Kaggle 704: 100%|██████████| 704/704 [24:57<00:00,  2.13s/it]


Kaggle 704: saved 704 pairs

Total unique pairs: 704
✅ Saved /content/preprocessed_datasets/xraychest/train.csv with 492 entries.
✅ Saved /content/preprocessed_datasets/xraychest/val.csv with 71 entries.
✅ Saved /content/preprocessed_datasets/xraychest/test.csv with 141 entries.
📦 Zipped /content/preprocessed_datasets/xraychest → /content/xray_chest_dataset.zip
