In [None]:
!pip install -q kaggle==1.5.12 pydicom opencv-python pillow pandas matplotlib seaborn tqdm


In [None]:
import os, sys, pathlib, zipfile, shutil

DATA_ROOT = pathlib.Path("data/vinbigdata_png")
DATA_ROOT.mkdir(parents=True, exist_ok=True)

KAGGLE_SLUG = "corochann/vinbigdata-chest-xray-original-png"

print("Downloading dataset (requires kaggle API token)...")
os.system(f"kaggle datasets download -d {KAGGLE_SLUG} -p {DATA_ROOT} --unzip")
print("Download complete â€” check", DATA_ROOT)


In [None]:
from pathlib import Path
DATA_ROOT = Path("data/vinbigdata_png")
print("Exists:", DATA_ROOT.exists())
print("Top-level files:", list(DATA_ROOT.glob("*"))[:20])
# List CSVs
for f in DATA_ROOT.glob("*.csv"):
    print("CSV:", f.name)


In [None]:
import pandas as pd
train_meta = pd.read_csv(DATA_ROOT / "train_meta.csv")    # adapt name if different
test_meta  = pd.read_csv(DATA_ROOT / "test_meta.csv")
print("Train meta shape:", train_meta.shape)
print("Test meta shape:", test_meta.shape)
train_meta.head()


In [None]:
import matplotlib.pyplot as plt
import cv2, ast

sample = train_meta.sample(6, random_state=1).reset_index(drop=True)
plt.figure(figsize=(12,8))
for i,row in sample.iterrows():
    img_path = DATA_ROOT / "train" / f"{row['image_id']}.png"   # adjust if path differs
    img = cv2.imread(str(img_path), cv2.IMREAD_GRAYSCALE)
    plt.subplot(2,3,i+1); plt.imshow(img, cmap='gray'); plt.axis('off')
    plt.title(row.get('class','') or row.get('label','') or row.get('finding',''))
plt.tight_layout()


In [None]:
import json
from collections import defaultdict

ann_by_image = defaultdict(list)
if 'bbox' in train_meta.columns:
    for _,r in train_meta.iterrows():
        image_id = r['image_id']
        bbox = r['bbox']
        try:
            bbox_list = ast.literal_eval(bbox)
        except Exception:
            bbox_list = None
        if bbox_list:
            ann_by_image[image_id].append({
                "bbox": bbox_list,
                "label": r.get('label',''),
            })
            
with open(DATA_ROOT/"annotations_preview.json","w") as fh:
    json.dump({"count_images": len(ann_by_image), "example": next(iter(ann_by_image.items())) if ann_by_image else {}}, fh, indent=2)
print("Saved annotation preview at", DATA_ROOT/"annotations_preview.json")


In [None]:
from sklearn.model_selection import train_test_split
if 'label' in train_meta.columns:
    train_df, val_df = train_test_split(train_meta, test_size=0.2, stratify=train_meta['label'], random_state=42)
else:
    train_df, val_df = train_test_split(train_meta, test_size=0.2, random_state=42)

(train_df.shape, val_df.shape)
# Save splits
train_df.to_csv(DATA_ROOT/"train_split.csv", index=False)
val_df.to_csv(DATA_ROOT/"val_split.csv", index=False)
print("Saved train/val splits.")


In [None]:
from PIL import Image
OUTPUT_PREP = DATA_ROOT/"prep_samples"
OUTPUT_PREP.mkdir(exist_ok=True, parents=True)
for i,row in train_df.head(50).iterrows():
    src = DATA_ROOT/"train"/f"{row['image_id']}.png"
    if src.exists():
        im = Image.open(src).convert("L").resize((512,512))
        im.save(OUTPUT_PREP/f"{row['image_id']}.png")
print("Saved 50 resized preview images to", OUTPUT_PREP)
