# FL-EHDS: Imaging DP Experiments on Colab

Runs Differential Privacy evaluation on imaging datasets (Chest X-Ray, Brain Tumor, Skin Cancer).

**Setup:** Runtime > Change runtime type > **T4 GPU**

**Experiments:** 3 algos × 3 datasets × 4 DP levels × 3 seeds = 108 experiments

**Checkpoint:** Saved to Google Drive after every round (~1-2 min granularity)

## 1. Setup Environment

In [None]:
# Mount Google Drive for persistent checkpoint storage
from google.colab import drive
drive.mount('/content/drive')

# Create persistent output directory on Drive
import os
DRIVE_OUTPUT = '/content/drive/MyDrive/FL-EHDS-FLICS2026/colab_results'
os.makedirs(DRIVE_OUTPUT, exist_ok=True)
print(f'Drive output: {DRIVE_OUTPUT}')

In [None]:
# Check GPU
import torch
print(f'PyTorch: {torch.__version__}')
print(f'CUDA available: {torch.cuda.is_available()}')
if torch.cuda.is_available():
    print(f'GPU: {torch.cuda.get_device_name(0)}')
    props = torch.cuda.get_device_properties(0)
    mem = getattr(props, 'total_memory', None) or getattr(props, 'total_mem', 0)
    print(f'Memory: {mem / 1e9:.1f} GB')

In [None]:
# Clone repository
!git clone https://github.com/FabioLiberti/FL-EHDS-FLICS2026.git /content/FL-EHDS-FLICS2026
%cd /content/FL-EHDS-FLICS2026/fl-ehds-framework

In [None]:
# Install minimal dependencies (most already in Colab)
!pip install -q scikit-learn scipy tqdm Pillow

## 2. Download Datasets

In [None]:
# Setup Kaggle API (using kagglehub - supports KGAT_ tokens)
!pip install -q kagglehub

import os
os.environ['KAGGLE_API_TOKEN'] = 'KGAT_edd561c1bc682c9ad06930bacd164431'

import kagglehub
print(f'kagglehub version: {kagglehub.__version__}')
print('Kaggle auth ready')

In [None]:
%%time
# Download Chest X-Ray Pneumonia (~2.3 GB)
import kagglehub, shutil, os

cache_path = kagglehub.dataset_download("paultimothymooney/chest-xray-pneumonia")
print(f'Downloaded to cache: {cache_path}')

# Copy to data/chest_xray with correct structure
os.makedirs('data/chest_xray', exist_ok=True)
for item in ['train', 'test', 'val']:
    src = os.path.join(cache_path, 'chest_xray', item)
    if not os.path.exists(src):
        src = os.path.join(cache_path, item)
    dst = f'data/chest_xray/{item}'
    if os.path.exists(src) and not os.path.exists(dst):
        shutil.copytree(src, dst)
        print(f'  Copied {item}')

# Remove macOS junk
shutil.rmtree('data/chest_xray/__MACOSX', ignore_errors=True)

print('Chest X-Ray ready:')
!find data/chest_xray -name '*.jpeg' -o -name '*.jpg' -o -name '*.png' | wc -l

In [None]:
%%time
# Download Skin Cancer (~325 MB)
cache_path = kagglehub.dataset_download("fanconic/skin-cancer-malignant-vs-benign")
print(f'Downloaded to cache: {cache_path}')

# Copy to data/Skin Cancer
dst = 'data/Skin Cancer'
if not os.path.exists(dst):
    shutil.copytree(cache_path, dst)

print('Skin Cancer ready:')
!find "data/Skin Cancer" -name '*.jpg' -o -name '*.jpeg' -o -name '*.png' | wc -l

In [None]:
%%time
# Download Brain Tumor (~250 MB)
import glob

cache_path = kagglehub.dataset_download("masoudnickparvar/brain-tumor-mri-dataset")
print(f'Downloaded to cache: {cache_path}')

# Copy classes to data/Brain_Tumor (merge Training/Testing if split)
os.makedirs('data/Brain_Tumor', exist_ok=True)

for root, dirs, files in os.walk(cache_path):
    for d in dirs:
        d_lower = d.lower()
        if d_lower in ['glioma', 'meningioma', 'pituitary', 'notumor', 'no_tumor', 'healthy']:
            target = 'healthy' if d_lower in ['notumor', 'no_tumor'] else d_lower
            src = os.path.join(root, d)
            dst = f'data/Brain_Tumor/{target}'
            if not os.path.exists(dst):
                shutil.copytree(src, dst)
            else:
                for f in os.listdir(src):
                    src_f = os.path.join(src, f)
                    dst_f = os.path.join(dst, f)
                    if os.path.isfile(src_f) and not os.path.exists(dst_f):
                        shutil.copy2(src_f, dst_f)

print('Brain Tumor ready:')
!find data/Brain_Tumor -name '*.jpg' -o -name '*.jpeg' -o -name '*.png' | wc -l
!ls data/Brain_Tumor/

In [None]:
# Verify all datasets
print('=== Dataset Summary ===')
for ds_name, ds_path in [('Chest X-Ray', 'data/chest_xray'), 
                          ('Skin Cancer', 'data/Skin Cancer'),
                          ('Brain Tumor', 'data/Brain_Tumor')]:
    count = sum(1 for _ in glob.iglob(f'{ds_path}/**/*.*', recursive=True) 
                if _.lower().endswith(('.jpg', '.jpeg', '.png')))
    subdirs = [d for d in os.listdir(ds_path) if os.path.isdir(os.path.join(ds_path, d))]
    print(f'  {ds_name:15s}: {count:5d} images, classes: {subdirs}')

## 3. Patch Script for Colab + Drive Checkpoint

Redirect checkpoint output to Google Drive for persistence across sessions.

In [None]:
import sys
sys.path.insert(0, '/content/FL-EHDS-FLICS2026/fl-ehds-framework')

# Patch the script to save checkpoints to Google Drive
import benchmarks.run_imaging_dp as dp_module
from pathlib import Path

# Override output directory to Google Drive
dp_module.OUTPUT_DIR = Path(DRIVE_OUTPUT)
print(f'Checkpoint directory: {dp_module.OUTPUT_DIR}')
print(f'Checkpoint file: {dp_module.OUTPUT_DIR / dp_module.CHECKPOINT_FILE}')

## 4. Quick Validation (Optional)

Run a quick 3-round test to verify everything works before the full run.

In [None]:
# Quick validation: 1 algo x 1 dataset x 1 epsilon x 1 seed x 3 rounds
# Should complete in ~2-3 minutes
import sys, os
sys.path.insert(0, '/content/FL-EHDS-FLICS2026/fl-ehds-framework')
os.chdir('/content/FL-EHDS-FLICS2026/fl-ehds-framework')

from benchmarks.run_imaging_dp import *
from pathlib import Path

# Quick single test
config = {**IMAGING_CONFIG, 'num_rounds': 3, 'local_epochs': 1}
es_config = {'enabled': False}

print('Quick validation: chest_xray / FedAvg / eps=10 / seed=42 / 3 rounds')
result = run_single_imaging(
    ds_name='chest_xray',
    data_dir=str(Path('.') / 'data' / 'chest_xray'),
    algorithm='FedAvg',
    dp_epsilon=10,
    seed=42,
    config=config,
    es_config=es_config,
    exp_idx=1, total_exps=1,
)
print(f'Result: {result["best_metrics"]}')
print('Validation OK!')

## 5. Run Full DP Experiments

**108 experiments** = 3 algos (FedAvg, Ditto, HPFL) × 3 datasets × 4 DP levels (No-DP, eps=1, 5, 10) × 3 seeds

Checkpoint saved to Google Drive after **every round** (~1-2 min).

If the session disconnects, re-run this cell — it auto-resumes from the last checkpoint.

In [None]:
%%time
# Full DP experiment run with checkpoint on Google Drive
# Auto-resumes if session disconnects

import subprocess
import sys

# Create a wrapper script that patches OUTPUT_DIR before running
wrapper_code = f'''
import sys, os
sys.path.insert(0, '/content/FL-EHDS-FLICS2026/fl-ehds-framework')
os.chdir('/content/FL-EHDS-FLICS2026/fl-ehds-framework')

# Patch OUTPUT_DIR to Google Drive before importing main()
import benchmarks.run_imaging_dp as dp_mod
from pathlib import Path
dp_mod.OUTPUT_DIR = Path("{DRIVE_OUTPUT}")

# Patch sys.argv for argparse
sys.argv = ["run_imaging_dp.py"]

# Run
dp_mod.main()
'''

with open('/tmp/run_dp.py', 'w') as f:
    f.write(wrapper_code)

!python /tmp/run_dp.py

## 5b. Alternative: Run One Dataset at a Time

If session time is limited, run one dataset per session.

In [None]:
# Run only Chest X-Ray (36 experiments, ~2-3h on T4)
# Change to Brain_Tumor or Skin_Cancer as needed
DATASET = "chest_xray"  # Options: chest_xray, Brain_Tumor, Skin_Cancer

wrapper_code = f'''
import sys, os
sys.path.insert(0, '/content/FL-EHDS-FLICS2026/fl-ehds-framework')
os.chdir('/content/FL-EHDS-FLICS2026/fl-ehds-framework')

import benchmarks.run_imaging_dp as dp_mod
from pathlib import Path
dp_mod.OUTPUT_DIR = Path("{DRIVE_OUTPUT}")

sys.argv = ["run_imaging_dp.py", "--dataset", "{DATASET}"]
dp_mod.main()
'''

with open('/tmp/run_dp_single.py', 'w') as f:
    f.write(wrapper_code)

!python /tmp/run_dp_single.py

## 6. Check Progress & Results

In [None]:
# Check checkpoint status
import json

ckpt_path = f'{DRIVE_OUTPUT}/checkpoint_imaging_dp.json'
if os.path.exists(ckpt_path):
    with open(ckpt_path) as f:
        data = json.load(f)
    
    completed = data.get('completed', {})
    in_progress = data.get('in_progress', {})
    total = data.get('metadata', {}).get('total_experiments', '?')
    
    n_ok = sum(1 for v in completed.values() if 'error' not in v)
    n_err = sum(1 for v in completed.values() if 'error' in v)
    
    print(f'Completed: {n_ok}/{total} (errors: {n_err})')
    
    if in_progress:
        print(f'In progress: {in_progress.get("key", "?")} '
              f'round {in_progress.get("round", "?")}/{in_progress.get("total_rounds", "?")}')
    
    # Summary table
    import numpy as np
    print(f'\n{"Dataset":<14} {"Algorithm":<10} {"No-DP":>8} {"eps=1":>8} {"eps=5":>8} {"eps=10":>8}')
    print('-' * 62)
    
    for ds in ['chest_xray', 'Brain_Tumor', 'Skin_Cancer']:
        for algo in ['FedAvg', 'Ditto', 'HPFL']:
            row = f'{ds:<14} {algo:<10}'
            for eps in [None, 1, 5, 10]:
                eps_str = f'eps{eps}' if eps else 'noDP'
                accs = []
                for seed in [42, 123, 456]:
                    k = f'{ds}_{algo}_{eps_str}_s{seed}'
                    r = completed.get(k, {})
                    if 'error' not in r and r:
                        accs.append(r.get('best_metrics', {}).get('accuracy', 0))
                if accs:
                    row += f' {100*np.mean(accs):>7.1f}%'
                else:
                    row += f' {"--":>8}'
            print(row)
else:
    print('No checkpoint found yet.')

## 7. Download Results

In [None]:
# Download checkpoint JSON for local integration
from google.colab import files

ckpt_path = f'{DRIVE_OUTPUT}/checkpoint_imaging_dp.json'
if os.path.exists(ckpt_path):
    files.download(ckpt_path)
    print(f'Downloaded: checkpoint_imaging_dp.json')
    
    # Also download log
    log_path = f'{DRIVE_OUTPUT}/experiment_imaging_dp.log'
    if os.path.exists(log_path):
        files.download(log_path)
else:
    print('No checkpoint to download yet.')