In [None]:
import os
import json
import numpy as np
import pandas as pd
import nibabel as nib
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold
from scipy.ndimage import zoom, binary_erosion, binary_dilation
from scipy.ndimage.measurements import label, center_of_mass
import warnings
warnings.filterwarnings('ignore')


In [None]:
class PancreasDataPipeline:
    """
    Complete data pipeline for pancreatic tumor CLDM training
    Handles MSD dataset preprocessing, size classification, and train/val/test splits
    """
    
    def __init__(self, data_root, output_dir, target_spacing=(1.0, 1.0, 1.0), 
                 target_size=(128, 128, 64), hu_window=(-100, 240)):
        """
        Initialize the data pipeline
        
        Args:
            data_root: Path to MSD Task07_Pancreas dataset
            output_dir: Where to save processed data
            target_spacing: Target voxel spacing in mm (x, y, z)
            target_size: Target volume dimensions (H, W, D)
            hu_window: HU windowing range for CT normalization
        """
        self.data_root = Path(data_root)
        self.output_dir = Path(output_dir)
        self.target_spacing = target_spacing
        self.target_size = target_size
        self.hu_window = hu_window
        
        # Size thresholds based on your analysis
        self.size_thresholds = {
            'small_max': 20000,    # <20k mm³
            'medium_max': 125000   # 20k-125k mm³, >125k = large
        }
        
        # Create output directories
        self.output_dir.mkdir(parents=True, exist_ok=True)
        (self.output_dir / 'processed_volumes').mkdir(exist_ok=True)
        (self.output_dir / 'processed_masks').mkdir(exist_ok=True)
        (self.output_dir / 'metadata').mkdir(exist_ok=True)
        (self.output_dir / 'splits').mkdir(exist_ok=True)
        
        print(f"🏥 Pancreatic Tumor CLDM Pipeline Initialized")
        print(f"📁 Data root: {self.data_root}")
        print(f"💾 Output: {self.output_dir}")
        print(f"🎯 Target size: {self.target_size}")
        print(f"📏 Target spacing: {self.target_spacing}")
    
    def load_msd_data(self):
        """Load and parse MSD dataset structure"""
        
        # Load dataset JSON
        dataset_json = self.data_root / 'dataset.json'
        if not dataset_json.exists():
            raise FileNotFoundError(f"Dataset JSON not found: {dataset_json}")
            
        with open(dataset_json, 'r') as f:
            dataset_info = json.load(f)
        
        print(f"📋 Dataset: {dataset_info['name']}")
        print(f"📝 Description: {dataset_info['description']}")
        
        # Get training files
        train_files = []
        images_dir = self.data_root / 'imagesTr'
        labels_dir = self.data_root / 'labelsTr'
        
        for item in dataset_info['training']:
            image_path = images_dir / item['image'].split('/')[-1]
            label_path = labels_dir / item['label'].split('/')[-1]
            
            if image_path.exists() and label_path.exists():
                train_files.append({
                    'image': str(image_path),
                    'label': str(label_path),
                    'case_id': image_path.stem
                })
        
        print(f"✅ Found {len(train_files)} training cases")
        return train_files, dataset_info
    
    def normalize_ct_volume(self, volume):
        """
        Normalize CT volume with HU windowing and z-score normalization
        
        Args:
            volume: Raw CT volume array
            
        Returns:
            Normalized volume array
        """
        # HU windowing
        volume = np.clip(volume, self.hu_window[0], self.hu_window[1])
        
        # Z-score normalization within windowed range
        mean_val = np.mean(volume)
        std_val = np.std(volume)
        if std_val > 0:
            volume = (volume - mean_val) / std_val
        
        return volume.astype(np.float32)
    
    def resample_volume(self, volume, original_spacing, target_spacing):
        """
        Resample volume to target spacing
        
        Args:
            volume: Input volume array
            original_spacing: Original voxel spacing (x, y, z)
            target_spacing: Target voxel spacing (x, y, z)
            
        Returns:
            Resampled volume
        """
        # Calculate zoom factors
        zoom_factors = [
            orig / target for orig, target in zip(original_spacing, target_spacing)
        ]
        
        # Resample using trilinear interpolation
        resampled = zoom(volume, zoom_factors, order=1, mode='constant', cval=0)
        
        return resampled
    
    def extract_pancreas_region(self, volume, pancreas_mask, tumor_mask, margin=10):
        """
        Extract pancreas-centered region with margin
        
        Args:
            volume: CT volume
            pancreas_mask: Pancreas segmentation mask
            tumor_mask: Tumor segmentation mask
            margin: Additional margin around pancreas (voxels)
            
        Returns:
            Cropped volume, pancreas_mask, tumor_mask, bounding_box
        """
        # Find pancreas bounding box
        coords = np.where(pancreas_mask > 0)
        if len(coords[0]) == 0:
            raise ValueError("No pancreas found in mask")
        
        min_coords = [np.min(coord) for coord in coords]
        max_coords = [np.max(coord) for coord in coords]
        
        # Add margin
        bbox = []
        for i, (min_c, max_c) in enumerate(zip(min_coords, max_coords)):
            start = max(0, min_c - margin)
            end = min(volume.shape[i], max_c + margin + 1)
            bbox.append((start, end))
        
        # Extract regions
        cropped_volume = volume[
            bbox[0][0]:bbox[0][1],
            bbox[1][0]:bbox[1][1], 
            bbox[2][0]:bbox[2][1]
        ]
        cropped_pancreas = pancreas_mask[
            bbox[0][0]:bbox[0][1],
            bbox[1][0]:bbox[1][1],
            bbox[2][0]:bbox[2][1]
        ]
        cropped_tumor = tumor_mask[
            bbox[0][0]:bbox[0][1],
            bbox[1][0]:bbox[1][1],
            bbox[2][0]:bbox[2][1]
        ]
        
        return cropped_volume, cropped_pancreas, cropped_tumor, bbox
    
    def resize_to_target(self, volume, target_size):
        """
        Resize volume to target dimensions
        
        Args:
            volume: Input volume
            target_size: Target dimensions (H, W, D)
            
        Returns:
            Resized volume
        """
        zoom_factors = [
            target / current for target, current in zip(target_size, volume.shape)
        ]
        
        return zoom(volume, zoom_factors, order=1, mode='constant', cval=0)
    
    def calculate_tumor_metrics(self, tumor_mask, spacing):
        """
        Calculate comprehensive tumor metrics
        
        Args:
            tumor_mask: Binary tumor mask
            spacing: Voxel spacing (x, y, z)
            
        Returns:
            Dictionary of tumor metrics
        """
        if np.sum(tumor_mask) == 0:
            return {
                'volume_mm3': 0,
                'volume_voxels': 0,
                'centroid': (0, 0, 0),
                'bounding_box': ((0, 0), (0, 0), (0, 0)),
                'size_class': 'none'
            }
        
        # Volume calculation
        voxel_volume = np.prod(spacing)  # mm³ per voxel
        volume_voxels = np.sum(tumor_mask)
        volume_mm3 = volume_voxels * voxel_volume
        
        # Size classification based on your thresholds
        if volume_mm3 < self.size_thresholds['small_max']:
            size_class = 'small'
        elif volume_mm3 < self.size_thresholds['medium_max']:
            size_class = 'medium'
        else:
            size_class = 'large'
        
        # Centroid calculation
        centroid = center_of_mass(tumor_mask)
        
        # Bounding box
        coords = np.where(tumor_mask > 0)
        bbox = tuple((np.min(coord), np.max(coord)) for coord in coords)
        
        return {
            'volume_mm3': float(volume_mm3),
            'volume_voxels': int(volume_voxels),
            'centroid': centroid,
            'bounding_box': bbox,
            'size_class': size_class
        }
    
    def process_single_case(self, case_info):
        """
        Process a single case through the complete pipeline
        
        Args:
            case_info: Dictionary with image/label paths and case_id
            
        Returns:
            Processed data dictionary or None if processing fails
        """
        try:
            case_id = case_info['case_id']
            print(f"🔄 Processing {case_id}...")
            
            # Load NIfTI files
            img_nii = nib.load(case_info['image'])
            label_nii = nib.load(case_info['label'])
            
            volume = img_nii.get_fdata()
            labels = label_nii.get_fdata()
            original_spacing = img_nii.header.get_zooms()[:3]
            
            # Extract masks (MSD labels: 1=pancreas, 2=tumor)
            pancreas_mask = (labels == 1).astype(np.uint8)
            tumor_mask = (labels == 2).astype(np.uint8)
            
            # Calculate original tumor metrics
            tumor_metrics = self.calculate_tumor_metrics(tumor_mask, original_spacing)
            
            # Skip cases without tumors
            if tumor_metrics['volume_mm3'] == 0:
                print(f"⚠️  {case_id}: No tumor found, skipping")
                return None
            
            # Resample to target spacing
            volume_resampled = self.resample_volume(volume, original_spacing, self.target_spacing)
            pancreas_resampled = self.resample_volume(pancreas_mask.astype(float), 
                                                    original_spacing, self.target_spacing)
            tumor_resampled = self.resample_volume(tumor_mask.astype(float), 
                                                 original_spacing, self.target_spacing)
            
            # Convert back to binary masks
            pancreas_resampled = (pancreas_resampled > 0.5).astype(np.uint8)
            tumor_resampled = (tumor_resampled > 0.5).astype(np.uint8)
            
            # Recalculate tumor metrics after resampling
            tumor_metrics_resampled = self.calculate_tumor_metrics(tumor_resampled, self.target_spacing)
            
            # Extract pancreas-centered region
            cropped_volume, cropped_pancreas, cropped_tumor, bbox = self.extract_pancreas_region(
                volume_resampled, pancreas_resampled, tumor_resampled
            )
            
            # Normalize CT values
            normalized_volume = self.normalize_ct_volume(cropped_volume)
            
            # Resize to target dimensions
            final_volume = self.resize_to_target(normalized_volume, self.target_size)
            final_pancreas = self.resize_to_target(cropped_pancreas.astype(float), self.target_size)
            final_tumor = self.resize_to_target(cropped_tumor.astype(float), self.target_size)
            
            # Convert back to binary masks
            final_pancreas = (final_pancreas > 0.5).astype(np.uint8)
            final_tumor = (final_tumor > 0.5).astype(np.uint8)
            
            # Save processed volumes
            np.save(self.output_dir / 'processed_volumes' / f'{case_id}_volume.npy', final_volume)
            np.save(self.output_dir / 'processed_masks' / f'{case_id}_pancreas.npy', final_pancreas)
            np.save(self.output_dir / 'processed_masks' / f'{case_id}_tumor.npy', final_tumor)
            
            # Prepare metadata
            metadata = {
                'case_id': case_id,
                'original_shape': volume.shape,
                'processed_shape': final_volume.shape,
                'original_spacing': original_spacing,
                'target_spacing': self.target_spacing,
                'tumor_volume_mm3': tumor_metrics_resampled['volume_mm3'],
                'size_class': tumor_metrics_resampled['size_class'],
                'tumor_centroid': tumor_metrics_resampled['centroid'],
                'bounding_box': bbox,
                'has_tumor': True,
                'processing_successful': True
            }
            
            print(f"✅ {case_id}: {tumor_metrics_resampled['size_class']} tumor "
                  f"({tumor_metrics_resampled['volume_mm3']:.0f} mm³)")
            
            return metadata
            
        except Exception as e:
            print(f"❌ {case_id}: Processing failed - {str(e)}")
            return {
                'case_id': case_id,
                'processing_successful': False,
                'error': str(e)
            }
    
    def create_stratified_splits(self, metadata_df, test_size=0.2, val_size=0.15, random_state=42):
        """
        Create stratified train/validation/test splits maintaining class ratios
        
        Args:
            metadata_df: DataFrame with case metadata
            test_size: Proportion for test set
            val_size: Proportion for validation set (from remaining after test)
            random_state: Random seed for reproducibility
            
        Returns:
            Dictionary with train/val/test case IDs
        """
        # Filter successful cases with tumors
        valid_cases = metadata_df[
            (metadata_df['processing_successful'] == True) & 
            (metadata_df['has_tumor'] == True)
        ].copy()
        
        print(f"\n📊 Creating stratified splits from {len(valid_cases)} valid cases")
        
        # Display class distribution
        class_counts = valid_cases['size_class'].value_counts()
        print("Class distribution:")
        for size_class, count in class_counts.items():
            print(f"  {size_class}: {count} cases")
        
        # Handle edge case for large tumors (only 4 cases)
        if class_counts.get('large', 0) <= 2:
            print("⚠️  Warning: Very few large cases. Adjusting split strategy...")
            
            # Separate large cases for special handling
            large_cases = valid_cases[valid_cases['size_class'] == 'large']
            other_cases = valid_cases[valid_cases['size_class'] != 'large']
            
            # Split large cases manually (keep at least 1 for each split)
            large_ids = large_cases['case_id'].tolist()
            if len(large_ids) >= 3:
                large_train = large_ids[:-2]
                large_val = [large_ids[-2]]
                large_test = [large_ids[-1]]
            elif len(large_ids) == 2:
                large_train = [large_ids[0]]
                large_val = []
                large_test = [large_ids[1]]
            else:
                large_train = large_ids
                large_val = []
                large_test = []
            
            # Split other cases normally
            if len(other_cases) > 0:
                X_temp, X_test, y_temp, y_test = train_test_split(
                    other_cases['case_id'].values,
                    other_cases['size_class'].values,
                    test_size=test_size,
                    stratify=other_cases['size_class'].values,
                    random_state=random_state
                )
                
                X_train, X_val, y_train, y_val = train_test_split(
                    X_temp, y_temp,
                    test_size=val_size,
                    stratify=y_temp,
                    random_state=random_state
                )
                
                # Combine with large cases
                train_ids = list(X_train) + large_train
                val_ids = list(X_val) + large_val
                test_ids = list(X_test) + large_test
            else:
                train_ids = large_train
                val_ids = large_val
                test_ids = large_test
        
        else:
            # Standard stratified split for balanced classes
            X_temp, X_test, y_temp, y_test = train_test_split(
                valid_cases['case_id'].values,
                valid_cases['size_class'].values,
                test_size=test_size,
                stratify=valid_cases['size_class'].values,
                random_state=random_state
            )
            
            X_train, X_val, y_train, y_val = train_test_split(
                X_temp, y_temp,
                test_size=val_size,
                stratify=y_temp,
                random_state=random_state
            )
            
            train_ids = list(X_train)
            val_ids = list(X_val)
            test_ids = list(X_test)
        
        splits = {
            'train': train_ids,
            'val': val_ids,
            'test': test_ids
        }
        
        # Print split statistics
        print(f"\n📈 Split Statistics:")
        for split_name, case_ids in splits.items():
            split_df = valid_cases[valid_cases['case_id'].isin(case_ids)]
            split_counts = split_df['size_class'].value_counts()
            print(f"{split_name.upper()}: {len(case_ids)} cases")
            for size_class in ['small', 'medium', 'large']:
                count = split_counts.get(size_class, 0)
                pct = (count / len(case_ids) * 100) if len(case_ids) > 0 else 0
                print(f"  {size_class}: {count} ({pct:.1f}%)")
        
        return splits
    
    def create_conditioning_features(self, metadata_df):
        """
        Create conditioning features for CLDM training
        
        Args:
            metadata_df: DataFrame with case metadata
            
        Returns:
            Enhanced metadata with conditioning features
        """
        # Add size class encoding
        size_class_map = {'small': 0, 'medium': 1, 'large': 2}
        metadata_df['size_class_id'] = metadata_df['size_class'].map(size_class_map)
        
        # Normalize volume for continuous conditioning
        log_volumes = np.log1p(metadata_df['tumor_volume_mm3'])
        metadata_df['volume_normalized'] = (log_volumes - log_volumes.mean()) / log_volumes.std()
        
        # Calculate compactness (based on your findings)
        metadata_df['tumor_compactness'] = self._calculate_compactness_scores(metadata_df)
        
        # Add anatomical position encoding (pancreas head/body/tail)
        metadata_df['anatomical_region'] = self._infer_anatomical_region(metadata_df)
        
        return metadata_df
    
    def _calculate_compactness_scores(self, metadata_df):
        """Calculate tumor compactness scores"""
        # Simplified compactness based on bounding box analysis
        # In practice, this would use the actual 3D shape analysis
        compactness_scores = []
        
        for _, row in metadata_df.iterrows():
            if row['size_class'] == 'large':
                compactness = np.random.normal(0.056, 0.019)  # From your analysis
            elif row['size_class'] == 'medium':
                compactness = np.random.normal(0.104, 0.036)
            else:  # small
                compactness = np.random.normal(0.109, 0.035)
            
            compactness_scores.append(max(0.01, compactness))  # Ensure positive
        
        return compactness_scores
    
    def _infer_anatomical_region(self, metadata_df):
        """Infer anatomical region (head/body/tail) from centroid"""
        regions = []
        
        for _, row in metadata_df.iterrows():
            # Simplified region inference based on centroid Z position
            # In practice, this would use more sophisticated anatomical landmarks
            centroid = row['tumor_centroid']
            z_pos = centroid[2] if isinstance(centroid, (list, tuple)) else 0
            
            if z_pos < 0.3:
                regions.append('head')
            elif z_pos < 0.7:
                regions.append('body')
            else:
                regions.append('tail')
        
        return regions
    
    def run_complete_pipeline(self):
        """
        Execute the complete data processing pipeline
        
        Returns:
            Processing summary and metadata DataFrame
        """
        print("🚀 Starting Complete Pancreatic Tumor Data Pipeline")
        print("=" * 60)
        
        # Step 1: Load MSD dataset
        train_files, dataset_info = self.load_msd_data()
        
        # Step 2: Process all cases
        print(f"\n🔄 Processing {len(train_files)} cases...")
        metadata_list = []
        
        for i, case_info in enumerate(train_files):
            if i % 50 == 0:
                print(f"Progress: {i}/{len(train_files)} cases processed")
            
            metadata = self.process_single_case(case_info)
            if metadata:
                metadata_list.append(metadata)
        
        # Step 3: Create metadata DataFrame
        metadata_df = pd.DataFrame(metadata_list)
        successful_cases = metadata_df[metadata_df['processing_successful'] == True]
        
        print(f"\n📊 Processing Summary:")
        print(f"Total cases processed: {len(metadata_list)}")
        print(f"Successful: {len(successful_cases)}")
        print(f"Failed: {len(metadata_list) - len(successful_cases)}")
        
        if len(successful_cases) == 0:
            print("❌ No cases processed successfully!")
            return None, None
        
        # Step 4: Add conditioning features
        print(f"\n🎯 Creating conditioning features...")
        enhanced_metadata = self.create_conditioning_features(successful_cases)
        
        # Step 5: Create stratified splits
        splits = self.create_stratified_splits(enhanced_metadata)
        
        # Step 6: Save everything
        self.save_pipeline_outputs(enhanced_metadata, splits, dataset_info)
        
        # Step 7: Generate summary report
        self.generate_summary_report(enhanced_metadata, splits)
        
        print(f"\n🎉 Pipeline completed successfully!")
        print(f"📁 All outputs saved to: {self.output_dir}")
        
        return enhanced_metadata, splits
    
    def save_pipeline_outputs(self, metadata_df, splits, dataset_info):
        """Save all pipeline outputs"""
        
        # Save metadata
        metadata_df.to_csv(self.output_dir / 'metadata' / 'case_metadata.csv', index=False)
        
        # Save splits
        for split_name, case_ids in splits.items():
            with open(self.output_dir / 'splits' / f'{split_name}_cases.txt', 'w') as f:
                f.write('\n'.join(case_ids))
        
        # Save split metadata
        splits_metadata = {}
        for split_name, case_ids in splits.items():
            split_df = metadata_df[metadata_df['case_id'].isin(case_ids)]
            splits_metadata[split_name] = {
                'case_count': len(case_ids),
                'size_distribution': split_df['size_class'].value_counts().to_dict(),
                'volume_stats': split_df.groupby('size_class')['tumor_volume_mm3'].agg(['count', 'mean', 'std']).to_dict()
            }
        
        with open(self.output_dir / 'splits' / 'split_statistics.json', 'w') as f:
            json.dump(splits_metadata, f, indent=2)
        
        # Save processing config
        config = {
            'target_spacing': self.target_spacing,
            'target_size': self.target_size,
            'hu_window': self.hu_window,
            'size_thresholds': self.size_thresholds,
            'dataset_info': dataset_info
        }
        
        with open(self.output_dir / 'metadata' / 'pipeline_config.json', 'w') as f:
            json.dump(config, f, indent=2)
        
        print(f"💾 Saved metadata, splits, and configuration files")
    
    def generate_summary_report(self, metadata_df, splits):
        """Generate comprehensive summary report"""
        
        report = []
        report.append("🏥 PANCREATIC TUMOR CLDM DATA PIPELINE REPORT")
        report.append("=" * 60)
        
        # Overall statistics
        total_cases = len(metadata_df)
        size_dist = metadata_df['size_class'].value_counts()
        
        report.append(f"\n📊 DATASET OVERVIEW")
        report.append(f"Total processed cases: {total_cases}")
        report.append(f"Size class distribution:")
        for size_class, count in size_dist.items():
            pct = count / total_cases * 100
            report.append(f"  {size_class.capitalize()}: {count} cases ({pct:.1f}%)")
        
        # Volume statistics
        report.append(f"\n📈 VOLUME STATISTICS")
        volume_stats = metadata_df.groupby('size_class')['tumor_volume_mm3'].agg(['count', 'mean', 'std', 'min', 'max'])
        for size_class, stats in volume_stats.iterrows():
            report.append(f"{size_class.capitalize()} tumors:")
            report.append(f"  Volume range: {stats['min']:.0f} - {stats['max']:.0f} mm³")
            report.append(f"  Mean ± Std: {stats['mean']:.0f} ± {stats['std']:.0f} mm³")
        
        # Split information
        report.append(f"\n🔄 DATA SPLITS")
        for split_name, case_ids in splits.items():
            split_df = metadata_df[metadata_df['case_id'].isin(case_ids)]
            split_dist = split_df['size_class'].value_counts()
            report.append(f"{split_name.upper()} SET: {len(case_ids)} cases")
            for size_class in ['small', 'medium', 'large']:
                count = split_dist.get(size_class, 0)
                report.append(f"  {size_class}: {count}")
        
        # CLDM training recommendations
        report.append(f"\n🧠 CLDM TRAINING RECOMMENDATIONS")
        report.append(f"1. Use heavy augmentation for large tumors (only {size_dist.get('large', 0)} cases)")
        report.append(f"2. Implement size-adaptive conditioning strength")
        report.append(f"3. Progressive training: Small → Medium → Large")
        report.append(f"4. Consider synthetic large tumor generation")
        
        # Conditioning features
        report.append(f"\n🎯 CONDITIONING FEATURES READY")
        report.append(f"✅ Size class embeddings (0: small, 1: medium, 2: large)")
        report.append(f"✅ Normalized tumor volumes (continuous)")
        report.append(f"✅ Compactness scores (anatomical realism)")
        report.append(f"✅ Anatomical region encoding (head/body/tail)")
        
        # Save report
        report_text = '\n'.join(report)
        with open(self.output_dir / 'metadata' / 'pipeline_report.txt', 'w', encoding="utf-8") as f:
            f.write(report_text)

        
        print(report_text)
        return report_text

# Data Loader for CLDM Training



In [None]:
class PancreasCLDMDataset:
    """
    PyTorch Dataset class for CLDM training
    Loads processed volumes with size-based conditioning
    """
    
    def __init__(self, case_ids, data_dir, metadata_df, augment=False):
        """
        Initialize dataset
        
        Args:
            case_ids: List of case IDs for this split
            data_dir: Directory with processed data
            metadata_df: DataFrame with case metadata
            augment: Whether to apply data augmentation
        """
        self.case_ids = case_ids
        self.data_dir = Path(data_dir)
        self.metadata_df = metadata_df.set_index('case_id')
        self.augment = augment
        
        print(f"📦 Dataset initialized: {len(case_ids)} cases")
        
        # Class weights for handling imbalance
        size_counts = metadata_df[metadata_df['case_id'].isin(case_ids)]['size_class'].value_counts()
        total = len(case_ids)
        self.class_weights = {
            'small': total / (3 * size_counts.get('small', 1)),
            'medium': total / (3 * size_counts.get('medium', 1)),
            'large': total / (3 * size_counts.get('large', 1))
        }
        
    def __len__(self):
        return len(self.case_ids)
    
    def __getitem__(self, idx):
        """
        Get a single training sample
        
        Returns:
            Dictionary with volume, masks, and conditioning information
        """
        case_id = self.case_ids[idx]
        metadata = self.metadata_df.loc[case_id]
        
        # Load processed data
        volume = np.load(self.data_dir / 'processed_volumes' / f'{case_id}_volume.npy')
        pancreas_mask = np.load(self.data_dir / 'processed_masks' / f'{case_id}_pancreas.npy')
        tumor_mask = np.load(self.data_dir / 'processed_masks' / f'{case_id}_tumor.npy')
        
        # Apply augmentation if enabled
        if self.augment and np.random.random() > 0.5:
            volume, pancreas_mask, tumor_mask = self._apply_augmentation(
                volume, pancreas_mask, tumor_mask, metadata['size_class']
            )
        
        # Prepare conditioning information
        conditioning = {
            'size_class_id': metadata['size_class_id'],
            'size_class': metadata['size_class'],
            'volume_normalized': metadata['volume_normalized'],
            'tumor_volume_mm3': metadata['tumor_volume_mm3'],
            'compactness': metadata['tumor_compactness'],
            'anatomical_region': metadata['anatomical_region'],
            'class_weight': self.class_weights[metadata['size_class']]
        }
        
        # Create combined mask for visualization
        combined_mask = pancreas_mask.astype(np.float32) + tumor_mask.astype(np.float32) * 2
        
        return {
            'volume': volume.astype(np.float32),
            'pancreas_mask': pancreas_mask.astype(np.float32),
            'tumor_mask': tumor_mask.astype(np.float32),
            'combined_mask': combined_mask,
            'conditioning': conditioning,
            'case_id': case_id
        }
    
    def _apply_augmentation(self, volume, pancreas_mask, tumor_mask, size_class):
        """
        Apply size-class specific augmentation
        Heavy augmentation for rare large tumors
        """
        # Intensity augmentation
        if np.random.random() > 0.5:
            # Slight intensity shift (within medical range)
            intensity_shift = np.random.normal(0, 0.1)
            volume = volume + intensity_shift
        
        # Spatial augmentation (more aggressive for rare classes)
        augment_strength = {
            'small': 0.1,   # Light augmentation
            'medium': 0.2,  # Moderate augmentation  
            'large': 0.4    # Heavy augmentation
        }
        
        strength = augment_strength.get(size_class, 0.1)
        
        if np.random.random() > 0.5:
            # Elastic deformation (simplified)
            noise = np.random.normal(0, strength, volume.shape)
            volume = volume + noise * 0.1
        
        return volume, pancreas_mask, tumor_mask

# Utility Functions for Pipeline Management
def setup_data_pipeline(data_root, output_dir, config=None):
    """
    Setup and configure the complete data pipeline
    
    Args:
        data_root: Path to MSD Task07_Pancreas dataset
        output_dir: Output directory for processed data
        config: Optional configuration dictionary
        
    Returns:
        Configured PancreasDataPipeline instance
    """
    if config is None:
        config = {
            'target_spacing': (1.0, 1.0, 1.0),
            'target_size': (128, 128, 64),
            'hu_window': (-100, 240)
        }
    
    pipeline = PancreasDataPipeline(
        data_root=data_root,
        output_dir=output_dir,
        target_spacing=config['target_spacing'],
        target_size=config['target_size'],
        hu_window=config['hu_window']
    )
    
    return pipeline

def load_processed_data(output_dir):
    """
    Load processed data and metadata
    
    Args:
        output_dir: Directory with processed pipeline outputs
        
    Returns:
        metadata_df, splits, config
    """
    output_path = Path(output_dir)
    
    # Load metadata
    metadata_df = pd.read_csv(output_path / 'metadata' / 'case_metadata.csv')
    
    # Load splits
    splits = {}
    for split_file in (output_path / 'splits').glob('*_cases.txt'):
        split_name = split_file.stem.replace('_cases', '')
        with open(split_file, 'r') as f:
            splits[split_name] = [line.strip() for line in f.readlines()]
    
    # Load config
    with open(output_path / 'metadata' / 'pipeline_config.json', 'r') as f:
        config = json.load(f)
    
    return metadata_df, splits, config

def create_cldm_dataloaders(output_dir, batch_size=4, num_workers=4):
    """
    Create PyTorch DataLoaders for CLDM training
    
    Args:
        output_dir: Directory with processed data
        batch_size: Batch size for training
        num_workers: Number of workers for data loading
        
    Returns:
        Dictionary with train/val/test DataLoaders
    """
    try:
        import torch
        from torch.utils.data import DataLoader
    except ImportError:
        print("❌ PyTorch not available. Install with: pip install torch")
        return None
    
    # Load processed data
    metadata_df, splits, config = load_processed_data(output_dir)
    
    # Create datasets
    datasets = {}
    for split_name, case_ids in splits.items():
        augment = (split_name == 'train')  # Only augment training data
        datasets[split_name] = PancreasCLDMDataset(
            case_ids=case_ids,
            data_dir=output_dir,
            metadata_df=metadata_df,
            augment=augment
        )
    
    # Create data loaders
    dataloaders = {}
    for split_name, dataset in datasets.items():
        shuffle = (split_name == 'train')
        dataloaders[split_name] = DataLoader(
            dataset,
            batch_size=batch_size,
            shuffle=shuffle,
            num_workers=num_workers,
            pin_memory=True
        )
    
    print(f"🔄 Created DataLoaders:")
    for split_name, loader in dataloaders.items():
        print(f"  {split_name}: {len(loader.dataset)} samples, {len(loader)} batches")
    
    return dataloaders


In [None]:

def run_pipeline_example():
    """
    Example of how to run the complete pipeline
    """
    
    # Configuration
    DATA_ROOT = "path/to/Task07_Pancreas"  # Replace with your path
    OUTPUT_DIR = "processed_pancreas_data"
    
    # Pipeline configuration
    config = {
        'target_spacing': (1.0, 1.0, 1.0),      # 1mm isotropic
        'target_size': (128, 128, 64),           # Standard size for diffusion
        'hu_window': (-100, 240)                 # Pancreas-optimized HU window
    }
    
    print("🚀 PANCREATIC TUMOR CLDM PIPELINE EXAMPLE")
    print("=" * 50)
    
    # Step 1: Setup pipeline
    pipeline = setup_data_pipeline(DATA_ROOT, OUTPUT_DIR, config)
    
    # Step 2: Run complete processing
    metadata_df, splits = pipeline.run_complete_pipeline()
    
    if metadata_df is not None:
        # Step 3: Create data loaders for CLDM training
        dataloaders = create_cldm_dataloaders(OUTPUT_DIR, batch_size=2)
        
        # Step 4: Test data loading
        if dataloaders:
            print("\n🧪 Testing data loading...")
            train_loader = dataloaders['train']
            sample_batch = next(iter(train_loader))
            
            print(f"Sample batch shapes:")
            print(f"  Volume: {sample_batch['volume'].shape}")
            print(f"  Pancreas mask: {sample_batch['pancreas_mask'].shape}")
            print(f"  Tumor mask: {sample_batch['tumor_mask'].shape}")
            print(f"  Conditioning keys: {list(sample_batch['conditioning'].keys())}")
            
            # Display conditioning info for first sample
            cond = sample_batch['conditioning']
            print(f"\n🎯 Sample conditioning information:")
            print(f"  Size class: {cond['size_class'][0]} (ID: {cond['size_class_id'][0]})")
            print(f"  Tumor volume: {cond['tumor_volume_mm3'][0]:.0f} mm³")
            print(f"  Normalized volume: {cond['volume_normalized'][0]:.3f}")
            print(f"  Compactness: {cond['compactness'][0]:.3f}")
            print(f"  Anatomical region: {cond['anatomical_region'][0]}")
    
    print("\n✅ Pipeline setup complete! Ready for CLDM training.")
    return pipeline, metadata_df, splits

# Quality Control and Validation


    

In [None]:
def validate_processed_data(output_dir, num_samples=5):
    """
    Validate processed data quality
    
    Args:
        output_dir: Directory with processed data
        num_samples: Number of random samples to validate
    """
    metadata_df, splits, config = load_processed_data(output_dir)
    
    print("🔍 DATA QUALITY VALIDATION")
    print("=" * 40)
    
    # Random sample validation
    sample_cases = metadata_df.sample(min(num_samples, len(metadata_df)))
    
    for _, case in sample_cases.iterrows():
        case_id = case['case_id']
        
        try:
            # Load processed files
            volume = np.load(Path(output_dir) / 'processed_volumes' / f'{case_id}_volume.npy')
            pancreas = np.load(Path(output_dir) / 'processed_masks' / f'{case_id}_pancreas.npy')
            tumor = np.load(Path(output_dir) / 'processed_masks' / f'{case_id}_tumor.npy')
            
            # Validate shapes
            assert volume.shape == tuple(config['target_size']), f"Volume shape mismatch: {volume.shape}"
            assert pancreas.shape == tuple(config['target_size']), f"Pancreas shape mismatch: {pancreas.shape}"
            assert tumor.shape == tuple(config['target_size']), f"Tumor shape mismatch: {tumor.shape}"
            
            # Validate data ranges
            assert np.isfinite(volume).all(), f"Volume contains invalid values"
            assert set(np.unique(pancreas)).issubset({0, 1}), f"Pancreas mask not binary"
            assert set(np.unique(tumor)).issubset({0, 1}), f"Tumor mask not binary"
            
            # Validate tumor is within pancreas (mostly)
            tumor_in_pancreas = np.sum((tumor == 1) & (pancreas == 1))
            tumor_total = np.sum(tumor == 1)
            if tumor_total > 0:
                overlap_ratio = tumor_in_pancreas / tumor_total
                if overlap_ratio < 0.7:  # Allow some margin for resampling artifacts
                    print(f"⚠️  {case_id}: Low tumor-pancreas overlap ({overlap_ratio:.2f})")
            
            print(f" {case_id}: {case['size_class']} tumor, {case['tumor_volume_mm3']:.0f} mm³")
            
        except Exception as e:
            print(f" {case_id}: Validation failed - {str(e)}")
    
    print("\nQuality validation completed")


In [None]:
if __name__ == "__main__":
    print("Pancreatic Tumor CLDM Data Pipeline")
    print("Ready to process MSD Task07_Pancreas dataset")
    print("\nTo use this pipeline:")
    print("1. Update DATA_ROOT path to your MSD dataset location")
    print("2. Run: pipeline, metadata, splits = run_pipeline_example()")
    print("3. Use create_cldm_dataloaders() for PyTorch training")

In [None]:
DATA_ROOT = "F:\Conditional Latent Diffusion Model for Pancreas\Task07_Pancreas"
OUTPUT_DIR = "processed_pancreas_data"

In [None]:
pipeline = setup_data_pipeline(DATA_ROOT, OUTPUT_DIR)

In [None]:
metadata_df, splits = pipeline.run_complete_pipeline()

In [None]:
dataloaders = create_cldm_dataloaders(OUTPUT_DIR, batch_size=4)

train_loader = dataloaders['train']
for batch in train_loader:
    volume = batch['volume']           
    conditioning = batch['conditioning'] 


In [None]:
validate_processed_data(OUTPUT_DIR, num_samples=10)