---

## üìä Summary: Phase 1 Improvements

### ‚úÖ ƒê√£ Implement

| Improvement | Implementation | Expected Impact |
|-------------|----------------|----------------|
| **Advanced Augmentation** | Albumentations pipeline v·ªõi CLAHE, ShiftScaleRotate, Noise/Blur | +1-2% AUC |
| **Class Imbalance** | Focal Loss + Weighted BCE + Label Smoothing | +3-5% (rare classes) |
| **Transfer Learning** | ImageNet pre-trained weights + Progressive unfreezing | +2-4% AUC |

### üéØ Combined Expected Impact
- **Total: +5-10% AUC improvement**
- **Faster convergence** (50% fewer epochs)
- **Better generalization**
- **More clinically useful** (better on rare diseases)

### üìù Next Steps

Trong c√°c cells ti·∫øp theo, ch√∫ng ta s·∫Ω:
1. **Load v√† preprocess data**
2. **Create datasets v·ªõi advanced augmentation**
3. **Train models v·ªõi all improvements**
4. **Evaluate v√† compare v·ªõi baseline**
5. **Visualize results v√† insights**

---

# üóÇÔ∏è PHASE 2: Data Loading & Preprocessing

## 2.1 Load NIH Chest X-ray Dataset

### Dataset Overview
- **Total images**: 112,120
- **Number of classes**: 15 (multi-label)
- **Format**: PNG grayscale images
- **Labels**: NLP-extracted from radiology reports (~10% noise)

In [None]:
def load_and_prepare_data(csv_path, test_size=0.2, val_size=0.1, random_state=42):
    """
    Load v√† prepare NIH Chest X-ray dataset
    
    Args:
        csv_path: Path to Data_Entry_2017_v2020.csv
        test_size: Fraction for test set
        val_size: Fraction of train set for validation
    
    Returns:
        train_df, val_df, test_df, disease_columns
    """
    print("üìÇ Loading dataset...")
    df = pd.read_csv(csv_path)
    
    print(f"   Total samples: {len(df):,}")
    
    # Parse Finding Labels column
    # Format: "Disease1|Disease2|Disease3" or "No Finding"
    
    # Get unique diseases
    all_diseases = set()
    for labels in df['Finding Labels'].values:
        diseases = labels.split('|')
        all_diseases.update(diseases)
    
    disease_columns = sorted(list(all_diseases))
    print(f"   Diseases found: {len(disease_columns)}")
    print(f"   {disease_columns}")
    
    # Create binary columns for each disease
    for disease in disease_columns:
        df[disease] = df['Finding Labels'].apply(
            lambda x: 1 if disease in x.split('|') else 0
        )
    
    # Print class distribution
    print("\nüìä Class Distribution:")
    class_counts = df[disease_columns].sum().sort_values(ascending=False)
    for disease, count in class_counts.items():
        percentage = count / len(df) * 100
        print(f"   {disease:25s}: {count:6,} ({percentage:5.2f}%)")
    
    # Split data: train/val/test
    from sklearn.model_selection import train_test_split
    
    # First split: train+val vs test
    train_val_df, test_df = train_test_split(
        df, test_size=test_size, random_state=random_state, shuffle=True
    )
    
    # Second split: train vs val
    train_df, val_df = train_test_split(
        train_val_df, test_size=val_size, random_state=random_state, shuffle=True
    )
    
    print(f"\nüì¶ Data Split:")
    print(f"   Train: {len(train_df):,} samples ({len(train_df)/len(df)*100:.1f}%)")
    print(f"   Val:   {len(val_df):,} samples ({len(val_df)/len(df)*100:.1f}%)")
    print(f"   Test:  {len(test_df):,} samples ({len(test_df)/len(df)*100:.1f}%)")
    
    return train_df, val_df, test_df, disease_columns


# Load data
if CSV_PATH.exists():
    train_df, val_df, test_df, disease_columns = load_and_prepare_data(
        CSV_PATH, test_size=0.2, val_size=0.1
    )
    print("\n‚úÖ Data loaded successfully!")
else:
    print(f"‚ùå CSV file not found: {CSV_PATH}")
    print("   Please update CSV_PATH in configuration")

## 2.2 Custom Dataset Class

### Design Principles
1. **Efficient loading**: Only load images when needed
2. **Flexible augmentation**: Support different transforms for train/val
3. **Error handling**: Skip corrupted images
4. **Memory efficient**: Don't load all images to RAM

In [None]:
class ChestXrayDataset(Dataset):
    """
    Custom Dataset for NIH Chest X-ray
    
    Features:
    - Lazy loading (load images on-demand)
    - Albumentations transforms
    - Error handling for corrupted images
    - Multi-label support
    """
    def __init__(self, dataframe, image_dir, disease_columns, transform=None):
        """
        Args:
            dataframe: DataFrame with image paths and labels
            image_dir: Root directory containing images
            disease_columns: List of disease column names
            transform: Albumentations transform pipeline
        """
        self.df = dataframe.reset_index(drop=True)
        self.image_dir = Path(image_dir)
        self.disease_columns = disease_columns
        self.transform = transform
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        """
        Load and return one sample
        
        Returns:
            image: (C, H, W) tensor
            labels: (num_classes,) binary vector
        """
        # Get image path and labels
        row = self.df.iloc[idx]
        img_name = row['Image Index']
        img_path = self.image_dir / img_name
        
        # Load image
        try:
            image = cv2.imread(str(img_path))
            
            if image is None:
                raise ValueError(f"Failed to load image: {img_path}")
            
            # Convert to RGB (X-ray is grayscale, but we need 3 channels for pre-trained models)
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            
        except Exception as e:
            print(f"‚ö†Ô∏è  Error loading {img_path}: {e}")
            # Return black image as fallback
            image = np.zeros((224, 224, 3), dtype=np.uint8)
        
        # Apply transforms
        if self.transform:
            augmented = self.transform(image=image)
            image = augmented['image']
        
        # Get labels
        labels = row[self.disease_columns].values.astype(np.float32)
        labels = torch.FloatTensor(labels)
        
        return image, labels


print("‚úÖ ChestXrayDataset class created")
print("   Features: Lazy loading, error handling, multi-label support")

## 2.3 Create DataLoaders

### Strategy
1. **Train**: Advanced augmentation + WeightedRandomSampler
2. **Val/Test**: Simple resize + normalize only
3. **Batch size**: Balance between GPU memory and convergence

In [None]:
def create_dataloaders(train_df, val_df, test_df, disease_columns, image_dir, config):
    """
    Create train/val/test DataLoaders v·ªõi all improvements
    
    Returns:
        train_loader, val_loader, test_loader
    """
    # Get transforms
    train_transform = get_train_transforms(config['img_size'])
    valid_transform = get_valid_transforms(config['img_size'])
    
    # Create datasets
    train_dataset = ChestXrayDataset(
        train_df, image_dir, disease_columns, train_transform
    )
    val_dataset = ChestXrayDataset(
        val_df, image_dir, disease_columns, valid_transform
    )
    test_dataset = ChestXrayDataset(
        test_df, image_dir, disease_columns, valid_transform
    )
    
    # Compute sample weights for weighted sampling
    print("‚öôÔ∏è  Computing sample weights...")
    sample_weights = compute_sample_weights(train_df, disease_columns)
    sampler = WeightedRandomSampler(
        weights=sample_weights,
        num_samples=len(sample_weights),
        replacement=True
    )
    
    # Create DataLoaders
    train_loader = DataLoader(
        train_dataset,
        batch_size=config['batch_size'],
        sampler=sampler,  # Use WeightedRandomSampler
        num_workers=config['num_workers'],
        pin_memory=True
    )
    
    val_loader = DataLoader(
        val_dataset,
        batch_size=config['batch_size'],
        shuffle=False,
        num_workers=config['num_workers'],
        pin_memory=True
    )
    
    test_loader = DataLoader(
        test_dataset,
        batch_size=config['batch_size'],
        shuffle=False,
        num_workers=config['num_workers'],
        pin_memory=True
    )
    
    print(f"\n‚úÖ DataLoaders created:")
    print(f"   Train: {len(train_loader)} batches")
    print(f"   Val:   {len(val_loader)} batches")
    print(f"   Test:  {len(test_loader)} batches")
    
    return train_loader, val_loader, test_loader


# Create DataLoaders (if data is loaded)
if 'train_df' in locals():
    train_loader, val_loader, test_loader = create_dataloaders(
        train_df, val_df, test_df, disease_columns, IMAGE_DIR, CONFIG
    )
else:
    print("‚ö†Ô∏è  Data not loaded, skip DataLoader creation")