# Stage 1: Image Enhancement for Eye Disease Classification

## Innovation Feature 2: Specialized Preprocessing

This notebook implements:
- **CLAHE (Contrast Limited Adaptive Histogram Equalization)**: Enhances blood vessels and lesions
- **Ben Graham's Preprocessing**: Color normalization and automatic margin cropping

**Expected Output**: Enhanced dataset ready for model training in Stage 2

In [None]:
# Install required libraries (Colab-specific)
!pip install opencv-python-headless scikit-image -q

In [None]:
# Import libraries
import cv2
import numpy as np
import matplotlib.pyplot as plt
import os
from glob import glob
from tqdm import tqdm
import zipfile
from pathlib import Path

## 1. Extract Dataset from Archive

Upload your `archive.zip` file to Colab, then run this cell.

In [None]:
# Extract archive
archive_path = 'archive.zip'  # Update if your zip has a different name
extract_path = 'dataset'

if os.path.exists(archive_path):
    print('Extracting archive...')
    with zipfile.ZipFile(archive_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)
    print(f'Extracted to {extract_path}/')
else:
    print('Please upload archive.zip to Colab first!')

In [None]:
# Find dataset structure
dataset_root = extract_path
print('Dataset structure:')
for root, dirs, files in os.walk(dataset_root):
    level = root.replace(dataset_root, '').count(os.sep)
    indent = ' ' * 2 * level
    print(f'{indent}{os.path.basename(root)}/')
    if level < 2:  # Only show 2 levels deep
        subindent = ' ' * 2 * (level + 1)
        for file in files[:3]:  # Show only first 3 files
            print(f'{subindent}{file}')
        if len(files) > 3:
            print(f'{subindent}... and {len(files)-3} more files')

## 2. Preprocessing Functions

### CLAHE + Ben Graham's Method

In [None]:
def crop_black_margins(image, threshold=10):
    """Remove black margins from fundus images"""
    # Convert to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    
    # Find non-black pixels
    coords = cv2.findNonZero((gray > threshold).astype(np.uint8))
    
    if coords is not None:
        x, y, w, h = cv2.boundingRect(coords)
        cropped = image[y:y+h, x:x+w]
        return cropped
    return image

def ben_graham_preprocessing(image, target_size=512):
    """
    Ben Graham's preprocessing:
    1. Crop black margins
    2. Resize to target size
    3. Color normalization (subtract local average)
    4. Gaussian filtering
    """
    # Step 1: Crop black margins
    image = crop_black_margins(image)
    
    # Step 2: Resize to consistent size
    image = cv2.resize(image, (target_size, target_size))
    
    # Step 3: Color normalization - subtract local average color
    # This helps standardize lighting conditions
    image = image.astype(np.float32)
    
    # Calculate local average using Gaussian blur
    local_avg = cv2.GaussianBlur(image, (0, 0), target_size/30)
    
    # Subtract local average and add 128 to center around mid-gray
    image = image - local_avg + 128
    
    # Clip values to valid range
    image = np.clip(image, 0, 255).astype(np.uint8)
    
    # Step 4: Apply Gaussian filter to reduce noise
    image = cv2.GaussianBlur(image, (5, 5), 0)
    
    return image

def apply_clahe(image):
    """
    Apply CLAHE (Contrast Limited Adaptive Histogram Equalization)
    to enhance blood vessels and lesions
    """
    # Convert to LAB color space
    lab = cv2.cvtColor(image, cv2.COLOR_BGR2LAB)
    
    # Split channels
    l, a, b = cv2.split(lab)
    
    # Apply CLAHE to L channel
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
    l = clahe.apply(l)
    
    # Merge channels
    enhanced_lab = cv2.merge([l, a, b])
    
    # Convert back to BGR
    enhanced = cv2.cvtColor(enhanced_lab, cv2.COLOR_LAB2BGR)
    
    return enhanced

def enhance_fundus_image(image_path, target_size=224):
    """
    Complete enhancement pipeline:
    1. Load image
    2. Apply Ben Graham's preprocessing
    3. Apply CLAHE
    """
    # Load image
    image = cv2.imread(image_path)
    
    if image is None:
        return None
    
    # Apply Ben Graham's preprocessing
    image = ben_graham_preprocessing(image, target_size)
    
    # Apply CLAHE
    image = apply_clahe(image)
    
    return image

## 3. Process Dataset

**Note**: Update `dataset_folder` to match your extracted folder structure

In [None]:
# Define paths
dataset_folder = 'dataset/dataset'  # Update based on your structure
output_folder = 'enhanced_dataset'

# Create output directory
os.makedirs(output_folder, exist_ok=True)

# Find all image files
image_extensions = ['*.jpg', '*.jpeg', '*.png', '*.JPG', '*.JPEG', '*.PNG']
all_images = []

for ext in image_extensions:
    all_images.extend(glob(os.path.join(dataset_folder, '**', ext), recursive=True))

print(f'Found {len(all_images)} images')
print(f'Sample paths: {all_images[:3]}')

In [None]:
# Process all images
print('Processing images...')
processed_count = 0
failed_count = 0

for img_path in tqdm(all_images):
    try:
        # Get relative path to preserve folder structure
        rel_path = os.path.relpath(img_path, dataset_folder)
        output_path = os.path.join(output_folder, rel_path)
        
        # Create output subdirectory if needed
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        
        # Enhance image
        enhanced = enhance_fundus_image(img_path, target_size=224)
        
        if enhanced is not None:
            # Save enhanced image
            cv2.imwrite(output_path, enhanced)
            processed_count += 1
        else:
            failed_count += 1
            
    except Exception as e:
        print(f'Error processing {img_path}: {e}')
        failed_count += 1

print(f'\nProcessing complete!')
print(f'Successfully processed: {processed_count}')
print(f'Failed: {failed_count}')

## 4. Visualization: Before vs After

Let's compare original and enhanced images

In [None]:
# Select random samples for visualization
np.random.seed(42)
sample_images = np.random.choice(all_images, min(6, len(all_images)), replace=False)

fig, axes = plt.subplots(len(sample_images), 2, figsize=(12, 4 * len(sample_images)))

for idx, img_path in enumerate(sample_images):
    # Load original
    original = cv2.imread(img_path)
    original = cv2.cvtColor(original, cv2.COLOR_BGR2RGB)
    
    # Load enhanced
    rel_path = os.path.relpath(img_path, dataset_folder)
    enhanced_path = os.path.join(output_folder, rel_path)
    enhanced = cv2.imread(enhanced_path)
    enhanced = cv2.cvtColor(enhanced, cv2.COLOR_BGR2RGB)
    
    # Plot
    axes[idx, 0].imshow(original)
    axes[idx, 0].set_title('Original')
    axes[idx, 0].axis('off')
    
    axes[idx, 1].imshow(enhanced)
    axes[idx, 1].set_title('Enhanced (CLAHE + Ben Graham)')
    axes[idx, 1].axis('off')

plt.tight_layout()
plt.show()

## 5. Histogram Comparison

Verify contrast enhancement through histogram analysis

In [None]:
# Select one image for detailed analysis
sample_img = sample_images[0]

# Load original and enhanced
original = cv2.imread(sample_img)
rel_path = os.path.relpath(sample_img, dataset_folder)
enhanced_path = os.path.join(output_folder, rel_path)
enhanced = cv2.imread(enhanced_path)

# Convert to grayscale for histogram
orig_gray = cv2.cvtColor(original, cv2.COLOR_BGR2GRAY)
enh_gray = cv2.cvtColor(enhanced, cv2.COLOR_BGR2GRAY)

# Plot histograms
fig, axes = plt.subplots(1, 2, figsize=(14, 4))

axes[0].hist(orig_gray.ravel(), bins=256, range=(0, 256), color='blue', alpha=0.7)
axes[0].set_title('Original - Histogram')
axes[0].set_xlabel('Pixel Intensity')
axes[0].set_ylabel('Frequency')

axes[1].hist(enh_gray.ravel(), bins=256, range=(0, 256), color='green', alpha=0.7)
axes[1].set_title('Enhanced - Histogram')
axes[1].set_xlabel('Pixel Intensity')
axes[1].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

print('Notice: Enhanced histogram shows better distribution across intensity range')

## 6. Download Enhanced Dataset

Compress and download for use in Stage 2

In [None]:
# Create zip file
import shutil

print('Creating zip archive...')
shutil.make_archive('enhanced_dataset', 'zip', output_folder)
print('Done! Download enhanced_dataset.zip from the Files panel')
print('You will use this in Stage 2 (Hyperparameter Tuning)')

## Summary

âœ… **Completed:**
- Dataset extraction
- Ben Graham's preprocessing (color normalization + margin cropping)
- CLAHE enhancement for better contrast
- Visual comparison (Before/After)
- Histogram analysis confirming improved contrast distribution

**Next Step:** Use `enhanced_dataset.zip` in Stage 2 (Hyperparameter Tuning)