# Chest X-Ray Lung Segmentation - Data Visualization & Preparation

**Author**: Deep Learning Project  
**Dataset**: Combined chest X-ray images from Darwin, Montgomery, and Shenzhen datasets  
**Total Images**: 3,211 images with corresponding lung segmentation masks  

This notebook provides comprehensive visualization and data preparation for the lung segmentation task.

## 1. Import Required Libraries

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
import cv2
from pathlib import Path
import random
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("Libraries imported successfully!")

## 2. Dataset Configuration

In [None]:
# Dataset paths
BASE_DIR = Path(r"d:\DEEP LEARNING\Dataset\ChestXray")
IMAGE_DIR = BASE_DIR / "CXR_Combined" / "images"
MASK_DIR = BASE_DIR / "CXR_Combined" / "masks"
CSV_LOG = BASE_DIR / "CXR_Selected-Image-Dataset_Log.csv"

# Output directory for processed data
OUTPUT_DIR = Path(r"d:\DEEP LEARNING\ChestXraySegmentation")
OUTPUT_DIR.mkdir(exist_ok=True)

print(f"Image Directory: {IMAGE_DIR}")
print(f"Mask Directory: {MASK_DIR}")
print(f"CSV Log: {CSV_LOG}")
print(f"Output Directory: {OUTPUT_DIR}")

# Verify directories exist
assert IMAGE_DIR.exists(), f"Image directory not found: {IMAGE_DIR}"
assert MASK_DIR.exists(), f"Mask directory not found: {MASK_DIR}"
print("\n✓ All directories verified!")

## 3. Load Dataset Metadata

In [None]:
# Load CSV log if available
if CSV_LOG.exists():
    df_log = pd.read_csv(CSV_LOG)
    print("Dataset Log loaded:")
    print(df_log.head())
    print(f"\nTotal entries in log: {len(df_log)}")
else:
    print("CSV log not found, will scan directories directly")
    df_log = None

In [None]:
# Scan image and mask directories
image_files = sorted([f for f in IMAGE_DIR.glob("*.png")])
mask_files = sorted([f for f in MASK_DIR.glob("*.png")])

print(f"Total Images Found: {len(image_files)}")
print(f"Total Masks Found: {len(mask_files)}")

# Check if images and masks match
image_names = set([f.name for f in image_files])
mask_names = set([f.name for f in mask_files])

matching = image_names & mask_names
print(f"\nMatching Image-Mask Pairs: {len(matching)}")

if len(matching) != len(image_files):
    missing_masks = image_names - mask_names
    missing_images = mask_names - image_names
    if missing_masks:
        print(f"\n⚠ Images without masks: {len(missing_masks)}")
        print(f"Examples: {list(missing_masks)[:5]}")
    if missing_images:
        print(f"\n⚠ Masks without images: {len(missing_images)}")
        print(f"Examples: {list(missing_images)[:5]}")
else:
    print("✓ All images have corresponding masks!")

## 4. Dataset Statistics & Analysis

In [None]:
# Categorize images by dataset source
dataset_counts = {'Darwin': 0, 'Shenzhen': 0, 'Montgomery': 0}

for img_name in image_names:
    if img_name.startswith('DARCXR'):
        dataset_counts['Darwin'] += 1
    elif img_name.startswith('CHNCXR'):
        dataset_counts['Shenzhen'] += 1
    elif img_name.startswith('MCUCXR'):
        dataset_counts['Montgomery'] += 1

print("Dataset Distribution:")
for dataset, count in dataset_counts.items():
    print(f"  {dataset}: {count} images ({count/len(image_names)*100:.1f}%)")

# Visualize distribution
fig, ax = plt.subplots(1, 2, figsize=(14, 5))

# Pie chart
ax[0].pie(dataset_counts.values(), labels=dataset_counts.keys(), autopct='%1.1f%%',
          startangle=90, colors=['#ff9999', '#66b3ff', '#99ff99'])
ax[0].set_title('Dataset Distribution', fontsize=14, fontweight='bold')

# Bar chart
bars = ax[1].bar(dataset_counts.keys(), dataset_counts.values(), 
                 color=['#ff9999', '#66b3ff', '#99ff99'], edgecolor='black')
ax[1].set_ylabel('Number of Images', fontsize=12)
ax[1].set_title('Images per Dataset', fontsize=14, fontweight='bold')
ax[1].grid(axis='y', alpha=0.3)

# Add value labels on bars
for bar in bars:
    height = bar.get_height()
    ax[1].text(bar.get_x() + bar.get_width()/2., height,
               f'{int(height)}',
               ha='center', va='bottom', fontsize=11, fontweight='bold')

plt.tight_layout()
plt.savefig(OUTPUT_DIR / 'dataset_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

## 5. Analyze Image Dimensions

In [None]:
# Sample images to check dimensions (check first 100 for speed)
sample_size = min(100, len(image_files))
sample_images = random.sample(image_files, sample_size)

dimensions = []
aspect_ratios = []

print(f"Analyzing dimensions of {sample_size} sample images...\n")

for img_path in sample_images:
    img = Image.open(img_path)
    width, height = img.size
    dimensions.append((width, height))
    aspect_ratios.append(width / height)

# Convert to numpy for analysis
dimensions = np.array(dimensions)
widths = dimensions[:, 0]
heights = dimensions[:, 1]

print("Image Dimensions Statistics:")
print(f"  Width  - Min: {widths.min()}, Max: {widths.max()}, Mean: {widths.mean():.1f}, Std: {widths.std():.1f}")
print(f"  Height - Min: {heights.min()}, Max: {heights.max()}, Mean: {heights.mean():.1f}, Std: {heights.std():.1f}")
print(f"  Aspect Ratio - Min: {min(aspect_ratios):.3f}, Max: {max(aspect_ratios):.3f}, Mean: {np.mean(aspect_ratios):.3f}")

# Find most common dimensions
dim_counter = Counter([f"{w}x{h}" for w, h in dimensions])
print(f"\nMost Common Dimensions:")
for dim, count in dim_counter.most_common(5):
    print(f"  {dim}: {count} images ({count/sample_size*100:.1f}%)")

In [None]:
# Visualize dimension distribution
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Width distribution
axes[0, 0].hist(widths, bins=30, color='skyblue', edgecolor='black', alpha=0.7)
axes[0, 0].set_xlabel('Width (pixels)', fontsize=11)
axes[0, 0].set_ylabel('Frequency', fontsize=11)
axes[0, 0].set_title('Image Width Distribution', fontsize=12, fontweight='bold')
axes[0, 0].axvline(widths.mean(), color='red', linestyle='--', label=f'Mean: {widths.mean():.0f}')
axes[0, 0].legend()
axes[0, 0].grid(alpha=0.3)

# Height distribution
axes[0, 1].hist(heights, bins=30, color='lightcoral', edgecolor='black', alpha=0.7)
axes[0, 1].set_xlabel('Height (pixels)', fontsize=11)
axes[0, 1].set_ylabel('Frequency', fontsize=11)
axes[0, 1].set_title('Image Height Distribution', fontsize=12, fontweight='bold')
axes[0, 1].axvline(heights.mean(), color='red', linestyle='--', label=f'Mean: {heights.mean():.0f}')
axes[0, 1].legend()
axes[0, 1].grid(alpha=0.3)

# Scatter plot of dimensions
axes[1, 0].scatter(widths, heights, alpha=0.6, s=50, c='green')
axes[1, 0].set_xlabel('Width (pixels)', fontsize=11)
axes[1, 0].set_ylabel('Height (pixels)', fontsize=11)
axes[1, 0].set_title('Width vs Height', fontsize=12, fontweight='bold')
axes[1, 0].grid(alpha=0.3)

# Aspect ratio distribution
axes[1, 1].hist(aspect_ratios, bins=30, color='orange', edgecolor='black', alpha=0.7)
axes[1, 1].set_xlabel('Aspect Ratio (W/H)', fontsize=11)
axes[1, 1].set_ylabel('Frequency', fontsize=11)
axes[1, 1].set_title('Aspect Ratio Distribution', fontsize=12, fontweight='bold')
axes[1, 1].axvline(np.mean(aspect_ratios), color='red', linestyle='--', 
                   label=f'Mean: {np.mean(aspect_ratios):.2f}')
axes[1, 1].legend()
axes[1, 1].grid(alpha=0.3)

plt.tight_layout()
plt.savefig(OUTPUT_DIR / 'image_dimensions_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

## 6. Visualize Sample Images with Masks

In [None]:
def visualize_sample(image_path, mask_path, ax=None):
    """Visualize a single image with its mask overlay"""
    # Read image and mask
    image = cv2.imread(str(image_path))
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    mask = cv2.imread(str(mask_path), cv2.IMREAD_GRAYSCALE)
    
    # Create overlay
    overlay = image.copy()
    overlay[mask > 0] = [0, 255, 0]  # Green overlay for lung regions
    
    # Blend
    blended = cv2.addWeighted(image, 0.7, overlay, 0.3, 0)
    
    if ax is None:
        fig, axes = plt.subplots(1, 3, figsize=(15, 5))
    else:
        axes = ax
    
    # Original image
    axes[0].imshow(image, cmap='gray')
    axes[0].set_title('Original X-Ray', fontsize=11, fontweight='bold')
    axes[0].axis('off')
    
    # Mask
    axes[1].imshow(mask, cmap='gray')
    axes[1].set_title('Lung Mask', fontsize=11, fontweight='bold')
    axes[1].axis('off')
    
    # Overlay
    axes[2].imshow(blended)
    axes[2].set_title('Overlay (Green = Lungs)', fontsize=11, fontweight='bold')
    axes[2].axis('off')
    
    return image, mask

# Select random samples from each dataset
darwin_samples = [f for f in image_files if f.name.startswith('DARCXR')][:3]
shenzhen_samples = [f for f in image_files if f.name.startswith('CHNCXR')][:3]
montgomery_samples = [f for f in image_files if f.name.startswith('MCUCXR')][:3]

all_samples = darwin_samples + shenzhen_samples + montgomery_samples

# Visualize samples
fig, axes = plt.subplots(len(all_samples), 3, figsize=(15, 5 * len(all_samples)))

for idx, img_path in enumerate(all_samples):
    mask_path = MASK_DIR / img_path.name
    if mask_path.exists():
        dataset_name = "Darwin" if img_path.name.startswith('DARCXR') else \
                      "Shenzhen" if img_path.name.startswith('CHNCXR') else "Montgomery"
        
        visualize_sample(img_path, mask_path, axes[idx] if len(all_samples) > 1 else axes)
        
        # Add filename and dataset as super title
        fig.text(0.5, 1 - (idx + 0.5) / len(all_samples), 
                f'{dataset_name} - {img_path.name}',
                ha='center', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.subplots_adjust(top=0.98)
plt.savefig(OUTPUT_DIR / 'sample_visualizations.png', dpi=300, bbox_inches='tight')
plt.show()

## 7. Analyze Mask Properties

In [None]:
# Analyze mask coverage
mask_coverage = []
sample_masks = random.sample(mask_files, min(100, len(mask_files)))

print(f"Analyzing {len(sample_masks)} sample masks...\n")

for mask_path in sample_masks:
    mask = cv2.imread(str(mask_path), cv2.IMREAD_GRAYSCALE)
    total_pixels = mask.size
    lung_pixels = np.sum(mask > 0)
    coverage = (lung_pixels / total_pixels) * 100
    mask_coverage.append(coverage)

print("Mask Coverage Statistics (% of image):")
print(f"  Min: {min(mask_coverage):.2f}%")
print(f"  Max: {max(mask_coverage):.2f}%")
print(f"  Mean: {np.mean(mask_coverage):.2f}%")
print(f"  Median: {np.median(mask_coverage):.2f}%")
print(f"  Std: {np.std(mask_coverage):.2f}%")

# Visualize mask coverage
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histogram
axes[0].hist(mask_coverage, bins=30, color='mediumpurple', edgecolor='black', alpha=0.7)
axes[0].axvline(np.mean(mask_coverage), color='red', linestyle='--', 
               label=f'Mean: {np.mean(mask_coverage):.1f}%', linewidth=2)
axes[0].set_xlabel('Lung Coverage (%)', fontsize=12)
axes[0].set_ylabel('Frequency', fontsize=12)
axes[0].set_title('Distribution of Lung Coverage in Masks', fontsize=13, fontweight='bold')
axes[0].legend(fontsize=11)
axes[0].grid(alpha=0.3)

# Box plot
box = axes[1].boxplot(mask_coverage, vert=True, patch_artist=True,
                      boxprops=dict(facecolor='lightblue', color='black'),
                      whiskerprops=dict(color='black'),
                      capprops=dict(color='black'),
                      medianprops=dict(color='red', linewidth=2))
axes[1].set_ylabel('Lung Coverage (%)', fontsize=12)
axes[1].set_title('Lung Coverage Box Plot', fontsize=13, fontweight='bold')
axes[1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig(OUTPUT_DIR / 'mask_coverage_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

## 8. Create Train/Val/Test Split

In [None]:
from sklearn.model_selection import train_test_split

# Get all matching image-mask pairs
valid_pairs = [(img, MASK_DIR / img.name) for img in image_files 
               if (MASK_DIR / img.name).exists()]

print(f"Total valid image-mask pairs: {len(valid_pairs)}")

# Split: 70% train, 15% val, 15% test
train_val_pairs, test_pairs = train_test_split(valid_pairs, test_size=0.15, random_state=42)
train_pairs, val_pairs = train_test_split(train_val_pairs, test_size=0.1765, random_state=42)  # 0.1765 of 0.85 ≈ 0.15

print(f"\nDataset Split:")
print(f"  Training:   {len(train_pairs)} pairs ({len(train_pairs)/len(valid_pairs)*100:.1f}%)")
print(f"  Validation: {len(val_pairs)} pairs ({len(val_pairs)/len(valid_pairs)*100:.1f}%)")
print(f"  Testing:    {len(test_pairs)} pairs ({len(test_pairs)/len(valid_pairs)*100:.1f}%)")

# Save split information
split_info = {
    'train': [img.name for img, _ in train_pairs],
    'val': [img.name for img, _ in val_pairs],
    'test': [img.name for img, _ in test_pairs]
}

# Save to CSV
for split_name, file_list in split_info.items():
    df = pd.DataFrame({'filename': file_list})
    df.to_csv(OUTPUT_DIR / f'{split_name}_split.csv', index=False)
    print(f"✓ Saved {split_name}_split.csv")

print("\n✓ Dataset split completed and saved!")

## 9. Visualize Split Distribution

In [None]:
# Analyze dataset distribution in splits
def count_datasets_in_split(pairs):
    counts = {'Darwin': 0, 'Shenzhen': 0, 'Montgomery': 0}
    for img, _ in pairs:
        if img.name.startswith('DARCXR'):
            counts['Darwin'] += 1
        elif img.name.startswith('CHNCXR'):
            counts['Shenzhen'] += 1
        elif img.name.startswith('MCUCXR'):
            counts['Montgomery'] += 1
    return counts

train_dist = count_datasets_in_split(train_pairs)
val_dist = count_datasets_in_split(val_pairs)
test_dist = count_datasets_in_split(test_pairs)

# Create DataFrame for visualization
df_splits = pd.DataFrame({
    'Train': list(train_dist.values()),
    'Validation': list(val_dist.values()),
    'Test': list(test_dist.values())
}, index=list(train_dist.keys()))

# Visualize
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Stacked bar chart
df_splits.plot(kind='bar', stacked=False, ax=axes[0], 
               color=['#FF6B6B', '#4ECDC4', '#45B7D1'],
               edgecolor='black', alpha=0.8)
axes[0].set_title('Dataset Distribution Across Splits', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Dataset Source', fontsize=12)
axes[0].set_ylabel('Number of Images', fontsize=12)
axes[0].legend(title='Split', fontsize=10)
axes[0].grid(axis='y', alpha=0.3)
axes[0].set_xticklabels(axes[0].get_xticklabels(), rotation=0)

# Pie charts for each split
split_sizes = [len(train_pairs), len(val_pairs), len(test_pairs)]
split_labels = ['Train\n(70%)', 'Validation\n(15%)', 'Test\n(15%)']
colors = ['#FF6B6B', '#4ECDC4', '#45B7D1']

wedges, texts, autotexts = axes[1].pie(split_sizes, labels=split_labels, autopct='%1.1f%%',
                                        startangle=90, colors=colors,
                                        explode=(0.05, 0.05, 0.05),
                                        textprops={'fontsize': 11, 'fontweight': 'bold'})
axes[1].set_title('Overall Split Distribution', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.savefig(OUTPUT_DIR / 'split_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

# Print detailed split info
print("\nDetailed Split Distribution:")
print(df_splits)
print(f"\nTotal: {df_splits.sum().sum()}")

## 10. Intensity Analysis

In [None]:
# Analyze pixel intensity distributions
print("Analyzing pixel intensities of sample images...\n")

sample_images_intensity = random.sample(image_files, min(20, len(image_files)))
mean_intensities = []
std_intensities = []

for img_path in sample_images_intensity:
    img = cv2.imread(str(img_path), cv2.IMREAD_GRAYSCALE)
    mean_intensities.append(np.mean(img))
    std_intensities.append(np.std(img))

print("Intensity Statistics:")
print(f"  Mean Intensity - Min: {min(mean_intensities):.1f}, Max: {max(mean_intensities):.1f}, "
      f"Avg: {np.mean(mean_intensities):.1f}")
print(f"  Std Intensity  - Min: {min(std_intensities):.1f}, Max: {max(std_intensities):.1f}, "
      f"Avg: {np.mean(std_intensities):.1f}")

# Visualize intensity distributions
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Sample histogram from one image
sample_img = cv2.imread(str(sample_images_intensity[0]), cv2.IMREAD_GRAYSCALE)
axes[0].hist(sample_img.ravel(), bins=256, range=[0, 256], color='steelblue', alpha=0.7)
axes[0].set_xlabel('Pixel Intensity', fontsize=11)
axes[0].set_ylabel('Frequency', fontsize=11)
axes[0].set_title(f'Sample Histogram\n({sample_images_intensity[0].name})', fontsize=12, fontweight='bold')
axes[0].grid(alpha=0.3)

# Mean intensities
axes[1].hist(mean_intensities, bins=20, color='coral', edgecolor='black', alpha=0.7)
axes[1].axvline(np.mean(mean_intensities), color='red', linestyle='--',
               label=f'Overall Mean: {np.mean(mean_intensities):.1f}', linewidth=2)
axes[1].set_xlabel('Mean Pixel Intensity', fontsize=11)
axes[1].set_ylabel('Frequency', fontsize=11)
axes[1].set_title('Distribution of Mean Intensities', fontsize=12, fontweight='bold')
axes[1].legend()
axes[1].grid(alpha=0.3)

# Std intensities
axes[2].hist(std_intensities, bins=20, color='mediumseagreen', edgecolor='black', alpha=0.7)
axes[2].axvline(np.mean(std_intensities), color='red', linestyle='--',
               label=f'Overall Mean: {np.mean(std_intensities):.1f}', linewidth=2)
axes[2].set_xlabel('Std Deviation of Intensity', fontsize=11)
axes[2].set_ylabel('Frequency', fontsize=11)
axes[2].set_title('Distribution of Intensity Std Dev', fontsize=12, fontweight='bold')
axes[2].legend()
axes[2].grid(alpha=0.3)

plt.tight_layout()
plt.savefig(OUTPUT_DIR / 'intensity_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

## 11. Summary Report

In [None]:
# Generate comprehensive summary
summary_report = f"""
{'='*70}
CHEST X-RAY LUNG SEGMENTATION DATASET - SUMMARY REPORT
{'='*70}

DATASET OVERVIEW
{'-'*70}
Total Images:              {len(image_files)}
Total Masks:               {len(mask_files)}
Valid Image-Mask Pairs:    {len(valid_pairs)}

SOURCE DISTRIBUTION
{'-'*70}
Darwin Dataset:            {dataset_counts['Darwin']} ({dataset_counts['Darwin']/len(image_names)*100:.1f}%)
Shenzhen Dataset:          {dataset_counts['Shenzhen']} ({dataset_counts['Shenzhen']/len(image_names)*100:.1f}%)
Montgomery Dataset:        {dataset_counts['Montgomery']} ({dataset_counts['Montgomery']/len(image_names)*100:.1f}%)

IMAGE DIMENSIONS (sampled)
{'-'*70}
Width  - Range: {widths.min()}-{widths.max()} px, Mean: {widths.mean():.0f} px
Height - Range: {heights.min()}-{heights.max()} px, Mean: {heights.mean():.0f} px
Aspect Ratio - Mean: {np.mean(aspect_ratios):.3f}

MASK COVERAGE (sampled)
{'-'*70}
Lung Coverage - Range: {min(mask_coverage):.1f}%-{max(mask_coverage):.1f}%
Mean Coverage: {np.mean(mask_coverage):.1f}%

DATASET SPLIT
{'-'*70}
Training Set:              {len(train_pairs)} images ({len(train_pairs)/len(valid_pairs)*100:.1f}%)
Validation Set:            {len(val_pairs)} images ({len(val_pairs)/len(valid_pairs)*100:.1f}%)
Test Set:                  {len(test_pairs)} images ({len(test_pairs)/len(valid_pairs)*100:.1f}%)

INTENSITY STATISTICS (sampled)
{'-'*70}
Mean Pixel Intensity:      {np.mean(mean_intensities):.1f} ± {np.std(mean_intensities):.1f}
Mean Std Deviation:        {np.mean(std_intensities):.1f} ± {np.std(std_intensities):.1f}

OUTPUT FILES GENERATED
{'-'*70}
✓ train_split.csv
✓ val_split.csv
✓ test_split.csv
✓ dataset_distribution.png
✓ image_dimensions_analysis.png
✓ sample_visualizations.png
✓ mask_coverage_analysis.png
✓ split_distribution.png
✓ intensity_analysis.png

{'='*70}
Dataset is ready for training!
{'='*70}
"""

print(summary_report)

# Save summary to file
with open(OUTPUT_DIR / 'dataset_summary.txt', 'w') as f:
    f.write(summary_report)

print("\n✓ Summary report saved to 'dataset_summary.txt'")

## Conclusion

The dataset has been thoroughly analyzed and prepared for training:

1. **Dataset Statistics**: Comprehensive analysis of 3,211 chest X-ray images
2. **Source Distribution**: Images from Darwin, Montgomery, and Shenzhen datasets
3. **Quality Check**: All images have corresponding lung segmentation masks
4. **Train/Val/Test Split**: 70/15/15 split saved to CSV files
5. **Visualizations**: Multiple analysis plots saved for reference

The dataset is now ready for model training in `training.ipynb`!