In [None]:
import os
import random
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from pathlib import Path

# --- CONFIGURATION ---
# We are in 'ml/notebooks', so we go back one level (..) to reach 'ml'
# Then we navigate to 'data/raw/train'
PROJECT_ROOT = Path('..') 
TRAIN_DIR = PROJECT_ROOT / 'data' / 'raw' / 'train'
VAL_DIR = PROJECT_ROOT / 'data' / 'raw' / 'val'

# Verify if paths exist
if TRAIN_DIR.exists():
    print(f"✅ Training directory found: {TRAIN_DIR.resolve()}")
else:
    print(f"❌ Error: Training directory not found at {TRAIN_DIR}")

if VAL_DIR.exists():
    print(f"✅ Validation directory found: {VAL_DIR.resolve()}")
else:
    print(f"❌ Error: Validation directory not found at {VAL_DIR}")

In [None]:
def count_images(directory):
    """
    Count images in each class folder, supporting multiple image formats (case-insensitive).
    """
    if not directory.exists():
        print(f"❌ Error: Directory not found at {directory}")
        return {}
    
    counts = {}
    # Image extensions to search for (case-insensitive)
    image_extensions = ['*.jpg', '*.jpeg', '*.png', '*.JPG', '*.JPEG', '*.PNG']
    
    for class_folder in sorted(directory.iterdir()):
        if class_folder.is_dir():
            class_name = class_folder.name
            image_count = 0
            # Count all image formats
            for ext in image_extensions:
                image_count += len(list(class_folder.glob(ext)))
            counts[class_name] = image_count
    return counts

train_counts = count_images(TRAIN_DIR)

print("\n--- Training Data Distribution ---")
total_images = 0
for class_name, count in sorted(train_counts.items()):
    print(f"{class_name}: {count} images")
    total_images += count

print(f"\nTotal images in training set: {total_images}")

In [None]:
def plot_random_samples(directory, num_samples=9):
    plt.figure(figsize=(12, 12))
    
    # Get all class names
    classes = [d.name for d in directory.iterdir() if d.is_dir()]
    
    for i in range(num_samples):
        # Pick a random class and a random image from that class
        random_class = random.choice(classes)
        class_path = directory / random_class
        images = list(class_path.glob('*'))
        
        if not images:
            continue
            
        random_image_path = random.choice(images)
        
        # Load and plot
        img = mpimg.imread(str(random_image_path))
        
        plt.subplot(3, 3, i + 1)
        plt.imshow(img)
        # Title format: Class Name \n Image Dimensions
        plt.title(f"{random_class}\nShape: {img.shape}", fontsize=10)
        plt.axis('off')
        
    plt.tight_layout()
    plt.show()

# Visualize
print("Displaying random samples from Training Set...")
plot_random_samples(TRAIN_DIR)