# 01 — Data Exploration

**Project:** Maize Disease Classification using Deep Learning  
**Candidate:** WANDABWA Frieze (ST62/55175/2025)  
**Institution:** Open University of Kenya — MSc AI  

---

This notebook covers:
1. Dataset structure & image counts per class
2. Sample image visualisation
3. Class distribution analysis
4. Basic image statistics (dimensions, colour channels)
5. Recommendations for data collection

## 0. Setup

In [None]:
import sys
import os
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import seaborn as sns
import cv2

# Add project root to path
PROJECT_ROOT = Path().resolve().parent
sys.path.insert(0, str(PROJECT_ROOT))

import config

# Plot style
plt.rcParams['figure.dpi'] = 120
plt.rcParams['font.family'] = 'DejaVu Sans'
sns.set_style('whitegrid')

print(f'Project root : {PROJECT_ROOT}')
print(f'Data directory: {config.DATA_DIR}')
print(f'Classes       : {config.CLASSES}')

## 1. Dataset Structure & Image Counts

In [None]:
IMAGE_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp'}

def count_images(class_dir: Path) -> int:
    if not class_dir.exists():
        return 0
    return sum(1 for f in class_dir.iterdir()
               if f.is_file() and f.suffix.lower() in IMAGE_EXTENSIONS)

counts = {cls: count_images(config.DATA_DIR / cls) for cls in config.CLASSES}
total  = sum(counts.values())

print('=' * 50)
print('  DATASET SUMMARY')
print('=' * 50)
for cls, n in counts.items():
    status = '✅' if n >= 50 else '⚠️ '
    print(f'  {status}  {cls:20s} : {n:4d} images')
print('-' * 50)
print(f'       {"TOTAL":20s} : {total:4d} images')
print('=' * 50)

## 2. Class Distribution Plot

In [None]:
short_names = [c.replace('maize_', '').capitalize() for c in config.CLASSES]
values = list(counts.values())
colours = ['#2ECC71', '#E74C3C', '#E67E22']

fig, axes = plt.subplots(1, 2, figsize=(13, 5))

# Bar chart
bars = axes[0].bar(short_names, values, color=colours, edgecolor='white', linewidth=1.2)
for bar, val in zip(bars, values):
    axes[0].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1,
                 str(val), ha='center', va='bottom', fontweight='bold')
axes[0].axhline(50,  color='red',   linestyle='--', alpha=0.6, label='Min (50)')
axes[0].axhline(200, color='orange',linestyle='--', alpha=0.6, label='Recommended (200)')
axes[0].axhline(500, color='green', linestyle='--', alpha=0.6, label='Optimal (500)')
axes[0].set_title('Images per Class', fontsize=13, fontweight='bold')
axes[0].set_ylabel('Count')
axes[0].legend(fontsize=9)

# Pie chart (only if we have data)
if total > 0:
    axes[1].pie(values, labels=short_names, colors=colours, autopct='%1.1f%%',
                startangle=140, pctdistance=0.75,
                wedgeprops=dict(width=0.5, edgecolor='white'))
    axes[1].set_title('Class Distribution', fontsize=13, fontweight='bold')
else:
    axes[1].text(0.5, 0.5, 'No images yet\nAdd photos to data/raw/',
                 ha='center', va='center', fontsize=12,
                 transform=axes[1].transAxes)
    axes[1].set_title('Class Distribution', fontsize=13, fontweight='bold')

plt.suptitle('Maize Disease Dataset Overview', fontsize=15, fontweight='bold')
plt.tight_layout()
plt.savefig(PROJECT_ROOT / 'results' / 'class_distribution.png', dpi=150, bbox_inches='tight')
plt.show()
print('Saved → results/class_distribution.png')

## 3. Sample Image Visualisation

> **Note:** This cell requires at least one image per class in `data/raw/`.

In [None]:
def load_sample_images(class_name: str, n: int = 4) -> list:
    """Return up to n image arrays for a given class."""
    class_dir = config.DATA_DIR / class_name
    paths = sorted(
        [f for f in class_dir.iterdir()
         if f.is_file() and f.suffix.lower() in IMAGE_EXTENSIONS]
    )[:n]
    images = []
    for p in paths:
        img = cv2.imread(str(p))
        if img is not None:
            images.append(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
    return images

n_cols = 4
fig, axes = plt.subplots(config.NUM_CLASSES, n_cols,
                          figsize=(14, 3 * config.NUM_CLASSES))

for row, cls in enumerate(config.CLASSES):
    images = load_sample_images(cls, n=n_cols)
    label  = cls.replace('maize_', '').upper()
    for col in range(n_cols):
        ax = axes[row][col]
        if col < len(images):
            ax.imshow(images[col])
            ax.set_title(f'{label}' if col == 0 else '', fontweight='bold')
        else:
            ax.text(0.5, 0.5, 'No image', ha='center', va='center',
                    transform=ax.transAxes, color='grey')
        ax.axis('off')

plt.suptitle('Sample Images — Maize Disease Classes', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig(PROJECT_ROOT / 'results' / 'sample_images.png', dpi=150, bbox_inches='tight')
plt.show()
print('Saved → results/sample_images.png')

## 4. Image Statistics

In [None]:
import pandas as pd

stats = []
for cls in config.CLASSES:
    class_dir = config.DATA_DIR / cls
    for p in class_dir.iterdir():
        if p.is_file() and p.suffix.lower() in IMAGE_EXTENSIONS:
            img = cv2.imread(str(p))
            if img is not None:
                h, w, c = img.shape
                stats.append({
                    'class': cls,
                    'height': h,
                    'width': w,
                    'aspect': round(w / h, 2),
                    'file_kb': round(p.stat().st_size / 1024, 1)
                })

if stats:
    df = pd.DataFrame(stats)
    print('Image dimension statistics:')
    display(df.groupby('class')[['height', 'width', 'aspect', 'file_kb']]
              .agg(['mean', 'min', 'max']).round(1))
else:
    print('No images found. Add images to data/raw/ subdirectories first.')

## 5. Next Steps

Based on the dataset analysis above:

| Step | Action |
|------|--------|
| 1 | Collect more images if any class < 200 |
| 2 | Run `python src/project_status.py` to check FRIENDS compliance |
| 3 | Run `python src/train.py` to begin training |
| 4 | Run `python src/evaluate.py` for metrics & XAI explanations |
| 5 | Open `notebooks/02_model_training.ipynb` (future) for deep-dive |
