## 1. Imports et Configuration

In [7]:
# =============================================================================
# IMPORTS
# =============================================================================

# Data manipulation
import numpy as np
import pandas as pd
from pathlib import Path
from typing import Dict, List, Tuple, Optional

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Subset
from torchvision import datasets, transforms, models
from torchsummary import summary

# Image processing
from PIL import Image
import cv2

# LIME for interpretability
import lime
from lime import lime_image

# Utilities
import warnings
warnings.filterwarnings('ignore')

# =============================================================================
# DEVICE CONFIGURATION
# =============================================================================

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"üñ•Ô∏è Device: {device}")
if torch.cuda.is_available():
    print(f"   GPU: {torch.cuda.get_device_name(0)}")
    print(f"   CUDA Version: {torch.version.cuda}")
else:
    print("   Running on CPU")

# Random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed(42)

print("\n‚úì Imports et configuration termin√©s")

üñ•Ô∏è Device: cuda
   GPU: NVIDIA GeForce RTX 4050 Laptop GPU
   CUDA Version: 13.0

‚úì Imports et configuration termin√©s


## 2. Acquisition des Donn√©es

### Instructions de t√©l√©chargement

**Dataset Kaggle:** https://www.kaggle.com/datasets/rodrigonuneswessner/labeledcorndataset

**Option 1 - Kaggle API:**
```bash
# Installer kaggle CLI
pip install kaggle

# Configurer API token (~/.kaggle/kaggle.json)
kaggle datasets download -d rodrigonuneswessner/labeledcorndataset
unzip labeledcorndataset.zip -d ../data/corn_images/
```

**Option 2 - Manuel:**
1. T√©l√©charger depuis Kaggle
2. D√©zipper dans `../data/corn_images/`

**Structure attendue:**
```
data/corn_images/
‚îú‚îÄ‚îÄ chao/          # ground (sol)
‚îú‚îÄ‚îÄ milho/         # corn (ma√Øs)
‚îú‚îÄ‚îÄ hervas/        # weeds (mauvaises herbes)
‚îî‚îÄ‚îÄ milho_ervas/   # corn/weeds (mixte)
```

In [8]:
# =============================================================================
# CONFIGURATION DES CHEMINS
# =============================================================================

# Le dataset est d√©j√† divis√© en Train/Test/Validation
DATA_ROOT = Path("../data/corn_images/ImagensTCCRotuladas/ImagensTCCRotuladas")
TRAIN_DIR = DATA_ROOT / "Treino"
TEST_DIR = DATA_ROOT / "Teste"
# Utiliser glob pour g√©rer l'encodage du nom Valida√ß√£o
VAL_DIR = list(DATA_ROOT.glob("Valida*"))[0]

# Mapping labels portugais ‚Üí anglais
CLASS_MAPPING = {
    'Chao': 'ground',
    'Milho': 'corn', 
    'Ervas': 'weeds',
    'Milho_ervas': 'corn_weeds'
}

# Configuration exp√©rimentation
CONFIG = {
    # Phase 1: 3 classes (ground, corn, weeds)
    'classes_3': ['Chao', 'Milho', 'Ervas'],
    
    # Phase 2: 4 classes (ajout corn/weeds)
    'classes_4': ['Chao', 'Milho', 'Ervas', 'Milho_ervas'],
    
    # Hyperparam√®tres
    'img_size': (224, 224),  # VGG16/ResNet standard
    'batch_size': 32,
    'epochs': 30,
    'validation_split': 0.2,
    'test_split': 0.1,
    
    # Optimisation
    'learning_rate': 0.001,
    'dropout_rate': 0.5,
}

print("="*70)
print("CONFIGURATION DU PROJET")
print("="*70)
print(f"Data root: {DATA_ROOT}")
print(f"Train directory: {TRAIN_DIR}")
print(f"Test directory: {TEST_DIR}")
print(f"Validation directory: {VAL_DIR}")
print(f"Image size: {CONFIG['img_size']}")
print(f"Batch size: {CONFIG['batch_size']}")
print(f"Epochs: {CONFIG['epochs']}")
print(f"\n3 classes: {[CLASS_MAPPING[c] for c in CONFIG['classes_3']]}")
print(f"4 classes: {[CLASS_MAPPING[c] for c in CONFIG['classes_4']]}")
print("="*70)

CONFIGURATION DU PROJET
Data root: ../data/corn_images/ImagensTCCRotuladas/ImagensTCCRotuladas
Train directory: ../data/corn_images/ImagensTCCRotuladas/ImagensTCCRotuladas/Treino
Test directory: ../data/corn_images/ImagensTCCRotuladas/ImagensTCCRotuladas/Teste
Validation directory: ../data/corn_images/ImagensTCCRotuladas/ImagensTCCRotuladas/Valida‚îú–∑‚îú–≥o
Image size: (224, 224)
Batch size: 32
Epochs: 30

3 classes: ['ground', 'corn', 'weeds']
4 classes: ['ground', 'corn', 'weeds', 'corn_weeds']


In [9]:
# =============================================================================
# V√âRIFICATION DONN√âES & STRUCTURE
# =============================================================================

def check_data_availability(data_dir: Path, classes: List[str]) -> Dict:
    """V√©rifie la pr√©sence et compte les images par classe."""
    stats = {}
    
    if not data_dir.exists():
        print(f"‚ùå ERROR: {data_dir} does not exist!")
        return stats
    
    for class_name in classes:
        class_dir = data_dir / class_name
        if class_dir.exists():
            images = list(class_dir.glob("*.jpg")) + list(class_dir.glob("*.png"))
            stats[class_name] = len(images)
        else:
            stats[class_name] = 0
            print(f"‚ö† WARNING: {class_dir} not found")
    
    return stats

# Check TRAIN set (3 classes)
print("\nüìä TRAIN Dataset Statistics (3 classes):")
print("-" * 70)
stats_train_3 = check_data_availability(TRAIN_DIR, CONFIG['classes_3'])
for class_name, count in stats_train_3.items():
    print(f"{CLASS_MAPPING[class_name]:15s} ({class_name:12s}): {count:5d} images")
print(f"{'Total':15s} {' '*13}: {sum(stats_train_3.values()):5d} images")

# Check TEST set (3 classes)
print("\nüìä TEST Dataset Statistics (3 classes):")
print("-" * 70)
stats_test_3 = check_data_availability(TEST_DIR, CONFIG['classes_3'])
for class_name, count in stats_test_3.items():
    print(f"{CLASS_MAPPING[class_name]:15s} ({class_name:12s}): {count:5d} images")
print(f"{'Total':15s} {' '*13}: {sum(stats_test_3.values()):5d} images")

# Check VALIDATION set (3 classes)
print("\nüìä VALIDATION Dataset Statistics (3 classes):")
print("-" * 70)
stats_val_3 = check_data_availability(VAL_DIR, CONFIG['classes_3'])
for class_name, count in stats_val_3.items():
    print(f"{CLASS_MAPPING[class_name]:15s} ({class_name:12s}): {count:5d} images")
print(f"{'Total':15s} {' '*13}: {sum(stats_val_3.values()):5d} images")

# Check 4 classes (TRAIN only for overview)
print("\nüìä TRAIN Dataset Statistics (4 classes):")
print("-" * 70)
stats_train_4 = check_data_availability(TRAIN_DIR, CONFIG['classes_4'])
for class_name, count in stats_train_4.items():
    print(f"{CLASS_MAPPING[class_name]:15s} ({class_name:12s}): {count:5d} images")
print(f"{'Total':15s} {' '*13}: {sum(stats_train_4.values()):5d} images")

# Balance check
if stats_train_3:
    max_count = max(stats_train_3.values())
    min_count = min(stats_train_3.values())
    imbalance_ratio = max_count / min_count if min_count > 0 else 0
    print(f"\n‚öñÔ∏è Class Imbalance Ratio (3 classes - TRAIN): {imbalance_ratio:.2f}")
    if imbalance_ratio > 2:
        print("   ‚Üí D√©s√©quilibre significatif: augmentation de donn√©es recommand√©e")
    else:
        print("   ‚Üí Classes relativement √©quilibr√©es")


üìä TRAIN Dataset Statistics (3 classes):
----------------------------------------------------------------------
ground          (Chao        ):  6134 images
corn            (Milho       ):  6255 images
weeds           (Ervas       ):  6015 images
Total                        : 18404 images

üìä TEST Dataset Statistics (3 classes):
----------------------------------------------------------------------
ground          (Chao        ):   100 images
corn            (Milho       ):   100 images
weeds           (Ervas       ):   100 images
Total                        :   300 images

üìä VALIDATION Dataset Statistics (3 classes):
----------------------------------------------------------------------
ground          (Chao        ):   646 images
corn            (Milho       ):   695 images
weeds           (Ervas       ):   668 images
Total                        :  2009 images

üìä TRAIN Dataset Statistics (4 classes):
----------------------------------------------------------------------

## 3. Analyse Exploratoire des Donn√©es (EDA)

Cette section explore:
- Distribution des classes
- Tailles d'images (width, height)
- Statistiques RGB (moyenne, √©cart-type)
- Exemples visuels par classe

In [10]:
# =============================================================================
# VISUALISATION √âCHANTILLONS PAR CLASSE
# =============================================================================

def plot_samples(base_dir: Path, classes: List[str], n_samples: int = 5, figsize=(15, 10)):
    """Affiche n √©chantillons al√©atoires de chaque classe."""
    fig, axes = plt.subplots(len(classes), n_samples, figsize=figsize)
    
    for i, class_name in enumerate(classes):
        class_dir = base_dir / class_name
        images = list(class_dir.glob("*.jpg")) + list(class_dir.glob("*.png"))
        
        if len(images) == 0:
            continue
            
        # S√©lection al√©atoire
        samples = np.random.choice(images, min(n_samples, len(images)), replace=False)
        
        for j, img_path in enumerate(samples):
            img = load_img(img_path)
            ax = axes[i, j] if len(classes) > 1 else axes[j]
            ax.imshow(img)
            ax.axis('off')
            
            if j == 0:
                ax.set_ylabel(f"{CLASS_MAPPING[class_name]}\n({class_name})", 
                            fontsize=12, fontweight='bold')
    
    plt.suptitle('√âchantillons par Classe', fontsize=14, fontweight='bold', y=0.98)
    plt.tight_layout()
    plt.show()

# Visualisation 3 classes
if stats_3:
    print("\nüñºÔ∏è Visualisation des √©chantillons (3 classes):")
    plot_samples(BASE_DIR, CONFIG['classes_3'], n_samples=5)

NameError: name 'stats_3' is not defined

**Observations visuelles:**
- `ground` (chao): Sol sec, couleurs terre/beige, pas de v√©g√©tation
- `corn` (milho): Feuilles vertes de ma√Øs, texture v√©g√©tale homog√®ne
- `weeds` (hervas): Plantes herbac√©es diverses, feuilles plus petites/d√©sordonn√©es
- `corn/weeds` (milho_ervas): M√©lange visible des deux types de v√©g√©tation

**Difficult√©s attendues:**
1. Distinction `corn` vs `corn/weeds`: pr√©sence partielle difficile √† d√©tecter
2. Variabilit√© d'√©clairage (photos smartphone en ext√©rieur)
3. Angles de vue vari√©s

In [None]:
# =============================================================================
# ANALYSE DISTRIBUTION DES TAILLES D'IMAGES
# =============================================================================

def analyze_image_sizes(base_dir: Path, classes: List[str], n_sample: int = 100) -> pd.DataFrame:
    """Analyse les dimensions des images (√©chantillon al√©atoire)."""
    data = []
    
    for class_name in classes:
        class_dir = base_dir / class_name
        images = list(class_dir.glob("*.jpg")) + list(class_dir.glob("*.png"))
        
        # √âchantillonnage pour acc√©l√©rer
        sampled = np.random.choice(images, min(n_sample, len(images)), replace=False)
        
        for img_path in sampled:
            img = load_img(img_path)
            width, height = img.size
            data.append({
                'class': class_name,
                'class_label': CLASS_MAPPING[class_name],
                'width': width,
                'height': height,
                'aspect_ratio': width / height,
                'total_pixels': width * height
            })
    
    return pd.DataFrame(data)

if stats_3:
    print("\nüìê Analyse des dimensions d'images (√©chantillon 100/classe):")
    df_sizes = analyze_image_sizes(BASE_DIR, CONFIG['classes_3'], n_sample=100)
    
    # Statistiques
    print("\nStatistiques globales:")
    print(df_sizes[['width', 'height', 'aspect_ratio']].describe())
    
    # Visualisation
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Distribution width/height
    axes[0].scatter(df_sizes['width'], df_sizes['height'], 
                   c=pd.Categorical(df_sizes['class_label']).codes, 
                   alpha=0.6, cmap='viridis')
    axes[0].set_xlabel('Width (px)')
    axes[0].set_ylabel('Height (px)')
    axes[0].set_title('Distribution des dimensions')
    axes[0].grid(True, alpha=0.3)
    
    # Aspect ratio
    df_sizes.boxplot(column='aspect_ratio', by='class_label', ax=axes[1])
    axes[1].set_title('Aspect Ratio par classe')
    axes[1].set_xlabel('Classe')
    axes[1].set_ylabel('Aspect Ratio (width/height)')
    
    plt.suptitle('Analyse G√©om√©trique des Images', fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.show()
    
    print(f"\n‚úì Redimensionnement √† {CONFIG['img_size']} n√©cessaire pour uniformit√©")


üìê Analyse des dimensions d'images (√©chantillon 100/classe):

Statistiques globales:


KeyError: "None of [Index(['width', 'height', 'aspect_ratio'], dtype='object')] are in the [columns]"

## 4. Pr√©traitement et Augmentation de Donn√©es

### Strat√©gies de pr√©traitement:
1. **Redimensionnement**: 224√ó224 (standard VGG16/ResNet)
2. **Normalisation**: [0, 255] ‚Üí [0, 1] (rescale=1./255)
3. **Augmentation** (train uniquement):
   - Rotation: ¬±20¬∞
   - Zoom: ¬±15%
   - Flip horizontal
   - Shift: ¬±10% (width/height)

**Justification:** Les photos sont prises en conditions naturelles avec variabilit√© d'angle et d'√©clairage ‚Üí augmentation robustifie le mod√®le.

In [None]:
# =============================================================================
# DATA GENERATORS (avec augmentation pour training)
# =============================================================================

def create_data_generators(base_dir: Path, 
                          classes: List[str],
                          img_size: Tuple[int, int],
                          batch_size: int,
                          validation_split: float = 0.2):
    """
    Cr√©e les g√©n√©rateurs train/validation avec augmentation.
    
    Returns:
        train_gen, val_gen, class_indices
    """
    
    # G√©n√©rateur TRAIN avec augmentation
    train_datagen = ImageDataGenerator(
        rescale=1./255,              # Normalisation [0,1]
        rotation_range=20,           # Rotation ¬±20¬∞
        width_shift_range=0.1,       # Shift horizontal ¬±10%
        height_shift_range=0.1,      # Shift vertical ¬±10%
        zoom_range=0.15,             # Zoom ¬±15%
        horizontal_flip=True,        # Flip al√©atoire
        fill_mode='nearest',         # Remplissage pixels manquants
        validation_split=validation_split
    )
    
    # G√©n√©rateur VALIDATION (pas d'augmentation)
    val_datagen = ImageDataGenerator(
        rescale=1./255,
        validation_split=validation_split
    )
    
    # Flow from directory (train)
    train_generator = train_datagen.flow_from_directory(
        base_dir,
        target_size=img_size,
        batch_size=batch_size,
        class_mode='categorical',
        classes=classes,
        subset='training',
        shuffle=True,
        seed=42
    )
    
    # Flow from directory (validation)
    val_generator = val_datagen.flow_from_directory(
        base_dir,
        target_size=img_size,
        batch_size=batch_size,
        class_mode='categorical',
        classes=classes,
        subset='validation',
        shuffle=False,
        seed=42
    )
    
    return train_generator, val_generator, train_generator.class_indices

# Cr√©ation des g√©n√©rateurs (3 classes)
if stats_3:
    print("\nüîÑ Cr√©ation des g√©n√©rateurs de donn√©es (3 classes):")
    print("="*70)
    train_gen_3, val_gen_3, class_indices_3 = create_data_generators(
        BASE_DIR,
        CONFIG['classes_3'],
        CONFIG['img_size'],
        CONFIG['batch_size'],
        CONFIG['validation_split']
    )
    
    print(f"\nClass indices: {class_indices_3}")
    print(f"Train samples: {train_gen_3.samples}")
    print(f"Validation samples: {val_gen_3.samples}")
    print(f"Batch size: {CONFIG['batch_size']}")
    print(f"Steps per epoch (train): {train_gen_3.samples // CONFIG['batch_size']}")
    print(f"Validation steps: {val_gen_3.samples // CONFIG['batch_size']}")
    print("="*70)

In [None]:
# =============================================================================
# VISUALISATION AUGMENTATION
# =============================================================================

def visualize_augmentation(generator, n_images: int = 4):
    """Affiche des exemples d'images augment√©es."""
    batch = next(generator)
    images = batch[0][:n_images]
    labels = batch[1][:n_images]
    
    fig, axes = plt.subplots(1, n_images, figsize=(15, 4))
    
    # Reverse class indices for display
    idx_to_class = {v: CLASS_MAPPING[k] for k, v in generator.class_indices.items()}
    
    for i in range(n_images):
        axes[i].imshow(images[i])
        axes[i].axis('off')
        
        # Get predicted class
        class_idx = np.argmax(labels[i])
        class_name = idx_to_class[class_idx]
        axes[i].set_title(f"{class_name}", fontsize=11, fontweight='bold')
    
    plt.suptitle('Exemples d\'images augment√©es (training set)', 
                fontsize=13, fontweight='bold')
    plt.tight_layout()
    plt.show()

if stats_3:
    print("\nüé® Visualisation de l'augmentation de donn√©es:")
    visualize_augmentation(train_gen_3, n_images=4)

## 5. Mod√©lisation - Phase 1: Baseline CNN (3 classes)

### Architecture CNN Simple

**Justification du design:**
- 3 blocs Conv2D + MaxPooling (extraction features hi√©rarchiques)
- BatchNormalization apr√®s chaque conv (stabilit√© training)
- Dropout 0.5 avant classification (r√©gularisation)
- Dense layer finale softmax (3 classes)

**Optimiseur:** Adam (lr=0.001) - adaptatif, converge rapidement

In [None]:
# =============================================================================
# CNN BASELINE (3 classes)
# =============================================================================

def build_baseline_cnn(input_shape: Tuple[int, int, int], 
                      num_classes: int,
                      dropout_rate: float = 0.5) -> keras.Model:
    """
    Construit un CNN simple baseline.
    
    Architecture:
        Conv2D(32) ‚Üí BatchNorm ‚Üí MaxPool ‚Üí Dropout(0.25)
        Conv2D(64) ‚Üí BatchNorm ‚Üí MaxPool ‚Üí Dropout(0.25)
        Conv2D(128) ‚Üí BatchNorm ‚Üí MaxPool ‚Üí Dropout(0.25)
        Flatten ‚Üí Dense(256) ‚Üí Dropout(0.5) ‚Üí Dense(num_classes)
    """
    model = models.Sequential([
        # Block 1
        layers.Conv2D(32, (3, 3), activation='relu', padding='same', 
                     input_shape=input_shape, name='conv1'),
        layers.BatchNormalization(name='bn1'),
        layers.MaxPooling2D((2, 2), name='pool1'),
        layers.Dropout(0.25, name='dropout1'),
        
        # Block 2
        layers.Conv2D(64, (3, 3), activation='relu', padding='same', name='conv2'),
        layers.BatchNormalization(name='bn2'),
        layers.MaxPooling2D((2, 2), name='pool2'),
        layers.Dropout(0.25, name='dropout2'),
        
        # Block 3
        layers.Conv2D(128, (3, 3), activation='relu', padding='same', name='conv3'),
        layers.BatchNormalization(name='bn3'),
        layers.MaxPooling2D((2, 2), name='pool3'),
        layers.Dropout(0.25, name='dropout3'),
        
        # Classifier
        layers.Flatten(name='flatten'),
        layers.Dense(256, activation='relu', name='fc1'),
        layers.Dropout(dropout_rate, name='dropout_fc'),
        layers.Dense(num_classes, activation='softmax', name='predictions')
    ], name='baseline_cnn')
    
    return model

# Construction du mod√®le
if stats_3:
    print("\nüèóÔ∏è Construction du CNN Baseline (3 classes):")
    print("="*70)
    
    input_shape = (*CONFIG['img_size'], 3)
    baseline_cnn_3 = build_baseline_cnn(input_shape, num_classes=3, 
                                       dropout_rate=CONFIG['dropout_rate'])
    
    # Compilation
    baseline_cnn_3.compile(
        optimizer=optimizers.Adam(learning_rate=CONFIG['learning_rate']),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    
    baseline_cnn_3.summary()
    
    print("\n‚úì Mod√®le compil√© avec:")
    print(f"  - Optimiseur: Adam (lr={CONFIG['learning_rate']})")
    print(f"  - Loss: categorical_crossentropy")
    print(f"  - Dropout: {CONFIG['dropout_rate']}")
    print("="*70)

---

**√Ä CONTINUER:**

Les prochaines cellules √† impl√©menter:
1. ‚úÖ Training du baseline CNN (3 classes) avec callbacks
2. ‚úÖ √âvaluation + courbes accuracy/loss
3. ‚úÖ Matrice de confusion
4. ‚úÖ Transfer Learning (VGG16, ResNet50)
5. ‚úÖ Extension 4 classes
6. ‚úÖ Interpr√©tation avec LIME

**Note:** Le notebook sera ex√©cut√© apr√®s t√©l√©chargement du dataset. Pour l'instant, structure et m√©thodologie valid√©es ‚úì