# Classification d'images avec EfficientNetB5
Ce projet consiste à entraîner un modèle de deep learning pour classer des images en trois catégories (par exemple : tumeur cérébrale, sain, etc.) à l'aide d'**EfficientNetB5** et de **Keras/TensorFlow**.

Le jeu de données est supposé être structuré dans un dossier `data/` avec un sous-dossier par classe contenant les images.

**Phases couvertes :**
- Nettoyage des doublons
- Partition en train/val/test
- Prétraitement des données
- Entraînement avec callbacks
- Évaluation et visualisation des performances

## 1. Imports et dépendances

In [None]:

import os
import shutil
import random
import hashlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import EfficientNetB5
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import GlobalAveragePooling2D, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from sklearn.metrics import confusion_matrix, classification_report


## 2. Détection et suppression de doublons

In [None]:

def file_hash(path, algo='md5', block_size=65536):
    h = hashlib.new(algo)
    with open(path, 'rb') as f:
        for block in iter(lambda: f.read(block_size), b''):
            h.update(block)
    return h.hexdigest()

root_dir = 'data'
seen = {}
to_remove = []

for cls in os.listdir(root_dir):
    cls_path = os.path.join(root_dir, cls)
    if not os.path.isdir(cls_path):
        continue
    for fname in os.listdir(cls_path):
        path = os.path.join(cls_path, fname)
        if not os.path.isfile(path):
            continue
        h = file_hash(path)
        if h in seen:
            to_remove.append(path)
        else:
            seen[h] = path

for dup in to_remove:
    os.remove(dup)
    print(f"Supprimé : {dup}")

print(f"Doublons supprimés : {len(to_remove)}")


## 3. Répartition des données en train/val/test

In [None]:

source_dir = 'data'
target_dir = 'data_ml_efficient_net'
classes = [d for d in os.listdir(source_dir) if os.path.isdir(os.path.join(source_dir, d))]
ratios = {'train': 0.7, 'val': 0.15, 'test': 0.15}

# Nettoyage du dossier cible
if os.path.exists(target_dir):
    shutil.rmtree(target_dir)
for split in ratios:
    for cls in classes:
        os.makedirs(os.path.join(target_dir, split, cls), exist_ok=True)

# Répartition
random.seed(42)
for cls in classes:
    files = os.listdir(os.path.join(source_dir, cls))
    random.shuffle(files)
    n = len(files)
    train, val = int(n * ratios['train']), int(n * ratios['val'])
    splits = {
        'train': files[:train],
        'val': files[train:train + val],
        'test': files[train + val:]
    }
    for split, file_list in splits.items():
        for f in file_list:
            shutil.copy(os.path.join(source_dir, cls, f), os.path.join(target_dir, split, cls, f))

print('Données réparties dans train/val/test')


In [None]:
from glob import glob

split_hashes = {}
for split in ['train', 'val', 'test']:
    split_hashes[split] = set()
    for path in glob(f"{target_dir}/{split}/**/*.*", recursive=True):
        split_hashes[split].add(file_hash(path))

# Chercher les intersections
for a in split_hashes:
    for b in split_hashes:
        if a < b:
            inter = split_hashes[a] & split_hashes[b]
            if inter:
                print(f"Doublons entre {a} et {b} : {len(inter)} images")

In [None]:
counts, total = count_images(target_dir)

print(f"TOTAL images dans '{target_dir}': {total}\n")
print("Détail par sous-dossier :")
for subdir, n in sorted(counts.items()):
    print(f"  {subdir:30s} : {n}")

## 4. Chargement des images avec `ImageDataGenerator`

In [None]:

image_size = (224, 224)
batch_size = 32
train_dir = os.path.join(target_dir, 'train')
val_dir = os.path.join(target_dir, 'val')
test_dir = os.path.join(target_dir, 'test')

train_gen = ImageDataGenerator(rescale=1./255, rotation_range=15, zoom_range=0.2, horizontal_flip=True).flow_from_directory(
    train_dir, target_size=image_size, batch_size=batch_size, class_mode='categorical')

val_gen = ImageDataGenerator(rescale=1./255).flow_from_directory(
    val_dir, target_size=image_size, batch_size=batch_size, class_mode='categorical')

test_gen = ImageDataGenerator(rescale=1./255).flow_from_directory(
    test_dir, target_size=image_size, batch_size=batch_size, class_mode='categorical', shuffle=False)


## 5. Construction du modèle EfficientNetB5

In [None]:

base_model = EfficientNetB5(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
for layer in base_model.layers[:95]:
    layer.trainable = False

x = GlobalAveragePooling2D()(base_model.output)
x = Dense(128, activation='relu')(x)
output = Dense(len(classes), activation='softmax')(x)
model = Model(inputs=base_model.input, outputs=output)

model.compile(optimizer=Adam(learning_rate=1e-5),
              loss='categorical_crossentropy',
              metrics=['accuracy'])


## 6. Entraînement du modèle

In [None]:

callbacks = [
    EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-7),
    ModelCheckpoint('best_model.h5', monitor='val_accuracy', save_best_only=True)
]

history = model.fit(train_gen, validation_data=val_gen, epochs=50, callbacks=callbacks)


## 7. Évaluation et visualisation des performances

## Evolution des métrics durant le train

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Extraire les métriques depuis l'objet history
val_accuracy = np.asarray(history.history['val_accuracy'])
train_accuracy = np.asarray(history.history['accuracy'])
val_loss = np.asarray(history.history['val_loss'])
train_loss = np.asarray(history.history['loss'])

# Nombre d'époques
epochs = np.arange(1, len(train_accuracy) + 1)

# Création du graphique
plt.figure(figsize=(10, 6))
plt.plot(epochs, train_accuracy, label='Exactitude (entraînement)', marker='o', color='green')
plt.plot(epochs, val_accuracy, label='Exactitude (validation)', marker='o', color='blue')
plt.plot(epochs, train_loss, label='Perte (entraînement)', marker='s', color='red')
plt.plot(epochs, val_loss, label='Perte (validation)', marker='s', color='orange')

plt.title("Évolution de l'exactitude et de la perte pendant l'entraînement")
plt.xlabel("Époques")
plt.ylabel("Valeurs")
plt.legend(loc='center right')
plt.grid(True)
plt.tight_layout()
plt.show()

## Evaluation de la performance du modèle sur les données de tests

In [None]:

loss, acc = model.evaluate(test_gen)
print(f"Test Accuracy: {acc:.4f} | Test Loss: {loss:.4f}")

y_pred_probs = model.predict(test_gen)
y_pred = np.argmax(y_pred_probs, axis=1)
y_true = test_gen.classes
class_names = list(test_gen.class_indices.keys())

cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=class_names, yticklabels=class_names)
plt.title("Matrice de confusion")
plt.xlabel("Prédictions")
plt.ylabel("Vérités")
plt.show()

print(classification_report(y_true, y_pred, target_names=class_names, digits=4))
