# Import

In [None]:
import face_recognition
import numpy as np
import os
from tqdm import tqdm

from collections import defaultdict
from sklearn.cluster import DBSCAN

# Constantes

In [26]:
# Dossiers source et destination
PATH_DIR_PHOTO = "data/data_reduit_10/"

# Liste Photos

In [27]:
photos = [f for f in os.listdir(PATH_DIR_PHOTO) if f.lower().endswith(('jpg','jpeg','png'))]

In [28]:
len(photos)

13

# Détecter et extraction d'empreinte de chaque image

In [29]:
all_face_embeddings = []  # va contenir (photo_id, embedding, location)
photo_id = 0

for photo_name in tqdm(photos):
    photo_path = os.path.join(PATH_DIR_PHOTO, photo_name)
    image = face_recognition.load_image_file(photo_path)

    # 1. Détecter toutes les localisations de visages
    face_locations = face_recognition.face_locations(image)

    # 2. Extraire l'empreinte de chaque visage détecté
    face_encodings = face_recognition.face_encodings(image, face_locations)

    # On stocke tout ça pour plus tard
    for loc, enc in zip(face_locations, face_encodings):
        all_face_embeddings.append({
            'photo_id': photo_id,
            'photo_name': photo_name,
            'location': loc,
            'embedding': enc
        })

    photo_id += 1

100%|██████████| 13/13 [02:29<00:00, 11.53s/it]


# Clustering

In [43]:
# Extraire uniquement les embeddings dans un array Numpy
embeddings = np.array([face["embedding"] for face in all_face_embeddings])

# Appliquer DBSCAN
dbscan = DBSCAN(metric='euclidean', eps=0.5, min_samples=2)  # Ajuste eps et min_samples selon tes tests
dbscan.fit(embeddings)

# dbscan.labels_ contient un numéro de cluster pour chaque embedding
# -1 correspond aux "outliers" qui n'appartiennent à aucun cluster
labels = dbscan.labels_

# Associer chaque visage à son cluster
for idx, face in enumerate(all_face_embeddings):
    face['cluster_id'] = labels[idx]


In [44]:
clusters = defaultdict(list)
for face in all_face_embeddings:
    cluster_id = face['cluster_id']
    clusters[cluster_id].append(face['photo_name'])


# Visualisation

In [45]:
unique_cluster_ids = set(labels) - {-1}
print(f"Nombre de visages distincts : {len(unique_cluster_ids)}")

for c_id in unique_cluster_ids:
    print(f"\nCluster {c_id} :")
    photos_in_cluster = set(clusters[c_id])  # pour enlever les doublons
    for p in photos_in_cluster:
        print(f"  - {p}")


Nombre de visages distincts : 3

Cluster 0 :
  - IMG_2735.JPG
  - IMG_2732.JPG

Cluster 1 :
  - IMG_2737.JPG
  - IMG_2879.JPG

Cluster 2 :
  - IMG_2735.JPG
  - IMG_2859.JPG
