In [1]:
from PIL import Image
import os
import pandas as pd
import numpy as np
import torch
from tabulate import tabulate
from transformers import CLIPProcessor, CLIPModel
import time
from sklearn.cluster import DBSCAN
from collections import defaultdict

In [2]:
def get_image_paths(directory):
    allowed_extensions = {".jpg", ".jpeg", ".png"}
    image_paths = [os.path.join(directory, filename) for filename in os.listdir(directory) if os.path.splitext(filename)[1].lower() in allowed_extensions]
    return image_paths

In [30]:
def dms_to_decimal(dms, ref):
    degrees, minutes, seconds = dms
    decimal = degrees + minutes / 60 + seconds / 3600
    if ref in ['S', 'W']:
        decimal = -decimal
    return decimal

def extract_coordinates(gps_dict):
    lat = dms_to_decimal(gps_dict[2], gps_dict[1])
    lon = dms_to_decimal(gps_dict[4], gps_dict[3])
    return lat, lon

In [31]:
def create_df(image_paths):
    image_list = []
    for path in image_paths:
        image = Image.open(path)
        image_name = os.path.basename(path)
        exifdata = image._getexif()
        date_time, latitude, longitude = None, None, None
        if exifdata:
            for tag_id, value in exifdata.items():
                tag = Image.ExifTags.TAGS.get(tag_id, tag_id)
                if tag == "DateTime":
                    date_time = value
                elif tag == "GPSInfo":
                    gps_filtered = {k: value[k] for k in [1, 2, 3, 4] if k in value}
                    if gps_filtered:
                        lat, lon = extract_coordinates(gps_filtered)
                        latitude = lat
                        longitude = lon

            image_list.append((image_name, path, date_time, latitude, longitude))

        else:
            print("Aucune donnée EXIF trouvée.")

    df = pd.DataFrame(image_list, columns=["image_name", "path", "date_time", "latitude", "longitude"])

    return df

In [32]:
directory = "photos_victor"
image_paths = get_image_paths(directory)
device = "cuda" if torch.cuda.is_available() else "cpu"
df = create_df(image_paths)
#print(tabulate(df, headers="keys", tablefmt="psql"))

In [33]:
print(tabulate(df, headers="keys", tablefmt="psql"))

+----+--------------------------------+----------------------------------------------+---------------------+------------+-------------+
|    | image_name                     | path                                         | date_time           |   latitude |   longitude |
|----+--------------------------------+----------------------------------------------+---------------------+------------+-------------|
|  0 | IMG_20250105_144147.jpg        | photos_victor\IMG_20250105_144147.jpg        | 2025:01:05 14:41:47 |    46.8121 |    -71.2054 |
|  1 | IMG_20250105_144436.jpg        | photos_victor\IMG_20250105_144436.jpg        | 2025:01:05 14:44:36 |    46.8121 |    -71.2054 |
|  2 | IMG_20250105_144540.jpg        | photos_victor\IMG_20250105_144540.jpg        | 2025:01:05 14:45:40 |    46.812  |    -71.2056 |
|  3 | IMG_20250105_144750.jpg        | photos_victor\IMG_20250105_144750.jpg        | 2025:01:05 14:47:50 |    46.812  |    -71.2056 |
|  4 | IMG_20250105_145543_BURST1.jpg | photos_v

In [201]:
print(torch.cuda.is_available())

True


## Methode 1 : Clustering par similarité des voisins proches

In [202]:
def image_embedding(paths):
    clip_model = CLIPModel.from_pretrained("laion/CLIP-ViT-L-14-laion2B-s32B-b82K").to(device)
    clip_processor = CLIPProcessor.from_pretrained("laion/CLIP-ViT-L-14-laion2B-s32B-b82K")

    images = []

    for path in paths:
        image = Image.open(path).convert("RGB")
        images.append(image)

    # Prétraitement des images en batch
    image_inputs = clip_processor(images=images, return_tensors="pt", padding=True).to(device)

    with torch.no_grad():
        image_embeddings = clip_model.get_image_features(**image_inputs)

    # Normalisation
    image_embeddings = image_embeddings / image_embeddings.norm(p=2, dim=-1, keepdim=True)
    image_embeddings = image_embeddings.cpu().numpy()

    return image_embeddings


In [203]:
embedded_images = image_embedding(image_paths)

In [204]:
def similarity_clustering(embeddings, paths, threshold=0.6, n_neighbors=2):
    image_names = [os.path.basename(path) for path in paths]
    N = len(image_names)
    clusters = {}
    cluster_id = 0
    current_cluster = []

    for i in range(N):
        current_image = image_names[i]
        has_strong_link = False

        for j in range(1, n_neighbors + 1):
            if i + j < N:
                sim = np.dot(embeddings[i], embeddings[i + j])
                if sim > threshold:
                    has_strong_link = True
                    break

        current_cluster.append(current_image)

        if not has_strong_link:
            clusters[f"cluster_{cluster_id}"] = current_cluster
            cluster_id += 1
            current_cluster = []

    # Gère le dernier cluster s'il reste des images non assignées
    if current_cluster:
        clusters[f"cluster_{cluster_id}"] = current_cluster

    return clusters

In [205]:
def photos_to_add(paths, embeddings, threshold=0.6):
    photos = []
    outliers = []
    all_photos = False

    sim_furthest = np.dot(embeddings[0], embeddings[-1])
    if sim_furthest > threshold:
        all_photos = True

    for i in range(1, len(paths)):
        sim = np.dot(embeddings[0], embeddings[i])
        print(f"Les photos {paths[0]} et {paths[i]} ont une similarité de {sim:.2f}")
        if sim > threshold :
            photos.append((paths[i], sim))
        elif all_photos:
            photos.append((paths[i], sim))
            outliers.append(paths[i])

    return photos, outliers

In [206]:
def neighbors_similarity_clustering(embeddings, paths, threshold=0.6, n_neighbors=3):
    N = len(paths)
    clusters = {}
    cluster_id = 0
    current_cluster = []
    already_clustered = set()
    all_outliers = []

    for i in range(N):
        current_img = paths[i]
        if current_img in all_outliers:
            continue

        end_idx = min(i + n_neighbors + 1, N)
        checking_paths = paths[i:end_idx]
        checking_embeddings = embeddings[i:end_idx]

        photos, outliers = photos_to_add(checking_paths, checking_embeddings, threshold)
        if outliers:
            all_outliers.append(outliers[0])

        if photos and current_img not in already_clustered:
            current_cluster.append(current_img)
            already_clustered.add(current_img)
            print(f"Ajout de {current_img} au cluster {cluster_id}")

        for elem in photos:
            if elem[0] not in already_clustered:
                current_cluster.append(elem[0])
                already_clustered.add(elem[0])
                print(f"Ajout de {elem[0]} au cluster {cluster_id}")

        if not photos and current_cluster:
            clusters[f"cluster_{cluster_id}"] = current_cluster
            cluster_id += 1
            current_cluster = []

        print(f"Nombre de photos similaires : {len(photos)}")

    print(len(current_cluster))
    if len(current_cluster) > 1:
        clusters[f"cluster_{cluster_id}"] = current_cluster

    other_cluster = []
    for path in image_paths:
        found = False
        for cluster_id, images in clusters.items():
            if path in images:
                found = True
                break
        if not found:
            other_cluster.append(path)
    if other_cluster:
        clusters["others"] = other_cluster

    return clusters

In [207]:
clusters = neighbors_similarity_clustering(embedded_images, image_paths)

Les photos photos_victor\IMG_20250105_144147.jpg et photos_victor\IMG_20250105_144436.jpg ont une similarité de 0.79
Les photos photos_victor\IMG_20250105_144147.jpg et photos_victor\IMG_20250105_144540.jpg ont une similarité de 0.66
Les photos photos_victor\IMG_20250105_144147.jpg et photos_victor\IMG_20250105_144750.jpg ont une similarité de 0.69
Ajout de photos_victor\IMG_20250105_144147.jpg au cluster 0
Ajout de photos_victor\IMG_20250105_144436.jpg au cluster 0
Ajout de photos_victor\IMG_20250105_144540.jpg au cluster 0
Ajout de photos_victor\IMG_20250105_144750.jpg au cluster 0
Nombre de photos similaires : 3
Les photos photos_victor\IMG_20250105_144436.jpg et photos_victor\IMG_20250105_144540.jpg ont une similarité de 0.69
Les photos photos_victor\IMG_20250105_144436.jpg et photos_victor\IMG_20250105_144750.jpg ont une similarité de 0.76
Les photos photos_victor\IMG_20250105_144436.jpg et photos_victor\IMG_20250105_145543_BURST1.jpg ont une similarité de 0.73
Ajout de photos_vic

In [208]:
print(clusters)

{'cluster_0': ['photos_victor\\IMG_20250105_144147.jpg', 'photos_victor\\IMG_20250105_144436.jpg', 'photos_victor\\IMG_20250105_144540.jpg', 'photos_victor\\IMG_20250105_144750.jpg', 'photos_victor\\IMG_20250105_145543_BURST1.jpg', 'photos_victor\\IMG_20250105_150144.jpg'], 'cluster_1': ['photos_victor\\IMG_20250105_161636.jpg', 'photos_victor\\IMG_20250105_165438.jpg', 'photos_victor\\IMG_20250105_165450.jpg', 'photos_victor\\IMG_20250105_165457.jpg'], 'cluster_2': ['photos_victor\\IMG_20250108_143517.jpg', 'photos_victor\\IMG_20250109_174849.jpg', 'photos_victor\\IMG_20250110_163528.jpg', 'photos_victor\\IMG_20250110_164345.jpg'], 'cluster_3': ['photos_victor\\IMG_20250111_152149.jpg', 'photos_victor\\IMG_20250111_152150.jpg', 'photos_victor\\IMG_20250111_154046.jpg'], 'cluster_4': ['photos_victor\\IMG_20250119_131124_BURST1.jpg', 'photos_victor\\IMG_20250119_131214_BURST1.jpg', 'photos_victor\\IMG_20250119_131317.jpg', 'photos_victor\\IMG_20250119_131334_BURST1.jpg', 'photos_victor\

In [209]:
for cluster_id, images in clusters.items():
    for image in images:
        print(f"{cluster_id} : {image}")
    print("\n")

cluster_0 : photos_victor\IMG_20250105_144147.jpg
cluster_0 : photos_victor\IMG_20250105_144436.jpg
cluster_0 : photos_victor\IMG_20250105_144540.jpg
cluster_0 : photos_victor\IMG_20250105_144750.jpg
cluster_0 : photos_victor\IMG_20250105_145543_BURST1.jpg
cluster_0 : photos_victor\IMG_20250105_150144.jpg


cluster_1 : photos_victor\IMG_20250105_161636.jpg
cluster_1 : photos_victor\IMG_20250105_165438.jpg
cluster_1 : photos_victor\IMG_20250105_165450.jpg
cluster_1 : photos_victor\IMG_20250105_165457.jpg


cluster_2 : photos_victor\IMG_20250108_143517.jpg
cluster_2 : photos_victor\IMG_20250109_174849.jpg
cluster_2 : photos_victor\IMG_20250110_163528.jpg
cluster_2 : photos_victor\IMG_20250110_164345.jpg


cluster_3 : photos_victor\IMG_20250111_152149.jpg
cluster_3 : photos_victor\IMG_20250111_152150.jpg
cluster_3 : photos_victor\IMG_20250111_154046.jpg


cluster_4 : photos_victor\IMG_20250119_131124_BURST1.jpg
cluster_4 : photos_victor\IMG_20250119_131214_BURST1.jpg
cluster_4 : photos_vi

In [210]:
stop

NameError: name 'stop' is not defined

## Methode 2 : Clustering par jour puis ensuite par similarités

In [349]:
def day_sorting(df):
    days = {}
    for index, row in df.iterrows():
        date = row["date_time"]
        if date:
            day = date.split(" ")[0]
            if day not in days:
                days[day] = []
            days[day].append(row["path"])

    return days

In [350]:
days_dict = day_sorting(df)
days_dict

{'2025:01:05': ['temp\\IMG_20250105_144147.jpg',
  'temp\\IMG_20250105_144436.jpg',
  'temp\\IMG_20250105_144540.jpg',
  'temp\\IMG_20250105_144750.jpg',
  'temp\\IMG_20250105_145543_BURST1.jpg',
  'temp\\IMG_20250105_150144.jpg',
  'temp\\IMG_20250105_161636.jpg',
  'temp\\IMG_20250105_165438.jpg',
  'temp\\IMG_20250105_165450.jpg',
  'temp\\IMG_20250105_165457.jpg'],
 '2025:01:07': ['temp\\IMG_20250107_180316.jpg'],
 '2025:01:08': ['temp\\IMG_20250108_143517.jpg'],
 '2025:01:09': ['temp\\IMG_20250109_174849.jpg'],
 '2025:01:10': ['temp\\IMG_20250110_163528.jpg',
  'temp\\IMG_20250110_164345.jpg'],
 '2025:01:11': ['temp\\IMG_20250111_152149.jpg',
  'temp\\IMG_20250111_152150.jpg',
  'temp\\IMG_20250111_154046.jpg'],
 '2025:01:13': ['temp\\IMG_20250113_201010.jpg'],
 '2025:01:14': ['temp\\IMG_20250114_193654.jpg'],
 '2025:01:19': ['temp\\IMG_20250119_131124_BURST1.jpg',
  'temp\\IMG_20250119_131214_BURST1.jpg',
  'temp\\IMG_20250119_131317.jpg',
  'temp\\IMG_20250119_131334_BURST1.jpg'

In [351]:
def days_embedding(days_dict):
    embeddings_dict = {}
    for day, images in days_dict.items():
        #print(f"Jour : {day}, Images : {images}")
        embeddings = image_embedding(images)
        embeddings_dict[day] = []
        for i, image in enumerate(images):
            embeddings_dict[day].append({
                'path' : image,
                'embedding' : embeddings[i]
            })
    return embeddings_dict

In [352]:
embeddings_dict = days_embedding(days_dict)
embeddings_dict

{'2025:01:05': [{'path': 'temp\\IMG_20250105_144147.jpg',
   'embedding': array([-3.58425779e-03, -1.48845926e-01,  4.41854894e-02,  1.80442706e-02,
           2.18128413e-02,  1.71009861e-02,  6.98811421e-03, -3.69332843e-02,
          -2.91799847e-03, -5.54098235e-03, -2.09506671e-03,  3.11359204e-02,
          -5.16058616e-02,  3.84400859e-02,  2.75507029e-02,  2.87331901e-02,
           6.26857392e-03,  8.99761729e-03,  3.35465074e-02, -4.20325296e-03,
           7.90882483e-03,  2.40337681e-02, -2.03703996e-02,  3.20118517e-02,
           2.09776349e-02,  1.09827965e-02, -3.73453237e-02, -2.02437416e-02,
          -4.15898254e-03,  1.62132329e-03,  9.40808281e-03, -4.71020006e-02,
          -2.58663222e-02, -1.55791966e-02,  7.48859392e-03,  2.92664114e-02,
           4.15957905e-02, -1.35781029e-02,  5.15265688e-02, -2.02230718e-02,
           3.00247837e-02, -2.35040890e-04,  1.55723859e-02, -3.90053005e-03,
          -1.26077235e-02, -2.69638319e-02,  4.13442310e-03,  1.9563576

### 2.1 Clustering par similarité des voisins proches

In [353]:
clustering_by_days = {}
for day, image_list in embeddings_dict.items():
    paths = [image['path'] for image in image_list]
    embeddings = np.array([image['embedding'] for image in image_list])
    print(f"Jour : {day}, Images : {paths}")
    clusters = neighbors_similarity_clustering(embeddings, paths)
    clustering_by_days[day] = clusters

Jour : 2025:01:05, Images : ['temp\\IMG_20250105_144147.jpg', 'temp\\IMG_20250105_144436.jpg', 'temp\\IMG_20250105_144540.jpg', 'temp\\IMG_20250105_144750.jpg', 'temp\\IMG_20250105_145543_BURST1.jpg', 'temp\\IMG_20250105_150144.jpg', 'temp\\IMG_20250105_161636.jpg', 'temp\\IMG_20250105_165438.jpg', 'temp\\IMG_20250105_165450.jpg', 'temp\\IMG_20250105_165457.jpg']
Jour : 2025:01:07, Images : ['temp\\IMG_20250107_180316.jpg']
Jour : 2025:01:08, Images : ['temp\\IMG_20250108_143517.jpg']
Jour : 2025:01:09, Images : ['temp\\IMG_20250109_174849.jpg']
Jour : 2025:01:10, Images : ['temp\\IMG_20250110_163528.jpg', 'temp\\IMG_20250110_164345.jpg']
Jour : 2025:01:11, Images : ['temp\\IMG_20250111_152149.jpg', 'temp\\IMG_20250111_152150.jpg', 'temp\\IMG_20250111_154046.jpg']
Jour : 2025:01:13, Images : ['temp\\IMG_20250113_201010.jpg']
Jour : 2025:01:14, Images : ['temp\\IMG_20250114_193654.jpg']
Jour : 2025:01:19, Images : ['temp\\IMG_20250119_131124_BURST1.jpg', 'temp\\IMG_20250119_131214_BURST

In [354]:
clustering_by_days

{'2025:01:05': {'cluster_0': ['IMG_20250105_144147.jpg',
   'IMG_20250105_144436.jpg',
   'IMG_20250105_144540.jpg',
   'IMG_20250105_144750.jpg',
   'IMG_20250105_145543_BURST1.jpg',
   'IMG_20250105_150144.jpg'],
  'cluster_1': ['IMG_20250105_161636.jpg',
   'IMG_20250105_165438.jpg',
   'IMG_20250105_165450.jpg',
   'IMG_20250105_165457.jpg']},
 '2025:01:07': {'cluster_0': ['IMG_20250107_180316.jpg']},
 '2025:01:08': {'cluster_0': ['IMG_20250108_143517.jpg']},
 '2025:01:09': {'cluster_0': ['IMG_20250109_174849.jpg']},
 '2025:01:10': {'cluster_0': ['IMG_20250110_163528.jpg'],
  'cluster_1': ['IMG_20250110_164345.jpg']},
 '2025:01:11': {'cluster_0': ['IMG_20250111_152149.jpg',
   'IMG_20250111_152150.jpg',
   'IMG_20250111_154046.jpg']},
 '2025:01:13': {'cluster_0': ['IMG_20250113_201010.jpg']},
 '2025:01:14': {'cluster_0': ['IMG_20250114_193654.jpg']},
 '2025:01:19': {'cluster_0': ['IMG_20250119_131124_BURST1.jpg',
   'IMG_20250119_131214_BURST1.jpg',
   'IMG_20250119_131317.jpg'],
 

### 2.2 Similarités entre toutes les photos d'une journée

In [355]:
def matrice_similarities(embeddings_dict):
    similarities_matrices = {}
    for day, image_list in embeddings_dict.items():
        paths = [image['path'] for image in image_list]
        embeddings = np.array([image['embedding'] for image in image_list])
        N = len(paths)
        similarities = np.zeros((N, N))

        for i in range(N):
            for j in range(N):
                sim = np.dot(embeddings[i], embeddings[j])
                similarities[i][j] = sim

        similarities_matrices[day] = ({'paths': paths, 'similarities': similarities})

    return similarities_matrices

In [356]:
def print_similarity_matrix(similarity_matrix, paths):
    N = len(paths)
    print("Matrice de similarité :")
    print("\t" + "\t".join([f"{path}" for path in paths]))
    for i in range(N):
        print(f"{paths[i]}\t" + "\t".join([f"{similarity_matrix[i][j]:.2f}" for j in range(N)]))
    print("\n")

In [357]:
similarities_matrices_dict = matrice_similarities(embeddings_dict)
for day, data in similarities_matrices_dict.items():
    print(f"Jour : {day}")
    similarities_matrices = data['similarities']
    image_paths = data['paths']
    print_similarity_matrix(similarities_matrices, image_paths)

Jour : 2025:01:05
Matrice de similarité :
	temp\IMG_20250105_144147.jpg	temp\IMG_20250105_144436.jpg	temp\IMG_20250105_144540.jpg	temp\IMG_20250105_144750.jpg	temp\IMG_20250105_145543_BURST1.jpg	temp\IMG_20250105_150144.jpg	temp\IMG_20250105_161636.jpg	temp\IMG_20250105_165438.jpg	temp\IMG_20250105_165450.jpg	temp\IMG_20250105_165457.jpg
temp\IMG_20250105_144147.jpg	1.00	0.79	0.66	0.69	0.70	0.67	0.38	0.49	0.45	0.46
temp\IMG_20250105_144436.jpg	0.79	1.00	0.69	0.76	0.73	0.80	0.43	0.54	0.51	0.53
temp\IMG_20250105_144540.jpg	0.66	0.69	1.00	0.76	0.60	0.60	0.46	0.51	0.45	0.46
temp\IMG_20250105_144750.jpg	0.69	0.76	0.76	1.00	0.76	0.70	0.39	0.48	0.48	0.50
temp\IMG_20250105_145543_BURST1.jpg	0.70	0.73	0.60	0.76	1.00	0.74	0.35	0.44	0.44	0.47
temp\IMG_20250105_150144.jpg	0.67	0.80	0.60	0.70	0.74	1.00	0.34	0.46	0.48	0.51
temp\IMG_20250105_161636.jpg	0.38	0.43	0.46	0.39	0.35	0.34	1.00	0.69	0.56	0.58
temp\IMG_20250105_165438.jpg	0.49	0.54	0.51	0.48	0.44	0.46	0.69	1.00	0.72	0.68
temp\IMG_20250105_165

In [362]:
def cluster_from_similarity_matrices(similarities_matrices_dict, threshold=0.6):
    clusters_by_day = {}

    for day, data in similarities_matrices_dict.items():
        paths = data["paths"]
        sim_matrix = data["similarities"]
        #print(f"Jour : {day}, Images : {paths}, Similarities : {sim_matrix}")


        # DBSCAN avec une métrique de distance pré-calculée
        model = DBSCAN(eps=1-threshold, min_samples=1, metric='precomputed')
        labels = model.fit_predict(sim_matrix)

        # Organisation des clusters
        clusters = {}
        for path, label in zip(paths, labels):
            cluster_name = f"cluster_{label}"
            if cluster_name not in clusters:
                clusters[cluster_name] = []
            clusters[cluster_name].append(path)

        clusters_by_day[day] = clusters

    return clusters_by_day

In [363]:
clusters_by_day = cluster_from_similarity_matrices(similarities_matrices_dict)
clusters_by_day

{'2025:01:05': {'cluster_0': ['temp\\IMG_20250105_144147.jpg',
   'temp\\IMG_20250105_144750.jpg',
   'temp\\IMG_20250105_145543_BURST1.jpg',
   'temp\\IMG_20250105_150144.jpg',
   'temp\\IMG_20250105_161636.jpg'],
  'cluster_-1': ['temp\\IMG_20250105_144436.jpg',
   'temp\\IMG_20250105_144540.jpg',
   'temp\\IMG_20250105_165438.jpg',
   'temp\\IMG_20250105_165450.jpg',
   'temp\\IMG_20250105_165457.jpg']},
 '2025:01:07': {'cluster_-1': ['temp\\IMG_20250107_180316.jpg']},
 '2025:01:08': {'cluster_-1': ['temp\\IMG_20250108_143517.jpg']},
 '2025:01:09': {'cluster_-1': ['temp\\IMG_20250109_174849.jpg']},
 '2025:01:10': {'cluster_0': ['temp\\IMG_20250110_163528.jpg',
   'temp\\IMG_20250110_164345.jpg']},
 '2025:01:11': {'cluster_-1': ['temp\\IMG_20250111_152149.jpg',
   'temp\\IMG_20250111_152150.jpg',
   'temp\\IMG_20250111_154046.jpg']},
 '2025:01:13': {'cluster_-1': ['temp\\IMG_20250113_201010.jpg']},
 '2025:01:14': {'cluster_-1': ['temp\\IMG_20250114_193654.jpg']},
 '2025:01:19': {'clu