In [68]:
from datetime import datetime
import hdbscan
from pathlib import Path
from typing import Optional
import pandas as pd
from loguru import logger
from sklearn.datasets import make_blobs


In [69]:
import pickle

n_components = 3
min_dist = 0.1
n_neighbors = 10
metric = "euclidean"
bbdd="flickr"
dino_model="base"
norm=True
scaler = "None"


# Load images
path = Path("./data/flickr/flickr_validated_imgs_7000")
image_extensions = ['.jpg', '.jpeg', '.png', '.bmp']
# Find all image files recursively and filter by extension (lowercase only)
image_paths = [img_path for img_path in path.rglob('*') if img_path.suffix.lower() in image_extensions]
# Convert to lowercase and remove duplicates (especially relevant for Windows)
unique_image_paths = {img_path.resolve().as_posix().lower(): img_path for img_path in image_paths}
images = list(unique_image_paths.values())

# Load cached embeddings already reduced based on params
data = pickle.load(
                open(str(f"cache_embeddings_red/bbdd_{bbdd}_dino_model_{dino_model}--norm_{norm}--scaler_{scaler}--dimred_umap--reduction_params_metric={metric}_n_components={n_components}_n_neighbors={n_neighbors}_min_dist={min_dist}.pkl"), "rb")
)

print(data)

             0         1         2
0    -1.957472  1.558334 -0.152311
1     3.503721  2.100925 -0.866593
2    -1.582763  1.074976 -1.674532
3    -1.069139  1.439669  2.243071
4    -0.807986  2.632015 -4.292572
...        ...       ...       ...
7067 -1.254056  2.357293 -4.528391
7068  2.932524  0.689199 -1.994015
7069 -3.407852 -2.284583  2.216865
7070  0.760709  0.634420  0.636125
7071  0.843254  1.535593  1.070764

[7072 rows x 3 columns]


In [70]:
# Aplicamos hdbscan

model = hdbscan.HDBSCAN(
        min_cluster_size=5,
        min_samples=3,
        cluster_selection_epsilon=0.09038653011927535,
        metric='chebyshev',
        cluster_selection_method='leaf',
        gen_min_span_tree=True,
        prediction_data=True
    )

labels =model.fit_predict(data)




In [71]:
labels

array([123, 255,  -1, ...,  63,  -1,  -1])

In [72]:
# Get number of clusters (excluding noise)
from sklearn.metrics import davies_bouldin_score, silhouette_score


n_clusters = len(set(labels)) - (1 if -1 in labels else 0)



score_silhouette = silhouette_score(data[labels != -1], labels[labels != -1])

score_davies = davies_bouldin_score(data[labels != -1], labels[labels != -1])


# Noise points
noise_points = (labels == -1).sum()
# Calcular la proporción de ruido
noise_ratio = noise_points / len(data)


In [73]:
print(score_silhouette)
print(score_davies)
print(noise_points)
print(noise_ratio)

0.5166675
0.5380218411429121
1548
0.21889140271493213


In [74]:
probs = model.probabilities_
labels = model.labels_
print([x for x in probs])
print(labels)


[1.0, 1.0, 0.0, 0.229765420187313, 1.0, 1.0, 1.0, 1.0, 0.6174614751520585, 0.8349419135551948, 0.8148313316622839, 0.0, 0.42277999784982145, 1.0, 0.0, 1.0, 1.0, 1.0, 0.7161001675112911, 1.0, 1.0, 0.0, 1.0, 1.0, 0.22277447580530854, 0.6644267858014746, 0.6607135196968614, 0.5352779269602578, 1.0, 1.0, 1.0, 1.0, 1.0, 0.959522901802734, 0.3126651161740465, 0.5352779269602578, 0.682762483051347, 0.0, 0.4081027966742253, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.8290503531623079, 1.0, 1.0, 0.92397961700838, 1.0, 0.0, 1.0, 1.0, 0.38214530483918124, 0.0, 1.0, 1.0, 1.0, 0.9638363409176764, 0.7273801304927022, 1.0, 1.0, 0.5452071703088619, 1.0, 1.0, 0.8959727779231949, 0.0, 0.9684218098115379, 0.42751646272007077, 1.0, 0.645030560029499, 1.0, 0.0, 0.0, 0.0, 0.5352779269602578, 0.0, 0.0, 0.0, 0.6510301013917847, 1.0, 0.8643185468239464, 0.0, 0.0, 0.28703969022265247, 0.991377750979801, 0.0, 1.0, 0.6044732014233473, 1.0, 0.8871407352580803, 1.0, 0.0, 1.0, 0.8856209150326797, 0.0, 0.5748911689227088, 0

In [75]:
probs = hdbscan.all_points_membership_vectors(model)

In [76]:
print(probs)

[[1.79349306e-307 2.15751362e-307 2.57562685e-307 ... 6.06083499e-307
  5.79453628e-307 5.64640890e-307]
 [5.71460822e-004 4.09370382e-004 4.50232269e-004 ... 1.01592403e-003
  1.03380809e-003 1.09490332e-003]
 [2.69282051e-005 2.89476678e-005 3.39141459e-005 ... 2.10177052e-004
  1.92056088e-004 1.82653437e-004]
 ...
 [8.22333609e-004 6.74056782e-004 8.48237932e-004 ... 6.91398788e-004
  6.74169989e-004 6.64286546e-004]
 [1.14358962e-004 8.78336153e-005 9.94239013e-005 ... 3.25718581e-004
  3.14306025e-004 3.07865404e-004]
 [2.27482259e-004 1.94314248e-004 2.19709353e-004 ... 5.57020444e-004
  5.39463862e-004 5.29499246e-004]]


In [77]:
import numpy as np

noise_indices = np.where(labels == -1)[0]

In [78]:
# Ejemplo: recopilamos el clúster con mayor probabilidad para cada punto ruido
# y guardamos la información en una lista de dicts.
records = []
for idx in noise_indices:
    # Vector de probabilidades del punto 'idx'
    prob_vector = probs[idx]

    # Ordenamos las probabilidades de mayor a menor
    sorted_cluster_indices = np.argsort(prob_vector)[::-1]
    
    # Tomamos los índices de los 2 clusters más probables
    top_clusters = sorted_cluster_indices[:1]
    top_probs = prob_vector[top_clusters]

    records.append({
        'point_index': idx,
        'cluster_1': top_clusters[0],
        'prob_1': top_probs[0]
    })

In [79]:
import pandas as pd

df_noise_probs = pd.DataFrame(records)
df_noise_probs.where(df_noise_probs["prob_1"] > 0.05).sort_values(by="cluster_1", ascending=False).head(50)  # ejemplo: mirar las primeras filas

Unnamed: 0,point_index,cluster_1,prob_1
409,1890.0,383.0,0.051718
1034,4756.0,374.0,0.058301
1352,6248.0,369.0,0.089522
124,535.0,367.0,0.050775
264,1232.0,366.0,0.079828
365,1633.0,364.0,0.056204
232,1107.0,364.0,0.059802
1498,6874.0,363.0,0.067172
1288,5981.0,346.0,0.050733
817,3778.0,344.0,0.05479


In [65]:
#Asignar al clúster con mayor probabilidad (ha de ser superior al 10%)

import numpy as np

# Supongamos que ya tienes:
# 1. labels: las etiquetas "duras" de HDBSCAN (model.labels_)
# 2. probs: la matriz de pertenencias blandas => all_points_membership_vectors(model)
#    donde probs.shape = (n_samples, n_clusters_sin_incluir_ruido)

# Indices de los puntos ruido
noise_indices = np.where(labels == -1)[0]

# Reasignamos cada punto ruido
for idx in noise_indices:
    prob_vector = probs[idx]          # Vector de pertenencia al resto de clusters
    best_cluster = np.argmax(prob_vector)    # Índice del clúster con mayor probabilidad
    best_prob = prob_vector[best_cluster]    # Probabilidad más alta

    # Si esa probabilidad supera el 10% (0.1), reasignamos la etiqueta
    if best_prob >= 0.05:
        labels[idx] = best_cluster
    # Si no, se mantiene en -1 (ruido)


In [80]:
from collections import Counter


cluster_images_dict = {}
for i, label in enumerate(labels):
            if label not in cluster_images_dict:
                cluster_images_dict[label] = []
            cluster_images_dict[label].append(images[i])
    
cluster_images_dict = dict(sorted(cluster_images_dict.items()))

print(cluster_images_dict)
label_counter = Counter(labels)
print(len(label_counter.keys()))
print(label_counter)

{-1: [PosixPath('data/flickr/flickr_validated_imgs_7000/teide_NA_773_23981914816__b.jpg'), PosixPath('data/flickr/flickr_validated_imgs_7000/guadarrama_NA_477_26530879632__b.jpg'), PosixPath('data/flickr/flickr_validated_imgs_7000/guadarrama_NA_4269_50814262142__b.jpg'), PosixPath('data/flickr/flickr_validated_imgs_7000/snev_NA_5988_44228420820__b.jpg'), PosixPath('data/flickr/flickr_validated_imgs_7000/guadarrama_NA_2738_39214735591__b.jpg'), PosixPath('data/flickr/flickr_validated_imgs_7000/guadarrama_NA_3262_35831859810__b.jpg'), PosixPath('data/flickr/flickr_validated_imgs_7000/guadarrama_NA_1506_29095677242__b.jpg'), PosixPath('data/flickr/flickr_validated_imgs_7000/ordesa_NA_3797_16001131429__b.jpg'), PosixPath('data/flickr/flickr_validated_imgs_7000/teide_NA_861_52557364868__b.jpg'), PosixPath('data/flickr/flickr_validated_imgs_7000/snev_NA_2893_16914940602__b.jpg'), PosixPath('data/flickr/flickr_validated_imgs_7000/aiguestortes_NA_4458_50036516237__b.jpg'), PosixPath('data/flic

In [81]:

import os
import shutil

from pathlib import Path


def create_cluster_dirs(images_dict, copy_images=True):
    """
    Create a dir for every cluster given in dictionary of images. 
    Move images to those clusters dirs
    Create csv with images asigned to each cluster
    Create pdf with knn(30) closest images to centroid asigned to each cluster
    """
    # logger.info("Copying images from Data path to cluster dirs")
    # For every key (cluster index)
    
    # Get all images in dict format asigned to cluster
    images_dict_format = images_dict
    path_cluster = Path.cwd() / "clusters" / "imgs"
    if os.path.exists(path_cluster):
        shutil.rmtree(path_cluster)
    cluster_data = []
    try:
        for cluster_id, image_paths in images_dict_format.items():
            # Create folder if it doesnt exists
            cluster_dir = os.path.join(path_cluster, str(cluster_id)) 
            os.makedirs(cluster_dir, exist_ok=True)
            # For every path image, copy that image from its path to cluster folder
            for image_path in image_paths:
                cluster_data.append([cluster_id, image_path])
                if copy_images:
                    shutil.copy(image_path, cluster_dir)
        #Guardar el CSV con la información de imágenes y sus clusters
        csv_path = Path.cwd() / "clusters" / "cluster_images.csv"
        df = pd.DataFrame(cluster_data, columns=["cluster", "img"])
        df.sort_values(by="cluster").to_csv(csv_path, index=False)
    except (os.error) as ex:
        template = "An exception of type {0} occurred. Arguments:\n{1!r}"
        message = template.format(type(ex).__name__, ex.args)
        print(message)

create_cluster_dirs(cluster_images_dict)