In [16]:
from datetime import datetime
import hdbscan
from pathlib import Path
from typing import Optional
import pandas as pd
from loguru import logger
from sklearn.datasets import make_blobs


In [17]:
import pickle

bbdd="flickr"
dino_model="base"

# Reduction params 
dim_red = "umap"
n_components = 5
min_dist = 0.1
n_neighbors = 5
metric = "euclidean"
norm=True
scaler = "None"


# Load images
path = Path("../../data/flickr/flickr_validated_imgs_7000")
image_extensions = ['.jpg', '.jpeg', '.png', '.bmp']
# Find all image files recursively and filter by extension (lowercase only)
image_paths = [img_path for img_path in path.rglob('*') if img_path.suffix.lower() in image_extensions]
# Convert to lowercase and remove duplicates (especially relevant for Windows)
unique_image_paths = {img_path.resolve().as_posix().lower(): img_path for img_path in image_paths}
images = list(unique_image_paths.values())

# Load cached embeddings already reduced based on params
data = pickle.load(
                open(str(f"../../data/cache_embeddings_red/bbdd_{bbdd}_dino_model_{dino_model}--norm_{norm}--scaler_{scaler}--dimred_{dim_red}--reduction_params_metric={metric}_n_components={n_components}_n_neighbors={n_neighbors}_min_dist={min_dist}.pkl"), "rb")
)

print(data)

             0         1         2         3         4
0     2.678764 -0.606406 -0.032275  1.172780  1.257389
1    -1.375359 -1.550557  1.989837 -1.964251 -0.160990
2     1.590983 -0.029678 -0.078153  0.727744 -1.106895
3    -1.348136 -0.000254 -0.272157  1.507219  2.934441
4     1.656070 -2.102522 -0.091459  0.681160 -3.783395
...        ...       ...       ...       ...       ...
7067  2.206087 -1.758617 -0.219990  0.834786 -3.871116
7068 -1.542105 -1.326776  1.557476 -1.618267  0.078575
7069  1.746041  1.510906 -2.133772 -0.924746  4.269979
7070 -1.516019  0.611899  0.199857  0.732062  0.496824
7071 -1.756593 -0.007219  1.514402  0.785210  0.752364

[7072 rows x 5 columns]


In [18]:
# Aplicamos hdbscan

from sklearn.cluster import AgglomerativeClustering

clustering = "hdbscan"
eval_method="silhouette"

model = hdbscan.HDBSCAN(
        min_cluster_size=4,
        min_samples=4,
        cluster_selection_epsilon=0.3016430007941225,
        metric='manhattan',
        cluster_selection_method='eom',
        gen_min_span_tree=False,
        prediction_data=True
    )


labels =model.fit_predict(data)




In [19]:
labels

array([ 55, 240, 236, ..., 162,  72, 304])

In [20]:
# Get number of clusters (excluding noise)
from sklearn.metrics import davies_bouldin_score, silhouette_score


n_clusters = len(set(labels)) - (1 if -1 in labels else 0)



score_silhouette = silhouette_score(data[labels != -1], labels[labels != -1])

score_davies = davies_bouldin_score(data[labels != -1], labels[labels != -1])


# Noise points
noise_points = (labels == -1).sum()
# Calcular la proporción de ruido
noise_ratio = noise_points / len(data)


In [21]:
print(score_silhouette)
print(score_davies)
print(noise_points)
print(noise_ratio)

if eval_method == "silhouette":
    score = score_silhouette
else:
    score = score_davies

0.5403887
0.5219882897837993
1458
0.2061651583710407


In [22]:
probs = model.probabilities_
labels = model.labels_
print([x for x in probs])
print(labels)


[1.0, 1.0, 1.0, 0.4977837999817597, 1.0, 1.0, 1.0, 1.0, 0.6439272567556262, 1.0, 1.0, 0.8080902553930078, 0.5033432169759048, 1.0, 0.0, 1.0, 1.0, 1.0, 0.4799196805173192, 1.0, 1.0, 0.9921940433515362, 0.7226628745696588, 1.0, 0.4176183563895836, 0.8617882829863961, 1.0, 0.4885346037423658, 0.26122487212988726, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.5200892511432592, 1.0, 1.0, 0.3400377340541912, 0.9344928196451039, 0.4734413948525408, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.9503108426422348, 1.0, 1.0, 1.0, 1.0, 0.6411631127675607, 0.1913018251320426, 1.0, 0.0, 1.0, 1.0, 0.9165305911765631, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.9787293644275519, 1.0, 1.0, 1.0, 0.19023161450798323, 0.0, 0.0, 0.0, 0.4422611239593375, 1.0, 0.0, 0.6387641353650036, 0.5279015612266693, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.247453849911632, 0.9838447251004722, 0.0, 1.0, 0.0, 0.7850129288418469, 0.933955793479852, 0.0, 1.0, 0.0, 0.5461734926764851, 0.0, 0.6603817850291327, 1.0, 1.0, 1.0, 1.0, 1.0, 0.733927754104

In [23]:
# probs = hdbscan.all_points_membership_vectors(model)

In [24]:
# print(probs)

In [25]:
# import numpy as np

# noise_indices = np.where(labels == -1)[0]

In [26]:
# # Ejemplo: recopilamos el clúster con mayor probabilidad para cada punto ruido
# # y guardamos la información en una lista de dicts.
# records = []
# for idx in noise_indices:
#     # Vector de probabilidades del punto 'idx'
#     prob_vector = probs[idx]

#     # Ordenamos las probabilidades de mayor a menor
#     sorted_cluster_indices = np.argsort(prob_vector)[::-1]
    
#     # Tomamos los índices de los 2 clusters más probables
#     top_clusters = sorted_cluster_indices[:1]
#     top_probs = prob_vector[top_clusters]

#     records.append({
#         'point_index': idx,
#         'cluster_1': top_clusters[0],
#         'prob_1': top_probs[0]
#     })

In [27]:
# import pandas as pd

# df_noise_probs = pd.DataFrame(records)
# df_noise_probs.where(df_noise_probs["prob_1"] > 0.05).sort_values(by="prob_1", ascending=False).head(20)  

In [28]:
# #Asignar al clúster con mayor probabilidad (ha de ser superior al 10%)

# import numpy as np

# # Supongamos que ya tienes:
# # 1. labels: las etiquetas "duras" de HDBSCAN (model.labels_)
# # 2. probs: la matriz de pertenencias blandas => all_points_membership_vectors(model)
# #    donde probs.shape = (n_samples, n_clusters_sin_incluir_ruido)

# # Indices de los puntos ruido
# noise_indices = np.where(labels == -1)[0]

# # Reasignamos cada punto ruido
# for idx in noise_indices:
#     prob_vector = probs[idx]          # Vector de pertenencia al resto de clusters
#     best_cluster = np.argmax(prob_vector)    # Índice del clúster con mayor probabilidad
#     best_prob = prob_vector[best_cluster]    # Probabilidad más alta

#     # Si esa probabilidad supera el 10% (0.1), reasignamos la etiqueta
#     if best_prob >= 0.01:
#         labels[idx] = best_cluster
#     # Si no, se mantiene en -1 (ruido)


In [29]:
from collections import Counter


cluster_images_dict = {}
for i, label in enumerate(labels):
            if label not in cluster_images_dict:
                cluster_images_dict[label] = []
            cluster_images_dict[label].append(images[i])
    
cluster_images_dict = dict(sorted(cluster_images_dict.items()))

print(cluster_images_dict)
label_counter = Counter(labels)
print(len(label_counter.keys()))
print(label_counter)

{-1: [PosixPath('../../data/flickr/flickr_validated_imgs_7000/guadarrama_NA_4269_50814262142__b.jpg'), PosixPath('../../data/flickr/flickr_validated_imgs_7000/snev_NA_2110_41642742851__b.jpg'), PosixPath('../../data/flickr/flickr_validated_imgs_7000/picos_NA_3258_50365711787__b.jpg'), PosixPath('../../data/flickr/flickr_validated_imgs_7000/picos_NA_5943_33757987434__b.jpg'), PosixPath('../../data/flickr/flickr_validated_imgs_7000/aiguestortes_NA_4792_16179367894__b.jpg'), PosixPath('../../data/flickr/flickr_validated_imgs_7000/teide_NA_861_52557364868__b.jpg'), PosixPath('../../data/flickr/flickr_validated_imgs_7000/snev_NA_2893_16914940602__b.jpg'), PosixPath('../../data/flickr/flickr_validated_imgs_7000/aiguestortes_NA_4458_50036516237__b.jpg'), PosixPath('../../data/flickr/flickr_validated_imgs_7000/guadarrama_NA_3864_22444368764__b.jpg'), PosixPath('../../data/flickr/flickr_validated_imgs_7000/sierra_nieves_NA_3376_30281427754__b.jpg'), PosixPath('../../data/flickr/flickr_validated

In [30]:

import os
import shutil

from pathlib import Path


def create_cluster_dirs(images_dict, copy_images=True):
    """
    Create a dir for every cluster given in dictionary of images. 
    Move images to those clusters dirs
    Create csv with images asigned to each cluster
    Create pdf with knn(30) closest images to centroid asigned to each cluster
    """
    # logger.info("Copying images from Data path to cluster dirs")
    # For every key (cluster index)
    
    # Get all images in dict format asigned to cluster
    images_dict_format = images_dict
    path_cluster = Path.cwd() / "clusters" / "imgs"
    if os.path.exists(path_cluster):
        shutil.rmtree(path_cluster)
    cluster_data = []
    try:
        for cluster_id, image_paths in images_dict_format.items():
            # Create folder if it doesnt exists
            cluster_dir = os.path.join(path_cluster, str(cluster_id)) 
            os.makedirs(cluster_dir, exist_ok=True)
            # For every path image, copy that image from its path to cluster folder
            for image_path in image_paths:
                cluster_data.append([cluster_id, image_path])
                if copy_images:
                    shutil.copy(image_path, cluster_dir)
        #Guardar el CSV con la información de imágenes y sus clusters
        csv_path = Path.cwd() / "clusters" / "cluster_images.csv"
        df = pd.DataFrame(cluster_data, columns=["cluster", "img"])
        df.sort_values(by="cluster").to_csv(csv_path, index=False)
    except (os.error) as ex:
        template = "An exception of type {0} occurred. Arguments:\n{1!r}"
        message = template.format(type(ex).__name__, ex.args)
        print(message)

create_cluster_dirs(cluster_images_dict)

In [31]:
# Create metrics
from classes.multimodal_clustering_metric import MultiModalClusteringMetric

cluster_images = cluster_images_dict

categories = pd.read_csv("../../data/categories/categories_stoten.csv", sep=";", header=None)
categories_list = categories.iloc[:, 0].astype(str).str.upper().tolist()

llava_results_df = pd.read_csv("../../data/inference/stoten_w_descriptions.csv", sep=";", header=0)
llava_results_df['category_llava'] = llava_results_df['category_llava'].apply(lambda x: x.upper())


llava_results_df["category_llava"] = llava_results_df["category_llava"].apply(
                    lambda cat: cat if cat in categories_list or cat == "NOT VALID" or cat == "NOT RELEVANT" else "BAD_INFERENCE"
                    )



lvm_lvlm_metric = MultiModalClusteringMetric("experiment_actual",
                                            3,
                                            categories_list,
                                            clustering,
                                            eval_method,
                                            score,
                                            model, 
                                            4, 
                                            cluster_images_dict, 
                                            llava_results_df)
lvm_lvlm_metric.eval_method = eval_method
lvm_lvlm_metric.clustering = clustering
lvm_lvlm_metric.score = score

lvm_lvlm_metric.generate_stats()


quality_results = pd.DataFrame()
for i in (True, False):
    # Calculate metrics
    results = lvm_lvlm_metric.calculate_clustering_quality(use_noise=i)
    # Join results (in columns)
    quality_results = pd.concat([quality_results, pd.DataFrame([results])], axis=1)

print(f"homogeneity_global: {quality_results['homogeneity_global'].iloc[0]}")
print(f"entropy_global: {quality_results['entropy_global'].iloc[0]}")
print(f"quality_metric: {quality_results['quality_metric'].iloc[0]}")

lvm_lvlm_metric.plot_cluster_categories_3()

homogeneity_global: 0.7062700391877449
entropy_global: 1.0518045276215306
quality_metric: 0.6714834830587436


In [32]:
# Añado clúster a la inferencia de Llava
inference_with_cluster = lvm_lvlm_metric.add_cluster_to_llava_inference().sort_values(by="cluster")
# Me quedo con la categoría mayoritaria para dicho clúster con cierto nivel de homogeneidad (primero con todo luego filtro)
stats_unique = lvm_lvlm_metric.result_stats_df[["cluster","predominant_category","homogeneity_k"]].drop_duplicates()
# Uno la inferencia del clúster con este resultado, y añado la columna de predominant_category y homoegeneity_k
stats_unique.rename(columns={"predominant_category":"predominant_category_llava"}, inplace=True)
imgs_with_predominant_category = inference_with_cluster.merge(stats_unique[["cluster","predominant_category_llava","homogeneity_k"]], on="cluster", how="left")
llava_classification = imgs_with_predominant_category[["cluster","img","predominant_category_llava","homogeneity_k"]]
# Mergeamos con el resultado de etiquetado manual
manual_classification = pd.read_csv("../../data/inference/stoten_manual_annotation.csv", sep=";", header=0)

# ATENCIÓN, HAY NAs NO CLASIFICADO EN STOTEN. Ponemos Other type
manual_classification['manual_category'] = manual_classification['manual_category'].fillna("Other type")
manual_classification['manual_category'] = manual_classification['manual_category'].apply(lambda x: x.upper())

# Obtenemos agrupado de número de imaǵenes ground truth por categoría
img_per_cat_manual = manual_classification.groupby(by="manual_category").count()
# Unimos ambos por imagen
result = llava_classification.merge(manual_classification[["img","manual_category"]],on="img",how="left")


print(result.head(10).to_string())

result.to_csv("clust-lvlm_vs_manual.csv",sep=";")


   cluster                                       img predominant_category_llava  homogeneity_k     manual_category
0       -1  sierra_nieves_NA_2090_31643484085__b.jpg         NATURE & LANDSCAPE       0.329218       FAUNA & FLORA
1       -1           snev_NA_4056_44971839574__b.jpg         NATURE & LANDSCAPE       0.329218  NATURE & LANDSCAPE
2       -1           snev_NA_3720_48385524436__b.jpg         NATURE & LANDSCAPE       0.329218           RELIGIOUS
3       -1           snev_NA_2893_17086368968__b.jpg         NATURE & LANDSCAPE       0.329218  NATURE & LANDSCAPE
4       -1          teide_NA_1475_36548571165__b.jpg         NATURE & LANDSCAPE       0.329218       FAUNA & FLORA
5       -1  sierra_nieves_NA_1119_52263662548__b.jpg         NATURE & LANDSCAPE       0.329218  NATURE & LANDSCAPE
6       -1           teide_NA_734_27496715612__b.jpg         NATURE & LANDSCAPE       0.329218       FAUNA & FLORA
7       -1          picos_NA_6139_20499269554__b.jpg         NATURE & LANDSCAPE 

In [14]:
print(manual_classification[manual_classification['manual_category'].isna()])


Empty DataFrame
Columns: [img, manual_category]
Index: []


In [15]:
nan_rows = result[result.isna().any(axis=1)]
print(nan_rows)

Empty DataFrame
Columns: [cluster, img, predominant_category_llava, homogeneity_k, manual_category]
Index: []


In [53]:
# GENERAR PDF CON LA COMPARATIVA DE IMÁGENES 1:1 ENTRE MODELO MULTIMODAL Y "GROUND TRUTH"

result = result.drop(columns=["predominant_category_llava", "homogeneity_k"])
print(result)

      cluster                                       img     manual_category
0          -1  sierra_nieves_NA_2090_31643484085__b.jpg       FAUNA & FLORA
1          -1           snev_NA_4056_44971839574__b.jpg  NATURE & LANDSCAPE
2          -1           snev_NA_3720_48385524436__b.jpg           RELIGIOUS
3          -1           snev_NA_2893_17086368968__b.jpg  NATURE & LANDSCAPE
4          -1          teide_NA_1475_36548571165__b.jpg       FAUNA & FLORA
...       ...                                       ...                 ...
7067      315           snev_NA_4056_29427303196__b.jpg       RURAL TOURISM
7068      315  sierra_nieves_NA_2455_30082567198__b.jpg       RURAL TOURISM
7069      315  sierra_nieves_NA_2455_43275300374__b.jpg       RURAL TOURISM
7070      315  sierra_nieves_NA_4002_48912391348__b.jpg       RURAL TOURISM
7071      315          teide_NA_1827_27304774007__b.jpg               URBAN

[7072 rows x 3 columns]


In [62]:
import os
import random
from math import ceil
from pathlib import Path

import pandas as pd
from reportlab.platypus import (
    SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, Image, PageBreak, Flowable
)
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.lib.pagesizes import letter
from reportlab.lib import colors

# -------------------------------
# Flowable personalizado (igual que en tu código original)
# -------------------------------
class ImageAndCaption(Flowable):
    """
    Flowable que dibuja una imagen y, debajo, un párrafo (caption).
    """
    def __init__(self, image_flowable, caption_paragraph):
        super().__init__()
        self.image_flowable = image_flowable
        self.caption_paragraph = caption_paragraph
        # Anchura/altura calculadas en wrap()
        self.width = 0
        self.height = 0

    def wrap(self, availWidth, availHeight):
        """
        Determina cuánto espacio vertical (y horizontal) necesita
        este flowable para que quepa imagen + caption.
        """
        i_w, i_h = self.image_flowable.wrap(availWidth, availHeight)
        c_w, c_h = self.caption_paragraph.wrap(availWidth, availHeight)

        self.width = max(i_w, c_w)
        self.height = i_h + c_h
        return (self.width, self.height)

    def draw(self):
        """
        Dibuja primero la imagen y después el caption debajo.
        """
        i_w, i_h = self.image_flowable.wrap(self.width, self.height)
        # Dibuja la imagen en la parte superior
        self.image_flowable.drawOn(self.canv, 0, self.height - i_h)

        c_w, c_h = self.caption_paragraph.wrap(self.width, self.height)
        # Dibuja el caption debajo de la imagen
        self.caption_paragraph.drawOn(self.canv, 0, self.height - i_h - c_h)

# -------------------------------
# Generar PDF a partir de clusters
# -------------------------------
def create_pdf_from_clusters(path_save_pdf, dict_clusters):
    """
    Genera un PDF donde cada página corresponde a un 'cluster'.
    En cada página se muestran aleatoriamente hasta 6x5 imágenes 
    (30 máximo). Si hay menos, se rellenan huecos vacíos para 
    llegar a 30. 

    Parámetros:
    -----------
    path_save_pdf : str
        Ruta (nombre) del PDF de salida.
    dict_clusters : dict.
        - key 'cluster': Identificador o número del cluster.
        - value array 'img': Ruta o nombre de archivo de la imagen.
    """
    # Parámetros para la tabla (6 filas, 5 columnas = 30 imágenes por página)
    filas = 6
    columnas = 5
    max_imgs = filas * columnas  # 30

    # Creamos el documento PDF
    doc = SimpleDocTemplate(path_save_pdf, pagesize=letter)
    story = []
    estilos = getSampleStyleSheet()
    
    # Estilo de texto normal
    estilo_normal = estilos['Normal']
    estilo_normal.fontSize = 6    # Ajusta el tamaño de letra si deseas
    estilo_normal.leading = 8
    
    # Estilo para el título de cada cluster
    estilo_titulo = estilos['Heading1']
    estilo_titulo.fontSize = 14
    estilo_titulo.leading = 10

    # Obtenemos la lista de clusters únicos
    lista_clusters = dict_clusters.keys()

    # Recorremos cada cluster
    for cluster_id in sorted(lista_clusters):
        # Título de la página
        story.append(Paragraph(f"Cluster: {cluster_id}", estilo_titulo))
        story.append(Spacer(1, 3))

        # Extraemos las imágenes de ese cluster
        imagenes_cluster = dict_clusters.get(cluster_id)

        # Si hay más de 30, tomamos una muestra aleatoria de 30
        if len(imagenes_cluster) > max_imgs:
            imagenes_cluster = random.sample(imagenes_cluster, max_imgs)

        # Si hay menos, rellenamos con None (o strings vacíos) para completar 30
        if len(imagenes_cluster) < max_imgs:
            imagenes_cluster += [None] * (max_imgs - len(imagenes_cluster))

        # Construimos la tabla (6x5) con las imágenes
        tabla_datos = []
        fila_actual = []

        for i, ruta_imagen in enumerate(imagenes_cluster, start=1):
            if ruta_imagen is not None and os.path.isfile(ruta_imagen):
                # Creamos el flowable de la imagen (tamaño fijo, por ejemplo 80x80)
                img_flowable = Image(ruta_imagen, width=80, height=80)
                # Caption con el nombre de archivo (opcional)
                nombre_archivo = os.path.basename(ruta_imagen)
                caption = Paragraph(nombre_archivo, estilo_normal)

                # Flowable combinado (Imagen + Caption debajo)
                celda = ImageAndCaption(img_flowable, caption)
            else:
                # Espacio en blanco
                celda = Paragraph(" ", estilo_normal)

            fila_actual.append(celda)

            # Cada vez que llegamos a 5 celdas, cerramos la fila
            if i % columnas == 0:
                tabla_datos.append(fila_actual)
                fila_actual = []

        # Creamos la tabla de 6x5
        tabla = Table(
            tabla_datos,
            colWidths=[90]*columnas,
            rowHeights=[100]*filas
        )
        tabla.setStyle(TableStyle([
            ('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
            ('VALIGN', (0, 0), (-1, -1), 'MIDDLE'),
            ('ALIGN', (0, 0), (-1, -1), 'CENTER'),
        ]))

        story.append(tabla)
        story.append(Spacer(1, 12))

        # Agregamos un salto de página para el siguiente cluster (si hay otro)
        story.append(PageBreak())

    # Finalmente, construimos el PDF
    doc.build(story)


create_pdf_from_clusters("clusters.pdf", cluster_images_dict)

