In [None]:
import pandas as pd
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
import numpy as np
from torch import Tensor
from transformers import AutoTokenizer, AutoModel
import umap
import plotly.express as px
import time
import torch.nn.functional as F
from torch.utils.data import DataLoader
from tqdm import tqdm
import cuml
from cuml.metrics.cluster.silhouette_score import cython_silhouette_score
import random
from tqdm import tqdm


In [None]:
#### seleccionamos solamnete imagen de marca
df_entero = pd.read_parquet("./data/Silver/Cleaned_data_features_productos_categorias.parquet")
df = df_entero[df_entero['imagen_marca']=='SI']
df

In [None]:
#cargamos modelo para embbeding
model_name = 'intfloat/multilingual-e5-large-instruct'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name,
                                torch_dtype=torch.float16,
                                  )

#quitar el batchdic to device si uso  device_map custom
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
## funcion para sacar los embedding del modelo
def average_pool(last_hidden_states: Tensor,
                 attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

In [None]:
##procesar en batch!

# Crear un DataLoader para procesar los textos en lotes
batch_size = 1024

# Función para tokenizar un lote de textos
def tokenize_batch(texts):
    return tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors='pt') #padding_side='left') qween

# Convertir a DataLoader (suponiendo que tienes un DataFrame)
dataset = df['tweets'].tolist()  # Conviértelo a lista para procesar en lotes
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

# Lista para guardar los embeddings
embeddings_array = []

# Asegúrate de que no calculamos gradientes
with torch.no_grad():
    #for batch_texts in dataloader:
    for batch_texts in tqdm(dataloader, desc="Procesando batches"):

        
        batch_dict = tokenize_batch(batch_texts)
        batch_dict = {key: value.to(device) for key, value in batch_dict.items()} # se tiene que quiirar si device_map custom

        outputs = model(**batch_dict)


        embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
        embeddings_normalized = F.normalize(embeddings, p=2, dim=1)
        # normalización embedding esto hace que al representar se vea igual con coseno que con distancia euclidea en UMAP
        #mover a cpu y agregar
        embeddings_array.append(embeddings_normalized.cpu())


# Convertir la lista de embeddings a un solo tensor
embedding_one_tensor = torch.cat(embeddings_array, dim=0)
#pasar a numpy
embeddings_numpy = embedding_one_tensor.numpy()

#guardamos emmbeding umap
np.save('./data/Silver/embeddings.npy', embeddings_numpy)

In [None]:
import gc
gc.collect()
del embedding_one_tensor, embeddings_normalized, outputs, batch_dict, model, tokenizer
torch.cuda.empty_cache()

In [None]:
embeddings_numpy = np.load('./data/Silver/embeddings.npy')

In [None]:
reducer_visual = umap.UMAP(n_components=3, random_state=2013, min_dist=0.0, metric='cosine', n_neighbors=100)  #min_dist=0.0, n_neighbors=1000,
embedding_visual = reducer_visual.fit_transform(embeddings_numpy)
embedding_UMAP = embedding_visual

In [None]:
# #  Busqueda de hiperparametros ## silohuete no es el mejor pero la biblioteca no tiene implementada DBCV 

# min_cluster_sizes = [10, 50, 100, 200, 300, 400, 500, 1000]
# min_samples_list = [10, 50,  100, 200, 300, 400, 500, 1000]
# cluster_selection_epsilon_list = [0.0, 0.01, 0.05, 0.1, 0.5, 1.0]

# results = []

# count = 0
# # Grid search

# for min_cluster_size in min_cluster_sizes:
#     for min_samples in min_samples_list:
#       for cluster_selection_epsilon in cluster_selection_epsilon_list:
#         #biblioteca cuml
#         clusterer = cuml.cluster.hdbscan.HDBSCAN(min_cluster_size=min_cluster_size,
#                                     min_samples=min_samples, cluster_selection_epsilon=cluster_selection_epsilon,
#                                     metric='euclidean')
#         labels = clusterer.fit_predict(embedding_UMAP)

#         # Ignorar si todos son ruido
#         if len(set(labels)) <= 1 or (len(set(labels)) == 2 and -1 in labels):
#             continue

#         try:
#             score = cython_silhouette_score(
#                 X=embedding_UMAP,
#                 labels=labels,
#                 metric='euclidean',
#                 convert_dtype=True
#             )
#             results.append({
#                 'min_cluster_size': min_cluster_size,
#                 'min_samples': min_samples,
#                 'silhouette': score,
#                 'cluster_selection_epsilon':cluster_selection_epsilon,
#                 'n_clusters': len(set(labels)) - (1 if -1 in labels else 0),
#                 'noise_fraction': np.mean(labels == -1)
#             })
#             print("=")
#         except Exception as e:
#             pass
#         count += 1
#         print(count)
# # Mostrar los mejores resultados
# results_sorted = sorted(results, key=lambda x: x['silhouette'], reverse=True)
# for r in results_sorted[:5]:
#     print(r)

# pd.DataFrame(results)

In [None]:
#esta parete del clustering se hizo en colab y está optimizada para ello. al hacerlo con mis setting 2 devices no hay reproducibilidad para
# la implementación de GPU de HDBSCAN.
#por tanto copio el archivo de colab "clusterizado_embeding_Colab.parquet" y continuo en este cuadrerno.

In [None]:
#hacemos  HDBSCAN con hiperparametros 
labels = cuml.cluster.hdbscan.HDBSCAN(min_cluster_size=400,
                                      min_samples=400,
                                      metric='euclidean',
                                      cluster_selection_epsilon=0.1
                                      ).fit_predict(embedding_UMAP)

In [None]:
df_embedding = pd.concat([df.reset_index(), pd.DataFrame(embedding_visual)], axis=1)
df_embedding.columns.values[-3:] = ['UMAP_1', 'UMAP_2', 'UMAP_3' ]

df_embedding = pd.concat([df_embedding, (pd.DataFrame(labels))], axis=1)
df_embedding.columns.values[-1:] = ['clusters_hdbscan']

df_embedding['clusters_hdbscan'] = df_embedding['clusters_hdbscan'].astype(str)

In [None]:
df_embedding = pd.read_parquet("./data/Silver/clusterizado_embeding_Colab.parquet")
df_backup = df_embedding.copy()

In [None]:
import plotly.io as pio
pio.renderers.default = "notebook_connected" 
### se observa un gran cluster que se dividirá un poco más adelante
fig = px.scatter_3d(df_embedding,
                    x="UMAP_1",
                    y="UMAP_2",
                    z="UMAP_3",  # Tercera dimensión
                    color='clusters_hdbscan',
                    hover_data=['tweets', 'sentimiento'],
                    height=1200,
                    width=1800,
                    color_discrete_map={-1: "gray"})

# Actualizando las trazas para los puntos
fig.update_traces(marker=dict(size=1, opacity=0.5))

# Apagando algunos clusters desde el inicio (ej: cluster '-1')
fig.update_traces(
    selector=dict(name="-1"),
    marker=dict(color="gray", opacity=0.1, size=1)
)

fig.show()
#fig.write_html("umap_plot.html")

In [None]:
labels = df_embedding.clusters_hdbscan.values

#vemos de que va cada cluster 
# Iteramos sobre cada cluster, excluyendo el ruido (-1)
for label in set(labels):
    if label == -1:
        continue  # Ignoramos el ruido
    print(f"\nCluster {label}")

    # Accedemos a las filas del DataFrame correspondientes a cada cluster
    cluster_tweets = df_embedding['tweets'][df_embedding['clusters_hdbscan'] == str(label)].tolist()

    # Si hay más de 20 frases en el cluster, seleccionamos 20 aleatorias
    if len(cluster_tweets) > 20:
        cluster_tweets = random.sample(cluster_tweets, 20)  # Selecciona 20 frases aleatorias

    # Imprimir las frases aleatorias
    for tweet in cluster_tweets:
        print(f"- {tweet}")  # Imprimir la frase

In [None]:
df = df_embedding[df_embedding['clusters_hdbscan'] == "5"]
df

In [None]:
## vamos a dividir el cluster 5 
embedding_UMAP = df[['UMAP_1','UMAP_2','UMAP_3']].values

labels = cuml.cluster.hdbscan.HDBSCAN(min_cluster_size=500,
                                      min_samples=1000,
                                      metric='euclidean',
                                      cluster_selection_epsilon=0.0
                                      ).fit_predict(embedding_UMAP)


In [None]:
# df_embedding.columns.values[-3:] = ['UMAP_1', 'UMAP_2', 'UMAP_3' ]
df_2 = df.copy()
df = pd.concat([df_2.reset_index(), (pd.DataFrame(labels))], axis=1)
df.columns.values[-1:] = ['clusters_hdbscan_2']

df['clusters_hdbscan_2'] = df['clusters_hdbscan_2'].astype(str)
df
# df = df[['index', 'User', 'tweets', 'search', 'fecha_captura',
#        'sentimiento', 'imagen_marca', 'sentimiento_producto',
#        'comparativa_producto', 'producto', 'categoria', 'UMAP_1', 'UMAP_2',
#        'UMAP_3', 'clusters_hdbscan', 'clusters_hdbscan_2']]
df

In [None]:
import plotly.express as px

# Asumiendo que en tu DataFrame tienes la columna 'UMAP_3' para la tercera dimensión
fig = px.scatter_3d(df,
                    x="UMAP_1",
                    y="UMAP_2",
                    z="UMAP_3",  # Tercera dimensión
                    color='clusters_hdbscan_2',
                    hover_data=['tweets', 'sentimiento'],
                    height=1200,
                    width=1800,
                    color_discrete_map={-1: "gray"})

# Actualizando las trazas para los puntos
fig.update_traces(marker=dict(size=2, opacity=0.5))

# Apagando algunos clusters desde el inicio (ej: cluster '-1')
fig.update_traces(
    selector=dict(name="-1"),
    marker=dict(color="gray", opacity=0.1, size=3)
)

fig.show()

In [None]:
for label in set(labels):
    if label == -1:
        continue  # Ignoramos el ruido
    print(f"\nCluster {label}")

    # Accedemos a las filas del DataFrame correspondientes a cada cluster
    cluster_tweets = df['tweets'][df['clusters_hdbscan_2'] == str(label)].tolist()

    if len(cluster_tweets) > 20:
        cluster_tweets = random.sample(cluster_tweets, 20)  # Selecciona 20 frases aleatorias

    # Imprimir las frases aleatorias
    for tweet in cluster_tweets:
        print(f"- {tweet}")  

In [None]:
df.drop(['level_0'], axis=1, inplace=True)
df

In [None]:
df['clusters_hdbscan_2'] = df['clusters_hdbscan_2'].replace({'0': 'Humor y experiencias cotidianas', '1': 'Interacciones directas y quejas de consumidores', '2':'Críticas políticas y económicas'})
df_embedding_merge = df[['index','clusters_hdbscan_2']]
df_embedding_merge

In [None]:
result = pd.merge(df_embedding, df_embedding_merge, on="index", how='left')
result

In [None]:
result = pd.merge(df_embedding, df_embedding_merge, on="index", how='left')
result

In [None]:
#corregimos valores 5

for index, row in tqdm(result.iterrows(), total=len(result), desc="Procesando filas"):
    if row['clusters_hdbscan'] == '5':  # Comprobamos si el valor de la columna es '5'
        result.at[index, 'clusters_hdbscan'] = row['clusters_hdbscan_2']  #ponemos el cluster de 2



In [None]:
##cambiamos el nombre

result['clusters_hdbscan'] = result['clusters_hdbscan'].replace({
    "19": "Dulces y bollería",
    "6": "Helados",
    "12": "Hummus",
    "4" : "Compra alcohol",
    "11": "Pipas",
    "13": "Guacamole",
    "7": "Productos retirados",
    "10": "Cosmética y cuidado personal ",
    "0": "Deportes",
    "9": "Yogures y lácteos",
    "3": "Pizzas ",
    "2": "Alcohol y bebidas alcohólicas",
    "15": "Patatas fritas y tortillas",
    "8": "Cereales",
    "14": "Lasaña",
    "17": "Pan, tartas y repostería",
    "1": "Productos de temporada y Navidad ",
    "18": "Mochis",
    "16": "Platos preparados, ensaladas, salsas",
    "-1": "Desconectado"
})

In [None]:
fig = px.scatter_3d(result,
                    x="UMAP_1",
                    y="UMAP_2",
                    z="UMAP_3",
                    color='clusters_hdbscan',
                    hover_data=['tweets', 'sentimiento'],
                    height=1200,
                    width=1800,
                    color_discrete_map={-1: "gray"})

# Actualizando las trazas para los puntos
fig.update_traces(marker=dict(size=2, opacity=0.5))

# Apagando algunos clusters desde el inicio (ej: cluster '-1')
fig.update_traces(
    selector=dict(name="Desconectado"),
    marker=dict(color="gray", opacity=0.05, size=2)
)

fig.show()

In [None]:
result['fecha_captura'] = pd.to_datetime(result['fecha_captura'])
result.info()

In [None]:
result.to_parquet('./data/Gold/Gold_embedding_imagen_marca.parquet', index=False)