In [1]:
import re
import math
import html

# URLs: http(s)://... o www....
URL_RE = re.compile(r'https?://\S+|www\.\S+')

# Menciones: @usuario (hasta el próximo espacio)
MENTION_RE = re.compile(r'@\S+')


HTML_TAG_RE = re.compile(r'<[^>]+>')

def strip_html(text: str) -> str:
    text = str(text)
    return HTML_TAG_RE.sub(" ", text)
# Emojis Unicode (rangos más comunes)
EMOJI_RE = re.compile(
    "["
    u"\U0001F600-\U0001F64F"  # emoticonos
    u"\U0001F300-\U0001F5FF"  # símbolos y pictos
    u"\U0001F680-\U0001F6FF"  # transporte
    u"\U0001F1E0-\U0001F1FF"  # banderas
    u"\U00002700-\U000027BF"  # dingbats
    u"\U0001F900-\U0001F9FF"  # más emojis
    "]+",
    flags=re.UNICODE,
)

# Emoticonos ASCII tipo :) :( :D <3
ASCII_EMOTICON_RE = re.compile(
    r'((:|;|=|8|x|X)[\-o\']?(\)|\(|D|p|P|3|/|\\))|<3'
)

# RT al inicio típico de retweets
RT_RE = re.compile(r'^\s*rt\b', flags=re.IGNORECASE)

def clean_tweet(text):
    from html import unescape
    # Manejo NaN / None para que no se conviertan en "nan"
    if text is None:
        return ""
    # Si vienen floats tipo NaN de pandas
    if isinstance(text, float) and math.isnan(text):
        return ""

    # Asegurar string
    text = str(text)

    text = strip_html(text)

    # Decodificar entidades HTML (&amp;, &lt;, etc.)
    text = unescape(text)

    # Eliminar URLs
    text = URL_RE.sub(" ", text)

    # Eliminar @usuario
    text = MENTION_RE.sub(" ", text)

    # Eliminar emojis unicode
    text = EMOJI_RE.sub(" ", text)

    # Eliminar emoticonos ASCII
    text = ASCII_EMOTICON_RE.sub(" ", text)

    # Quitar el "RT" inicial típico de retweets
    text = RT_RE.sub(" ", text)

    # Quitar # pero dejar la palabra
    text = re.sub(r'#(\w+)', r'\1', text)

    # Pasar a minúsculas
    text = text.lower()

    # Quitar espacios repetidos
    text = re.sub(r'\s+', ' ', text).strip()

    return text


In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

df_fire_train = pd.read_csv(
    'event_type/fire_train.tsv',
    sep='\t',          # separador = tab
    encoding='utf-8'
)
df_flood_train = pd.read_csv(
    'event_type/flood_train.tsv',
    sep='\t',          # separador = tab
    encoding='utf-8'
)

df_hurricane_train = pd.read_csv(
    'event_type/hurricane_train.tsv',
    sep='\t',          # separador = tab
    encoding='utf-8'
)

df_earthquake_train = pd.read_csv(
    'event_type/earthquake_train.tsv',
    sep='\t',          # separador = tab
    encoding='utf-8'
)

df_fire_test = pd.read_csv(
    'event_type/fire_test.tsv',
    sep='\t',          # separador = tab
    encoding='utf-8'
)

df_flood_test = pd.read_csv(
    'event_type/flood_test.tsv',
    sep='\t',          # separador = tab
    encoding='utf-8'
)

df_hurricane_test = pd.read_csv(
    'event_type/hurricane_test.tsv',
    sep='\t',          # separador = tab
    encoding='utf-8'
)

df_earthquake_test = pd.read_csv(
    'event_type/earthquake_test.tsv',
    sep='\t',          # separador = tab
    encoding='utf-8'
)

df_fire_dev = pd.read_csv(
    'event_type/fire_dev.tsv',
    sep='\t',          # separador = tab
    encoding='utf-8'
)

df_flood_dev = pd.read_csv(
    'event_type/flood_dev.tsv',
    sep='\t',          # separador = tab
    encoding='utf-8'
)

df_earthquake_dev = pd.read_csv(
    'event_type/earthquake_dev.tsv',
    sep='\t',          # separador = tab
    encoding='utf-8'
)
df_hurricane_dev = pd.read_csv(
    'event_type/hurricane_dev.tsv',
    sep='\t',          # separador = tab
    encoding='utf-8'
)

# TRAIN
df_fire_train["disaster_type"]       = "fire"
df_flood_train["disaster_type"]      = "flood"
df_hurricane_train["disaster_type"]  = "hurricane"
df_earthquake_train["disaster_type"] = "earthquake"

# DEV
df_fire_dev["disaster_type"]       = "fire"
df_flood_dev["disaster_type"]      = "flood"
df_hurricane_dev["disaster_type"]  = "hurricane"
df_earthquake_dev["disaster_type"] = "earthquake"

# TEST
df_fire_test["disaster_type"]       = "fire"
df_flood_test["disaster_type"]      = "flood"
df_hurricane_test["disaster_type"]  = "hurricane"
df_earthquake_test["disaster_type"] = "earthquake"


# Lista de todos los dataframes
all_train = [
    df_fire_train, df_flood_train, df_hurricane_train, df_earthquake_train,
]

all_dev = [
        df_fire_dev,   df_flood_dev,   df_hurricane_dev,   df_earthquake_dev,
]

all_test =[
        df_fire_test,  df_flood_test,  df_hurricane_test,  df_earthquake_test,
]

# Clean tweets
for df in all_train:
    df["tweet_text"] = df["tweet_text"].astype(str).apply(clean_tweet)

for df in all_dev:
    df["tweet_text"] = df["tweet_text"].astype(str).apply(clean_tweet)

for df in all_test:
    df["tweet_text"] = df["tweet_text"].astype(str).apply(clean_tweet)


#concat tweets by objective
df_all_train = pd.concat(
    [
        df_fire_train,
        df_flood_train,
        df_hurricane_train,
        df_earthquake_train,
    ],
    ignore_index=True
    )

df_all_dev = pd.concat(
    [
        df_fire_dev,
        df_flood_dev,
        df_hurricane_dev,
        df_earthquake_dev,
    ],
    ignore_index=True
    )

df_all_test = pd.concat(
    [
        df_fire_test,
        df_flood_test,
        df_hurricane_test,
        df_earthquake_test,
    ],
    ignore_index=True
    )

#vectorize
vectorizer = TfidfVectorizer(
    preprocessor=clean_tweet,   # tu función
    strip_accents='ascii',
    stop_words='english',
)

vectorizer.fit(df_all_train['tweet_text'])

X_all_train = vectorizer.transform(df_all_train['tweet_text'])




# Experimento de clusters con SBERT

In [6]:
!pip install sentence-transformers 

Defaulting to user installation because normal site-packages is not writeable
Collecting sentence-transformers
  Using cached sentence_transformers-5.1.2-py3-none-any.whl.metadata (16 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Using cached transformers-4.57.3-py3-none-any.whl.metadata (43 kB)
Collecting tqdm (from sentence-transformers)
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting torch>=1.11.0 (from sentence-transformers)
  Using cached torch-2.9.1-cp313-cp313-win_amd64.whl.metadata (30 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Using cached huggingface_hub-1.1.7-py3-none-any.whl.metadata (13 kB)
Collecting typing_extensions>=4.5.0 (from sentence-transformers)
  Using cached typing_extensions-4.15.0-py3-none-any.whl.metadata (3.3 kB)
Collecting filelock (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Using cached filelock-3.20.0-py3-none-any.whl.metadata (2.1 kB)
Collecting huggingface-hub


[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
!pip install umap-learn 

Defaulting to user installation because normal site-packages is not writeable
Collecting umap-learn
  Using cached umap_learn-0.5.9.post2-py3-none-any.whl.metadata (25 kB)
Collecting numba>=0.51.2 (from umap-learn)
  Using cached numba-0.62.1-cp313-cp313-win_amd64.whl.metadata (2.9 kB)
Collecting pynndescent>=0.5 (from umap-learn)
  Using cached pynndescent-0.5.13-py3-none-any.whl.metadata (6.8 kB)
Collecting llvmlite<0.46,>=0.45.0dev0 (from numba>=0.51.2->umap-learn)
  Using cached llvmlite-0.45.1-cp313-cp313-win_amd64.whl.metadata (5.0 kB)
Using cached umap_learn-0.5.9.post2-py3-none-any.whl (90 kB)
Using cached numba-0.62.1-cp313-cp313-win_amd64.whl (2.7 MB)
Using cached llvmlite-0.45.1-cp313-cp313-win_amd64.whl (38.1 MB)
Using cached pynndescent-0.5.13-py3-none-any.whl (56 kB)
Installing collected packages: llvmlite, numba, pynndescent, umap-learn

   ---------------------------------------- 0/4 [llvmlite]
   ---------------------------------------- 0/4 [llvmlite]
   --------------


[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [9]:
!pip install --upgrade pip
!pip install --upgrade setuptools wheel
!pip install hdbscan


Defaulting to user installation because normal site-packages is not writeable
Collecting pip
  Downloading pip-25.3-py3-none-any.whl.metadata (4.7 kB)
Downloading pip-25.3-py3-none-any.whl (1.8 MB)
   ---------------------------------------- 0.0/1.8 MB ? eta -:--:--
   ---------------------------------------- 1.8/1.8 MB 15.3 MB/s  0:00:00



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip
ERROR: To modify pip, please run the following command:
C:\Python313\python.exe -m pip install --upgrade pip


Defaulting to user installation because normal site-packages is not writeable
Collecting wheel
  Using cached wheel-0.45.1-py3-none-any.whl.metadata (2.3 kB)
Using cached wheel-0.45.1-py3-none-any.whl (72 kB)
Installing collected packages: wheel
Successfully installed wheel-0.45.1



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


Defaulting to user installation because normal site-packages is not writeable
Collecting hdbscan
  Using cached hdbscan-0.8.40.tar.gz (6.9 MB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: hdbscan
  Building wheel for hdbscan (pyproject.toml): started
  Building wheel for hdbscan (pyproject.toml): finished with status 'done'
  Created wheel for hdbscan: filename=hdbscan-0.8.40-cp313-cp313-win_amd64.whl size=673052 sha256=4a8f95ae4426c722b64da6c2b0b63b1fe4bc8fa9c104d7041669894ea0104c90
  Stored in directory: c:\users\lucciano\appdata\local\pip\cache\wheels\c3\72\af\f4d00f372c844119d65a75541288c9a23c23696de08a71b7ec
Successfully built hdbscan
Installing collected packag


[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [10]:
import pandas as pd

# Usas tu df_all_train ya generado después del cleaning
texts = df_all_train["tweet_text"].astype(str).tolist()


Crear embeddings con SBERT

Elección del modelo: all-mpnet-base-v2 supera consistentemente a las variantes RoBERTa porque ofrece mejor desempeño en similitud semántica, una arquitectura más moderna y eficiente, y embeddings más densos y uniformes que permiten formar clusters más claros y estables. Estos avances hacen que MPNet funcione de forma significativamente superior en tareas con texto corto, como tweets, titulares o comentarios, donde RoBERTa tiende a dispersar los vectores y depender excesivamente del vocabulario superficial.

In [11]:
from sentence_transformers import SentenceTransformer

# Opciones:
# model_name = "all-mpnet-base-v2"           # Mejor desempeño general
# model_name = "all-roberta-large-v1"        # SBERT basado en RoBERTa
# model_name = "paraphrase-MiniLM-L6-v2"     # Más ligero y rápido

model_name = "all-mpnet-base-v2"
model = SentenceTransformer(model_name,device="cuda")

embeddings = model.encode(
    texts,
    batch_size=64,
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True   # recomendado para UMAP + HDBSCAN
)


  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


AssertionError: Torch not compiled with CUDA enabled

Reducir dimensionalidad con UMAP

In [None]:
import umap

umap_reducer = umap.UMAP(
    n_neighbors=15,      # estándar para clustering semántico
    n_components=10,     # 5–10 funciona bien para HDBSCAN
    metric="cosine",
    random_state=42
)

embeddings_umap = umap_reducer.fit_transform(embeddings)


  warn(


Clustering con HDBSCAN

In [None]:
import hdbscan

clusterer = hdbscan.HDBSCAN(
    min_cluster_size=100,          # ajusta según tu dataset
    metric='euclidean',           # si usas embeddings reducidos
    cluster_selection_method='eom'
)

labels = clusterer.fit_predict(embeddings_umap)


Añadir los clusters al dataframe original

In [None]:
df_all_train["cluster"] = labels
df_all_train.head()


Unnamed: 0,tweet_id,tweet_text,class_label,disaster_type,cluster
0,735891446960623616,how came to the assistance of fort mcmurray wi...,other_relevant_information,fire,1
1,731202020296818688,red cross distributes $30m to fort mcmurray wi...,displaced_people_and_evacuations,fire,1
2,733665357236342784,interesting insights on the shifting communica...,other_relevant_information,fire,1
3,731963038429929472,oil sands producers helping workers affected b...,rescue_volunteering_or_donation_effort,fire,1
4,728674838034944001,ottawa to match red cross donations for fort m...,rescue_volunteering_or_donation_effort,fire,1


In [None]:
df_all_train["cluster"].value_counts().sort_index()


Unnamed: 0_level_0,count
cluster,Unnamed: 1_level_1
-1,1558
0,1011
1,6387
2,5980
3,2748
4,687
5,3030
6,861
7,183
8,1415


# Métricas

In [None]:
df = df_all_train.copy()
df["cluster"] = labels      # labels = clusters SBERT (HDBSCAN o KMeans 21)
texts = df["tweet_text"].tolist()


Silhouette

In [None]:
from sklearn.metrics import silhouette_score

sil_global = silhouette_score(embeddings, df["cluster"])
print("Silhouette global:", sil_global)


Silhouette global: 0.008952842


El coeficiente Silhouette global obtuvo un valor de –0.019, lo que evidencia que los puntos se encuentran más próximos a clusters distintos del que fueron asignados, reflejando la ausencia de fronteras geométricas claras en el espacio embebido. Este resultado es coherente con la naturaleza de los datos: tweets breves, altamente ruidosos y con poca información contextual, además del uso de UMAP, que preserva vecindarios locales a costa de distorsionar distancias globales. En consecuencia, un Silhouette negativo no invalida el análisis, sino que confirma que el problema no presenta separabilidad natural y que las asignaciones de cluster responden a densidades locales más que a estructuras bien definidas.

Métricas globales respecto a class_label

In [None]:
from sklearn.metrics import (
    adjusted_rand_score,
    adjusted_mutual_info_score,
    homogeneity_score,
    completeness_score,
    v_measure_score
)

y_true = df["class_label"]
y_pred = df["cluster"]

print("ARI:", adjusted_rand_score(y_true, y_pred))
print("AMI:", adjusted_mutual_info_score(y_true, y_pred))
print("Homogeneity:", homogeneity_score(y_true, y_pred))
print("Completeness:", completeness_score(y_true, y_pred))
print("V-Measure:", v_measure_score(y_true, y_pred))


ARI: 0.0063115633875253895
AMI: 0.07701530918955353
Homogeneity: 0.07256332334686244
Completeness: 0.08345652319779003
V-Measure: 0.07762964536018803


Las métricas ARI y AMI mostraron valores bajos (ARI = 0.032, AMI = 0.176), lo que indica que los clusters encontrados no se alinean con las 9 categorías humanitarias. Esto era esperable debido a la naturaleza del dataset: tweets extremadamente cortos, etiquetado conceptual que no siempre se refleja en el texto, y significativo ruido semántico. Aun así, el AMI de 0.176 representa una mejora sustancial respecto a TF-IDF, lo que confirma que los embeddings SBERT capturan mejor las relaciones semánticas, aunque siguen siendo insuficientes para reconstruir las etiquetas mediante clustering no supervisado. Valores igualmente bajos en Homogeneity (0.225), Completeness (0.149) y V-Measure (0.179) refuerzan esta conclusión, ya que muestran que los clusters contienen mezclas de múltiples clases y que cada etiqueta real se distribuye a través de numerosos grupos, evidenciando que la estructura conceptual del etiquetado no emerge espontáneamente en un proceso no supervisado.

Los resultados con un tamaño mínimo de cluster igual a 100 muestran un deterioro aún mayor en las métricas de alineación con las etiquetas reales. El ARI cayó a 0.0063 y el AMI a 0.0770, valores prácticamente nulos que indican que los clusters producidos en esta configuración no recuperan ninguna estructura asociada a las categorías humanitarias. Asimismo, las métricas de Homogeneity (0.0725), Completeness (0.0834) y V-Measure (0.0776) revelan que los grupos formados contienen mezclas extensas de clases y que cada etiqueta se dispersa ampliamente entre distintos clusters. Esto confirma que, al aumentar la granularidad y exigir clusters más grandes y compactos, la estructura semántica detectable disminuye aún más, reforzando la conclusión de que las etiquetas humanitarias no emergen de manera natural mediante clustering no supervisado, incluso utilizando embeddings SBERT y configuraciones densas como min_cluster_size = 100.

Pureza global por cluster (class_label)

In [None]:
purezas = []

for c in sorted(df["cluster"].unique()):
    sub = df[df["cluster"] == c]
    mayor = sub["class_label"].value_counts().idxmax()
    pureza = sub["class_label"].value_counts().max() / len(sub)

    purezas.append({
        "cluster": c,
        "label_mayoritaria": mayor,
        "tamano": len(sub),
        "pureza": pureza
    })

df_pureza = pd.DataFrame(purezas).sort_values("pureza", ascending=False)
df_pureza


Unnamed: 0,cluster,label_mayoritaria,tamano,pureza
10,9,injured_or_dead_people,415,0.848193
8,7,injured_or_dead_people,183,0.765027
3,2,rescue_volunteering_or_donation_effort,5980,0.537793
4,3,rescue_volunteering_or_donation_effort,2748,0.47016
7,6,rescue_volunteering_or_donation_effort,861,0.445993
5,4,sympathy_and_support,687,0.422125
9,8,rescue_volunteering_or_donation_effort,1415,0.418375
11,10,rescue_volunteering_or_donation_effort,567,0.405644
12,11,not_humanitarian,136,0.345588
1,0,injured_or_dead_people,1011,0.331355


Términos dominantes para interpretar clusters

In [None]:
from collections import Counter
import re

def top_words(texts, n=20):
    tokens = []
    for t in texts:
        tokens += re.findall(r"[a-zA-Z]+", t.lower())
    return Counter(tokens).most_common(n)

temas = {}

for c in sorted(df["cluster"].unique()):
    subset = df[df["cluster"] == c]["tweet_text"]
    temas[c] = top_words(subset, n=20)

temas


{np.int64(-1): [('to', 907),
  ('the', 885),
  ('of', 636),
  ('in', 605),
  ('hurricane', 477),
  ('a', 468),
  ('and', 434),
  ('for', 394),
  ('is', 304),
  ('harvey', 207),
  ('by', 197),
  ('you', 183),
  ('are', 183),
  ('help', 172),
  ('irma', 167),
  ('this', 164),
  ('i', 153),
  ('from', 149),
  ('earthquake', 147),
  ('on', 146)],
 np.int64(0): [('the', 1114),
  ('greece', 858),
  ('in', 786),
  ('to', 639),
  ('of', 625),
  ('wildfires', 529),
  ('and', 490),
  ('for', 337),
  ('a', 281),
  ('people', 272),
  ('athens', 254),
  ('fire', 229),
  ('victims', 212),
  ('are', 210),
  ('is', 201),
  ('greek', 188),
  ('greecefires', 178),
  ('by', 166),
  ('at', 158),
  ('with', 149)],
 np.int64(1): [('the', 6303),
  ('to', 4495),
  ('california', 3868),
  ('in', 3647),
  ('of', 2950),
  ('and', 2729),
  ('wildfire', 1870),
  ('fires', 1831),
  ('for', 1826),
  ('a', 1754),
  ('is', 1457),
  ('wildfires', 1446),
  ('fire', 1446),
  ('are', 1317),
  ('people', 1030),
  ('you', 1