In [10]:
#Evaluar hdbscan + umap

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import silhouette_score
from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score
import numpy as np
import hdbscan
from hdbscan.validity import validity_index

# 1. Cargar el parquet
df = pd.read_parquet('parquets/no_wind/hdbscan_results/hdbscan3000_100_umap100_5_imputed.parquet')
features = ['zcrall', 'normpeakall', 'spectralTiltall', 'LHratioall', 
            'cppall', 'acflow', 'oq', 'naq', 'h1h2']
# 2. Reconstruir X_umap
umap_cols = [col for col in df.columns if col.startswith('umap_')]
X_umap = df[umap_cols].values

# 3. Tomar los labels de HDBSCAN
cluster_labels = df['hdbscan_label'].values

# 4. Seguir con el mismo flujo de máscara, resample y evaluación
mask = cluster_labels != -1
X_clustered = X_umap[mask]
labels_clustered = cluster_labels[mask]

X_sampled, _, y_sampled, _ = train_test_split(
    X_clustered, labels_clustered,
    train_size=10000,
    stratify=labels_clustered,
    random_state=42
)

X_sampled_64 = X_sampled.astype(np.float64)
dbcv_score_sampled = validity_index(X_sampled_64, y_sampled, metric='euclidean')
print(f"DBCV Score (muestra 10%): {dbcv_score_sampled:.2f}  (mayor mejor)")
sil_score = silhouette_score(X_sampled, y_sampled)
print(f"Silhouette Score (muestra 10%): {sil_score:.4f}")
ch_score = calinski_harabasz_score(X_sampled, y_sampled)
print(f"Calinski-Harabasz Score: {ch_score:.2f}  (mayor mejor)")
db_score = davies_bouldin_score(X_sampled, y_sampled)
print(f"Davies-Bouldin Score: {db_score:.2f}  (menor mejor)")



DBCV Score (muestra 10%): 0.84  (mayor mejor)
Silhouette Score (muestra 10%): 0.7149
Calinski-Harabasz Score: 18193.12  (mayor mejor)
Davies-Bouldin Score: 0.29  (menor mejor)


In [None]:
#persistence

import pandas as pd
from sklearn.model_selection import train_test_split
import hdbscan
from hdbscan.validity import validity_index
import numpy as np

# 1. Cargar el parquet
df = pd.read_parquet('parquets/no_wind/hdbscan_results/hdbscan1000_50_umap30_9_imputed.parquet')
features = ['zcrall', 'normpeakall', 'spectralTiltall', 'LHratioall', 
            'cppall', 'acflow', 'oq', 'naq', 'h1h2']
# 2. Reconstruir X_umap





MemoryError: Unable to allocate 1.19 TiB for an array with shape (405132, 405132) and data type float64

: 

In [2]:
#Evaluar solo HDBSCAN

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.preprocessing import StandardScaler
import numpy as np

# 1. Cargar el parquet con HDBSCAN labels
df = pd.read_parquet('parquets/real_timestamps/hdbscan_results/hdbscan300_50_umap30_9_imputed.parquet')

# 2. Definir features
features = ['zcrall', 'normpeakall', 'spectralTiltall', 'LHratioall', 
            'cppall', 'acflow', 'oq', 'naq', 'h1h2']
X = df[features].values

# 3. Escalar features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 4. Obtener labels de clustering
cluster_labels = df['hdbscan_label'].values

# 5. Filtrar puntos con cluster válido (excluye ruido -1)
mask = cluster_labels != -1
X_clustered = X_scaled[mask]
labels_clustered = cluster_labels[mask]

# 6. Resamplear para evaluación rápida
X_sampled, _, y_sampled, _ = train_test_split(
    X_clustered, labels_clustered,
    train_size=0.1,
    stratify=labels_clustered,
    random_state=42
)

# 7. Calcular métricas
sil_score = silhouette_score(X_sampled, y_sampled)
ch_score = calinski_harabasz_score(X_sampled, y_sampled)
db_score = davies_bouldin_score(X_sampled, y_sampled)

# 8. Mostrar resultados
print(f"Silhouette Score (muestra 10%): {sil_score:.4f}")
print(f"Calinski-Harabasz Score: {ch_score:.2f}  (mayor mejor)")
print(f"Davies-Bouldin Score: {db_score:.2f}  (menor mejor)")
print(f"Número de clusters: {len(np.unique(labels_clustered))}")
print(f"Porcentaje de ruido: {100 * np.sum(cluster_labels == -1) / len(cluster_labels):.2f}%")


Silhouette Score (muestra 10%): 0.0607
Calinski-Harabasz Score: 2839.86  (mayor mejor)
Davies-Bouldin Score: 1.70  (menor mejor)
Número de clusters: 7
Porcentaje de ruido: 0.18%


In [1]:
# Agrupar directamente por sujeto + semana real
import pandas as pd
df = pd.read_parquet('parquets/no_wind/hdbscan_results/hdbscan1000_50_umap30_5_0p1imputed.parquet')
df_valid = df[df['hdbscan_label'] != -1]
df_valid['subject_week'] = df_valid['subject_id'] + "_" + df_valid['week']

# Obtener tabla de frecuencias y normalizar
cluster_dist = pd.crosstab(df_valid['subject_week'], df_valid['hdbscan_label'])
cluster_freq = cluster_dist.div(cluster_dist.sum(axis=1), axis=0)

# Reasignar condición usando el DataFrame original
subject_week_to_condition = df_valid.drop_duplicates('subject_week').set_index('subject_week')['week']
cluster_freq['condition'] = cluster_freq.index.map(subject_week_to_condition)

cluster_freq.head()



hdbscan_label,0,1,2,3,4,condition
subject_week,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
NF031_Control,0.237799,0.000941,0.003147,0.001284,0.756829,Control
NF038_Control,0.149371,0.000336,0.003302,0.001972,0.845018,Control
NF111_Control,0.002163,0.120472,0.871102,0.00623,3.3e-05,Control
NF126_Control,4e-05,0.150981,0.846608,0.002138,0.000234,Control
NF134_Control,6.1e-05,0.254863,0.743926,0.00109,6.1e-05,Control


In [2]:
from scipy.spatial.distance import jensenshannon

# Promedio de vectores por condición
means = cluster_freq.groupby('condition').mean()
print(means)

# JS distance entre cada par
js_pre_control = jensenshannon(means.loc['Pre'], means.loc['Control'])
js_post_control = jensenshannon(means.loc['Post'], means.loc['Control'])
js_post_pre = jensenshannon(means.loc['Post'], means.loc['Pre'])

print(f"JS(Pre || Control): {js_pre_control:.4f}")
print(f"JS(Post || Control): {js_post_control:.4f}")
print(f"JS(Post || Pre): {js_post_pre:.4f}")
    


hdbscan_label         0         1         2         3         4
condition                                                      
Control        0.077887  0.105519  0.493617  0.002543  0.320435
Post           0.000081  0.175433  0.819173  0.004847  0.000466
Pre            0.000112  0.124530  0.866647  0.008227  0.000484
JS(Pre || Control): 0.4039
JS(Post || Control): 0.4014
JS(Post || Pre): 0.0522
