In [None]:
import hdbscan
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import umap

df = pd.read_parquet("parquets/features_selected_imputed.parquet")
features = ['zcrall', 'normpeakall', 'spectralTiltall', 'LHratioall', 'periodicity', 'cppall', 'acflow', 'oq', 'naq', 'h1h2']

#sub sampleo 10% de los datos
df = df.sample(frac=0.1, random_state=42)
print("Tamaño del dataframe después de sub muestreo:", df.shape)

X = df[features]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


# Clustering con HDBSCAN
clusterer = hdbscan.HDBSCAN(min_cluster_size=1000)
cluster_labels = clusterer.fit_predict(X_scaled)
df['hdbscan_label'] = cluster_labels
df.to_parquet("parquets/hdbscan_results/hdbscan1000_imputed.parquet")

print("Número de clusters encontrados:", len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0))
print("Número de puntos de ruido:", list(cluster_labels).count(-1))
print(df.head())



In [2]:
#Imputar

import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import hdbscan
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.model_selection import train_test_split

# 1. Cargar tus datos
df = pd.read_parquet("parquets/real_timestamps/features_selected.parquet")

features = ['zcrall','normpeakall','spectralTiltall','LHratioall',
            'periodicity','cppall','acflow','oq','naq','h1h2']

X = df[features]

# 2. Imputar faltantes con mediana
imputer = SimpleImputer(strategy='median')
X_imputed = imputer.fit_transform(X)

X_imputed_df = pd.DataFrame(X_imputed, columns=features)
df[features] = X_imputed_df[features]
df.to_parquet("parquets/real_timestamps/features_selected_imputed.parquet")

In [None]:
#Umap con subsample

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
import umap
import hdbscan
from hdbscan.validity import validity_index

# === 0. Cargar y definir columnas ===
df = pd.read_parquet("parquets/no_wind/features_selected_imputed.parquet")
features = ['zcrall', 'normpeakall', 'LHratioall', 'spectralTiltall',
            'cppall', 'acflow', 'oq', 'naq', 'h1h2']

# === 1. Submuestreo estratificado para entrenamiento (ej. 60k por grupo) ===
n_per_grp = 200_000
dfs = []
for grp, sub in df.groupby('week'):  # 'Control', 'Pre', 'Post'
    dfs.append(sub.sample(n=n_per_grp, random_state=42, replace=False))
df_bal = pd.concat(dfs).reset_index(drop=True)
df_bal = shuffle(df_bal, random_state=42)

# === 2. Escalado global ===
scaler = StandardScaler()
X_scaled_full = scaler.fit_transform(df[features])         # Todo el dataset
X_scaled_sample = scaler.transform(df_bal[features])       # Solo el sample

# === 3. UMAP fit en muestra, transform en todo ===
umap_model = umap.UMAP(
    n_neighbors=100,
    min_dist=0.1,
    n_components=9,
    random_state=42,
)
X_umap_sample = umap_model.fit_transform(X_scaled_sample)
X_umap_full = umap_model.transform(X_scaled_full)
print("UMAP fit/transform completado. Dimensiones:", X_umap_full.shape)

# === 4. HDBSCAN en el embedding del sample ===
clusterer = hdbscan.HDBSCAN(
    min_cluster_size=1000,
    min_samples=50,
    prediction_data=True,
).fit(X_umap_sample)

# === 5. Aproximar etiquetas para todo el set ===
labels_full, strengths_full = hdbscan.approximate_predict(clusterer, X_umap_full)

# === 6. Añadir columnas al DataFrame original ===
for i in range(X_umap_full.shape[1]):
    df[f'umap_{i}'] = X_umap_full[:, i]
df['hdbscan_label'] = labels_full
df['cluster_confidence'] = strengths_full

# === 7. Informes ===
print("Clusters encontrados (en muestra):", clusterer.labels_.max() + 1)
print("Ruido en total:", np.sum(labels_full == -1))

# === 8. Guardar ===
df.to_parquet('parquets/no_wind/hdbscan_results/hdbscan1000_50_umap100_9_imputed.parquet', index=False)
print("Guardado exitosamente")



In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
import umap
import hdbscan
from hdbscan.validity import validity_index
from itertools import product
from pathlib import Path
import time

# === 0. Cargar datos ===
df = pd.read_parquet("parquets/no_wind/features_selected_imputed.parquet")

features = ['zcrall', 'normpeakall', 'LHratioall', 'spectralTiltall',
            'cppall', 'acflow', 'oq', 'naq', 'h1h2']

# === 1. Submuestreo estratificado (se reutiliza) ===
n_per_grp = 200_000
dfs = [sub.sample(n=n_per_grp, random_state=42, replace=False)
       for _, sub in df.groupby('week')]
df_bal = shuffle(pd.concat(dfs).reset_index(drop=True), random_state=42)

# === 2. Escalado global ===
scaler = StandardScaler()
X_scaled_full   = scaler.fit_transform(df[features])
X_scaled_sample = scaler.transform(df_bal[features])

# === 3. Definir grilla de hiperparámetros ===
grid = list(product(
    [5, 9],          # n_components
    [30, 100],       # n_neighbors
    [0.05, 0.1],     # min_dist
    [1000, 3000],    # min_cluster_size
    [50, 100]        # min_samples
))

# Carpeta de resultados
out_path = Path("parquets/no_wind/hdbscan_results")
out_path.mkdir(parents=True, exist_ok=True)

results = []

for n_comp, n_neigh, m_dist, min_c, min_s in grid:
    t0 = time.time()
    print(f"\n▶️  UMAP(nc={n_comp}, nn={n_neigh}, md={m_dist}) | "
          f"HDB(min_c={min_c}, min_s={min_s})")

    # 3.1 Fit-transform UMAP en sample
    umap_model = umap.UMAP(
        n_neighbors=n_neigh,
        min_dist=m_dist,
        n_components=n_comp,
        random_state=42,
        low_memory=True
    )
    X_umap_sample = umap_model.fit_transform(X_scaled_sample)
    X_umap_full   = umap_model.transform(X_scaled_full)

    # 3.2 HDBSCAN en sample
    clusterer = hdbscan.HDBSCAN(
        min_cluster_size=min_c,
        min_samples=min_s,
        prediction_data=True
    ).fit(X_umap_sample)

    # 3.3 Aproximar etiquetas para todo el dataset
    labels_full, strengths_full = hdbscan.approximate_predict(clusterer, X_umap_full)

    # 3.4 Métricas (muestra estratificada de 10 000 puntos distintos de ruido)
    mask_clustered = labels_full != -1
    X_clustered = X_umap_full[mask_clustered]
    y_clustered = labels_full[mask_clustered]

    X_sampled, _, y_sampled, _ = train_test_split(
        X_clustered, y_clustered,
        train_size=10_000, stratify=y_clustered, random_state=42
    )

    dbcv  = validity_index(X_sampled.astype(np.float64), y_sampled, metric='euclidean')
    sil   = silhouette_score(X_sampled, y_sampled)
    ch    = calinski_harabasz_score(X_sampled, y_sampled)
    db    = davies_bouldin_score(X_sampled, y_sampled)

    # 3.5 Guardar parquet con etiquetas y embedding
    file_name = f"hdbscan{min_c}_{min_s}_umap{n_neigh}_{n_comp}_{str(m_dist).replace('.','p')}imputed.parquet"
    df_out = df.copy()
    for i in range(n_comp):
        df_out[f'umap_{i}'] = X_umap_full[:, i]
    df_out['hdbscan_label']      = labels_full
    df_out['cluster_confidence'] = strengths_full
    df_out.to_parquet(out_path / file_name, index=False)

    # 3.6 Registrar resultados
    results.append({
        'n_components': n_comp,
        'n_neighbors': n_neigh,
        'min_dist': m_dist,
        'min_cluster_size': min_c,
        'min_samples': min_s,
        'clusters_found': clusterer.labels_.max() + 1,
        'noise_ratio_sample': np.mean(clusterer.labels_ == -1),
        'noise_ratio_full': np.mean(labels_full == -1),
        'DBCV': dbcv,
        'Silhouette': sil,
        'Calinski_Harabasz': ch,
        'Davies_Bouldin': db,
        'runtime_sec': round(time.time() - t0, 1)
    })

    print(f"  ➜ DBCV={dbcv:.3f}  Sil={sil:.3f}  CH={ch:.1f}  DB={db:.2f} "
          f"| clusters={clusterer.labels_.max()+1}  ruido_full={np.mean(labels_full == -1):.2%}")

# === 4. Guardar CSV de resultados ===
results_df = pd.DataFrame(results)
results_df.to_csv('gridsearch_umap_hdbscan_scores.csv', index=False)
print("\n✅ Grid-search finalizado. Resultados guardados en 'gridsearch_umap_hdbscan_scores.csv'")


In [13]:
from sklearn.model_selection import train_test_split
X_sampled, _, y_sampled, _ = train_test_split(
    X_valid, labels_valid,
    train_size=0.1,
    stratify=labels_valid,
    random_state=42
)
X_sampled_64 = X_sampled.astype(np.float64)

dbcv_score_sampled = validity_index(X_sampled_64, y_sampled, metric='euclidean')


In [2]:
results_df = pd.DataFrame(results)
results_df.to_csv('gridsearch_umap_hdbscan_scores.csv', index=False)