In [1]:
import pandas as pd
import numpy as np
import umap
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

df = pd.read_parquet("parquets/no_wind/features_selected_imputed.parquet")
features = ['zcrall', 'normpeakall', 'spectralTiltall', 'LHratioall', 'periodicity', 
            'cppall', 'acflow', 'oq', 'naq', 'h1h2']
umap_model = umap.UMAP(n_neighbors=50, min_dist=0.1,
               metric='euclidean', low_memory=True)
X = df[features]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_umap = umap_model.fit_transform(X_scaled)
df['umap_X'] = X_umap[:, 0]
df['umap_Y'] = X_umap[:, 1]
df.to_parquet("parquets/no_wind/umap_results/features_selected_imputed_umap100.parquet")


  from .autonotebook import tqdm as notebook_tqdm


In [50]:
import pandas as pd
import numpy as np
import umap
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.utils import shuffle

df = pd.read_parquet("parquets/features_selected_imputed.parquet")
features = ['zcrall', 'normpeakall', 'spectralTiltall', 'LHratioall', 'periodicity', 
            'cppall', 'acflow', 'oq', 'naq', 'h1h2']

n_per_grp = 30_000
dfs = []
for grp, sub in df.groupby('week'):          # 'Control', 'Pre', 'Post'
    dfs.append(sub.sample(n=n_per_grp, random_state=42, replace=False))
df_bal = pd.concat(dfs).reset_index(drop=True)
df_bal = shuffle(df_bal, random_state=42)      # mezclar

# 2) Escalado
scaler = StandardScaler()
X_bal = scaler.fit_transform(df_bal[features])

# 3) Entrenar UMAP en el set balanceado
um = umap.UMAP(n_neighbors=30, min_dist=0.1,
               metric='euclidean', random_state=42)
emb_bal = um.fit_transform(X_bal)
df_bal['umap_X'], df_bal['umap_Y'] = emb_bal[:,0], emb_bal[:,1]
df_bal.to_parquet("parquets/umap_results/features_selected_imputed_umap30_bal.parquet")


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [None]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

# --- carga de datos (idéntica a tu código) -------------------------------
df_bal = pd.read_parquet("parquets/no_wind/umap_results/features_selected_imputed_umap50.parquet")
df_hdbscan = pd.read_parquet('parquets/no_wind/hdbscan_results/hdbscan1000_50_umap30_5_0p1imputed.parquet')
df_bal['hdbscan_label'] = df_hdbscan['hdbscan_label']

df_control = df_bal[df_bal['week'] == 'Control']
df_pre     = df_bal[df_bal['week'] == 'Pre']
df_post    = df_bal[df_bal['week'] == 'Post']

# --- ploteo --------------------------------------------------------------
fig, axes = plt.subplots(1, 3, figsize=(15, 5), sharex=True, sharey=True)

# Primer scatterplot (con leyenda)
sns.scatterplot(
    data=df_control, x='umap_X', y='umap_Y',
    ax=axes[0], hue='hdbscan_label',
    palette='Set2', alpha=0.5
)
axes[0].set_title('Control')

# Siguientes scatterplots SIN leyenda
for ax, data, title in zip(
        axes[1:],
        [df_pre, df_post],
        ['Pre', 'Post']):
    sns.scatterplot(
        data=data, x='umap_X', y='umap_Y',
        ax=ax, hue='hdbscan_label',
        palette='Set2', alpha=0.5, legend=False
    )
    ax.set_title(title)

# --- leyenda única y reducida -------------------------------------------
# 1. Extraer handles/labels de la leyenda del primer eje
handles, labels = axes[0].get_legend_handles_labels()

# 2. Crear leyenda de figura (tamaño pequeño)
fig.legend(
    handles, labels,
    loc='lower center',          # elige la posición que mejor se adapte
    bbox_to_anchor=(0.5, 1.02),  # fuera del área de dibujo
    ncol=min(5, len(labels)),    # distribuye en varias columnas si hay muchas clases
    frameon=False,
    fontsize='x-small',          # más pequeño que default
    title='hdbscan_label'        # opcional: título de la leyenda
)

# 3. Eliminar la leyenda sobrante del primer eje
axes[0].get_legend().remove()

plt.tight_layout()
plt.show()



In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
df_vis = pd.read_parquet("parquets/umap_results/features_selected_imputed_umap30.parquet")
df = pd.read_parquet('parquets/hdbscan_results/hdbscan1000_200_umap30_9_imputed.parquet')

df_vis['hdbscan_label'] = df['hdbscan_label']
df_vis['CAPE_overall'] = df['CAPE_overall']
df_vis['vrqol_total'] = df['vrqol_total']
df_pre = df_vis[df_vis['week'] == 'Pre']
df_post = df_vis[df_vis['week'] == 'Post']

fig, axes = plt.subplots(1, 2, figsize=(10, 5), sharex=True, sharey=True)
sns.scatterplot(
    data=df_pre, x='umap_X', y='umap_Y',
    ax=axes[0], hue='CAPE_overall',
    palette='Set2', alpha=0.6, s=3
)
axes[0].set_title('Pre')
sns.scatterplot(
    data=df_post, x='umap_X', y='umap_Y',
    ax=axes[1], hue='CAPE_overall',
    palette='Set2', alpha=0.6, s=3
)
axes[1].set_title('Post')
plt.show()

MemoryError: 

In [None]:
import pandas as pd
import plotly.graph_objects as go

# 1. Cargar datos
df = pd.read_parquet("parquets/no_wind/umap_results/features_selected_imputed_umap50.parquet")
hdbscan = pd.read_parquet('parquets/no_wind/hdbscan_results/hdbscan1000_50_umap30_5_0p1imputed.parquet')
df = df.reset_index(drop=True)
hdbscan = hdbscan.reset_index(drop=True)

# Agregar la columna 'hdbscan_label' al df con UMAP
df['hdbscan_label'] = hdbscan['hdbscan_label']

# 2. Agregar columna de día (sin hora)
df['day'] = pd.to_datetime(df['ts']).dt.date

# 3. Función para crear figura con slider para un sujeto
def create_slider_plot(subject_id):
    df_sub = df[df['subject_id'] == subject_id].copy()
    unique_days = sorted(df_sub['day'].unique())

    fig_dict = {
        "data": [],
        "layout": {},
        "frames": []
    }

    # Layout
    fig_dict["layout"]["xaxis"] = {"title": "UMAP 1"}
    fig_dict["layout"]["yaxis"] = {"title": "UMAP 2"}
    fig_dict["layout"]["hovermode"] = "closest"
    fig_dict["layout"]["title"] = f"UMAP por día - {subject_id}"
    fig_dict["layout"]["height"] = 800
    fig_dict["layout"]["updatemenus"] = [{
        "type": "buttons",
        "buttons": [{
            "label": "Play",
            "method": "animate",
            "args": [None, {"frame": {"duration": 1000, "redraw": True},
                            "fromcurrent": True}]
        }]
    }]

    # Slider setup
    sliders_dict = {
        "active": 0,
        "steps": [],
        "x": 0.1,
        "len": 0.9,
        "xanchor": "left",
        "y": 0,
        "yanchor": "top",
        "pad": {"b": 10},
        "currentvalue": {"prefix": "Día: "}
    }

    # Inicializar con el primer día
    first_day = unique_days[0]
    df_day = df_sub[df_sub['day'] == first_day]

    scatter = go.Scatter(
    x=df_day["umap_X"],
    y=df_day["umap_Y"],
    mode="markers",
    marker=dict(
        color=df_day["hdbscan_label"],  # ahora sí usas la etiqueta
        colorscale="Viridis",
        size=5,
        colorbar=dict(title="Cluster")
    ),
    text=df_day["ts"],  # para que se vea el cluster al pasar el mouse
    hoverinfo="text"
)
    fig_dict["data"] = [scatter]

    # Crear frames por día
    for day in unique_days:
        df_day = df_sub[df_sub['day'] == day]
        frame = go.Frame(
            data=[go.Scatter(
                x=df_day["umap_X"],
                y=df_day["umap_Y"],
                mode="markers",
                marker=dict(
                color=df_day["hdbscan_label"],  # ahora sí usas la etiqueta
                colorscale="Viridis",
                size=5,
                colorbar=dict(title="Cluster")
    ),
    text=df_day["ts"],  # para que se vea el cluster al pasar el mouse
    hoverinfo="text"
            )],
            name=str(day)
        )
        fig_dict["frames"].append(frame)

        slider_step = {
            "args": [
                [str(day)],
                {"frame": {"duration": 0, "redraw": True},
                 "mode": "immediate"}
            ],
            "label": str(day),
            "method": "animate"
        }
        sliders_dict["steps"].append(slider_step)

    fig_dict["layout"]["sliders"] = [sliders_dict]

    return go.Figure(fig_dict)

# 4. Crear figura para cada sujeto
fig_nf = create_slider_plot("NF134")
fig_pf = create_slider_plot("PF134")

# 5. Mostrar
fig_nf.show()
fig_pf.show()



In [33]:
nf031 = df[df['subject_id'] == "NF134"].copy()
print(nf031.shape)


(61689, 19)
