In [None]:

# --------------------------- IMPORTS ---------------------------
import os
import json
import numpy as np
import pandas as pd

# ---------------------- CONFIGURACIÓN BÁSICA -------------------
base_power = "Powerdata_filtrado"                # Ruta a tus NPZ TF reales
base_temporal = "power_data_temporal_fil"        # Ruta a NPZ temporales
output_features_xlsx = "features_completas.xlsx"
output_features_original_xlsx = "features_completas_original.xlsx"
output_dict_xlsx = "features_diccionario.xlsx"
REGISTRY_PATH = "features_registry_num.json"
EPS = 1e-12

# ---------------------- BANDAS DE FRECUENCIA -------------------
band_plan = [
    (10,    50,    1),
    (50,    100,   1),
    (100,   300,   4),
    (300,   800,   5),
    (800,   1500,  7),
    (1500,  3000,  6),
    (3000,  5000,  0),
    (5000,  11025, 0),
]

def construir_bandas(plan):
    edges = []
    for i, (f0, f1, n) in enumerate(plan):
        sub = np.linspace(float(f0), float(f1), int(n) + 1)
        if i > 0:
            sub = sub[1:]  # evitar duplicar borde
        edges.extend(sub.tolist())
    return np.array(edges, dtype=float)

band_edges = construir_bandas(band_plan)
assert len(band_edges) - 1 == 24, f"Esperaba 36 bandas, obtuve {len(band_edges)-1}"

def obtener_rangos(bordes):
    return [(bordes[i], bordes[i + 1]) for i in range(len(bordes) - 1)]

band_ranges = obtener_rangos(band_edges)   # [(f0,f1), ...] 36 bandas

# ------------------- DEFINICIÓN DE FEATURES -------------------
GLOBAL_FEATURE_NAMES = [
    # temporales (1–9)
    "mean_time",
    "std_time",
    "entropy_time",
    "energy_time",
    "skewness_time",
    "kurtosis_time",
    "time_peak_1",
    "time_peak_2",
    "duration_time",

    # TF / frecuencia global (10–17)
    "tf_energy_total",
    "freq_centroid",
    "freq_max",
    "spectral_entropy",
    "spectral_rolloff70",
    "low_high_ratio",
    "tf_energy_mean",
    "tf_entropy_2d",
]

# 6 features por banda (todas según el PDF actualizado)
BAND_METRICS = [
    "energy",         # E_B
    "energy_norm",    # E_B / sum(E_B)
    "entropy_time",   # entropía temporal dentro de la banda
    "std_time",       # desviación estándar temporal W_B(t)
    "freq_dom",       # frecuencia dominante en la banda
    "freq_70",        # frecuencia donde se acumula el 70% de la energía de la banda
]

# nombres finales de features por banda
BAND_FEATURE_NAMES = []
for (f0, f1) in band_ranges:
    label = f"{int(round(f0))}_{int(round(f1))}"  # por ej. "1500_1750"
    for metric in BAND_METRICS:
        BAND_FEATURE_NAMES.append(f"band_{label}_{metric}")

ALL_FEATURE_NAMES = GLOBAL_FEATURE_NAMES + BAND_FEATURE_NAMES
assert len(ALL_FEATURE_NAMES) == 161, f"Se esperaban 233 features, hay {len(ALL_FEATURE_NAMES)}"

# ---------------------- FEATURES TEMPORALES ----------------------
def features_temporales(signal):
    """
    Features 1–9 (señal temporal x[t]):
    - mean_time
    - std_time
    - entropy_time
    - energy_time
    - skewness_time
    - kurtosis_time
    - time_peak_1
    - time_peak_2
    - duration_time
    """
    feats = {}
    x = np.asarray(signal, dtype=float)
    N = len(x)

    if N == 0:
        for name in GLOBAL_FEATURE_NAMES[:9]:
            feats[name] = 0.0
        feats["time_peak_1"] = 0
        feats["time_peak_2"] = 0
        feats["duration_time"] = 0
        return feats

    mu = np.mean(x)
    sigma = np.std(x)
    feats["mean_time"] = float(mu)
    feats["std_time"] = float(sigma)

    # Entropía temporal (p_i = |x[i]| / sum|x|)
    ax = np.abs(x)
    s = ax.sum()
    if s > 0:
        p = ax / s
        mask = p > 0
        feats["entropy_time"] = float(-(p[mask] * np.log2(p[mask] + EPS)).sum())
    else:
        feats["entropy_time"] = 0.0

    # Energía temporal (x^2)
    feats["energy_time"] = float(np.sum(x**2))

    # Skewness y kurtosis
    feats["skewness_time"] = float(np.mean((x - mu)**3) / (sigma**3 + EPS))
    feats["kurtosis_time"] = float(np.mean((x - mu)**4) / (sigma**4 + EPS))

    # Dos picos temporales más intensos (índices)
    if N >= 2:
        idx_top2 = np.argpartition(np.abs(x), -2)[-2:]
        idx_top2.sort()
        feats["time_peak_1"] = int(idx_top2[0])
        feats["time_peak_2"] = int(idx_top2[1])
    else:
        feats["time_peak_1"] = 0
        feats["time_peak_2"] = 0

    # Duración (número de muestras)
    feats["duration_time"] = int(N)
    return feats

# ---------------------- FEATURES TF / FRECUENCIA (CWT) ----------------------
def features_power(power, freqs):
    """
    Features 10–17 globales + 6*36 features por banda.

    power: matriz |W(a,t)|   (escalas x tiempo)
    freqs: vector de frecuencia equivalente por escala (Hz), 
           mismo tamaño que número de filas de power.
    """
    feats = {}
    P = np.asarray(power, dtype=float)   # CWT magnitudes
    F = np.asarray(freqs, dtype=float)

    if P.size == 0 or F.size == 0:
        for name in GLOBAL_FEATURE_NAMES[9:]:
            feats[name] = 0.0
        for name in BAND_FEATURE_NAMES:
            feats[name] = 0.0
        return feats

    # ---------- 10. Energía total TF ----------
    feats["tf_energy_total"] = float(np.sum(P))

    # ---------- 16. Energía media 2D ----------
    feats["tf_energy_mean"] = float(np.mean(P))

    # ---------- 17. Entropía 2D ----------
    total_P = np.sum(P)
    if total_P > 0:
        q = (P / (total_P + EPS)).flatten()
        mask = q > 0
        feats["tf_entropy_2d"] = float(-(q[mask] * np.log2(q[mask] + EPS)).sum())
    else:
        feats["tf_entropy_2d"] = 0.0

    # Energía por escala (para centroide, fmax, entropía espectral, rolloff, ratio)
    Ef = np.sum(P, axis=1)   # sum sobre tiempo
    Ef_sum = np.sum(Ef)

    if Ef_sum > 0:
        # ---------- 11. Frecuencia media (centroide) ----------
        feats["freq_centroid"] = float(np.sum(F * Ef) / (Ef_sum + EPS))

        # ---------- 12. Frecuencia de máxima energía ----------
        feats["freq_max"] = float(F[np.argmax(Ef)])

        # ---------- 13. Entropía espectral global ----------
        pf = Ef / (Ef_sum + EPS)
        mask_pf = pf > 0
        feats["spectral_entropy"] = float(-(pf[mask_pf] * np.log2(pf[mask_pf] + EPS)).sum())

        # ---------- 14. Rolloff 70% global ----------
        cum_E = np.cumsum(Ef)
        idx_roll = np.argmax(cum_E >= 0.7 * Ef_sum)
        feats["spectral_rolloff70"] = float(F[idx_roll])

        # ---------- 15. Ratio <3kHz / ≥3kHz ----------
        low_mask = F < 3000
        high_mask = ~low_mask
        E_low = np.sum(Ef[low_mask])
        E_high = np.sum(Ef[high_mask])
        feats["low_high_ratio"] = float(E_low / (E_high + EPS))
    else:
        feats["freq_centroid"] = 0.0
        feats["freq_max"] = 0.0
        feats["spectral_entropy"] = 0.0
        feats["spectral_rolloff70"] = 0.0
        feats["low_high_ratio"] = 0.0

    # ---------------------- FEATURES POR BANDA ----------------------
    n_bands = len(band_ranges)

    # Primero calculamos E_B y datos intermedios para cada banda
    band_data = []  # lista de dicts para cada banda
    for i, (f0, f1) in enumerate(band_ranges):
        # última banda incluye el límite superior
        if i < n_bands - 1:
            idx = np.where((F >= f0) & (F < f1))[0]
        else:
            idx = np.where((F >= f0) & (F <= f1))[0]

        label = f"{int(round(f0))}_{int(round(f1))}"

        if idx.size == 0:
            band_data.append({
                "label": label,
                "E_B": 0.0,
                "W_B": None,
                "F_band": None,
                "EfB": None,
            })
            continue

        B = P[idx, :]                    # (n_escalas_en_banda x T)
        W_B = np.sum(B, axis=0)          # W_B(t) = sum_{a in B} |W(a,t)|
        E_B = float(np.sum(W_B))         # energía total de la banda
        F_band = F[idx]                  # frecuencias de las escalas en la banda
        EfB = np.sum(B, axis=1)          # energía por "frecuencia" dentro de banda

        band_data.append({
            "label": label,
            "E_B": E_B,
            "W_B": W_B,
            "F_band": F_band,
            "EfB": EfB,
        })

    # Suma total de energía en bandas (para normalización)
    total_E_bands = sum(d["E_B"] for d in band_data)

    # Ahora calculamos las 6 métricas por banda
    for d in band_data:
        label = d["label"]
        base = f"band_{label}_"
        E_B = d["E_B"]
        W_B = d["W_B"]
        F_band = d["F_band"]
        EfB = d["EfB"]

        if (W_B is None) or (E_B <= 0):
            # banda vacía o sin energía
            feats[base + "energy"] = 0.0
            feats[base + "energy_norm"] = 0.0
            feats[base + "entropy_time"] = 0.0
            feats[base + "std_time"] = 0.0
            feats[base + "freq_dom"] = 0.0
            feats[base + "freq_70"] = 0.0
            continue

        # 1) Energía en banda
        feats[base + "energy"] = E_B

        # 2) Energía normalizada en banda
        if total_E_bands > 0:
            feats[base + "energy_norm"] = float(E_B / (total_E_bands + EPS))
        else:
            feats[base + "energy_norm"] = 0.0

        # 3) Entropía temporal dentro de la banda
        p_t = W_B / (E_B + EPS)
        mask_t = p_t > 0
        H_B = -(p_t[mask_t] * np.log2(p_t[mask_t] + EPS)).sum()
        feats[base + "entropy_time"] = float(H_B)

        # 4) Desviación estándar temporal de la banda
        mu_B = float(np.mean(W_B))
        sigma_B = float(np.sqrt(np.mean((W_B - mu_B)**2)))
        feats[base + "std_time"] = sigma_B

        # 5) Frecuencia dominante en la banda (frecuencia con mayor energía integrada)
        EfB_sum = np.sum(EfB)
        if EfB_sum > 0:
            idx_max = np.argmax(EfB)
            freq_dom = float(F_band[idx_max])

            # 6) Frecuencia donde se acumula el 70% de la energía de la banda
            cumB = np.cumsum(EfB)
            idx70 = np.argmax(cumB >= 0.7 * EfB_sum)
            freq_70 = float(F_band[idx70])
        else:
            freq_dom = 0.0
            freq_70 = 0.0

        feats[base + "freq_dom"] = freq_dom
        feats[base + "freq_70"] = freq_70

    return feats

# ----------------- RECORRIDO DE ARCHIVOS -----------------
records = []

for tipo in ["pacientes", "sujetos"]:
    power_dir = os.path.join(base_power, tipo)
    temp_dir = os.path.join(base_temporal, tipo)

    if not os.path.exists(power_dir):
        continue

    for iddsi_folder in os.listdir(power_dir):
        folder_tf = os.path.join(power_dir, iddsi_folder)
        folder_temp = os.path.join(temp_dir, iddsi_folder)
        if not os.path.isdir(folder_tf):
            continue

        for archivo in os.listdir(folder_tf):
            if not archivo.endswith(".npz"):
                continue

            data_power = np.load(os.path.join(folder_tf, archivo))

            # CWT magnitude / energy
            power = data_power["cwt_mag"].astype(float)
            freqs = data_power["freqs"].astype(float)

            sujeto = str(data_power["sujeto"])
            iddsi = int(data_power["iddsi"])
            sorbo = int(data_power["sorbo"])
            t_ini = float(data_power["t_ini"])
            t_fin = float(data_power["t_fin"])

            file_temp = os.path.join(folder_temp, archivo)
            if os.path.exists(file_temp):
                data_temp = np.load(file_temp)
                temporal_signal = data_temp["signal_fil"].astype(float)
            else:
                temporal_signal = np.zeros(power.shape[1])

            # features temporales y CWT
            feats_t = features_temporales(temporal_signal)
            feats_p = features_power(power, freqs)

            merged = {**feats_t, **feats_p}
            merged.update({
                "tipo": tipo,
                "sujeto": sujeto,
                "iddsi": iddsi,
                "sorbo": sorbo,
                "duracion": float(t_fin - t_ini),
                "archivo": archivo,
            })

            records.append(merged)

if not records:
    raise RuntimeError("No se encontraron NPZ. Revisa tus carpetas.")

df = pd.DataFrame(records)

# ------------------------ MATRICES DE SALIDA ------------------------
metadata_cols = ["tipo", "sujeto", "iddsi", "sorbo", "duracion", "archivo"]

# Garantizar que todas las columnas de features existan
for name in ALL_FEATURE_NAMES:
    if name not in df.columns:
        df[name] = 0.0

df_original = df[metadata_cols + ALL_FEATURE_NAMES].copy()

# ID → nombre
id_to_name = {i + 1: name for i, name in enumerate(ALL_FEATURE_NAMES)}

# Guardar JSON
with open(REGISTRY_PATH, "w", encoding="utf-8") as fh:
    json.dump({str(k): v for k, v in id_to_name.items()}, fh,
              ensure_ascii=False, indent=2)
print("Registro JSON simple guardado:", REGISTRY_PATH)

# Renombrar columnas de features a IDs numéricos
rename_map = {name: str(i + 1) for i, name in enumerate(ALL_FEATURE_NAMES)}
df_ids = df_original.rename(columns=rename_map)

# ------------------------ GUARDAR XLSX ------------------------
try:
    try:
        import xlsxwriter
        engine = "xlsxwriter"
    except Exception:
        import openpyxl
        engine = "openpyxl"

    with pd.ExcelWriter(output_features_xlsx, engine=engine) as xw:
        df_ids.to_excel(xw, index=False, sheet_name="features_ids")

    with pd.ExcelWriter(output_features_original_xlsx, engine=engine) as xw:
        df_original.to_excel(xw, index=False, sheet_name="features_original")

    dict_rows = [{"ID": fid, "Feature": fname} for fid, fname in id_to_name.items()]
    df_dict = pd.DataFrame(dict_rows).sort_values(by="ID")
    with pd.ExcelWriter(output_dict_xlsx, engine=engine) as xw:
        df_dict.to_excel(xw, index=False, sheet_name="Diccionario")

    print("\n Todos los archivos XLSX generados correctamente.\n")

except Exception as e:
    print("Error generando XLSX:", e)


Registro JSON simple guardado: features_registry_num.json

✅ Todos los archivos XLSX generados correctamente.



In [4]:
import numpy as np

data = np.load("power_data_temporal_fil/pacientes/IDDSI0/P1S1.npz")
print("Claves del archivo:", data.files)


Claves del archivo: ['signal_fil', 'sujeto', 'iddsi', 'sorbo', 'fs', 't_ini', 't_fin', 'downsample_factor', 'filtro_wavelet']


In [35]:
# -------------------- SEPARACIÓN EN 3 MATRICES POR IDDSI --------------------
iddsi_targets = [0, 2, 4]
split_prefix = "features_IDDSI"

# SOLO features + tipo
feature_cols = [c for c in df_ids.columns if c not in metadata_cols]
cols_keep = ["tipo"] + feature_cols   # Dejamos tipo como único metadato útil

# -------------------- EXPORTAR XLSX INDIVIDUALES --------------------
for val in iddsi_targets:

    # Filtrar filas por IDDSI
    sub = df_ids[df_ids["iddsi"] == val]

    if sub.empty:
        print(f"⚠️ No hay registros para IDDSI={val}; no se genera archivo XLSX.")
        continue

    # Mantener SOLO "tipo" + features
    sub_clean = sub[cols_keep]

    # Convertir a matriz (incluye columna tipo)
    matrix = sub_clean.to_numpy()

    # Guardar en XLSX SIN headers NI índice
    filename = f"{split_prefix}{val}.xlsx"
    with pd.ExcelWriter(filename, engine="xlsxwriter") as writer:
        pd.DataFrame(matrix).to_excel(writer, index=False, header=False, sheet_name=f"IDDSI_{val}")

print("✅ Archivos XLSX por IDDSI generados con SOLO 'tipo' + features.")

# -------------------- EXCEL UNIFICADO --------------------
try:
    with pd.ExcelWriter("features_por_IDDSI.xlsx", engine="xlsxwriter") as xw:
        for val in iddsi_targets:
            sub = df_ids[df_ids["iddsi"] == val]
            if sub.empty:
                continue

            sub_clean = sub[cols_keep]
            matrix = sub_clean.to_numpy()

            pd.DataFrame(matrix).to_excel(
                xw,
                index=False,
                header=False,
                sheet_name=f"IDDSI_{val}"
            )

    print("✅ Excel unificado 'features_por_IDDSI.xlsx' generado SOLO con 'tipo' + features.")

except Exception as e:
    print(f"⚠️ Error al crear Excel por IDDSI: {e}")


✅ Archivos XLSX por IDDSI generados con SOLO 'tipo' + features.
✅ Excel unificado 'features_por_IDDSI.xlsx' generado SOLO con 'tipo' + features.


APLICAR Wilcoxon rank-sum test

No es necesario normalizar pero puede servir para probar con otros TEST estadisticos

In [36]:
import pandas as pd
import numpy as np

def normalizar_minmax(path_in, path_out):
    # cargar matriz (sin header)
    df = pd.read_excel(path_in, header=None)

    # asignar nombres de columnas
    df.columns = ["tipo"] + [f"f{i}" for i in range(1, df.shape[1])]

    tipo = df["tipo"]
    X = df.drop(columns=["tipo"]).astype(float)

    # Min–Max: x' = (x - min) / (max - min)
    X_min = X.min(axis=0)
    X_max = X.max(axis=0)
    X_norm = (X - X_min) / (X_max - X_min + 1e-12)

    # reconstruir
    df_norm = pd.DataFrame(X_norm, columns=X.columns)
    df_norm.insert(0, "tipo", tipo)

    # guardar sin header ni index
    df_norm.to_excel(path_out, index=False, header=False)
    print(f"✔ Matriz normalizada Min–Max guardada en: {path_out}")

normalizar_minmax("features_IDDSI0.xlsx", "IDDSI0_minmax.xlsx")
normalizar_minmax("features_IDDSI2.xlsx", "IDDSI2_minmax.xlsx")
normalizar_minmax("features_IDDSI4.xlsx", "IDDSI4_minmax.xlsx")


  df_norm.insert(0, "tipo", tipo)


✔ Matriz normalizada Min–Max guardada en: IDDSI0_minmax.xlsx


  df_norm.insert(0, "tipo", tipo)


✔ Matriz normalizada Min–Max guardada en: IDDSI2_minmax.xlsx
✔ Matriz normalizada Min–Max guardada en: IDDSI4_minmax.xlsx


  df_norm.insert(0, "tipo", tipo)


In [37]:
import pandas as pd

# Cargar las 3 matrices SIN encabezados
df0 = pd.read_excel("IDDSI0_minmax.xlsx", header=None)
df2 = pd.read_excel("IDDSI2_minmax.xlsx", header=None)
df4 = pd.read_excel("IDDSI4_minmax.xlsx", header=None)

print("IDDSI0 shape:", df0.shape)
print("IDDSI2 shape:", df2.shape)
print("IDDSI4 shape:", df4.shape)

def poner_nombres(df):
    n_cols = df.shape[1]
    feature_names = [str(i) for i in range(1, n_cols)]  # "1","2",...,
    df.columns = ["tipo"] + feature_names
    return df

df0 = poner_nombres(df0)
df2 = poner_nombres(df2)
df4 = poner_nombres(df4)

print(df0.head())


IDDSI0 shape: (78, 162)
IDDSI2 shape: (85, 162)
IDDSI4 shape: (84, 162)
        tipo         1         2         3         4         5         6  \
0  pacientes  0.545256  0.540855  0.509711  0.436656  0.596018  0.118797   
1  pacientes  0.372540  0.873900  0.069792  0.429370  0.574819  0.102579   
2  pacientes  0.000000  0.625789  0.113890  0.248278  0.715432  0.195645   
3  pacientes  0.077033  0.200435  0.279025  0.039140  0.425240  0.015178   
4  pacientes  0.524012  0.113030  0.948785  0.130428  0.430494  0.075512   

          7         8         9  ...       152       153  154  155       156  \
0  0.169732  0.133006  0.343434  ...  0.719403  0.050355  0.0    0  0.291524   
1  0.069751  0.028602  0.037808  ...  0.415397  0.074359  0.0    0  0.088455   
2  0.081641  0.041019  0.046146  ...  0.413057  0.083441  0.0    0  0.078988   
3  0.152162  0.114659  0.056505  ...  0.490103  0.025334  0.0    0  0.069488   
4  0.425140  0.399712  1.000000  ...  0.947866  0.038039  0.0    0  0.3

In [38]:
import numpy as np
import pandas as pd
from scipy.stats import ranksums   

def ranksum_por_matriz(df_iddsi):

    # Separar grupos
    df_pac = df_iddsi[df_iddsi["tipo"] == "pacientes"]
    df_san = df_iddsi[df_iddsi["tipo"] == "sujetos"]

    print(f"n pacientes = {len(df_pac)}, n sanos = {len(df_san)}")

    feature_cols = [c for c in df_iddsi.columns if c != "tipo"]
    resultados = []

    for col in feature_cols:
        x = df_pac[col].values
        y = df_san[col].values

        # Evitar errores si falta algún dato
        if len(x) == 0 or len(y) == 0:
            continue

        # Test Wilcoxon rank-sum (según la doc de ranksums)
        stat, p = ranksums(x, y)

        resultados.append({
            "feature_ID": col,
            "Z_statistic": stat,         # ranksums entrega estadístico tipo Z
            "p_value": p,
            "median_pacientes": np.median(x),
            "median_sanos": np.median(y),
            "mean_pacientes": np.mean(x),
            "mean_sanos": np.mean(y),
            "significativo_0_05": p < 0.05
        })

    # Convertir a tabla ordenada por p-value
    res_df = pd.DataFrame(resultados)
    res_df = res_df.sort_values("p_value").reset_index(drop=True)
    return res_df


In [39]:
res0 = ranksum_por_matriz(df0)
res2 = ranksum_por_matriz(df2)
res4 = ranksum_por_matriz(df4)

res0.head()
res2.head()
res4.head()


n pacientes = 33, n sanos = 45
n pacientes = 45, n sanos = 40
n pacientes = 39, n sanos = 45


Unnamed: 0,feature_ID,Z_statistic,p_value,median_pacientes,median_sanos,mean_pacientes,mean_sanos,significativo_0_05
0,16,5.376898,7.578013e-08,0.203041,0.12239,0.287373,0.114797,True
1,123,4.30959,1.635572e-05,0.087098,0.047921,0.189834,0.059912,True
2,35,-4.188509,2.807929e-05,0.392718,0.592689,0.369632,0.634751,True
3,117,4.13918,3.485495e-05,0.098697,0.054044,0.190306,0.064071,True
4,24,4.022583,5.756328e-05,0.084641,0.045608,0.144834,0.053295,True


In [40]:
# Ordenar resultados por p-value ascendente (por si acaso)
res0_sorted = res0.sort_values(by="p_value", ascending=True)
res2_sorted = res2.sort_values(by="p_value", ascending=True)
res4_sorted = res4.sort_values(by="p_value", ascending=True)

# Guardar Excel con orden aplicado
with pd.ExcelWriter("Ranksum_por_IDDSI.xlsx", engine="xlsxwriter") as writer:
    res0_sorted.to_excel(writer, index=False, sheet_name="IDDSI_0")
    res2_sorted.to_excel(writer, index=False, sheet_name="IDDSI_2")
    res4_sorted.to_excel(writer, index=False, sheet_name="IDDSI_4")

print("✔ Excel generado con features ordenados por p-value (más significativos arriba).")


✔ Excel generado con features ordenados por p-value (más significativos arriba).


mRMR test

In [41]:
import pandas as pd
from feature_engine.selection import MRMR
from sklearn.model_selection import train_test_split


In [42]:
# Archivo con los resultados del RankSum
ranksum_file = "Ranksum_por_IDDSI.xlsx"

# Cargar todas las hojas
dfs_ranksum = pd.read_excel(ranksum_file, sheet_name=None)

# Listas de features significativos (feature_ID donde significativo_0_05 == True)
sig_0 = dfs_ranksum["IDDSI_0"].loc[
    dfs_ranksum["IDDSI_0"]["significativo_0_05"] == True, "feature_ID"
].tolist()

sig_2 = dfs_ranksum["IDDSI_2"].loc[
    dfs_ranksum["IDDSI_2"]["significativo_0_05"] == True, "feature_ID"
].tolist()

sig_4 = dfs_ranksum["IDDSI_4"].loc[
    dfs_ranksum["IDDSI_4"]["significativo_0_05"] == True, "feature_ID"
].tolist()

print("IDDSI 0 - N° features significativos:", len(sig_0))
print("IDDSI 2 - N° features significativos:", len(sig_2))
print("IDDSI 4 - N° features significativos:", len(sig_4))


IDDSI 0 - N° features significativos: 50
IDDSI 2 - N° features significativos: 67
IDDSI 4 - N° features significativos: 64


funcion generica para no repetir por iddsi

In [44]:
def procesar_iddsi_mrmr(nombre_archivo_matriz, sig_features_ids, max_features=20):
    """
    Procesa una matriz de features IDDSI con:
    - Renombrado de columnas a '1', '2', ..., '233'
    - Filtrado por RankSum (sig_features_ids)
    - Split train/test
    - Selección mRMR
    
    Retorna:
        X_train_mrmr, X_test_mrmr, y_train, y_test, selected_features, mrmr_object
    """
    
    # ----------------------------------------------------------
    # 1) Cargar matriz IDDSI
    # ----------------------------------------------------------
    df = pd.read_excel(nombre_archivo_matriz)
    
    # Etiquetas
    y = df["pacientes"]
    
    # Features
    X = df.drop(columns=["pacientes"])
    
    print("\nForma original de X:", X.shape)
    
    # ----------------------------------------------------------
    # 2) Renombrar columnas a "1", "2", ..., "N"
    # ----------------------------------------------------------
    n_features = X.shape[1]
    new_cols = [str(i) for i in range(1, n_features + 1)]
    X.columns = new_cols
    
    print("Primeros nombres de columnas renombradas:", X.columns[:10].tolist())
    
    # ----------------------------------------------------------
    # 3) Filtrar features significativos del RankSum
    # ----------------------------------------------------------
    cols_sig = [str(i) for i in sig_features_ids]  # convertir IDs a strings
    X_sig = X[cols_sig]
    
    print("Forma X_sig (solo significativos):", X_sig.shape)
    
    # ----------------------------------------------------------
    # 4) Train / Test split
    # ----------------------------------------------------------
    X_train, X_test, y_train, y_test = train_test_split(
        X_sig,
        y,
        test_size=0.3,
        random_state=0,
        stratify=y
    )
    
    print("Train:", X_train.shape, " Test:", X_test.shape)
    
    # ----------------------------------------------------------
    # 5) Crear y ejecutar mRMR
    # ----------------------------------------------------------
    mrmr_sel = MRMR(
        variables=None,
        method="FCQ",      # método recomendado para comenzar
        max_features=max_features,
        regression=False,  # clasificación
        random_state=0
    )
    
    mrmr_sel.fit(X_train, y_train)
    
    # ----------------------------------------------------------
    # 6) Obtener features seleccionados (CORREGIDO)
    # ----------------------------------------------------------
    selected_features = mrmr_sel.get_feature_names_out()  # YA NO SE USA .tolist()
    
    print("\nFeatures seleccionados por mRMR:")
    print(selected_features)
    
    # Transformar matrices
    X_train_mrmr = mrmr_sel.transform(X_train)
    X_test_mrmr  = mrmr_sel.transform(X_test)
    
    print("Forma X_train_mrmr:", X_train_mrmr.shape)
    print("Forma X_test_mrmr:", X_test_mrmr.shape)
    
    # ----------------------------------------------------------
    # 7) Retornar todo lo útil
    # ----------------------------------------------------------
    return {
        "X_train_mrmr": X_train_mrmr,
        "X_test_mrmr": X_test_mrmr,
        "y_train": y_train,
        "y_test": y_test,
        "selected_features": selected_features,
        "mrmr_object": mrmr_sel
    }


PARA IDDSI 0

In [45]:
resultado_iddsi0 = procesar_iddsi_mrmr(
    nombre_archivo_matriz="IDDSI0_minmax.xlsx",
    sig_features_ids=sig_0,
    max_features=15
)



Forma original de X: (77, 161)
Primeros nombres de columnas renombradas: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']
Forma X_sig (solo significativos): (77, 50)
Train: (53, 50)  Test: (24, 50)

Features seleccionados por mRMR:
['117', '111', '61', '123', '67', '55', '138', '144', '26', '150', '156', '71', '38', '94', '35']
Forma X_train_mrmr: (53, 15)
Forma X_test_mrmr: (24, 15)


In [46]:
X0_train_mrmr = resultado_iddsi0["X_train_mrmr"]
X0_test_mrmr  = resultado_iddsi0["X_test_mrmr"]
y0_train      = resultado_iddsi0["y_train"]
y0_test       = resultado_iddsi0["y_test"]
features_0    = resultado_iddsi0["selected_features"]


IDDSI 2


In [47]:
resultado_iddsi2 = procesar_iddsi_mrmr(
    nombre_archivo_matriz="IDDSI2_minmax.xlsx",  # tu archivo de matriz IDDSI 2
    sig_features_ids=sig_2,                      # lista de features significativos RankSum
    max_features=15                               # puedes usar 10, 15, 20, 30...
)



Forma original de X: (84, 161)
Primeros nombres de columnas renombradas: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']
Forma X_sig (solo significativos): (84, 67)
Train: (58, 67)  Test: (26, 67)

Features seleccionados por mRMR:
['144', '138', '35', '45', '34', '61', '29', '25', '55', '40', '158', '67', '131', '7', '8']
Forma X_train_mrmr: (58, 15)
Forma X_test_mrmr: (26, 15)


In [48]:
X2_train_mrmr = resultado_iddsi2["X_train_mrmr"]
X2_test_mrmr  = resultado_iddsi2["X_test_mrmr"]
y2_train      = resultado_iddsi2["y_train"]
y2_test       = resultado_iddsi2["y_test"]
features_2    = resultado_iddsi2["selected_features"]


IDDSI 4

In [49]:
resultado_iddsi4 = procesar_iddsi_mrmr(
    nombre_archivo_matriz="IDDSI4_minmax.xlsx",   # tu matriz normalizada IDDSI 4
    sig_features_ids=sig_4,                       # los features significativos del RankSum
    max_features=15                                # puedes cambiar 10, 15, 20, etc.
)



Forma original de X: (83, 161)
Primeros nombres de columnas renombradas: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']
Forma X_sig (solo significativos): (83, 64)
Train: (58, 64)  Test: (25, 64)

Features seleccionados por mRMR:
['16', '123', '35', '117', '24', '111', '2', '33', '34', '30', '27', '94', '131', '40', '41']
Forma X_train_mrmr: (58, 15)
Forma X_test_mrmr: (25, 15)


In [19]:
X4_train_mrmr = resultado_iddsi4["X_train_mrmr"]
X4_test_mrmr  = resultado_iddsi4["X_test_mrmr"]
y4_train      = resultado_iddsi4["y_train"]
y4_test       = resultado_iddsi4["y_test"]
features_4    = resultado_iddsi4["selected_features"]
