In [None]:

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_curve,
    roc_auc_score,
    precision_recall_curve,
    average_precision_score
)
from sklearn.feature_selection import SelectFromModel
from imblearn.combine import SMOTETomek
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")


data = pd.read_csv(r"C:\Users\khrva\Downloads\Hkan-20251107T040042Z-1-001\Hkan\merge_listo.csv")


def random_forest_binario_v6(data, target_column, feature_subset):
    df = data.dropna(subset=[target_column]).copy()
    df = df[[c for c in feature_subset + [target_column] if c in df.columns]]

    df[target_column] = pd.to_numeric(df[target_column], errors='coerce')
    df = df[df[target_column].isin([1, 2])]
    df[target_column] = df[target_column].replace({2: 0})

    X = pd.get_dummies(df.drop(columns=[target_column]), drop_first=True).fillna(0)
    y = df[target_column].astype(int)

    print(f"\n{'='*70}")
    print(f" Distribución original de clases en {target_column}:")
    print(y.value_counts())


    print("\n Aplicando balanceo híbrido SMOTE + Tomek Links...")
    smt = SMOTETomek(random_state=42)
    X_bal, y_bal = smt.fit_resample(X, y)
    print(" Distribución tras balanceo:")
    print(pd.Series(y_bal).value_counts())


    X_train, X_test, y_train, y_test = train_test_split(
        X_bal, y_bal, test_size=0.3, random_state=42, stratify=y_bal
    )

    rf_model = RandomForestClassifier(
        n_estimators=400,
        max_depth=14,
        min_samples_split=3,
        min_samples_leaf=1,
        class_weight='balanced_subsample',
        random_state=42,
        n_jobs=-1
    )

    selector = SelectFromModel(rf_model, threshold="median")
    selector.fit(X_train, y_train)
    X_train_sel = selector.transform(X_train)
    X_test_sel = selector.transform(X_test)

    rf_model.fit(X_train_sel, y_train)


    scores = cross_val_score(rf_model, X_train_sel, y_train, cv=3, scoring='f1')
    print(f"\n F1 promedio (CV=3): {scores.mean():.3f}")


    y_pred = rf_model.predict(X_test_sel)
    y_prob = rf_model.predict_proba(X_test_sel)[:, 1]

    print(f"\n Reporte de clasificación ({target_column}):")
    print(classification_report(y_test, y_pred, zero_division=0))


    feature_importances = pd.DataFrame({
        'Feature': X.columns[selector.get_support()],
        'Importance': rf_model.feature_importances_
    }).sort_values(by='Importance', ascending=False)

    print("\n 15 Características más importantes:")
    print(feature_importances.head(15))

    plt.figure(figsize=(10, 6))
    sns.barplot(x='Importance', y='Feature', data=feature_importances.head(15), palette='viridis')
    plt.title(f'Importancia de Variables - {target_column} (Top 15)')
    plt.tight_layout()
    plt.show()


    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(5, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='coolwarm')
    plt.title(f'Matriz de Confusión - {target_column}')
    plt.xlabel('Predicho')
    plt.ylabel('Real')
    plt.tight_layout()
    plt.show()


    auc_score = roc_auc_score(y_test, y_prob)
    fpr, tpr, _ = roc_curve(y_test, y_prob)
    plt.figure(figsize=(6, 5))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC (AUC = {auc_score:.3f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.title(f'Curva ROC - {target_column}')
    plt.xlabel('Tasa de Falsos Positivos')
    plt.ylabel('Tasa de Verdaderos Positivos')
    plt.legend(loc="lower right")
    plt.tight_layout()
    plt.show()

 
    precision, recall, _ = precision_recall_curve(y_test, y_prob)
    ap_score = average_precision_score(y_test, y_prob)
    plt.figure(figsize=(6, 5))
    plt.plot(recall, precision, color='green', lw=2, label=f'AP = {ap_score:.3f}')
    plt.title(f'Curva Precisión-Recall - {target_column}')
    plt.xlabel('Recall')
    plt.ylabel('Precisión')
    plt.legend(loc="lower left")
    plt.tight_layout()
    plt.show()


features_diabetes = [
    'BMXWT', 'RIDAGEYR', 'CDQ010', 'WTDRD1', 'ALQ120Q', 'DR1TSUGR',
    'DR1ISODI', 'DR2TKCAL', 'CDQ001', 'SLD010H', 'DR2TPROT', 'DR1TCARB',
    'DR1TCHOL', 'BPQ080', 'DR1TSFAT', 'DR1TPROT', 'DR1TFIBE', 'DR2TTFAT',
    'DTD190Q', 'SMAQUEX', 'BPQ020', 'DR2TFIBE'
]

features_hipertension = [
    'RIDAGEYR', 'BMXWT', 'BPQ080', 'CDQ010', 'SLD010H', 'CDQ001',
    'DR1TPROT', 'DR2IPROT', 'WTDRD1', 'DR2ISFAT', 'DR2TTFAT', 'DR1TSUGR',
    'DR2TCHOL', 'DR1TFIBE', 'DR1TCHOL', 'DR1TRET', 'DR1TCARB', 'DR1TTFAT',
    'DIQ010', 'DR2ITFAT', 'DR1ISODI', 'DR2TKCAL', 'DR1TSFAT', 'DR1ISFAT'
]


random_forest_binario_v6(data, 'DIQ010', features_diabetes)     # Diabetes
random_forest_binario_v6(data, 'BPQ020', features_hipertension) # Hipertensión


**CSV**

In [36]:
selected_codes = [
    'RIDAGEYR','BMXWT','DR1TSUGR','DR1ISODI','DR2TKCAL','WTDRD1','ALQ120Q',
    'DR1TCHOL','DR2TPROT','DR1TCARB','DR1TSFAT','DR2TTFAT','DR2IPROT',
    'DR2TCHOL','SLD010H','BPQ080','CDQ010','CDQ001'
]

descriptions = {
    'RIDAGEYR': 'Edad_Adjudicada',
    'BMXWT': 'Peso_kg',
    'DR1TSUGR': 'Azucares_totales_gm',
    'DR1ISODI': 'Sodio_mg',
    'DR2TKCAL': 'Energia_kcal',
    'WTDRD1': 'Peso_muestra_dia1',
    'ALQ120Q': 'Frecuencia_alcohol_12m',
    'DR1TCHOL': 'Colesterol_mg_DR1',
    'DR2TPROT': 'Proteina_g_DR2',
    'DR1TCARB': 'Carbohidratos_g',
    'DR1TSFAT': 'Grasas_saturadas_g',
    'DR2TTFAT': 'Grasa_total_g_DR2',
    'DR2IPROT': 'Proteina_g_DR2I',
    'DR2TCHOL': 'Colesterol_mg_DR2',
    'SLD010H': 'Horas_suenio',
    'BPQ080': 'Medico_dijo_colesterol_alto',
    'CDQ010': 'Dificultad_respirar_escaleras',
    'CDQ001': 'Dolor_malestar_pecho_SI'
}

existing = [c for c in selected_codes if c in data.columns]
missing = [c for c in selected_codes if c not in data.columns]

print(f"Columnas encontradas: {len(existing)}")
if missing:
    print("Columnas faltantes (no están en el DataFrame):", missing)

df_dataoficial = data[existing].copy()
df_dataoficial = df_dataoficial.rename(columns={k: descriptions.get(k, k) for k in existing})

out_path = r"C:\Users\khrva\Downloads\Hkan-20251107T040042Z-1-001\Hkan\DataOficial.csv"
df_dataoficial.to_csv(out_path, index=False)
print(f"Guardado: {out_path} ({df_dataoficial.shape[0]} filas, {df_dataoficial.shape[1]} columnas)")


Columnas encontradas: 18
Guardado: C:\Users\khrva\Downloads\Hkan-20251107T040042Z-1-001\Hkan\DataOficial.csv (10537 filas, 18 columnas)


**Formulario**

In [51]:
import os, joblib
import ipywidgets as widgets
from IPython.display import display, clear_output
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np

MODEL_DIR = os.path.join(".", "models")
os.makedirs(MODEL_DIR, exist_ok=True)
DIAB_PATH = os.path.join(MODEL_DIR, "diabetes_pipeline.joblib")
HIP_PATH = os.path.join(MODEL_DIR, "hipertension_pipeline.joblib")

fields_codes = [
    'RIDAGEYR','BMXWT','DR1TSUGR','DR1ISODI','DR2TKCAL','WTDRD1','ALQ120Q',
    'DR1TCHOL','DR2TPROT','DR1TCARB','DR1TSFAT','DR2TTFAT','DR2IPROT',
    'DR2TCHOL','SLD010H','BPQ080','CDQ010','CDQ001'
]

labels = {
    'RIDAGEYR': 'Edad (evaluación adjudicada)',
    'BMXWT': 'Peso (kg)',
    'DR1TSUGR': 'Azúcares totales diario (gm)',
    'DR1ISODI': 'Sodio diario(mg)',
    'DR2TKCAL': 'Energía diario(kcal)',
    'WTDRD1': 'Perímetro abdominal registrado ',  
    'ALQ120Q': 'Frecuencia alcohol semanal (últimos 12 meses)',
    'DR1TCHOL': 'Colesterol (mg)',
    'DR2TPROT': 'Proteína diario(gm)',
    'DR1TCARB': 'Carbohidratos diario(g)',
    'DR1TSFAT': 'Ácidos grasos saturados (gm)',
    'DR2TTFAT': 'Grasa total (gm)',
    'DR2IPROT': 'Proteína diario (gm) - DR2',
    'DR2TCHOL': 'Colesterol (mg) - DR2',
    'SLD010H': 'Horas de sueño',
    'BPQ080': 'Médico dijo: colesterol alto (SI/NO)',
    'CDQ010': 'Dificultad para respirar en escaleras/inclinaciones (SI/NO)',
    'CDQ001': 'Alguna vez dolor/malestar en el pecho (SI/NO)'
}

def train_pipeline(X, y):
    numeric_cols = [c for c in X.columns if pd.api.types.is_numeric_dtype(X[c])]
    categorical_cols = [c for c in X.columns if c not in numeric_cols]
    num_t = SimpleImputer(strategy='median')
    cat_t = Pipeline([('imputer', SimpleImputer(strategy='most_frequent')),
                      ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False))])
    pre = ColumnTransformer([('num', num_t, numeric_cols),
                             ('cat', cat_t, categorical_cols)], remainder='drop')
    rf = RandomForestClassifier(n_estimators=200, max_depth=12, random_state=42, n_jobs=-1, class_weight='balanced')
    pipe = Pipeline([('pre', pre), ('rf', rf)])
    pipe.fit(X, y)
    return pipe, X.columns.tolist()

def ensure_models():
    if os.path.exists(DIAB_PATH) and os.path.exists(HIP_PATH):
        diab = joblib.load(DIAB_PATH)
        hip = joblib.load(HIP_PATH)
        return diab, hip
    def prepare(target):
        df = data.dropna(subset=[target]).copy()
        use = [c for c in fields_codes + [target] if c in df.columns]
        df = df[use]
        df[target] = pd.to_numeric(df[target], errors='coerce')
        df = df[df[target].isin([1,2])]
        df[target] = df[target].replace({2:0}).astype(int)
        X = df.drop(columns=[target])
        y = df[target]
        return X, y
    Xd, yd = prepare('DIQ010') if 'DIQ010' in data.columns else (pd.DataFrame(columns=[]), pd.Series(dtype=int))
    Xh, yh = prepare('BPQ020') if 'BPQ020' in data.columns else (pd.DataFrame(columns=[]), pd.Series(dtype=int))
    diab, hip = None, None
    if Xd.shape[0] > 10:
        pipe_d, feat_d = train_pipeline(Xd, yd)
        joblib.dump({'pipeline': pipe_d, 'features': feat_d}, DIAB_PATH)
        diab = {'pipeline': pipe_d, 'features': feat_d}
    if Xh.shape[0] > 10:
        pipe_h, feat_h = train_pipeline(Xh, yh)
        joblib.dump({'pipeline': pipe_h, 'features': feat_h}, HIP_PATH)
        hip = {'pipeline': pipe_h, 'features': feat_h}
    return diab, hip

diab_model, hip_model = ensure_models()

def categorize(p):
    if p < 0.33: return "Bajo"
    if p < 0.66: return "Medio"
    return "Alto"

widgets_map = {}
for code in fields_codes:
    label = labels.get(code, code)
    if code in data.columns and pd.api.types.is_numeric_dtype(data[code]):
        w = widgets.FloatText(
            description=label,
            value=float(np.nan),
            layout=widgets.Layout(width='520px'),
            style={'description_width': '250px'}
        )
    else:
        w = widgets.Text(
            description=label,
            value="",
            layout=widgets.Layout(width='260px'),
            style={'description_width': '250px'}
        )
    widgets_map[code] = w

btn = widgets.Button(description="Calcular riesgo")
out = widgets.Output()

def on_click(b):
    with out:
        clear_output()
        row = {}
        for code, widget in widgets_map.items():
            v = widget.value
            if v == "" or (isinstance(v, float) and np.isnan(v)):
                row[code] = np.nan
            else:
                try:
                    row[code] = float(v)
                except:
                    if isinstance(v, str) and v.strip().lower() in ['si','sí','s','1','yes','y']:
                        row[code] = 1
                    elif isinstance(v, str) and v.strip().lower() in ['no','n','0','nope']:
                        row[code] = 0
                    else:
                        row[code] = str(v)
        Xrow = pd.DataFrame([row])
        if diab_model:
            feats = diab_model['features']
            try:
                prob_d = float(diab_model['pipeline'].predict_proba(Xrow[feats])[:,1][0])
            except Exception:
                prob_d = float(diab_model['pipeline'].predict_proba(Xrow)[:,1][0])
            print(f"Diabetes — Probabilidad: {prob_d:.3f} — Categoría: {categorize(prob_d)}")
        else:
            print("No hay modelo de diabetes disponible.")
        if hip_model:
            feats = hip_model['features']
            try:
                prob_h = float(hip_model['pipeline'].predict_proba(Xrow[feats])[:,1][0])
            except Exception:
                prob_h = float(hip_model['pipeline'].predict_proba(Xrow)[:,1][0])
            print(f"Hipertensión — Probabilidad: {prob_h:.3f} — Categoría: {categorize(prob_h)}")
        else:
            print("No hay modelo de hipertensión disponible.")

btn.on_click(on_click)

items = [widgets_map[c] for c in fields_codes]
form = widgets.VBox(items + [btn, out])
display(form)


VBox(children=(FloatText(value=nan, description='Edad (evaluación adjudicada)', layout=Layout(width='520px'), …