# MODELO HÍBRIDO (RF+XGBoost)

## 1. Importación de librerías

In [13]:
import random
import warnings, joblib, numpy as np, pandas as pd
from pathlib import Path
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import (classification_report, roc_auc_score,
                             average_precision_score, confusion_matrix)

warnings.filterwarnings("ignore")

## 2. Semilla de reproducibilidad

In [16]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)

## 3. Carga de data

In [22]:
# ---------- 1. Cargar datos ----------
X = pd.read_csv(
        r"/work/vae_embeddings.csv",
        index_col=0).values
y = pd.read_csv(
        r"/work/y_labels.csv",
        index_col=0).values.ravel().astype(int)

# ---------- 2. Split 80/20 ----------
X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.20, stratify=y, random_state=42)

## 3. Búsqueda de mejores hiperparámetros

In [25]:
# ---------- 3. Construir Pipeline (SMOTE + RF) ----------
pipe = Pipeline(steps=[
        ("smote", SMOTE(random_state=42)),
        ("rf", RandomForestClassifier(
                n_estimators=100,      # fijo para ir rápido
                random_state=42,
                n_jobs=-1))
])

# ---------- 4. Espacio de hiperparámetros (se refieren al paso "rf__") ----------
param_dist = {
    "rf__max_depth":         [10, 20, None],
    "rf__max_features":      ["sqrt", 0.5],
    "rf__min_samples_leaf":  [1, 2],
    "rf__min_samples_split": [2, 4]
}

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

search = RandomizedSearchCV(
        estimator=pipe,
        param_distributions=param_dist,
        n_iter=10,                 # ↔ equilibrio velocidad/exploración
        scoring="average_precision",
        cv=cv,
        n_jobs=-1,
        verbose=1,
        random_state=42)

# ---------- 5. Ajustar ----------
search.fit(X_train, y_train)
best_model = search.best_estimator_

# ---------- 6. Evaluar en test ----------
probs = best_model.predict_proba(X_test)[:, 1]
preds = (probs >= 0.5).astype(int)

print("\n  Mejores hiperparámetros:")
for k, v in search.best_params_.items():
    print(f"   {k}: {v}")
print(f"AUCPR CV (mean): {search.best_score_:.4f}")

print("\n=== Reporte en Test ===")
print(classification_report(y_test, preds, digits=4))
print("AUROC :", round(roc_auc_score(y_test, probs), 4))
print("AUCPR :", round(average_precision_score(y_test, probs), 4))
print("Confusion:\n", confusion_matrix(y_test, preds))

# ---------- 7. Guardar modelo ----------
out = Path(r"/work/rf_vae_smote_opt.pkl")
joblib.dump(best_model, out)



Fitting 3 folds for each of 10 candidates, totalling 30 fits

  Mejores hiperparámetros:
   rf__min_samples_split: 4
   rf__min_samples_leaf: 1
   rf__max_features: sqrt
   rf__max_depth: 20
AUCPR CV (mean): 0.4733

=== Reporte en Test ===
              precision    recall  f1-score   support

           0     0.7288    0.6719    0.6992        64
           1     0.4878    0.5556    0.5195        36

    accuracy                         0.6300       100
   macro avg     0.6083    0.6137    0.6093       100
weighted avg     0.6421    0.6300    0.6345       100

AUROC : 0.6842
AUCPR : 0.5166
Confusion:
 [[43 21]
 [16 20]]


['/work/rf_vae_smote_opt.pkl']

## 4. Implementación de RF

In [31]:
# --------------------------- Árbol CART mínimo ---------------------------
class SimpleTree:
    class Node:
        __slots__ = ("feat","val","left","right","pred")
        def __init__(self,pred):
            self.feat=None; self.val=None
            self.left=None; self.right=None
            self.pred=pred
    def __init__(self,max_depth, min_samples_split, max_features):
        self.max_depth=max_depth
        self.min_samples_split=min_samples_split
        self.max_features=max_features
        self.root=None
    @staticmethod
    def _gini(y):
        m=len(y)
        if m==0: return 0
        _,cnt=np.unique(y,return_counts=True)
        p=cnt/m
        return 1.0-np.sum(p**2)
    def _best_split(self,X,y,idx):
        Sg=self._gini(y[idx]); best_gain=0; best=None
        feats=np.random.choice(X.shape[1], self.max_features, replace=False)
        for f in feats:
            sidx=idx[np.argsort(X[idx,f])]
            y_sorted=y[sidx]
            left_cnt=defaultdict(int); right_cnt=Counter(y_sorted)
            n_left=0; n_tot=len(sidx)
            for i in range(1,n_tot):
                cls=y_sorted[i-1]
                left_cnt[cls]+=1; right_cnt[cls]-=1; n_left+=1
                if X[sidx[i],f]==X[sidx[i-1],f]: continue
                n_right=n_tot-n_left
                gl=1-np.sum([(c/n_left)**2 for c in left_cnt.values()])
                gr=1-np.sum([(c/n_right)**2 for c in right_cnt.values()])
                gain=Sg-(n_left/n_tot)*gl-(n_right/n_tot)*gr
                if gain>best_gain:
                    best_gain=gain
                    best=(f,(X[sidx[i-1],f]+X[sidx[i],f])/2)
        return best_gain,best
    def _build(self,X,y,idx,depth):
        pred=Counter(y[idx]).most_common(1)[0][0]
        node=self.Node(pred)
        if depth>=self.max_depth or len(idx)<self.min_samples_split:
            return node
        gain,best=self._best_split(X,y,idx)
        if best is None or gain==0: return node
        f,val=best
        mask=X[idx,f]<val
        left_idx=idx[mask]; right_idx=idx[~mask]
        if len(left_idx)==0 or len(right_idx)==0: return node
        node.feat,node.val=f,val
        node.left=self._build(X,y,left_idx,depth+1)
        node.right=self._build(X,y,right_idx,depth+1)
        return node
    def fit(self,X,y,idx):
        self.root=self._build(X,y,idx,0)
    def _pred_row(self,x,node):
        if node.left is None: return node.pred
        return self._pred_row(x,node.left) if x[node.feat]<node.val else self._pred_row(x,node.right)
    def predict(self,X):
        return np.array([self._pred_row(x,self.root) for x in X])

# --------------------------- Random Forest manual ---------------------------
class RandomForestScratch:
    def __init__(self, n_estimators=100, max_depth=5,
                 min_samples_split=2, min_samples_leaf=1,
                 max_features="sqrt", bootstrap=True,
                 class_weight=None, random_state=None):
        self.n_estimators=n_estimators
        self.max_depth=max_depth
        self.min_samples_split=min_samples_split
        self.min_samples_leaf=min_samples_leaf
        self.max_features=max_features
        self.bootstrap=bootstrap
        self.class_weight=class_weight
        self.random_state=random_state
        self.trees=[]
        if random_state is not None:
            np.random.seed(random_state)
    def _sample_weights(self,y):
        if self.class_weight!='balanced': return np.ones_like(y,float)
        cnt=np.bincount(y); total=len(y)
        w=total/(len(cnt)*cnt); return w[y]
    def fit(self, X, y):
        n, d = X.shape
        mtry = int(np.log2(d)) if self.max_features == "log2" else int(np.sqrt(d))
        self.trees = []
        sample_weights = self._sample_weights(y)

        for i in range(self.n_estimators):
            if self.bootstrap:
                idx = np.random.choice(np.arange(n), size=n, replace=True)
            else:
                idx = np.random.permutation(n)

            tree = SimpleTree(self.max_depth, self.min_samples_split, mtry)
            tree.fit(X, y, idx)
            self.trees.append(tree)
            print(f"      Árbol {i+1}/{self.n_estimators} entrenado", end="\r")

        print(f"Entrenamiento de {self.n_estimators} árboles completado")
        return self
    def predict(self,X):
        votes=np.vstack([t.predict(X) for t in self.trees])
        return np.apply_along_axis(lambda col: Counter(col).most_common(1)[0][0],0,votes)
    def predict_proba(self,X):
        votes=np.vstack([t.predict(X) for t in self.trees])
        return votes.mean(axis=0)

def print_tree(node, depth=0):
    prefix = "│   " * depth + ("├── " if depth > 0 else "")
    if node.left is None:
        print(f"{prefix}Predicción: {node.pred}")
    else:
        print(f"{prefix}if X[{node.feat}] < {node.val:.4f}:")
        print_tree(node.left, depth + 1)
        print(f"{'│   ' * depth}else:")
        print_tree(node.right, depth + 1)


## 5. Entrenamiento de RF

In [43]:
# --------------------------- Hiperparámetros óptimos ---------------------------
from collections import Counter, defaultdict

params = dict(
    n_estimators=100,
    max_depth=20,
    min_samples_split=4,
    min_samples_leaf=1,
    max_features="sqrt",
    bootstrap=True,
    class_weight=None,
    random_state=SEED
)

# ---------------------- VALIDACIÓN CRUZADA CON SMOTE (3 folds) ----------------------
auprc_scores, auroc_scores = [], []
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=SEED)

print("\n Validación cruzada 3-fold con SMOTE")
for fold, (tr_idx, va_idx) in enumerate(skf.split(X_train, y_train), 1):
    # ① Smote solo en cada fold de entrenamiento
    sm = SMOTE(random_state=SEED)
    X_tr_res, y_tr_res = sm.fit_resample(X_train[tr_idx], y_train[tr_idx])

    rf = RandomForestScratch(**params)
    rf.fit(X_tr_res, y_tr_res)

    prob_val = rf.predict_proba(X_train[va_idx])
    auprc_scores.append(average_precision_score(y_train[va_idx], prob_val))
    auroc_scores.append(roc_auc_score(y_train[va_idx], prob_val))

    print(f"   Fold {fold}/3  AUCPR={auprc_scores[-1]:.4f}  AUROC={auroc_scores[-1]:.4f}")

print("\n Validación terminada")
print(f"CV AUCPR (3-fold): {np.mean(auprc_scores):.4f} ± {np.std(auprc_scores):.4f}")
print(f"CV AUROC (3-fold): {np.mean(auroc_scores):.4f} ± {np.std(auroc_scores):.4f}")


 Validación cruzada 3-fold con SMOTE
Entrenamiento de 100 árboles completado
   Fold 1/3  AUCPR=0.5401  AUROC=0.6373
Entrenamiento de 100 árboles completado
   Fold 2/3  AUCPR=0.4152  AUROC=0.5610
Entrenamiento de 100 árboles completado
   Fold 3/3  AUCPR=0.4107  AUROC=0.5672

 Validación terminada
CV AUCPR (3-fold): 0.4553 ± 0.0600
CV AUROC (3-fold): 0.5885 ± 0.0346


## 6. Evaluación de RF

In [49]:
# ----------------------------- ENTRENAMIENTO FINAL -----------------------------
sm = SMOTE(random_state=SEED)
X_tr_res, y_tr_res = sm.fit_resample(X_train, y_train)

rf_final = RandomForestScratch(**params).fit(X_tr_res, y_tr_res)
# ----------------------------- EVALUACIÓN TEST -----------------------------
prob_test = rf_final.predict_proba(X_test)
thr = 0.42      # mismo umbral
pred_test = (prob_test >= thr).astype(int)

df_probs = pd.DataFrame({
    "prob_RF": prob_test,
    "y_true":  y_test,
    "pred":    pred_test,
    "acierto": pred_test == y_test
})
df_probs.to_csv("probs_rf_test.csv", index=False)
print("\n✅ Probabilidades guardadas en probs_rf_test.csv")

print("\n=== Reporte Final (umbral 0.42) ===")
print(classification_report(y_test, pred_test, digits=4))
print("AUROC :", round(roc_auc_score(y_test, prob_test), 4))
print("AUCPR :", round(average_precision_score(y_test, prob_test), 4))
print("Matriz de Confusión:\n", confusion_matrix(y_test, pred_test))

Entrenamiento de 100 árboles completado

✅ Probabilidades guardadas en probs_rf_test.csv

=== Reporte Final (umbral 0.42) ===
              precision    recall  f1-score   support

           0     0.7778    0.5469    0.6422        64
           1     0.4727    0.7222    0.5714        36

    accuracy                         0.6100       100
   macro avg     0.6253    0.6345    0.6068       100
weighted avg     0.6680    0.6100    0.6167       100

AUROC : 0.6914
AUCPR : 0.489
Matriz de Confusión:
 [[35 29]
 [10 26]]


## 7.  Segmentación de zona gris

In [58]:


# ------------------ 1. Datos y zona gris ------------------
LOW, HIGH = 0.35, 0.55
CONF_LIM = 0.10

probs_rf_file = r"/work/probs_rf_test.csv"
probs_rf = pd.read_csv(probs_rf_file)["prob_RF"].values

mask_gris = (probs_rf >= LOW) & (probs_rf <= HIGH)
Xg, yg = X_test[mask_gris], y_test[mask_gris]
print(f" Ejemplos en zona gris TEST: {Xg.shape[0]} / {X_test.shape[0]}")


 Ejemplos en zona gris TEST: 32 / 100


## 8. Entrenamiento con XGBoost

In [79]:
from xgboost import XGBClassifier
# ------------------ 2. Submodelo XGBoost ------------------
scale_pos = sum(yg == 0) / sum(yg == 1)
xgb = XGBClassifier(
    objective="binary:logistic",
    max_depth=3,
    learning_rate=0.1,
    n_estimators=50,
    gamma=0,
    subsample=0.8,
    scale_pos_weight=scale_pos,
    eval_metric="logloss",
    random_state=SEED,
)
xgb.fit(Xg, yg)

# ------------------ 3. Random Forest base ------------------
rf_final = joblib.load(
    r"/work/rf_vae_smote_opt.pkl"
)
prob_rf_test = rf_final.predict_proba(X_test)[:, 1]
pred_hibrido = (prob_rf_test >= 0.5).astype(int)

# ------------------ 4. Inferencia zona gris + prob_hibrido ------------------
prob_xgb_gris = xgb.predict_proba(Xg)[:, 1]
override = (prob_xgb_gris >= 0.5 + CONF_LIM) | (prob_xgb_gris <= 0.5 - CONF_LIM)
preds_xgb = (prob_xgb_gris >= 0.5).astype(int)

pred_hibrido[mask_gris] = np.where(override, preds_xgb, pred_hibrido[mask_gris])

#  Probabilidad híbrida final (para AUROC/AUPRC híbridos)
prob_hibrido = prob_rf_test.copy()
prob_hibrido[mask_gris] = np.where(override, prob_xgb_gris, prob_rf_test[mask_gris])


## 9. Evaluación y test

In [82]:

# ------------------ 5. Reporte ------------------
print("\n=== Reporte híbrido final (XGB zona gris) ===")
print(classification_report(y_test, pred_hibrido, digits=4))

print("AUROC híbrido :", round(roc_auc_score(y_test, prob_hibrido), 4))
print("AUPRC híbrido :", round(average_precision_score(y_test, prob_hibrido), 4))

print("Confusion:\n", confusion_matrix(y_test, pred_hibrido))

# ------------------ 6. Guardado ------------------
out_dir = r"/work/hibrido_xgb_zonagris"
import os
os.makedirs(out_dir, exist_ok=True)
joblib.dump(xgb, os.path.join(out_dir, "xgb_submodelo.pkl"))
with open(os.path.join(out_dir, "zonagris_info.txt"), "w") as f:
    f.write(f"LOW={LOW}\nHIGH={HIGH}\nCONF_LIM={CONF_LIM}\n")


=== Reporte híbrido final (XGB zona gris) ===
              precision    recall  f1-score   support

           0     0.8727    0.7500    0.8067        64
           1     0.6444    0.8056    0.7160        36

    accuracy                         0.7700       100
   macro avg     0.7586    0.7778    0.7614       100
weighted avg     0.7905    0.7700    0.7741       100

AUROC híbrido : 0.8566
AUPRC híbrido : 0.774
Confusion:
 [[48 16]
 [ 7 29]]


## 