# Modelado — Clasificación de Viabilidad (tabular)
Notebook para entrenamiento, validación y selección del mejor modelo con datos tabulares procesados.

**Entrada:** `data/processed/startups_sintetico_1000_processed.csv`

**Métricas:** ROC AUC (principal), F1, Precisión, Recall, Accuracy.


In [None]:

# === Imports y setup ===
import numpy as np, pandas as pd
import matplotlib.pyplot as plt

from pathlib import Path
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import (roc_auc_score, f1_score, precision_score, recall_score, accuracy_score,
                             confusion_matrix, RocCurveDisplay, PrecisionRecallDisplay)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier

processed_path = Path("data/processed/startups_sintetico_1000_processed.csv")
assert processed_path.exists(), "No se encuentra el procesado. Ejecuta el notebook de exploración/limpieza primero."
df = pd.read_csv(processed_path, encoding="utf-8")
print(df.shape)
df.head(3)


In [None]:

# === Preparación de X, y ===
TARGET_COL = "viabilidad"
assert TARGET_COL in df.columns, "Falta la columna objetivo 'viabilidad'"
drop_cols = []

# Evitar fuga y ruido: no usar texto ni la cruda si existe una versión robusta
if "descripcion" in df.columns:
    drop_cols.append("descripcion")
if "presencia_redes" in df.columns and "intensidad_redes" in df.columns:
    drop_cols.append("presencia_redes")

X = df.drop(columns=[TARGET_COL] + drop_cols)
y = df[TARGET_COL].astype(int)
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()

print("X shape:", X.shape, "| Num feats:", len(num_cols))


In [None]:

# === Partición 70/15/15 y CV ===
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.30, stratify=y, random_state=RANDOM_STATE)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.50, stratify=y_temp, random_state=RANDOM_STATE)
print("Splits ->", X_train.shape, X_valid.shape, X_test.shape)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

def evaluate_model(clf, Xv, yv, name="model"):
    # scores probabilísticos
    if hasattr(clf, "predict_proba"):
        scores = clf.predict_proba(Xv)[:,1]
    elif hasattr(clf, "decision_function"):
        scores = clf.decision_function(Xv)
    else:
        scores = clf.predict(Xv)
    yhat = (scores >= 0.5).astype(int)
    metrics = {
        "roc_auc": roc_auc_score(yv, scores),
        "f1": f1_score(yv, yhat),
        "precision": precision_score(yv, yhat, zero_division=0),
        "recall": recall_score(yv, yhat),
        "accuracy": accuracy_score(yv, yhat),
    }
    print(f"[{name}] AUC={metrics['roc_auc']:.3f} | F1={metrics['f1']:.3f} | P={metrics['precision']:.3f} | R={metrics['recall']:.3f} | Acc={metrics['accuracy']:.3f}")
    return metrics


## Baselines

In [None]:

dummy = DummyClassifier(strategy="stratified", random_state=RANDOM_STATE).fit(X_train, y_train)
_ = evaluate_model(dummy, X_valid, y_valid, "Dummy")

logit = Pipeline([("scaler", StandardScaler(with_mean=False)),
                  ("clf", LogisticRegression(max_iter=250, random_state=RANDOM_STATE))]).fit(X_train, y_train)
_ = evaluate_model(logit, X_valid, y_valid, "LogReg")


## Modelos candidatos

In [None]:

tree = DecisionTreeClassifier(random_state=RANDOM_STATE).fit(X_train, y_train)
_ = evaluate_model(tree, X_valid, y_valid, "DecisionTree")

rf = RandomForestClassifier(n_estimators=300, random_state=RANDOM_STATE, n_jobs=-1).fit(X_train, y_train)
_ = evaluate_model(rf, X_valid, y_valid, "RandomForest")

hgb = HistGradientBoostingClassifier(max_iter=300, learning_rate=0.1, random_state=RANDOM_STATE).fit(X_train, y_train)
_ = evaluate_model(hgb, X_valid, y_valid, "HGB")


## Comparación con CV (ROC AUC)

In [None]:

for name, model in {"LogReg":logit, "Tree":tree, "RF":rf, "HGB":hgb}.items():
    scores = cross_val_score(model, X_train, y_train, scoring="roc_auc", cv=cv, n_jobs=-1)
    print(f"{name:7s} CV AUC: {scores.mean():.3f} ± {scores.std():.3f}")


## Tuning básico

In [None]:

# HGB: RandomizedSearch liviano
param_dist_hgb = {
    "learning_rate":[0.03,0.05,0.1,0.2],
    "max_depth":[None,3,5,7],
    "l2_regularization":[0.0,0.1,0.5,1.0],
    "max_iter":[200,300,500]
}
rs_hgb = RandomizedSearchCV(HistGradientBoostingClassifier(random_state=RANDOM_STATE),
                            param_distributions=param_dist_hgb, n_iter=20, scoring="roc_auc",
                            cv=cv, random_state=RANDOM_STATE, n_jobs=-1)
rs_hgb.fit(X_train, y_train)
print("Best HGB:", rs_hgb.best_params_)
_ = evaluate_model(rs_hgb.best_estimator_, X_valid, y_valid, "HGB_tuned")

# RF: Grid pequeño
param_grid_rf = {"n_estimators":[200,400], "max_depth":[None,6,12], "max_features":["sqrt",0.5]}
gs_rf = GridSearchCV(RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=-1),
                     param_grid=param_grid_rf, scoring="roc_auc", cv=cv, n_jobs=-1)
gs_rf.fit(X_train, y_train)
print("Best RF:", gs_rf.best_params_)
_ = evaluate_model(gs_rf.best_estimator_, X_valid, y_valid, "RF_tuned")


## Selección final y evaluación en test

In [None]:

finals = {"HGB_tuned": rs_hgb.best_estimator_, "RF_tuned": gs_rf.best_estimator_}
best_name, best_model, best_auc = None, None, -np.inf
for name, m in finals.items():
    met = evaluate_model(m, X_valid, y_valid, name)
    if met["roc_auc"] > best_auc:
        best_auc, best_name, best_model = met["roc_auc"], name, m

print(f"Mejor en valid: {best_name} AUC={best_auc:.3f}")

# Reentrena con train+valid
best_model.fit(pd.concat([X_train,X_valid]), pd.concat([y_train,y_valid]))

# Test
if hasattr(best_model, "predict_proba"):
    scores_test = best_model.predict_proba(X_test)[:,1]
elif hasattr(best_model, "decision_function"):
    scores_test = best_model.decision_function(X_test)
else:
    scores_test = best_model.predict(X_test)

y_pred05 = (scores_test>=0.5).astype(int)

print("=== Test @0.5 ===")
print("AUC:", roc_auc_score(y_test, scores_test))
print("F1:", f1_score(y_test, y_pred05), "Prec:", precision_score(y_test, y_pred05), "Rec:", recall_score(y_test, y_pred05), "Acc:", accuracy_score(y_test, y_pred05))
print("Matriz de confusión\n", confusion_matrix(y_test, y_pred05))

# Curvas
_ = RocCurveDisplay.from_predictions(y_test, scores_test)
plt.title("ROC — Test")
plt.show()

_ = PrecisionRecallDisplay.from_predictions(y_test, scores_test)
plt.title("Precision-Recall — Test")
plt.show()
