In [None]:
# 2_modeling.ipynb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, RocCurveDisplay
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.pipeline import Pipeline
import joblib
from src.preprocessing import build_preprocessing_pipeline

In [None]:
# Cargar datos procesados
df = pd.read_csv('data/processed/heart_raw_processed.csv')

In [None]:

# Features / target
X = df.drop(columns=['target'])
y = df['target']

In [None]:

# Train-test split estratificado
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y)

In [None]:

# Construir pipeline
preprocessor, num_cols, cat_cols = build_preprocessing_pipeline(df)
# Model candidates:
models = {
    'logreg': Pipeline([('pre', preprocessor), ('clf', LogisticRegression(max_iter=1000, class_weight='balanced'))]),
    'rf': Pipeline([('pre', preprocessor), ('clf', RandomForestClassifier(n_estimators=200, random_state=42, class_weight='balanced'))]),
    'xgb': Pipeline([('pre', preprocessor), ('clf', xgb.XGBClassifier(eval_metric='logloss', use_label_encoder=False, random_state=42))])
}


In [None]:

# Entrenamiento rápido baseline
results = {}
for name, pipe in models.items():
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)
    probs = pipe.predict_proba(X_test)[:,1]
    results[name] = {
        'accuracy': accuracy_score(y_test, preds),
        'precision': precision_score(y_test, preds),
        'recall': recall_score(y_test, preds),
        'f1': f1_score(y_test, preds),
        'auc': roc_auc_score(y_test, probs),
        'confusion': confusion_matrix(y_test, preds)
    }
    print(name, results[name])

In [None]:
# Seleccionar un modelo para optimizar (ej. xgb)
param_dist = {
    'clf__n_estimators': [50,100,200,400],
    'clf__max_depth': [3,4,6,8],
    'clf__learning_rate': [0.01,0.05,0.1,0.2],
    'clf__subsample': [0.6,0.8,1.0],
}
xgb_pipe = models['xgb']
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
rs = RandomizedSearchCV(xgb_pipe, param_distributions=param_dist, n_iter=30, scoring='roc_auc', n_jobs=-1, cv=cv, random_state=42, verbose=1)
rs.fit(X_train, y_train)
print("Best params:", rs.best_params_)
best_model = rs.best_estimator_

In [None]:
# Evaluacion final
preds = best_model.predict(X_test)
probs = best_model.predict_proba(X_test)[:,1]
import sklearn.metrics as m
print("Accuracy", m.accuracy_score(y_test,preds))
print("Precision", m.precision_score(y_test,preds))
print("Recall", m.recall_score(y_test,preds))
print("F1", m.f1_score(y_test,preds))
print("AUC", m.roc_auc_score(y_test,probs))
print("Confusion:\n", m.confusion_matrix(y_test,preds))

In [None]:
# Guardar modelo
joblib.dump(best_model, 'models/best_model.joblib')
print("Saved model to models/best_model.joblib")
# Guardar métricas
import json
with open('reports/metrics.json','w') as f:
    json.dump(results, f, indent=2, default=str)