In [1]:
import pandas as pd
data = pd.read_csv('../data/processed/data.csv')
data.head()


Unnamed: 0,race,gender,age,admission_type_id,admission_source_id,time_in_hospital,medical_specialty,num_lab_procedures,num_procedures,num_medications,...,metformin-pioglitazone,change,diabetesMed,discharge_segment,target,numchange,nummeds,diag_1_group,diag_2_group,diag_3_group
0,Caucasian,Female,5,6,1,1,Pediatrics-Endocrinology,41,0,1,...,No,No,No,Otherwise,0,0,0,Diabetes,Other,Other
1,Caucasian,Female,15,1,7,3,,59,0,18,...,No,Ch,Yes,Discharged to home,0,1,1,Other,Diabetes,Other
2,AfricanAmerican,Female,25,1,7,2,,11,5,13,...,No,No,Yes,Discharged to home,0,0,1,Other,Diabetes,Other
3,Caucasian,Male,35,1,7,2,,44,1,16,...,No,Ch,Yes,Discharged to home,0,1,1,Other,Diabetes,Circulatory
4,Caucasian,Male,45,1,7,1,,51,0,8,...,No,Ch,Yes,Discharged to home,0,0,2,Neoplasms,Neoplasms,Diabetes


In [2]:
data.columns

Index(['race', 'gender', 'age', 'admission_type_id', 'admission_source_id',
       'time_in_hospital', 'medical_specialty', 'num_lab_procedures',
       'num_procedures', 'num_medications', 'number_outpatient',
       'number_emergency', 'number_inpatient', 'number_diagnoses',
       'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide',
       'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide',
       'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose',
       'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton',
       'insulin', 'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'discharge_segment',
       'target', 'numchange', 'nummeds', 'diag_1_group', 'diag_2_group',
       'diag_3_group'],
      dtype='object')

In [3]:
TOP_10_FEATURES = [
    'number_inpatient',      # Most important
    'discharge_segment',     # Generó las vars 2 y 3 más importantes
    'number_emergency',      # Visitas de emergencia
    'diag_1_group',          # Diagnóstico primario (Circulatory/Resp/Diabetes)
    'diag_2_group',          # Diagnóstico secundario
    'diabetesMed',           # Indicador general de medicación
    'insulin',               # Insulina (Se redujo la dosis de insulina)
    'number_diagnoses',      # Cantidad de diagnósticos
    'age',                   # Edad 
    'time_in_hospital'       # Tiempo en hospital
]
data_top_10 = data[TOP_10_FEATURES + ['target']]
data_top_10.head()

Unnamed: 0,number_inpatient,discharge_segment,number_emergency,diag_1_group,diag_2_group,diabetesMed,insulin,number_diagnoses,age,time_in_hospital,target
0,0,Otherwise,0,Diabetes,Other,No,No,1,5,1,0
1,0,Discharged to home,0,Other,Diabetes,Yes,Up,9,15,3,0
2,1,Discharged to home,0,Other,Diabetes,Yes,No,6,25,2,0
3,0,Discharged to home,0,Other,Diabetes,Yes,Up,7,35,2,0
4,0,Discharged to home,0,Neoplasms,Neoplasms,Yes,Steady,5,45,1,0


In [4]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
from xgboost import XGBClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
import numpy as np

X = data_top_10.drop(columns=['target'])
y = data_top_10['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

numerical_cols = X.select_dtypes(include=np.number).columns.tolist()
categorical_cols = X.select_dtypes(include='object').columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

pipeline_10 = Pipeline([
    ('preprocessor', preprocessor),
    ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='logloss'))
])

pipeline_10.fit(X_train, y_train)

y_pred = pipeline_10.predict(X_test)
y_pred_proba = pipeline_10.predict_proba(X_test)[:, 1]
print(classification_report(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_pred_proba))



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


              precision    recall  f1-score   support

           0       0.89      1.00      0.94     18069
           1       0.41      0.02      0.03      2285

    accuracy                           0.89     20354
   macro avg       0.65      0.51      0.49     20354
weighted avg       0.84      0.89      0.84     20354

ROC AUC Score: 0.6440779467669097


In [5]:
import optuna
from sklearn.model_selection import cross_val_score

def objective(trial):
    """
    Función objetivo para optimizar los hiperparámetros de XGBoost dentro de un Pipeline.
    """
    
    # Definir el espacio de búsqueda de hiperparámetros
    # Usamos el prefijo 'xgb__' porque el modelo está dentro de un Pipeline llamado 'xgb'
    param_grid = {
    'xgb__n_estimators': trial.suggest_int('n_estimators', 100, 1000),
    'xgb__learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
    'xgb__max_depth': trial.suggest_int('max_depth', 3, 10),
    'xgb__min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
    'xgb__gamma': trial.suggest_float('gamma', 0, 5),
    'xgb__subsample': trial.suggest_float('subsample', 0.5, 1.0),
    'xgb__colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
    'xgb__reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-5, 1.0),
    'xgb__reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-5, 1.0)
}

        # Importante para data desbalanceada (scale_pos_weight)
        # Se suele calcular como: (total_negativos / total_positivos)
        # 'classifier__scale_pos_weight': trial.suggest_float('scale_pos_weight', 1.0, 10.0) 
    

    # Actualizar los parámetros del Pipeline existente
    pipeline_10.set_params(**param_grid)

    # Evaluación con Validación Cruzada (Cross-Validation)
    # cv=5 lo cambie a 5 para probar mejor robustez
    cv_scores = cross_val_score(
        pipeline_10, 
        X_train, 
        y_train, 
        cv=5, 
        scoring='roc_auc', 
        n_jobs=-1
    )

    # Retornar el promedio de los scores (Optuna intentará maximizar esto)
    return cv_scores.mean()

In [6]:
# Crear el estudio
study = optuna.create_study(direction='maximize', study_name='xgboost_optimization_10')

# Iniciar la optimización (esto puede tardar unos minutos dependiendo de tu hardware)
study.optimize(objective, n_trials=100, show_progress_bar=True)

print("-" * 50)
print("¡Optimización terminada!")
print(f"Mejor AUC ROC obtenido: {study.best_value:.4f}")
print("Mejores Hiperparámetros encontrados:")
print(study.best_params)

[I 2025-11-26 23:47:38,953] A new study created in memory with name: xgboost_optimization_10


  0%|          | 0/100 [00:00<?, ?it/s]

  'xgb__reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-5, 1.0),
  'xgb__reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-5, 1.0)


[I 2025-11-26 23:47:47,891] Trial 0 finished with value: 0.6501057742435538 and parameters: {'n_estimators': 805, 'learning_rate': 0.18553680166325637, 'max_depth': 3, 'min_child_weight': 9, 'gamma': 3.0256109122762833, 'subsample': 0.6232227666625001, 'colsample_bytree': 0.7192280558466821, 'reg_alpha': 0.020094070727246116, 'reg_lambda': 0.00021335377348689782}. Best is trial 0 with value: 0.6501057742435538.


  'xgb__reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-5, 1.0),
  'xgb__reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-5, 1.0)


[W 2025-11-26 23:47:50,600] Trial 1 failed with parameters: {'n_estimators': 131, 'learning_rate': 0.10492005759901918, 'max_depth': 7, 'min_child_weight': 3, 'gamma': 1.1911384599598125, 'subsample': 0.7083213716212626, 'colsample_bytree': 0.8191524671524981, 'reg_alpha': 0.0007297479456546492, 'reg_lambda': 0.00011779407727161156} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "c:\Users\asjer\AppData\Local\Programs\Python\Python313\Lib\site-packages\optuna\study\_optimize.py", line 201, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\asjer\AppData\Local\Temp\ipykernel_25360\2840193074.py", line 33, in objective
    cv_scores = cross_val_score(
        pipeline_10,
    ...<4 lines>...
        n_jobs=-1
    )
  File "c:\Users\asjer\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\utils\_param_validation.py", line 218, in wrapper
    return func(*args, **kwargs)
  File "c:\Users\asjer\AppData\Local\Program

KeyboardInterrupt: 

In [None]:
# Obtener los mejores parámetros del estudio
best_params = study.best_params

# Ajustar las claves del diccionario para que coincidan con el Pipeline
# (Añadimos el prefijo 'xgboost__' a cada clave que nos dio Optuna)
pipeline_params = {f'xgb__{k}': v for k, v in best_params.items()}

# 3. Configurar el pipeline con los ganadores
pipeline_10.set_params(**pipeline_params)

# 4. Entrenar modelo definitivo
pipeline_10.fit(X_train, y_train)

# 5. Generar predicciones
# predict() nos da clases (0 o 1)
y_pred = pipeline_10.predict(X_test)
# predict_proba() nos da la probabilidad (0.0 a 1.0), necesaria para el AUC y la curva ROC
y_pred_proba = pipeline_10.predict_proba(X_test)[:, 1]

print("Modelo entrenado y predicciones generadas.")

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Modelo entrenado y predicciones generadas.


In [None]:

y_pred_proba = pipeline_10.predict_proba(X_test)[:, 1]
print(classification_report(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_pred_proba))


              precision    recall  f1-score   support

           0       0.89      1.00      0.94     18069
           1       0.64      0.01      0.02      2285

    accuracy                           0.89     20354
   macro avg       0.76      0.50      0.48     20354
weighted avg       0.86      0.89      0.84     20354

ROC AUC Score: 0.657798255241608
