In [3]:
# --- Librerías principales ---
import os
import json
import gzip
import pickle
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (
    confusion_matrix, balanced_accuracy_score, f1_score, 
    precision_score, recall_score
)

In [4]:
def load_data(csv_file):
    df = pd.read_csv(csv_file, compression="zip")
    return df


df_train_raw = load_data("../files/input/train_data.csv.zip")
df_test_raw = load_data("../files/input/test_data.csv.zip")

print("✅ Datos cargados correctamente:")
display(df_train_raw.head())

✅ Datos cargados correctamente:


Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,10748,310000,1,3,1,32,0,0,0,0,...,84373,57779,14163,8295,6000,4000,3000,1000,2000,0
1,12574,10000,2,3,1,49,-1,-1,-2,-1,...,1690,1138,930,0,0,2828,0,182,0,1
2,29677,50000,1,2,1,28,-1,-1,-1,0,...,45975,1300,43987,0,46257,2200,1300,43987,1386,0
3,8857,80000,2,3,1,52,2,2,3,3,...,40748,39816,40607,3700,1600,1600,0,1600,1600,1
4,21099,270000,1,1,2,34,1,2,0,0,...,22448,15490,17343,0,4000,2000,0,2000,2000,0


In [5]:
def data_clean(data):
    df = data.copy()
    df.rename(columns={"default payment next month": "default"}, inplace=True)
    df.drop(columns="ID", inplace=True)
    df = df[(df["EDUCATION"] != 0) & (df["MARRIAGE"] != 0)]
    df["EDUCATION"] = df["EDUCATION"].apply(lambda x: 4 if x > 4 else x)
    return df

# Aplicar limpieza y visualizar
df_train = data_clean(df_train_raw)
df_test = data_clean(df_test_raw)

print("✅ Limpieza aplicada:")
display(df_train.head())

✅ Limpieza aplicada:


Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default
0,310000,1,3,1,32,0,0,0,0,0,...,84373,57779,14163,8295,6000,4000,3000,1000,2000,0
1,10000,2,3,1,49,-1,-1,-2,-1,2,...,1690,1138,930,0,0,2828,0,182,0,1
2,50000,1,2,1,28,-1,-1,-1,0,-1,...,45975,1300,43987,0,46257,2200,1300,43987,1386,0
3,80000,2,3,1,52,2,2,3,3,3,...,40748,39816,40607,3700,1600,1600,0,1600,1600,1
4,270000,1,1,2,34,1,2,0,0,2,...,22448,15490,17343,0,4000,2000,0,2000,2000,0


In [6]:
def split_data(data_train, data_test):
    x_train = data_train.drop(columns="default")
    y_train = data_train["default"]
    x_test = data_test.drop(columns="default")
    y_test = data_test["default"]
    return x_train, y_train, x_test, y_test

x_train, y_train, x_test, y_test = split_data(df_train, df_test)

print("✅ División realizada correctamente:")
print(f"x_train: {x_train.shape}, y_train: {y_train.shape}")


✅ División realizada correctamente:
x_train: (20953, 23), y_train: (20953,)


In [7]:
def create_pipeline(estimator):
    categorical_feature = ['EDUCATION', 'SEX', 'MARRIAGE']

    preprocessor = ColumnTransformer(
        transformers=[('cat', OneHotEncoder(handle_unknown='ignore'), categorical_feature)],
        remainder='passthrough'
    )

    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('estimator', estimator)
    ])
    return pipeline

pipeline = create_pipeline(RandomForestClassifier())
print("✅ Pipeline creado correctamente:")
pipeline

✅ Pipeline creado correctamente:


0,1,2
,steps,"[('preprocessor', ...), ('estimator', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [8]:
def make_grid_search(pipeline):
    grid_search = GridSearchCV(
        estimator=pipeline,
        param_grid={
            'estimator__n_estimators': [50, 100, 200],
            'estimator__max_depth': [None, 10, 20],
            'estimator__min_samples_split': [10],
            'estimator__min_samples_leaf': [1, 2, 5],
            'estimator__max_features': ['sqrt']
        },
        cv=10,
        scoring='balanced_accuracy',
        n_jobs=-1,
        verbose=2
    )
    return grid_search

grid_search = make_grid_search(pipeline)
grid_search.fit(x_train, y_train)

print("✅ GridSearch finalizado")
print("Mejores parámetros:")
grid_search.best_params_


Fitting 10 folds for each of 27 candidates, totalling 270 fits
✅ GridSearch finalizado
Mejores parámetros:


{'estimator__max_depth': None,
 'estimator__max_features': 'sqrt',
 'estimator__min_samples_leaf': 1,
 'estimator__min_samples_split': 10,
 'estimator__n_estimators': 200}

In [9]:
def check_estimator(estimator, x, y, dataset):
    y_pred = estimator.predict(x)
    precision = round(precision_score(y, y_pred), 4)
    balanced_accuracy = round(balanced_accuracy_score(y, y_pred), 4)
    f1 = round(f1_score(y, y_pred), 4)
    recall = round(recall_score(y, y_pred), 4)

    metrics = {
        "type": "metrics",
        "dataset": dataset,
        "precision": precision,
        "balanced_accuracy": balanced_accuracy,
        "recall": recall,
        "f1_score": f1
    }
    return metrics, y_pred

metrics_train, y_pred_train = check_estimator(grid_search, x_train, y_train, "train")
metrics_test, y_pred_test = check_estimator(grid_search, x_test, y_test, "test")

print("✅ Métricas entrenamiento:")
print(metrics_train)
print("✅ Métricas prueba:")
print(metrics_test)


✅ Métricas entrenamiento:
{'type': 'metrics', 'dataset': 'train', 'precision': 0.9718, 'balanced_accuracy': 0.8397, 'recall': 0.6853, 'f1_score': 0.8038}
✅ Métricas prueba:
{'type': 'metrics', 'dataset': 'test', 'precision': 0.6693, 'balanced_accuracy': 0.6758, 'recall': 0.4056, 'f1_score': 0.5051}


In [11]:
#Matriz de confusión
cm = confusion_matrix(y_test, y_pred_test)  
print("✅ Matriz de confusión:")
print(cm)


✅ Matriz de confusión:
[[6691  382]
 [1133  773]]


In [9]:
%pip install optuna-integration[sklearn]
%pip install optuna

from optuna.integration import OptunaSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier

import optuna

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

pipeline = create_pipeline(RandomForestClassifier(random_state=42))

param_distributions = {
    "estimator__n_estimators": optuna.distributions.IntDistribution(100, 800),
    "estimator__max_depth": optuna.distributions.IntDistribution(3, 30),
    "estimator__min_samples_split": optuna.distributions.IntDistribution(2, 50),
    "estimator__min_samples_leaf": optuna.distributions.IntDistribution(1, 20),
    "estimator__max_features": optuna.distributions.CategoricalDistribution(["sqrt", "log2", None]),
    "estimator__class_weight": optuna.distributions.CategoricalDistribution(["balanced", "balanced_subsample", None]),
}

optuna_search = OptunaSearchCV(
    estimator=pipeline,
    param_distributions=param_distributions,
    cv=cv,
    scoring="balanced_accuracy",
    n_trials=60,                    # sube si tienes tiempo
    n_jobs=-1,
    random_state=42,
    refit=True
)

optuna_search.fit(x_train, y_train)
best_estimator = optuna_search.best_estimator_
optuna_search.best_params_, optuna_search.best_score_


Note: you may need to restart the kernel to use updated packages.


  optuna_search = OptunaSearchCV(
[I 2025-11-09 08:43:47,366] A new study created in memory with name: no-name-a268f5ac-cfa8-4452-9d5c-7568b708a9ae


Note: you may need to restart the kernel to use updated packages.


[I 2025-11-09 08:50:16,441] Trial 14 finished with value: 0.6561219263273734 and parameters: {'estimator__n_estimators': 174, 'estimator__max_depth': 24, 'estimator__min_samples_split': 20, 'estimator__min_samples_leaf': 6, 'estimator__max_features': 'log2', 'estimator__class_weight': None}. Best is trial 14 with value: 0.6561219263273734.
[I 2025-11-09 08:54:47,246] Trial 15 finished with value: 0.7113426596937695 and parameters: {'estimator__n_estimators': 191, 'estimator__max_depth': 20, 'estimator__min_samples_split': 50, 'estimator__min_samples_leaf': 16, 'estimator__max_features': 'sqrt', 'estimator__class_weight': 'balanced_subsample'}. Best is trial 15 with value: 0.7113426596937695.
[I 2025-11-09 08:56:46,251] Trial 5 finished with value: 0.6014150582656635 and parameters: {'estimator__n_estimators': 503, 'estimator__max_depth': 3, 'estimator__min_samples_split': 23, 'estimator__min_samples_leaf': 10, 'estimator__max_features': 'sqrt', 'estimator__class_weight': None}. Best is

: 

: 