In [1]:
import pandas as pd
import numpy as np

test  =pd.read_csv(
    "../files/input/test_data.csv.zip",
    index_col=False,
    compression="zip"
)

train = pd.read_csv(
    "../files/input/train_data.csv.zip",
    index_col=False,
    compression="zip"
)

# Paso 1. Limpieza de los datasets

# Renombrar la columna "default payment next month" a "default"
train.rename(columns={"default payment next month": "default"}, inplace=True)
test.rename(columns={"default payment next month": "default"}, inplace=True)

# Remover la columna "ID"
train.drop(columns=["ID"], inplace=True)
test.drop(columns=["ID"], inplace=True)

# Eliminar registros con información no disponible
train.dropna(inplace=True)
test.dropna(inplace=True)

# Agrupar valores de EDUCATION > 4 en la categoría "others" (valor 4)
train["EDUCATION"] = train["EDUCATION"].apply(lambda x: 4 if x>4 else x)
test["EDUCATION"] = test["EDUCATION"].apply(lambda x: 4 if x>4 else x)
train['EDUCATION'] = train['EDUCATION'].apply(lambda x: x if x > 0 else np.nan)
test['EDUCATION'] = test['EDUCATION'].apply(lambda x: x if x > 0 else np.nan)

train['MARRIAGE'] = train['MARRIAGE'].apply(lambda x: x if x > 0 else np.nan)
test['MARRIAGE'] = test['MARRIAGE'].apply(lambda x: x if x > 0 else np.nan)

In [2]:
# Paso 2. División de los datasets

# Variable objetivo (target)
x_train = train.drop(columns=['default'])
y_train = train["default"]

x_test = test.drop(columns=['default'])
y_test = test["default"]


In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

categorical = ['SEX', 'EDUCATION', 'MARRIAGE']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical)
    ],
    remainder='passthrough',  # Mantiene el resto sin modificar
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])


In [4]:
from sklearn.model_selection import GridSearchCV

# Paso 4. Optimización de hiperparámetros usando validación cruzada

# Definir los hiperparámetros a explorar

from sklearn.metrics import precision_score, balanced_accuracy_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix


param_grid = {
  'classifier__n_estimators': [200],
  'classifier__max_depth': [45],  
  'classifier__min_samples_split': [8], 
   'classifier__max_features': ['sqrt']
}

model = GridSearchCV(
    pipeline, param_grid, cv=10, scoring='balanced_accuracy', n_jobs=-1)

model.fit(x_train, y_train)


The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [5]:
print("\Parámetros:", model.best_params_)
print("Precisión:", round(model.best_score_, 4))
best_pipeline = model.best_estimator_

\Parámetros: {'classifier__max_depth': 45, 'classifier__max_features': 'sqrt', 'classifier__min_samples_split': 8, 'classifier__n_estimators': 200}
Precisión: 0.6569


  print("\Parámetros:", model.best_params_)


In [6]:
# Guarde el modelo (comprimido con gzip) como "files/models/model.pkl.gz".
# Recuerde que es posible guardar el modelo comprimido usanzo la libreria gzip.

import pickle
import os
import gzip

dir_models = "../files/models"
os.makedirs(dir_models, exist_ok=True)

if not os.path.exists("../files/output"):
    os.makedirs("../files/output")

with gzip.open("../files/models/model.pkl.gz","wb") as file:
    pickle.dump(model,file)



In [7]:
import json
from sklearn.metrics import precision_score, recall_score, f1_score, balanced_accuracy_score

def evaluate_and_save_metrics(model, X_train, X_test, y_train, y_test):

    #Calcular predicciones
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    #Calcular métricas para el conjunto de entrenamiento
    metrics_train = {
        "type": "metrics",
        "dataset": "train",
        "precision": precision_score(y_train, y_train_pred, zero_division=0),
        "balanced_accuracy": balanced_accuracy_score(y_train, y_train_pred),
        "recall": recall_score(y_train, y_train_pred, zero_division=0),
        "f1_score": f1_score(y_train, y_train_pred, zero_division=0)
    }

    #Calcular métricas para el conjunto de prueba
    metrics_test = {
        "type": "metrics",
        "dataset": "test",
        "precision": precision_score(y_test, y_test_pred, zero_division=0),
        "balanced_accuracy": balanced_accuracy_score(y_test, y_test_pred),
        "recall": recall_score(y_test, y_test_pred, zero_division=0),
        "f1_score": f1_score(y_test, y_test_pred, zero_division=0)
    }

    #Crear carpeta
    output_dir = "../files/output"
    os.makedirs(output_dir, exist_ok=True)

    #Guardar las métricas en JSON
    output_path = os.path.join(output_dir,"metrics.json")
    with open(output_path,"w") as f:
        f.write(json.dumps(metrics_train) + '\n')
        f.write(json.dumps(metrics_test) + '\n')

In [8]:
from sklearn.metrics import confusion_matrix

def evaluate_and_save_confusion_matrices(model, X_train, X_test, y_train, y_test):

    #Calcular predicciones
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    #Calcular matrices de confusión
    cm_train = confusion_matrix(y_train, y_train_pred)
    cm_test = confusion_matrix(y_test, y_test_pred)

    #Convertir las confusion matrices a JSON
    def format_confusion_matrix(cm, dataset_type):
        return {
            'type': 'cm_matrix',
            'dataset': dataset_type,
            'true_0': {
                'predicted_0': int(cm[0, 0]),
                'predicted_1': int(cm[0, 1])
            },
            'true_1': {
                'predicted_0': int(cm[1, 0]),
                'predicted_1': int(cm[1, 1])
            }
        }
    
    metrics = [
        format_confusion_matrix(cm_train, "train"),
        format_confusion_matrix(cm_test, "test")
    ]

    #Salvar las matrices de confusion en el archivo JSON
    output_path = "../files/output/metrics.json"
    with open(output_path, "a") as f:
        for metric in metrics:
            f.write(json.dumps(metric) + '\n')

#Orquestador
def run(model, X_train, X_test, y_train, y_test):
    import os
    os.makedirs('../files/output', exist_ok=True)

    evaluate_and_save_metrics(model, X_train, X_test, y_train, y_test)

    evaluate_and_save_confusion_matrices(model, X_train, X_test, y_train, y_test)

run(model, x_train, x_test, y_train, y_test)