In [1]:
import pickle
import gzip
import os
import json

import pandas as pd
import numpy as np

from sklearn.discriminant_analysis import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.neural_network import MLPClassifier
from sklearn.compose import ColumnTransformer
from sklearn.metrics import (
    precision_score, recall_score, f1_score,
    balanced_accuracy_score, confusion_matrix
)
from sklearn.model_selection import GridSearchCV, StratifiedKFold


In [2]:
def limpiar_dataset(df_inicial):
    df_procesado = df_inicial.copy()
    
    df_procesado.drop(columns='ID', inplace=True)
    df_procesado.rename(columns={'default payment next month': 'flag_default'}, inplace=True)
    df_procesado.dropna(inplace=True)

    df_procesado = df_procesado[(df_procesado['EDUCATION'] != 0) & (df_procesado['MARRIAGE'] != 0)]
    df_procesado.loc[df_procesado['EDUCATION'] > 4, 'EDUCATION'] = 4

    return df_procesado


In [3]:
def separar_features(df_train, df_test):
    x_tr = df_train.drop(columns="flag_default")
    y_tr = df_train["flag_default"]

    x_te = df_test.drop(columns="flag_default")
    y_te = df_test["flag_default"]

    return x_tr, y_tr, x_te, y_te


In [4]:
def ensamblar_pipeline():
    columnas_categoricas = ['SEX', 'EDUCATION', 'MARRIAGE']
    columnas_numericas = [
        "LIMIT_BAL", "AGE", "PAY_0", "PAY_2", "PAY_3", "PAY_4",
        "PAY_5", "PAY_6", "BILL_AMT1", "BILL_AMT2", "BILL_AMT3",
        "BILL_AMT4", "BILL_AMT5", "BILL_AMT6", "PAY_AMT1", 
        "PAY_AMT2", "PAY_AMT3", "PAY_AMT4", "PAY_AMT5", "PAY_AMT6"
    ]

    preprocesamiento = ColumnTransformer([
        ("cat_cols", OneHotEncoder(handle_unknown="ignore"), columnas_categoricas),
        ("num_cols", StandardScaler(), columnas_numericas),
    ])

    pipeline_ml = Pipeline([
        ("prep", preprocesamiento),
        ("sel_features", SelectKBest(score_func=f_classif)),
        ("reductor", PCA()),
        ("modelo_nn", MLPClassifier(max_iter=15000, random_state=17)),
    ])

    return pipeline_ml


In [5]:
def ajustar_parametros(pipeline_base):
    grid_parametros = {
        'reductor__n_components': [None],
        'sel_features__k': [20],
        'modelo_nn__hidden_layer_sizes': [(50, 30, 40, 60)],
        'modelo_nn__alpha': [0.26],
        'modelo_nn__learning_rate_init': [0.001],
    }

    busqueda = GridSearchCV(
        estimator=pipeline_base,
        param_grid=grid_parametros,
        cv=10,
        scoring='balanced_accuracy',
        n_jobs=-1,
        verbose=2
    )

    return busqueda


In [6]:
def generar_metricas(modelo_fit, x_data, y_data, nombre_ds):
    predicciones = modelo_fit.predict(x_data)

    reporte = {
        "type": "metrics",
        "dataset": nombre_ds,
        "precision": round(precision_score(y_data, predicciones), 4),
        "balanced_accuracy": round(balanced_accuracy_score(y_data, predicciones), 4),
        "recall": round(recall_score(y_data, predicciones), 4),
        "f1_score": round(f1_score(y_data, predicciones), 4)
    }

    return predicciones, reporte


In [7]:
def matriz_confusion(y_real, y_estimado, etiqueta):
    cm = confusion_matrix(y_real, y_estimado)
    
    return {
        "type": "cm_matrix",
        "dataset": etiqueta,
        "true_0": {"predicted_0": int(cm[0][0]), "predicted_1": int(cm[0][1])},
        "true_1": {"predicted_0": int(cm[1][0]), "predicted_1": int(cm[1][1])}
    }


In [8]:
def exportar_modelo_comprimido(modelo, ruta_salida):
    os.makedirs(os.path.dirname(ruta_salida), exist_ok=True)
    with gzip.open(ruta_salida, 'wb') as f:
        pickle.dump(modelo, f)


In [9]:
def exportar_jsonl(lista_registros, ruta_archivo):
    os.makedirs(os.path.dirname(ruta_archivo), exist_ok=True)
    with open(ruta_archivo, "w") as f:
        for registro in lista_registros:
            f.write(json.dumps(registro) + '\n')


In [10]:
print("Cargando datos...")
df_tr = pd.read_csv("../files/input/train_data.csv.zip")
df_te = pd.read_csv("../files/input/test_data.csv.zip")

print("Limpiando datasets...")
df_tr = limpiar_dataset(df_tr)
df_te = limpiar_dataset(df_te)

x_train, y_train, x_test, y_test = separar_features(df_tr, df_te)

print("Creando pipeline...")
pipeline_final = ensamblar_pipeline()

print("Optimizando hiperparámetros...")
busqueda_modelo = ajustar_parametros(pipeline_final)
modelo_entrenado = busqueda_modelo.fit(x_train, y_train)

print("Guardando modelo...")
exportar_modelo_comprimido(busqueda_modelo, '../files/models/model.pkl.gz')

print("Calculando métricas...")
pred_train, metrics_train = generar_metricas(modelo_entrenado, x_train, y_train, "train")
pred_test, metrics_test = generar_metricas(modelo_entrenado, x_test, y_test, "test")

cm_train = matriz_confusion(y_train, pred_train, 'train')
cm_test = matriz_confusion(y_test, pred_test, 'test')

exportar_jsonl(
    [metrics_train, metrics_test, cm_train, cm_test],
    '../files/output/metrics.json'
)



Cargando datos...
Limpiando datasets...
Creando pipeline...
Optimizando hiperparámetros...
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Guardando modelo...
Calculando métricas...
