In [38]:
# flake8: noqa: E501
#
# En este dataset se desea pronosticar el default (pago) del cliente el próximo
# mes a partir de 23 variables explicativas.
#
#   LIMIT_BAL: Monto del credito otorgado. Incluye el credito individual y el
#              credito familiar (suplementario).
#         SEX: Genero (1=male; 2=female).
#   EDUCATION: Educacion (0=N/A; 1=graduate school; 2=university; 3=high school; 4=others).
#    MARRIAGE: Estado civil (0=N/A; 1=married; 2=single; 3=others).
#         AGE: Edad (years).
#       PAY_0: Historia de pagos pasados. Estado del pago en septiembre, 2005.
#       PAY_2: Historia de pagos pasados. Estado del pago en agosto, 2005.
#       PAY_3: Historia de pagos pasados. Estado del pago en julio, 2005.
#       PAY_4: Historia de pagos pasados. Estado del pago en junio, 2005.
#       PAY_5: Historia de pagos pasados. Estado del pago en mayo, 2005.
#       PAY_6: Historia de pagos pasados. Estado del pago en abril, 2005.
#   BILL_AMT1: Historia de pagos pasados. Monto a pagar en septiembre, 2005.
#   BILL_AMT2: Historia de pagos pasados. Monto a pagar en agosto, 2005.
#   BILL_AMT3: Historia de pagos pasados. Monto a pagar en julio, 2005.
#   BILL_AMT4: Historia de pagos pasados. Monto a pagar en junio, 2005.
#   BILL_AMT5: Historia de pagos pasados. Monto a pagar en mayo, 2005.
#   BILL_AMT6: Historia de pagos pasados. Monto a pagar en abril, 2005.
#    PAY_AMT1: Historia de pagos pasados. Monto pagado en septiembre, 2005.
#    PAY_AMT2: Historia de pagos pasados. Monto pagado en agosto, 2005.
#    PAY_AMT3: Historia de pagos pasados. Monto pagado en julio, 2005.
#    PAY_AMT4: Historia de pagos pasados. Monto pagado en junio, 2005.
#    PAY_AMT5: Historia de pagos pasados. Monto pagado en mayo, 2005.
#    PAY_AMT6: Historia de pagos pasados. Monto pagado en abril, 2005.
#
# La variable "default payment next month" corresponde a la variable objetivo.
#
# El dataset ya se encuentra dividido en conjuntos de entrenamiento y prueba
# en la carpeta "files/input/".
#
# Los pasos que debe seguir para la construcción de un modelo de
# clasificación están descritos a continuación.
#

In [39]:
# flake8: noqa: E501
from __future__ import annotations

import json
import gzip
from io import BytesIO
from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    precision_score, recall_score, f1_score, balanced_accuracy_score, confusion_matrix
)
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

# Config global
SEED = 21
RNG = np.random.default_rng(SEED)

# Rutas
INPUT_DIR = Path("../files/input")
MODEL_DIR = Path("../files/models")
OUTPUT_DIR = Path("../files/output")

MODEL_DIR.mkdir(parents=True, exist_ok=True)
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

TRAIN_PATH = INPUT_DIR / "train_data.csv.zip"
TEST_PATH = INPUT_DIR / "test_data.csv.zip"

pd.set_option("display.max_columns", 200)


In [40]:
# Carga de datos
train_df = pd.read_csv(TRAIN_PATH, compression="zip")
test_df = pd.read_csv(TEST_PATH, compression="zip")

# Vista rápida
display(train_df.head(3))
display(train_df.shape, test_df.shape)

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,10748,310000,1,3,1,32,0,0,0,0,0,0,172772,152397,110375,84373,57779,14163,8295,6000,4000,3000,1000,2000,0
1,12574,10000,2,3,1,49,-1,-1,-2,-1,2,2,32,-358,-748,1690,1138,930,0,0,2828,0,182,0,1
2,29677,50000,1,2,1,28,-1,-1,-1,0,-1,-1,430,0,46257,45975,1300,43987,0,46257,2200,1300,43987,1386,0


(21000, 25)

(9000, 25)

In [41]:
# Paso 1.
# Realice la limpieza de los datasets:
# - Renombre la columna "default payment next month" a "default".
# - Remueva la columna "ID".
# - Elimine los registros con informacion no disponible.
# - Para la columna EDUCATION, valores > 4 indican niveles superiores
#   de educación, agrupe estos valores en la categoría "others".
#


def clean_credit_df(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # Renombrar target si aplica
    if "default payment next month" in df.columns:
        df = df.rename(columns={"default payment next month": "default"})

    # Remover ID si aplica
    if "ID" in df.columns:
        df = df.drop(columns=["ID"])

    # Normalización EDUCATION
    if "EDUCATION" in df.columns:
        df.loc[df["EDUCATION"] > 4, "EDUCATION"] = 4

    # Normalización MARRIAGE
    # 0 = N/A -> NaN
    if "MARRIAGE" in df.columns:
        df.loc[df["MARRIAGE"] == 0, "MARRIAGE"] = np.nan

    # Validación SEX (esperado 1,2); otros -> NaN
    if "SEX" in df.columns:
        df.loc[~df["SEX"].isin([1, 2]), "SEX"] = np.nan

    # Eliminar registros con información no disponible
    df = df.dropna(axis=0).reset_index(drop=True)

    # Tipos
    cat_cols = ["SEX", "EDUCATION", "MARRIAGE", "PAY_0", "PAY_2", "PAY_3", "PAY_4", "PAY_5", "PAY_6"]
    for c in cat_cols:
        if c in df.columns:
            df[c] = df[c].astype(int)


    return df


train_df = clean_credit_df(train_df)
test_df = clean_credit_df(test_df)

train_df.head(3)

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default
0,310000,1,3,1,32,0,0,0,0,0,0,172772,152397,110375,84373,57779,14163,8295,6000,4000,3000,1000,2000,0
1,10000,2,3,1,49,-1,-1,-2,-1,2,2,32,-358,-748,1690,1138,930,0,0,2828,0,182,0,1
2,50000,1,2,1,28,-1,-1,-1,0,-1,-1,430,0,46257,45975,1300,43987,0,46257,2200,1300,43987,1386,0


In [42]:
# Paso 2.
# Divida los datasets en x_train, y_train, x_test, y_test.


TARGET = "default"
FEATURES = [c for c in train_df.columns if c != TARGET]

X_train = train_df[FEATURES]
y_train = train_df[TARGET].astype(int)

X_test = test_df[FEATURES]
y_test = test_df[TARGET].astype(int)

X_train.shape, X_test.shape

((20962, 23), (8984, 23))

In [43]:
# Definir columnas categóricas y numéricas
cat_cols = ["SEX", "EDUCATION", "MARRIAGE", "PAY_0", "PAY_2", "PAY_3", "PAY_4", "PAY_5", "PAY_6"]
num_cols = [c for c in FEATURES if c not in cat_cols]

pre = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_cols),
    ],
    remainder="passthrough",
    verbose_feature_names_out=False,
)


rf = RandomForestClassifier(
    n_estimators=1200,
    random_state=SEED,
    n_jobs=-1,
    bootstrap=True,
    class_weight="balanced",
)

pipe = Pipeline(steps=[("pre", pre), ("rf", rf)])

In [44]:
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=21)

# Grid más permisivo para subir el score en train
param_grid = {
    "rf__n_estimators": [600],
    "rf__max_depth": [None],
    "rf__min_samples_leaf": [2], 
    "rf__min_samples_split": [2],
    "rf__max_features": ["sqrt"],
    "rf__criterion": ["gini"],
    "rf__bootstrap": [False],    
}

gs = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring="balanced_accuracy",
    cv=cv,
    n_jobs=-1,
    refit=True,
    verbose=1,
)

gs.fit(X_train, y_train)

best_model = gs.best_estimator_
best_params = gs.best_params_
best_score = gs.best_score_

print("Mejores hiperparámetros:", best_params)
print("Balanced accuracy CV (media):", round(best_score, 4))

Fitting 10 folds for each of 1 candidates, totalling 10 fits
Mejores hiperparámetros: {'rf__bootstrap': False, 'rf__criterion': 'gini', 'rf__max_depth': None, 'rf__max_features': 'sqrt', 'rf__min_samples_leaf': 2, 'rf__min_samples_split': 2, 'rf__n_estimators': 600}
Balanced accuracy CV (media): 0.6838


In [45]:
import pickle, gzip
from pathlib import Path

model_path = MODEL_DIR / "model.pkl.gz"
with gzip.open(model_path, "wb") as f:
    pickle.dump(gs, f, protocol=pickle.HIGHEST_PROTOCOL)

model_path, model_path.exists(), type(gs).__name__

(PosixPath('../files/models/model.pkl.gz'), True, 'GridSearchCV')

In [46]:
from typing import Tuple, Dict
import json
from sklearn.metrics import (
    confusion_matrix, precision_score, recall_score, f1_score, balanced_accuracy_score
)

def build_rows(y_true, y_pred, dataset: str):
    metrics_row = {
        "type": "metrics",
        "dataset": dataset,
        "precision": float(precision_score(y_true, y_pred, zero_division=0)),
        "balanced_accuracy": float(balanced_accuracy_score(y_true, y_pred)),
        "recall": float(recall_score(y_true, y_pred, zero_division=0)),
        "f1_score": float(f1_score(y_true, y_pred, zero_division=0)),
    }
    cm = confusion_matrix(y_true, y_pred, labels=[0, 1])
    cm_row = {
        "type": "cm_matrix",
        "dataset": dataset,
        "true_0": {"predicted_0": int(cm[0, 0]), "predicted_1": int(cm[0, 1])},
        "true_1": {"predicted_0": int(cm[1, 0]), "predicted_1": int(cm[1, 1])},
    }
    return metrics_row, cm_row

best_model = gs.best_estimator_
yhat_tr = best_model.predict(X_train)
yhat_te = best_model.predict(X_test)

m_train, cm_train = build_rows(y_train, yhat_tr, "train")
m_test, cm_test = build_rows(y_test, yhat_te, "test")

# Orden requerido por el autograder
records = [m_train, m_test, cm_train, cm_test]

metrics_path = OUTPUT_DIR / "metrics.json"
with open(metrics_path, "w", encoding="utf-8") as f:
    for rec in records:
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")

metrics_path, metrics_path.exists()

(PosixPath('../files/output/metrics.json'), True)

In [47]:
pd.DataFrame(records)

Unnamed: 0,type,dataset,precision,balanced_accuracy,recall,f1_score,true_0,true_1
0,metrics,train,0.954931,0.993133,1.0,0.976946,,
1,metrics,test,0.576971,0.694114,0.483736,0.526256,,
2,cm_matrix,train,,,,,"{'predicted_0': 16014, 'predicted_1': 223}","{'predicted_0': 0, 'predicted_1': 4725}"
3,cm_matrix,test,,,,,"{'predicted_0': 6402, 'predicted_1': 676}","{'predicted_0': 984, 'predicted_1': 922}"
