In [2]:
# Cargamos las librerías
import os
import pandas as pd
import gzip
import json
import pickle

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, balanced_accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split

In [3]:
# Cargar los archivos
train_dataset = pd.read_csv("../files/input/train_data.csv.zip", index_col=False)
test_dataset = pd.read_csv("../files/input/test_data.csv.zip", index_col=False)

In [4]:
# Paso 1.
# - Renombre la columna "default payment next month" a "default".
train_dataset.rename(columns={"default payment next month": "default"}, inplace=True)
test_dataset.rename(columns={"default payment next month": "default"}, inplace=True)

# # - Remueva la columna "ID".
train_dataset.drop(columns={"ID"}, inplace=True)
test_dataset.drop(columns={"ID"}, inplace=True)

# # - Elimine los registros con informacion no disponible.
test_dataset.dropna()
train_dataset.dropna()

# Esta sería la otra forma de eliminar los NA
# train_dataset = train_dataset.loc[train_dataset["MARRIAGE"] != 0]
# train_dataset = train_dataset.loc[train_dataset["EDUCATION"] != 0]
# test_dataset = test_dataset.loc[test_dataset["MARRIAGE"] != 0]
# test_dataset = test_dataset.loc[test_dataset["EDUCATION"] != 0]


# # - Para la columna EDUCATION, valores > 4 indican niveles superiores
# #   de educación, agrupe estos valores en la categoría "others".
train_dataset["EDUCATION"] = train_dataset["EDUCATION"].apply(lambda x: 4 if x > 4 else x)
test_dataset["EDUCATION"] = train_dataset["EDUCATION"].apply(lambda x: 4 if x > 4 else x)


In [5]:
# Paso 2.
# Divida los datasets en x_train, y_train, x_test, y_test.


x_train = train_dataset.drop(columns=["default"])
y_train = train_dataset["default"]
x_test = test_dataset.drop(columns=["default"])
y_test = test_dataset["default"]


In [6]:
# Paso 3.
# Cree un pipeline para el modelo de clasificación. Este pipeline debe
# contener las siguientes capas:
# - Transforma las variables categoricas usando el método
#   one-hot-encoding.
# - Ajusta un modelo de bosques aleatorios (rando forest).
#

categorical_features = ["EDUCATION", "MARRIAGE", "SEX"]

transformer = ColumnTransformer(
    transformers=[
        ("ohe", OneHotEncoder(), categorical_features),
    ],
    remainder="passthrough"
)

pipeline = Pipeline([
    ("transformer", transformer),
    ("rf", RandomForestClassifier(random_state=42))
])



In [7]:
# Paso 4.
# Optimice los hiperparametros del pipeline usando validación cruzada.
# Use 10 splits para la validación cruzada. Use la función de precision
# balanceada para medir la precisión del modelo.

param_grid = {
"rf__n_estimators": [100, 200],
"rf__max_depth": [5, 10, None],
"rf__min_samples_split": [2, 5],
'rf__min_samples_leaf': [1, 2], 
"rf__max_features": [25],
}

grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv = 10,
    scoring="balanced_accuracy",
    n_jobs=-1,        
)

grid_search.fit(x_train, y_train)



print("\nMejores parámetros encontrados:")
print(grid_search.best_params_)
print("Mejor f1 ponderado (Validación cruzada):", grid_search.best_score_)


Mejores parámetros encontrados:
{'rf__max_depth': None, 'rf__max_features': 25, 'rf__min_samples_leaf': 2, 'rf__min_samples_split': 2, 'rf__n_estimators': 200}
Mejor f1 ponderado (Validación cruzada): 0.6583434673796368


In [2]:
# Paso 5.
# Guarde el modelo (comprimido con gzip) como "files/models/model.pkl.gz".
# Recuerde que es posible guardar el modelo comprimido usanzo la libreria gzip.

def save_estimator(estimator):
    models_path = "files/models"
    os.makedirs(models_path, exist_ok=True)

    model_file = os.path.join(models_path, "model.pkl.gz")

    with gzip.open(model_file, "wb") as file:
        pickle.dump(estimator, file)


In [53]:
import os
import json
from sklearn.metrics import precision_score, recall_score, f1_score, balanced_accuracy_score

# Paso 6: Calcular métricas
def calc_metrics(model, x_train, y_train, x_test, y_test):
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)

    metrics = [
        {
            'dataset': 'train',
            'precision': precision_score(y_train, y_train_pred, zero_division=0),
            'balanced_accuracy': balanced_accuracy_score(y_train, y_train_pred),
            'recall': recall_score(y_train, y_train_pred, zero_division=0),
            'f1_score': f1_score(y_train, y_train_pred, zero_division=0)
        },
        {
            'dataset': 'test',
            'precision': precision_score(y_test, y_test_pred, zero_division=0),
            'balanced_accuracy': balanced_accuracy_score(y_test, y_test_pred),
            'recall': recall_score(y_test, y_test_pred, zero_division=0),
            'f1_score': f1_score(y_test, y_test_pred, zero_division=0)
        },
    ]

    return metrics


# Guardar las métricas en JSONL
def save_metrics(metrics):
    output_path = "files/output"
    os.makedirs(output_path, exist_ok=True)
    metrics_file = os.path.join(output_path, "metrics.json")

    with open(metrics_file, "w", encoding="utf-8") as f:
        for metric in metrics:
            json.dump(metric, f)
            f.write("\n")

    print("✅ Métricas guardadas en:", metrics_file)


# 1. Crear pipeline
pipeline = make_pipeline()

# 2. Hacer Grid Search
grid_search = make_grid_search(pipeline, x_train, y_train)

# 3. Guardar el mejor modelo entrenado
model = grid_search.best_estimator_

# 4. Calcular métricas
results = calc_metrics(model, x_train, y_train, x_test, y_test)

# 5. Mostrar métricas
for res in results:
    print(f"=== Métricas ({res['dataset']}) ===")
    print(f"Precisión:            {res['precision']:.3f}")
    print(f"Exactitud balanceada: {res['balanced_accuracy']:.3f}")
    print(f"Sensibilidad (Recall):{res['recall']:.3f}")
    print(f"F1 Score:             {res['f1_score']:.3f}")
    print()

# 6. Guardar métricas en JSON
save_metrics(results)



=== Métricas (train) ===
Precisión:            0.998
Exactitud balanceada: 0.978
Sensibilidad (Recall):0.956
F1 Score:             0.977

=== Métricas (test) ===
Precisión:            0.645
Exactitud balanceada: 0.672
Sensibilidad (Recall):0.405
F1 Score:             0.498

✅ Métricas guardadas en: files/output\metrics.json


In [55]:
from sklearn.metrics import confusion_matrix

def matriz_confusion_dict(model, x_train, y_train, x_test, y_test):
    # Predicciones
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)

    # Matrices de confusión
    cm_train = confusion_matrix(y_train, y_train_pred)
    cm_test = confusion_matrix(y_test, y_test_pred)

    # Estructura en diccionario
    matriz_confusion = [
        {
            'type': 'cm_matrix',
            'dataset': 'train',
            'true_0': {'predicted_0': int(cm_train[0, 0]), 'predicted_1': int(cm_train[0, 1])},
            'true_1': {'predicted_0': int(cm_train[1, 0]), 'predicted_1': int(cm_train[1, 1])}
        },
        {
            'type': 'cm_matrix',
            'dataset': 'test',
            'true_0': {'predicted_0': int(cm_test[0, 0]), 'predicted_1': int(cm_test[0, 1])},
            'true_1': {'predicted_0': int(cm_test[1, 0]), 'predicted_1': int(cm_test[1, 1])}
        }
    ]

    return matriz_confusion

cm_results = matriz_confusion_dict(model, x_train, y_train, x_test, y_test)

for cm in cm_results:
    print(f"=== Matriz de confusión ({cm['dataset']}) ===")
    print(f"        Pred_0   Pred_1")
    print(f"True_0    {cm['true_0']['predicted_0']}        {cm['true_0']['predicted_1']}")
    print(f"True_1    {cm['true_1']['predicted_0']}        {cm['true_1']['predicted_1']}")
    print()




=== Matriz de confusión (train) ===
        Pred_0   Pred_1
True_0    16266        7
True_1    208        4519

=== Matriz de confusión (test) ===
        Pred_0   Pred_1
True_0    6666        425
True_1    1136        773

