In [5]:

## SOLUCIÓN

# Paso 1.
# Realice la limpieza de los datasets:
# - Renombre la columna "default payment next month" a "default".
# - Remueva la columna "ID".
# - Elimine los registros con informacion no disponible.
# - Para la columna EDUCATION, valores > 4 indican niveles superiores
#   de educación, agrupe estos valores en la categoría "others".
# - Renombre la columna "default payment next month" a "default"
# - Remueva la columna "ID".

import zipfile
import os
import pandas as pd
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.model_selection import cross_val_score
import joblib
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, balanced_accuracy_score
import gzip
import pickle
import json
from sklearn.metrics import precision_score, recall_score, f1_score, balanced_accuracy_score


# Vamos a traernos la data de train y de test del dataset

train_path = "../files/input/train_data.csv.zip"
test_path = "../files/input/test_data.csv.zip"
extract_path = "../files/input/"

# Ahora, extraigamos la data del dataset 

if not os.path.exists("../files/input/train_default_of_credit_card_clients.csv"):
  with zipfile.ZipFile(train_path, 'r') as zip_ref:
      zip_ref.extractall(extract_path)

if not os.path.exists("../files/input/test_default_of_credit_card_clients.csv"):
  with zipfile.ZipFile(test_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)


# Ahora, leamos la data
train = pd.read_csv("../files/input/train_default_of_credit_card_clients.csv")
test = pd.read_csv("../files/input/test_default_of_credit_card_clients.csv")

# - Renombre la columna "default payment next month" a "default".
train = train.rename(columns={"default payment next month": "default"})
test = test.rename(columns={"default payment next month": "default"})

# - Remueva la columna "ID".
train = train.drop(columns=["ID"])
test = test.drop(columns=["ID"])

# - Elimine los registros con informacion no disponible.
## ---Reemplazar valores 0 en EDUCATION por NaN (información no disponible)
train["EDUCATION"] = train["EDUCATION"].replace(0, np.nan)
test["EDUCATION"] = test["EDUCATION"].replace(0, np.nan)

## ---Eliminar filas con valores faltantes
train = train.dropna()
test = test.dropna()

# - Para la columna EDUCATION, valores > 4 indican niveles superiores
#   de educación, agrupe estos valores en la categoría "others".
train.loc[train["EDUCATION"] > 4, "EDUCATION"] = 4
test.loc[test["EDUCATION"] > 4, "EDUCATION"] = 4

map_edu = {4: "others"}
train["EDUCATION"] = train["EDUCATION"].clip(upper=4).map(map_edu)

# Eliminemos los duplicados como procesamiento extra
train = train.drop_duplicates()
test = test.drop_duplicates()


In [6]:
# Paso 2.
# Divida los datasets en x_train, y_train, x_test, y_test.

# Para train
X_train = train.drop(columns=["default"])  # Eliminamos la columna target
y_train = train["default"]                  # Solo la columna target

# Para test
X_test = test.drop(columns=["default"])
y_test = test["default"]

In [9]:
# Paso 3.
# Cree un pipeline para el modelo de clasificación. Este pipeline debe
# contener las siguientes capas:
# - Transforma las variables categoricas usando el método
#   one-hot-encoding.
# - Ajusta un modelo de bosques aleatorios (rando forest).

categorical_cols = ["SEX", "EDUCATION", "MARRIAGE"]

numeric_cols = [c for c in X_train.columns if c not in categorical_cols]

categorical_pipeline = Pipeline(steps=[
   ("imputer", SimpleImputer(strategy="most_frequent")),
   ("ohe", OneHotEncoder(handle_unknown="ignore"))
])

numeric_pipeline = Pipeline(steps=[
   ("imputer",SimpleImputer(strategy="median")),
   ("scaler", StandardScaler())
])

preprocessor = ColumnTransformer(transformers=[
   ("cat", categorical_pipeline, categorical_cols),
   ("num", numeric_pipeline, numeric_cols)
])

clf_pipeline = Pipeline(steps=[
   ("preprocessor", preprocessor),
   ("rf", RandomForestClassifier(random_state=42))
])

clf_pipeline.fit(X_train, y_train)

y_pred = clf_pipeline.predict(X_test)


In [11]:
# Paso 4.
# Optimice los hiperparametros del pipeline usando validación cruzada
# Use 10 splits para la validación cruzada. Use la función de precision
# balanceada para medir la precisión del modelo.

param_grid = {
    'rf__n_estimators': [100, 200, 300],
    'rf__max_depth': [10, 20, 30, None],
    'rf__min_samples_split': [2, 5, 10],
    'rf__min_samples_leaf': [1, 2, 4],
    'rf__max_features': ['sqrt', 'log2']
}


grid_search = GridSearchCV(
   estimator=clf_pipeline,
   param_grid=param_grid,
   cv=5,
   scoring='balanced_accuracy',
   n_jobs=-1,
   verbose=2
)

grid_search.fit(X_train, y_train)

print("Mejores hiperparámetros:", grid_search.best_params_)
print("Mejor balanced accuracy (CV):", grid_search.best_score_)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

print("Balanced Accuracy en test:",
      balanced_accuracy_score(y_test, y_pred))

Fitting 5 folds for each of 216 candidates, totalling 1080 fits
Mejores hiperparámetros: {'rf__max_depth': None, 'rf__max_features': 'log2', 'rf__min_samples_leaf': 1, 'rf__min_samples_split': 10, 'rf__n_estimators': 300}
Mejor balanced accuracy (CV): 0.6590823360700611
Balanced Accuracy en test: 0.6739637681336459


In [19]:
# Paso 5.
# Guarde el modelo (comprimido con gzip) como "files/models/model.pkl.gz".
# Recuerde que es posible guardar el modelo comprimido usanzo la libreria gzip.


with gzip.open("../files/models/model.pkl.gz", "wb") as f:
    pickle.dump(grid_search, f)

In [22]:
# Paso 6.
# Calcule las metricas de precision, precision balanceada, recall,
# y f1-score para los conjuntos de entrenamiento y prueba.
# Guardelas en el archivo files/output/metrics.json. Cada fila
# del archivo es un diccionario con las metricas de un modelo.
# Este diccionario tiene un campo para indicar si es el conjunto
# de entrenamiento o prueba. Por ejemplo:
#
# {'dataset': 'train', 'precision': 0.8, 'balanced_accuracy': 0.7, 'recall': 0.9, 'f1_score': 0.85}
# {'dataset': 'test', 'precision': 0.7, 'balanced_accuracy': 0.6, 'recall': 0.8, 'f1_score': 0.75}

# Crear carpeta de salida si no existe
os.makedirs("../files/output", exist_ok=True)

# Predicciones
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)

# Función para calcular métricas
def compute_metrics(y_true, y_pred, dataset_name):
    return {
        "type": "metrics",
        "dataset": dataset_name,
        "precision": precision_score(y_true, y_pred, zero_division=0),
        "balanced_accuracy": balanced_accuracy_score(y_true, y_pred),
        "recall": recall_score(y_true, y_pred, zero_division=0),
        "f1_score": f1_score(y_true, y_pred, zero_division=0)
    }

# Calcular métricas
metrics = [
    compute_metrics(y_train, y_train_pred, "train"),
    compute_metrics(y_test, y_test_pred, "test")
]

# Guardar en JSON (cada fila un diccionario)
with open("../files/output/metrics.json", "w") as f:
    for m in metrics:
        f.write(json.dumps(m) + "\n")

print("Métricas guardadas en ../files/output/metrics.json")


Métricas guardadas en ../files/output/metrics.json


In [23]:
# Paso 7.
# Calcule las matrices de confusion para los conjuntos de entrenamiento y
# prueba. Guardelas en el archivo files/output/metrics.json. Cada fila
# del archivo es un diccionario con las metricas de un modelo.
# de entrenamiento o prueba. Por ejemplo:
#
# {'type': 'cm_matrix', 'dataset': 'train', 'true_0': {"predicted_0": 15562, "predicte_1": 666}, 'true_1': {"predicted_0": 3333, "predicted_1": 1444}}
# {'type': 'cm_matrix', 'dataset': 'test', 'true_0': {"predicted_0": 15562, "predicte_1": 650}, 'true_1': {"predicted_0": 2490, "predicted_1": 1420}}
#

def compute_confusion(y_true, y_pred, dataset_name):
    cm = confusion_matrix(y_true, y_pred, labels=[0,1])
    return {
        "type": "cm_matrix",
        "dataset": dataset_name,
        "true_0": {"predicted_0": int(cm[0,0]), "predicted_1": int(cm[0,1])},
        "true_1": {"predicted_0": int(cm[1,0]), "predicted_1": int(cm[1,1])}
    }

conf_matrices = [
    compute_confusion(y_train, y_train_pred, "train"),
    compute_confusion(y_test, y_test_pred, "test")
]

# --- Guardar métricas + matrices en JSON ---
with open("../files/output/metrics.json", "w") as f:
    for m in metrics + conf_matrices:   # concatenamos las dos listas
        f.write(json.dumps(m) + "\n")

print("Métricas y matrices de confusión guardadas en ../files/output/metrics.json")



Métricas y matrices de confusión guardadas en ../files/output/metrics.json
