In [19]:
#imports
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, balanced_accuracy_score, precision_score, recall_score, f1_score
import gzip
import json
import pickle

In [12]:
#paso 1: Cargar y procesar data
def load_train_test_data():
    df_train = pd.read_csv("../files/input/train_data.csv.zip", index_col=False)
    df_test = pd.read_csv("../files/input/test_data.csv.zip", index_col=False)
    return df_train, df_test


def process_data(df_train, df_test):  # Corregido el nombre de la función
    # Eliminar ID
    df_train.drop(columns=["ID"], inplace=True)
    df_test.drop(columns=["ID"], inplace=True)
    # Renombrar default
    df_train = df_train.rename(columns={"default payment next month": "default"})  # Corregido
    df_test = df_test.rename(columns={"default payment next month": "default"})    # Corregido
    # Eliminar Nan
    df_train.dropna(inplace=True)
    df_test.dropna(inplace=True)
    # Corregir educación
    df_train["EDUCATION"] = df_train["EDUCATION"].apply(lambda x: 4 if x > 4 else x)
    df_test["EDUCATION"] = df_test["EDUCATION"].apply(lambda x: 4 if x > 4 else x)
    # Eliminar no disponibles de la columna mariage y education
    df_train = df_train.query("MARRIAGE != 0 and EDUCATION != 0")
    df_test = df_test.query("MARRIAGE != 0 and EDUCATION != 0")
    return df_train, df_test

train, test = load_train_test_data()
train, test = process_data(train, test)  # Actualizado el nombre de la función

In [14]:
# Paso 2: Dividir los datos en conjuntos de entrenamiento y prueba
def make_train_test_split(df_train, df_test):
    # Usar el nombre correcto de la columna
    x_train = df_train.drop(columns=["default"])
    x_test = df_test.drop(columns=["default"])
    y_train = df_train["default"]
    y_test = df_test["default"]
    return x_train, y_train, x_test, y_test

x_train, y_train, x_test, y_test = make_train_test_split(train, test)

# Verificar las dimensiones de los conjuntos
print("Dimensiones de los conjuntos:")
print(f"X_train: {x_train.shape}")
print(f"X_test: {x_test.shape}")
print(f"y_train: {y_train.shape}")
print(f"y_test: {y_test.shape}")

Dimensiones de los conjuntos:
X_train: (20953, 23)
X_test: (8979, 23)
y_train: (20953,)
y_test: (8979,)


In [None]:
# Paso 3: Crear el pipeline con preprocesamiento y modelo
categorical_features = ["EDUCATION", "MARRIAGE", "SEX"]

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(), categorical_features),
    ],
    remainder="passthrough",
)

pipeline = Pipeline(
    [
        ("preprocessor", preprocessor),
        ("classifier", RandomForestClassifier(random_state=42)),
    ]
)




In [None]:
# Paso 4: Búsqueda en grid para hiperparámetros
def grid_search(pipeline, X_train, y_train):
    # Espacio de búsqueda de hiperparámetros (usar el nombre del step 'classifier')
    param_grid = {
        "classifier__n_estimators": [100, 200],
        "classifier__max_depth": [10, None],
        "classifier__min_samples_split": [10],
        "classifier__min_samples_leaf": [2, 4],
        "classifier__max_features": [25]
    }

    model = GridSearchCV(
        pipeline,
        param_grid,
        cv=10,
        scoring="balanced_accuracy",
        n_jobs=-1,
        verbose=1,
    )
    model.fit(X_train, y_train)

    return model

# Ejecutar búsqueda en grid
model = grid_search(pipeline, x_train, y_train)

Fitting 10 folds for each of 8 candidates, totalling 80 fits


In [26]:
# Paso 5: Guardar el modelo

# Asegurar que el directorio existe
os.makedirs("../files/models", exist_ok=True)

# Guardar el modelo GridSearchCV completo (no solo el mejor estimador)
with gzip.open("../files/models/model.pkl.gz", "wb") as f:
    pickle.dump(model, f)  # Guardamos el modelo completo, no solo model.best_estimator_

# 

In [24]:
def save_metrics(model, x_train, y_train, x_test, y_test):

    # Predicciones
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)

    # Calcular métricas de confusión
    cm_train = confusion_matrix(y_train, y_train_pred)
    cm_test = confusion_matrix(y_test, y_test_pred)


    res = [

        {
        'type': 'metrics',
        'dataset': 'train',
        'precision': precision_score(y_train, y_train_pred, zero_division=0),
        'balanced_accuracy': balanced_accuracy_score(y_train, y_train_pred),
        'recall': recall_score(y_train, y_train_pred, zero_division=0),
        'f1_score': f1_score(y_train, y_train_pred, zero_division=0)  
        },
        {
        'type': 'metrics',
        'dataset': 'test',
        'precision': precision_score(y_test, y_test_pred, zero_division=0),
        'balanced_accuracy': balanced_accuracy_score(y_test, y_test_pred),
        'recall': recall_score(y_test, y_test_pred, zero_division=0),
        'f1_score': f1_score(y_test, y_test_pred, zero_division=0)
        },

      {
        "type": "cm_matrix",
        "dataset": "train",
        "true_0": {"predicted_0": int(cm_train[0][0]), "predicted_1": int(cm_train[0][1])},
        "true_1": {"predicted_0": int(cm_train[1][0]), "predicted_1": int(cm_train[1][1])}
    },

    {
        "type": "cm_matrix",
        "dataset": "test",
        "true_0": {"predicted_0": int(cm_test[0][0]), "predicted_1": int(cm_test[0][1])},
        "true_1": {"predicted_0": int(cm_test[1][0]), "predicted_1": int(cm_test[1][1])}
    }
    ]

    with open("files/output/metrics.json", "w") as file:
        for item in res:
            json.dump(item, file)
            file.write("\n")
save_metrics(model, x_train, y_train, x_test, y_test)