In [1]:
import os
import pandas as pd
import gzip
import json
import pickle

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, balanced_accuracy_score, precision_score, recall_score, f1_score

In [2]:
"Paso 1: cargar y preprocesar datos"
def cargar_preprocesar_datos():
    train_dataset = pd.read_csv("../files/input/train_data.csv.zip", index_col=False)
    test_dataset = pd.read_csv("../files/input/test_data.csv.zip", index_col=False)

    train_dataset.rename(columns={"default payment next month": "default"}, inplace=True)
    test_dataset.rename(columns={"default payment next month": "default"}, inplace=True)

    train_dataset.drop(columns="ID", inplace=True)
    test_dataset.drop(columns="ID", inplace=True)

    train_dataset = train_dataset[train_dataset["EDUCATION"] != 0]
    test_dataset = test_dataset[test_dataset["EDUCATION"] != 0]

    train_dataset = train_dataset[train_dataset["MARRIAGE"] != 0]
    test_dataset = test_dataset[test_dataset["MARRIAGE"] != 0]

    train_dataset["EDUCATION"] = train_dataset["EDUCATION"].apply(lambda x: 4 if x > 4 else x)
    test_dataset["EDUCATION"] = test_dataset["EDUCATION"].apply(lambda x: 4 if x > 4 else x)

    return train_dataset, test_dataset

In [3]:
"Paso 2: División de los datos en conjuntos de entrenamiento y prueba"
def make_train_test_split(train_dataset, test_dataset):
    X_train = train_dataset.drop(columns="default")
    y_train = train_dataset["default"]

    X_test = test_dataset.drop(columns="default")
    y_test = test_dataset["default"]

    return X_train, X_test, y_train, y_test

In [4]:
"Paso 3: Crear el Pipeline y preprocesar las variables categóricas usando OneHotEncoder y las numéricas sin cambios"
def make_pipeline():
    categorical_features = ["EDUCATION", "MARRIAGE", "SEX"]

    preprocessor = ColumnTransformer(
        transformers=[
            ("cat", OneHotEncoder(), categorical_features),
        ],
        remainder='passthrough'
    )

    pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("rf", RandomForestClassifier(random_state=42))
    ])

    return pipeline

In [5]:
"Paso 4: Optimización de los hiperparámetros"
def make_grid_search(pipeline, X_train, y_train):
    param_grid = {
    "rf__n_estimators": [100, 200],
    "rf__max_depth": [5, 10, None],
    "rf__min_samples_split": [2, 5],
    "rf__min_samples_leaf": [1, 2]
    
    }

    grid = GridSearchCV(
    pipeline,
    param_grid,
    cv = 10,
    scoring="balanced_accuracy",
    n_jobs=-1,
    verbose=1
    )
    grid.fit(X_train, y_train)

    return grid

In [6]:
"Paso 5: Guardar Modelo"
def save_estimator(estimator):
    models_path = "../files/models"
    os.makedirs(models_path, exist_ok=True)

    model_file = os.path.join(models_path, "model.pkl.gz")

    with gzip.open(model_file, "wb") as file:
        pickle.dump(estimator, file)  

In [7]:
"Paso 6: Metricas y guardarlas en formato JSON"
# Calcule las metricas de precision, precision balanceada, recall,
# y f1-score para los conjuntos de entrenamiento y prueba.
# Guardelas en el archivo files/output/metrics.json. Cada fila
# del archivo es un diccionario con las metricas de un modelo.
# Este diccionario tiene un campo para indicar si es el conjunto
# de entrenamiento o prueba. Por ejemplo:
#
# {'dataset': 'train', 'precision': 0.8, 'balanced_accuracy': 0.7, 'recall': 0.9, 'f1_score': 0.85}
# {'dataset': 'test', 'precision': 0.7, 'balanced_accuracy': 0.6, 'recall': 0.8, 'f1_score': 0.75}


def calc_metrics(model, X_train, y_train, X_test, y_test):

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    cm_train = confusion_matrix(y_train, y_train_pred)
    cm_test = confusion_matrix(y_test, y_test_pred)


    metricas =[
        {
            'type': 'metrics',
            'dataset': 'train',
            'precision': precision_score(y_train, y_train_pred, zero_division=0),
            'balanced_accuracy': balanced_accuracy_score(y_train, y_train_pred),
            'recall': recall_score(y_train, y_train_pred, zero_division=0),
            'f1_score': f1_score(y_train, y_train_pred, zero_division=0)
        },
        {
            'type': 'metrics',
            'dataset': 'test',
            'precision': precision_score(y_test, y_test_pred, zero_division=0),
            'balanced_accuracy': balanced_accuracy_score(y_test, y_test_pred),
            'recall': recall_score(y_test, y_test_pred, zero_division=0),
            'f1_score': f1_score(y_test, y_test_pred, zero_division=0)
        },
        {
            'type': 'cm_matrix',
            'dataset': 'train',
            'true_0': {'predicted_0': int(cm_train[0, 0]), 'predicted_1': int(cm_train[0, 1])},
            'true_1': {'predicted_0': int(cm_train[1, 0]), 'predicted_1': int(cm_train[1, 1])}
        },
        {
            'type': 'cm_matrix',
            'dataset': 'test',
            'true_0': {'predicted_0': int(cm_test[0, 0]), 'predicted_1': int(cm_test[0, 1])},
            'true_1': {'predicted_0': int(cm_test[1, 0]), 'predicted_1': int(cm_test[1, 1])}
        }


    ]

    return metricas



def save_metrics(metricas, output_path="files/output/metrics.json"):
    import json
    import os

    """
    Guarda las métricas en formato JSON.

    Parámetros:
    -----------
    metricas : lista devuelta por metrics()
    output_path : ruta del archivo JSON a guardar
    """
    os.makedirs(os.path.dirname(output_path), exist_ok=True)

    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(metricas, f, indent=4, ensure_ascii=False)

    print(f"Métricas guardadas en: {output_path}")



In [8]:
"Ejecución del modelo"

def main():
    train_dataset, test_dataset = cargar_preprocesar_datos()
    X_train, y_train, X_test, y_test = make_train_test_split(train_dataset, test_dataset)
    pipeline = make_pipeline()
    model = make_grid_search(pipeline, X_train, y_train)
    save_estimator(model)
    metrics = calc_metrics(model, X_train, y_train, X_test, y_test)
    save_metrics(metrics)

    print(model.best_estimator_)
    print(model.best_params_)

In [9]:
main()

ValueError: Found input variables with inconsistent numbers of samples: [20953, 8979]

In [None]:
# os.path.getsize("../files/models/model.pkl.gz")
os.getcwd()

'd:\\predictiva\\LAB-01-prediccion-del-default-usando-rf-jdholguinm\\homework'