# *Laboratorio 1* #

In [1]:
import pandas as pd  #  type: ignore
import numpy as np  #  type: ignore
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, balanced_accuracy_score
from sklearn.feature_selection import SelectKBest, f_classif

In [2]:
train_path = "../files/input/train_data.csv.zip"
test_path = "../files/input/test_data.csv.zip"

train_df = pd.read_csv(train_path, compression="zip")
test_df = pd.read_csv(test_path, compression="zip")

In [3]:
train_df.shape, test_df.shape

((21000, 25), (9000, 25))

### Paso 1: Carga y limpieza de datos ###

Realice la limpieza de los datasets:
- Renombre la columna "default payment next month" a "default".
- Remueva la columna "ID".
- Elimine los registros con informacion no disponible.
- Para la columna EDUCATION, valores > 4 indican niveles superiores
  de educación, agrupe estos valores en la categoría "others".
- Renombre la columna "default payment next month" a "default"
- Remueva la columna "ID".

In [4]:
# ======================================================
# === Paso 1: Carga y limpieza de datos ===============
# ======================================================

def limpieza(df: pd.DataFrame) -> pd.DataFrame:
    df = df.rename(columns={"default payment next month": "default"})
    df = df.drop(columns=["ID"])
    df = df.dropna()
    df["EDUCATION"] = df["EDUCATION"].replace({5: 4, 6: 4})
    return df

train_df = limpieza(train_df)
test_df = limpieza(test_df)

### Paso 2: Dividir los datasets ###
- Divida los datasets en x_train, y_train, x_test, y_test.

In [5]:
# ====================================================================================
# === Paso 2: División de datos en conjuntos de entrenamiento y prueba ===============
# ====================================================================================

def split_datasets(train_data, test_data):

    # train
    X_train = train_data.drop(columns=["default"])
    y_train = train_data["default"]

    #test
    X_test = test_data.drop(columns=["default"])
    y_test = test_data["default"]

    return X_train, y_train, X_test, y_test

X_train, y_train, X_test, y_test = split_datasets(train_df, test_df)

### Paso 3: creación pipeline ###
Cree un pipeline para el modelo de clasificación. Este pipeline debe
contener las siguientes capas:
- Transforma las variables categoricas usando el método
  one-hot-encoding.
- Ajusta un modelo de bosques aleatorios (rando forest).

In [7]:
# ======================================================
# === Paso 3: Pipeline del modelo ======================
# ======================================================

def make_pipeline(cat_cols, random_state=477):

    encoder = OneHotEncoder(handle_unknown="ignore")
    preprocessor = ColumnTransformer(
        transformers=[("cat", encoder, cat_cols)],
        remainder="passthrough"
    )

    rf = RandomForestClassifier(random_state=random_state)

    pipeline = Pipeline(steps=[
        ("preprocess", preprocessor),
        ("model", rf)
    ])

    return pipeline


cat_cols = ["SEX", "EDUCATION", "MARRIAGE", "PAY_0", "PAY_2", "PAY_3", "PAY_4", "PAY_5", "PAY_6"]
pipeline = make_pipeline(cat_cols)
pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('preprocess', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


### Paso 4: Optimizacion de parámetros ###

- Optimice los hiperparametros del pipeline usando validación cruzada.
- Use 10 splits para la validación cruzada. Use la función de precision
- balanceada para medir la precisión del modelo.

In [10]:
# ======================================================
# === Paso 4: Optimización de hiperparámetros ==========
# ======================================================

def optimize_hyperparameters(pipeline, X_train, y_train):

    param_grid = {
        "model__n_estimators": [100, 200, 300],
        "model__max_depth": [None, 5, 10, 15],
        "model__min_samples_split": [2, 5, 10],
        "model__min_samples_leaf": [1, 2, 4],
        "model__max_features": ['sqrt']
    }

    scorer = make_scorer(balanced_accuracy_score)

    grid_search = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid,
        scoring='balanced_accuracy',
        cv=5,
        n_jobs=-1,
        verbose=2
    )

    grid_search.fit(X_train, y_train)

    return grid_search

grid_search = optimize_hyperparameters(pipeline, X_train, y_train)



Fitting 5 folds for each of 108 candidates, totalling 540 fits
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=100; total time=  18.5s
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=100; total time=  18.6s
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=100; total time=  18.7s
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=100; total time=  18.7s
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=100; total time=  19.1s
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=200; total time=  3

### Paso 5: Guardar el modelo ###

- Guarde el modelo (comprimido con gzip) como "files/models/model.pkl.gz".
- Recuerde que es posible guardar el modelo comprimido usanzo la libreria gzip.

In [11]:
# ======================================================
# === Paso 5: Guardar modelo ====================
# ======================================================

def save_estimator(estimator):
    models_path = "files/models"
    os.makedirs(models_path, exist_ok=True)

    with gzip.open("files/models/model.pkl.gz", "wb") as file:
        pickle.dump(estimator, file)     
    print(f"Modelo guardado en: {'files/models/model.pkl.gz'}")


def load_estimator(output_path):
    """Cargar modelo comprimido"""
    import gzip, pickle
    if not os.path.exists(output_path):
        return None
    with gzip.open(output_path, "rb") as f:
        return pickle.load(f)

# Paso 6.
- Calcule las metricas de precision, precision balanceada, recall,
y f1-score para los conjuntos de entrenamiento y prueba.
- Guardelas en el archivo files/output/metrics.json. Cada fila
 del archivo es un diccionario con las metricas de un modelo.
Este diccionario tiene un campo para indicar si es el conjunto
de entrenamiento o prueba. Por ejemplo:

- {'dataset': 'train', 'precision': 0.8, 'balanced_accuracy': 0.7, 'recall': 0.9, 'f1_score': 0.85}
- {'dataset': 'test', 'precision': 0.7, 'balanced_accuracy': 0.6, 'recall': 0.8, 'f1_score': 0.75}

In [None]:
# ======================================================
# === Paso 6: Matrices de confusión ====================
# ======================================================

from typing import Tuple, Dict
import json
from sklearn.metrics import (
    confusion_matrix, precision_score, recall_score, f1_score, balanced_accuracy_score
)

def build_rows(y_true, y_pred, dataset: str):
    metrics_row = {
        "type": "metrics",
        "dataset": dataset,
        "precision": float(precision_score(y_true, y_pred, zero_division=0)),
        "balanced_accuracy": float(balanced_accuracy_score(y_true, y_pred)),
        "recall": float(recall_score(y_true, y_pred, zero_division=0)),
        "f1_score": float(f1_score(y_true, y_pred, zero_division=0)),
    }
    cm = confusion_matrix(y_true, y_pred, labels=[0, 1])
    cm_row = {
        "type": "cm_matrix",
        "dataset": dataset,
        "true_0": {"predicted_0": int(cm[0, 0]), "predicted_1": int(cm[0, 1])},
        "true_1": {"predicted_0": int(cm[1, 0]), "predicted_1": int(cm[1, 1])},
    }
    return metrics_row, cm_row

best_model = grid_search.best_estimator_
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)

m_train, cm_train = build_rows(y_train, y_train_pred, "train")
m_test, cm_test = build_rows(y_test, y_test_pred, "test")

# Orden requerido por el autograder
records = [m_train, m_test, cm_train, cm_test]

from pathlib import Path

metrics_path = Path("../files/output/metrics.json")
with open(metrics_path, "w", encoding="utf-8") as f:
    for rec in records:
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")

metrics_path, metrics_path.exists()

(PosixPath('../files/output/metrics.json'), True)