In [1]:
test_path = '../files/input/test_data.csv.zip'
train_path = '../files/input/train_data.csv.zip'
categorical_cols = ['SEX', 'EDUCATION', 'MARRIAGE']

In [2]:
# Paso 1.
# Realice la limpieza de los datasets:
# - Renombre la columna "default payment next month" a "default".
# - Remueva la columna "ID".
# - Elimine los registros con informacion no disponible.
# - Para la columna EDUCATION, valores > 4 indican niveles superiores
#   de educación, agrupe estos valores en la categoría "others".
# - Renombre la columna "default payment next month" a "default"
# - Remueva la columna "ID".

In [3]:
def clean_data(df):
    # Renombrar la columna "default payment next month" a "default"
    df = df.rename(columns={"default payment next month": "default"})
    
    # Remover la columna "ID"
    if 'ID' in df.columns:
        df = df.drop(columns=['ID'])
    
    # Eliminar registros con información no disponible
    df = df.dropna()
    
    # Agrupar valores > 4 en EDUCATION como 4 (others)
    if 'EDUCATION' in df.columns:
        df['EDUCATION'] = df['EDUCATION'].astype(int)
        df.loc[df['EDUCATION'] > 4, 'EDUCATION'] = 4
        
    return df

In [4]:
# Paso 2.
# Divida los datasets en x_train, y_train, x_test, y_test.

In [5]:
def load_clean_data(test_path: str, train_path: str):
    import pandas as pd

    df_test = pd.read_csv(test_path,
            index_col=False,
            compression='zip',
            )
    
    df_train = pd.read_csv(train_path,
            index_col=False,
            compression='zip',
            )

    # Aplicar limpieza a ambos DataFrames
    df_test = clean_data(df_test)
    df_train = clean_data(df_train)

# Ahora separar X/y con el nombre ya renombrado ("default")
    target_col = 'default'
    data_test = df_test.drop(columns=[target_col])
    target_test = df_test[target_col].astype(int)
    data_train = df_train.drop(columns=[target_col])
    target_train = df_train[target_col].astype(int)

    
    return data_train, target_train, data_test, target_test

In [6]:
# Paso 3.
# Cree un pipeline para el modelo de clasificación. Este pipeline debe
# contener las siguientes capas:
# - Transforma las variables categoricas usando el método
#   one-hot-encoding.
# - Ajusta un modelo de bosques aleatorios (rando forest).

In [7]:
def make_pipeline(estimator):

    from sklearn.pipeline import Pipeline
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.preprocessing import OneHotEncoder
    from sklearn.compose import ColumnTransformer

    transformer = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
        ],
        remainder='passthrough'  # mantiene las columnas numéricas sin cambios
    )

    pipeline = Pipeline(
        steps=[
            ('preprocessor', transformer),
            ('classifier', estimator)
        ],
        verbose=False
    )
    
    return pipeline

In [8]:
# Paso 4.
# Optimice los hiperparametros del pipeline usando validación cruzada.
# Use 10 splits para la validación cruzada. Use la función de precision
# balanceada para medir la precisión del modelo.

In [9]:
def make_grid_search(estimator, param_grid, cv=10, scoring='balanced_accuracy', n_jobs=None):

    from sklearn.model_selection import GridSearchCV

    grid_search = GridSearchCV(
        estimator=estimator,
        param_grid=param_grid,
        cv=cv,
        scoring=scoring,
        n_jobs=n_jobs
    )

    return grid_search


In [10]:
# Paso 5.
# Guarde el modelo (comprimido con gzip) como "files/models/model.pkl.gz".
# Recuerde que es posible guardar el modelo comprimido usanzo la libreria gzip.

In [11]:
def save_estimator(estimator):

    import gzip
    import pickle
    import os

    model_dir = '../files/models'

    os.makedirs(model_dir, exist_ok=True)
    model_path = os.path.join(model_dir, 'model.pkl.gz')

    with gzip.open(model_path, 'wb') as file:
        pickle.dump(estimator, file)

In [12]:
def load_estimator():

    import os
    import gzip
    import pickle

    model_dir = '../files/models'
    model_path = os.path.join(model_dir, 'model.pkl.gz')

    if not os.path.exists(model_path):
        return None
    with gzip.open(model_path, 'rb') as file:
        estimator = pickle.load(file)

    return estimator

In [13]:
# Paso 6.
# Calcule las metricas de precision, precision balanceada, recall,
# y f1-score para los conjuntos de entrenamiento y prueba.
# Guardelas en el archivo files/output/metrics.json. Cada fila
# del archivo es un diccionario con las metricas de un modelo.
# Este diccionario tiene un campo para indicar si es el conjunto
# de entrenamiento o prueba. Por ejemplo:
#
# {'dataset': 'train', 'precision': 0.8, 'balanced_accuracy': 0.7, 'recall': 0.9, 'f1_score': 0.85}
# {'dataset': 'test', 'precision': 0.7, 'balanced_accuracy': 0.6, 'recall': 0.8, 'f1_score': 0.75}
#

In [14]:
def train_estimator(estimator):

    from sklearn.metrics import balanced_accuracy_score, precision_score

    x_train, y_train, x_test, y_test = load_clean_data(test_path, train_path)


    estimator.fit(x_train, y_train)
    best_estimator = load_estimator()


    if best_estimator is not None:

        saved_bas = balanced_accuracy_score(
        y_true=y_test, y_pred=best_estimator.predict(x_test)
        )

        current_bas = balanced_accuracy_score(
            y_true=y_test, y_pred=estimator.predict(x_test)
        )

        if saved_bas > current_bas:
            estimator = best_estimator

        elif saved_bas > 0.90*current_bas and saved_bas > 0.673:

            saved_ps = precision_score(
                y_true=y_test, y_pred=best_estimator.predict(x_test)
            )

            current_ps = precision_score(
                y_true=y_test, y_pred=estimator.predict(x_test)
            )

            if saved_ps >= current_ps:
                estimator = best_estimator


    save_estimator(estimator)

In [15]:
def train_rf_estimator():

    from sklearn.ensemble import RandomForestClassifier
    
    pipeline = make_pipeline(RandomForestClassifier(random_state=42))
    param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5],
    'classifier__min_samples_leaf': [1, 2]
}

    estimator = make_grid_search(
        estimator=pipeline, 
        param_grid=param_grid, 
        cv=10, 
        scoring='balanced_accuracy',
        n_jobs=-1
        )
   
    train_estimator(estimator)

train_rf_estimator()    

In [16]:
def eval_metrics(estimator, x_train, y_train, x_test, y_test):

    from sklearn.metrics import (
        precision_score,
        balanced_accuracy_score,
        recall_score,
        f1_score,
        confusion_matrix,
    )

    # Predicciones
    y_pred_train = estimator.predict(x_train)
    y_pred_test  = estimator.predict(x_test)

    # Helpers locales
    
    def metrics_dict(y_true, y_pred, dataset):
        return {
            "type": "metrics",
            "dataset": dataset,
            "precision": float(precision_score(y_true, y_pred, zero_division=0)),
            "balanced_accuracy": float(balanced_accuracy_score(y_true, y_pred)),
            "recall": float(recall_score(y_true, y_pred, zero_division=0)),
            "f1_score": float(f1_score(y_true, y_pred, zero_division=0)),
        }

    def cm_dict(y_true, y_pred, dataset):
        cm = confusion_matrix(y_true.astype(int), y_pred.astype(int), labels=[0, 1])
        tn, fp, fn, tp = int(cm[0, 0]), int(cm[0, 1]), int(cm[1, 0]), int(cm[1, 1])
        return {
            "type": "cm_matrix",
            "dataset": dataset,
            "true_0": {"predicted_0": tn, "predicted_1": fp},
            "true_1": {"predicted_0": fn, "predicted_1": tp},
        }

    # Construye los 4 renglones en el orden exacto que pide el test
    metrics_train = metrics_dict(y_train, y_pred_train, "train")
    metrics_test  = metrics_dict(y_test,  y_pred_test,  "test")
    cm_train      = cm_dict(y_train, y_pred_train, "train")
    cm_test       = cm_dict(y_test,  y_pred_test,  "test")

    return metrics_train, metrics_test, cm_train, cm_test

In [17]:
def save_report(
   metrics_train,
   metrics_test,
   cm_train,
   cm_test,
):
    
    import json
    import os
    
    path = '../files/output/metrics.json'

    if not os.path.exists(os.path.dirname(path)):
       os.makedirs(os.path.dirname(path))

    with open(path, 'w', encoding='utf-8', newline='\n') as f:
       f.write(json.dumps(metrics_train) + '\n')
       f.write(json.dumps(metrics_test) + '\n')
       f.write(json.dumps(cm_train) + '\n')
       f.write(json.dumps(cm_test) + '\n')


In [18]:
def print_report(metrics_train, metrics_test, cm_train=None, cm_test=None):
    """Imprime un resumen compacto de métricas.
    Muestra valores de test con el valor de train entre paréntesis.
    """
    def fmt(name, test_val, train_val):
        return f"{name:>20}: {test_val:.4f} ({train_val:.4f})"

    print("-" * 80)
    print("Metrics summary (test (train))")
    print("-" * 80)
    print(fmt("Balanced Accuracy", metrics_test["balanced_accuracy"], metrics_train["balanced_accuracy"]))
    print(fmt("Precision",         metrics_test["precision"],         metrics_train["precision"]))
    print(fmt("Recall",            metrics_test["recall"],            metrics_train["recall"]))
    print(fmt("F1-score",          metrics_test["f1_score"],          metrics_train["f1_score"]))
    if cm_test and cm_train:
        print("-" * 80)
        print("Confusion matrix (test):")
        print(f" true_0 -> predicted_0: {cm_test['true_0']['predicted_0']}, predicted_1: {cm_test['true_0']['predicted_1']}")
        print(f" true_1 -> predicted_0: {cm_test['true_1']['predicted_0']}, predicted_1: {cm_test['true_1']['predicted_1']}")
    print("-" * 80)

In [19]:
def check_estimator():

    x_train, y_train, x_test, y_test = load_clean_data(test_path, train_path)

    estimator = load_estimator()

    metrics_train, metrics_test, cm_train, cm_test = eval_metrics(
    estimator,
    x_train,
    y_train,
    x_test,
    y_test,
    )
    save_report(
        metrics_train,
        metrics_test,
        cm_train,
        cm_test,
    )
    print_report(             
        metrics_train,
        metrics_test,
        cm_train,
        cm_test,
    )

check_estimator()


--------------------------------------------------------------------------------
Metrics summary (test (train))
--------------------------------------------------------------------------------
   Balanced Accuracy: 0.6745 (0.8986)
           Precision: 0.6681 (0.9932)
              Recall: 0.4028 (0.7988)
            F1-score: 0.5026 (0.8854)
--------------------------------------------------------------------------------
Confusion matrix (test):
 true_0 -> predicted_0: 6709, predicted_1: 382
 true_1 -> predicted_0: 1140, predicted_1: 769
--------------------------------------------------------------------------------


In [20]:
best_params = load_estimator().best_estimator_
print(best_params)
print('_______________________________')
print(load_estimator().get_params())

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['SEX', 'EDUCATION',
                                                   'MARRIAGE'])])),
                ('classifier',
                 RandomForestClassifier(min_samples_leaf=2, min_samples_split=5,
                                        n_estimators=200, random_state=42))])
_______________________________
{'cv': 10, 'error_score': nan, 'estimator__memory': None, 'estimator__steps': [('preprocessor', ColumnTransformer(remainder='passthrough',
                  transformers=[('cat', OneHotEncoder(handle_unknown='ignore'),
                                 ['SEX', 'EDUCATION', 'MARRIAGE'])])), ('classifier', RandomForestClassifier(random_state=42))], 'estimator__transform_input': None, 'estimator__ver

def eval_metricts(
    estimator,
    x_train,
    y_train,
    x_test,
    y_test
):
    
    from sklearn.metrics import precision_score, balanced_accuracy_score, recall_score, f1_score

    y_pred_train = estimator.predict(x_train)
    y_pred_test = estimator.predict(x_test)

    precision_train = precision_score(y_train, y_pred_train)
    precision_test = precision_score(y_test, y_pred_test)
    accuracy_train = balanced_accuracy_score(y_train, y_pred_train)
    accuracy_test = balanced_accuracy_score(y_test, y_pred_test)
    recall_train = recall_score(y_train, y_pred_train)
    recall_test = recall_score(y_test, y_pred_test)
    f1_train = f1_score(y_train, y_pred_train)
    f1_test = f1_score(y_test, y_pred_test)

    cm = confusion_matrix(y_test, y_pred_test)

    
    return (
        precision_train,
        precision_test,
        accuracy_train,
        accuracy_test,
        recall_train,
        recall_test,
        f1_train,
        f1_test
    )
