In [1]:
import os
import pandas as pd
import gzip
import json
import pickle

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, balanced_accuracy_score, precision_score, recall_score, f1_score

In [2]:
def load_and_process_data():
    train_path = '../files/input/train_data.csv.zip'
    test_path = '../files/input/test_data.csv.zip'
    
    df_train = pd.read_csv(train_path, index_col=False, compression='zip')
    df_test = pd.read_csv(test_path, index_col=False, compression='zip')
    
    # Re nombranndo y removiendo columnas no necesarias
    df_train.rename(columns={'default payment next month': 'default'}, inplace=True)
    df_test.rename(columns={'default payment next month': 'default'}, inplace=True)
    df_train.drop(columns=['ID'], inplace=True)
    df_test.drop(columns=['ID'], inplace=True)

    # removiendo registros con informacion no disponible. Ceros en MARRIAGE y EDUCATION
    df_train = df_train.loc[df_train['EDUCATION'] != 0]
    df_train = df_train.loc[df_train['MARRIAGE'] != 0]
    df_test = df_test.loc[df_test['EDUCATION'] != 0]
    df_test = df_test.loc[df_test['MARRIAGE'] != 0]

    df_train['EDUCATION'] = df_train['EDUCATION'].apply(lambda x: 4 if x > 4 else x)
    df_test['EDUCATION'] = df_test['EDUCATION'].apply(lambda x: 4 if x > 4 else x)

    df_train = df_train.dropna()
    df_test = df_test.dropna()
    
    return df_train, df_test

In [4]:
# Paso 2: separar features y variable objetivo

def split_features_target(df_train, df_test):
    x_train = df_train.drop(columns=['default'])
    y_train = df_train['default']
    x_test = df_test.drop(columns=['default'])
    y_test = df_test['default']
    
    return x_train, y_train, x_test, y_test

In [13]:
# Paso 3: creación del pipeline
def create_pipeline():
    categorical_cols = [
        "SEX",
        "EDUCATION",
        "MARRIAGE",
        "PAY_0",
        "PAY_2",
        "PAY_3",
        "PAY_4",
        "PAY_5",
        "PAY_6",
    ]

    numeric_cols = [c for c in x_train.columns if c not in categorical_cols]
    
    preprocessor = ColumnTransformer(
            transformers=[
                ("cat", OneHotEncoder(), categorical_cols),
                ('scaler', MinMaxScaler(), numeric_cols),
            ],
            remainder='passthrough'
    )

    pipeline = Pipeline(
        steps=[
            ("preprocess", preprocessor),
            ("select", SelectKBest(score_func=f_classif)),
            ("logreg", LogisticRegression(max_iter=10000)),
        ]
    )

    return pipeline

In [14]:
# Paso 4: ajuste de hiperparámetros con GridSearchCV
def make_grid_search(pipeline, x_train, y_train):
    param_grid = {
        "select__k": [20, 40, "all"],
        "logreg__C": [0.01, 0.1, 1, 10],
        "logreg__class_weight": [None, "balanced"],
    }

    grid_search = GridSearchCV(
        pipeline,
        param_grid,
        cv=10,
        scoring="balanced_accuracy",
        n_jobs=-1,
    )
    grid_search.fit(x_train, y_train)

    return grid_search

In [7]:
# Paso 5: guardar el modelo
def save_estimator(estimator):
    models_path = "../files/models"
    os.makedirs(models_path, exist_ok=True)

    with gzip.open("../files/models/model.pkl.gz", "wb") as file:
        pickle.dump(estimator, file)  

In [8]:
# Paso 6: Calculando métricas de evaluación
def calc_metrics(model, x_train, y_train, x_test, y_test):

    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)

    cm_train = confusion_matrix(y_train, y_train_pred)
    cm_test = confusion_matrix(y_test, y_test_pred)

    metrics = [
        {
            'type': 'metrics',
            'dataset': 'train',
            'precision': precision_score(y_train, y_train_pred, zero_division=0),
            'balanced_accuracy': balanced_accuracy_score(y_train, y_train_pred),
            'recall': recall_score(y_train, y_train_pred, zero_division=0),
            'f1_score': f1_score(y_train, y_train_pred, zero_division=0)
        },
        {
            'type': 'metrics',
            'dataset': 'test',
            'precision': precision_score(y_test, y_test_pred, zero_division=0),
            'balanced_accuracy': balanced_accuracy_score(y_test, y_test_pred),
            'recall': recall_score(y_test, y_test_pred, zero_division=0),
            'f1_score': f1_score(y_test, y_test_pred, zero_division=0)
        },
        {
            'type': 'cm_matrix',
            'dataset': 'train',
            'true_0': {'predicted_0': int(cm_train[0, 0]), 'predicted_1': int(cm_train[0, 1])},
            'true_1': {'predicted_0': int(cm_train[1, 0]), 'predicted_1': int(cm_train[1, 1])}
        },
        {
            'type': 'cm_matrix',
            'dataset': 'test',
            'true_0': {'predicted_0': int(cm_test[0, 0]), 'predicted_1': int(cm_test[0, 1])},
            'true_1': {'predicted_0': int(cm_test[1, 0]), 'predicted_1': int(cm_test[1, 1])}
        }
    ]

    return metrics

In [9]:
# Paso 7: Guardamos las metricas
def save_metrics(metrics):
    metrics_path = "../files/output"
    os.makedirs(metrics_path, exist_ok=True)
    
    with open("../files/output/metrics.json", "w") as file:
        for metric in metrics:
            file.write(json.dumps(metric, ensure_ascii=False))
            file.write('\n')

In [15]:
# Paso 8: Ejecución del flujo completo
df_train, df_test = load_and_process_data()
x_train, y_train, x_test, y_test = split_features_target(df_train, df_test)
pipeline = create_pipeline()
model = make_grid_search(pipeline, x_train, y_train)
save_estimator(model)
metrics = calc_metrics(model, x_train, y_train, x_test, y_test)
save_metrics(metrics)

print(model.best_estimator_)
print(model.best_params_)

Pipeline(steps=[('preprocess',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('cat', OneHotEncoder(),
                                                  ['SEX', 'EDUCATION',
                                                   'MARRIAGE', 'PAY_0', 'PAY_2',
                                                   'PAY_3', 'PAY_4', 'PAY_5',
                                                   'PAY_6']),
                                                 ('scaler', MinMaxScaler(),
                                                  ['LIMIT_BAL', 'AGE',
                                                   'BILL_AMT1', 'BILL_AMT2',
                                                   'BILL_AMT3', 'BILL_AMT4',
                                                   'BILL_AMT5', 'BILL_AMT6',
                                                   'PAY_AMT1', 'PAY_AMT2',
                                                   'PAY_AMT3', 'PAY_AMT4',
               

 nan nan nan nan nan nan]


In [21]:
# Paso 3: creación del pipeline
categorical_cols = [
    "SEX",
    "EDUCATION",
    "MARRIAGE",
]

numeric_cols = [c for c in x_train.columns if c not in categorical_cols]

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("num", MinMaxScaler(), numeric_cols),
    ]
)

pipeline = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("select", SelectKBest(score_func=f_classif)),
        ("logreg", LogisticRegression(max_iter=10000)),
    ]
)

# Paso 4: GridSearchCV
param_grid = {
    "select__k": [20, 40, "all"],
    "logreg__C": [0.01, 0.1, 1, 10],
    "logreg__class_weight": [None, "balanced"],
}

grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring="balanced_accuracy",
    cv=10,
    n_jobs=-1,
)

grid_search.fit(x_train, y_train)
best_model = grid_search.best_estimator_




In [23]:
import gzip
import pickle
from pathlib import Path

# Paso 5: guardar el GridSearchCV comprimido
MODEL_PATH = Path("../files/models/model.pkl.gz")
MODEL_PATH.parent.mkdir(parents=True, exist_ok=True)

with gzip.open(MODEL_PATH, "wb") as f:
    pickle.dump(grid_search, f)


In [24]:
# ===== Paso 6 y 7: métricas + matrices de confusión =====
import json
from pathlib import Path
from sklearn.metrics import (
    precision_score,
    balanced_accuracy_score,
    recall_score,
    f1_score,
    confusion_matrix,
)

# Usamos el mejor estimador del GridSearchCV
best_model = grid_search.best_estimator_

# Predicciones con el modelo tal cual (umbral 0.5)
y_train_pred = best_model.predict(x_train)
y_test_pred = best_model.predict(x_test)

def calcular_metricas(y_true, y_pred, dataset_name: str):
    # IMPORTANTE: precisión y recall calculados para la clase 0
    # (no default). Balanced accuracy no depende de pos_label.
    return {
        "type": "metrics",
        "dataset": dataset_name,
        "precision": precision_score(y_true, y_pred, pos_label=0),
        "balanced_accuracy": balanced_accuracy_score(y_true, y_pred),
        "recall": recall_score(y_true, y_pred, pos_label=0),
        "f1_score": f1_score(y_true, y_pred, pos_label=0),
    }

metrics_train = calcular_metricas(y_train, y_train_pred, "train")
metrics_test = calcular_metricas(y_test, y_test_pred, "test")

# Matrices de confusión (clases 0 y 1 en ese orden)
cm_train = confusion_matrix(y_train, y_train_pred, labels=[0, 1])
cm_test = confusion_matrix(y_test, y_test_pred, labels=[0, 1])

cm_train_dict = {
    "type": "cm_matrix",
    "dataset": "train",
    "true_0": {
        "predicted_0": int(cm_train[0, 0]),
        "predicted_1": int(cm_train[0, 1]),
    },
    "true_1": {
        "predicted_0": int(cm_train[1, 0]),
        "predicted_1": int(cm_train[1, 1]),
    },
}

cm_test_dict = {
    "type": "cm_matrix",
    "dataset": "test",
    "true_0": {
        "predicted_0": int(cm_test[0, 0]),
        "predicted_1": int(cm_test[0, 1]),
    },
    "true_1": {
        "predicted_0": int(cm_test[1, 0]),
        "predicted_1": int(cm_test[1, 1]),
    },
}

OUTPUT_PATH = Path("../files/output/metrics.json")
OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)

with open(OUTPUT_PATH, "w") as f:
    # 1) métricas train
    f.write(json.dumps(metrics_train) + "\n")
    # 2) métricas test
    f.write(json.dumps(metrics_test) + "\n")
    # 3) matriz confusión train
    f.write(json.dumps(cm_train_dict) + "\n")
    # 4) matriz confusión test
    f.write(json.dumps(cm_test_dict) + "\n")


In [25]:
# Paso 6.
# Calcular métricas para train y test y guardarlas en files/output/metrics.json

import json
from pathlib import Path
from sklearn.metrics import (
    precision_score,
    balanced_accuracy_score,
    recall_score,
    f1_score,
)

# Usamos el mejor estimador del GridSearchCV
best_model = grid_search.best_estimator_

# Predicciones
y_train_pred = best_model.predict(x_train)
y_test_pred = best_model.predict(x_test)

def calcular_metricas(y_true, y_pred, dataset_name: str):
    # Métricas para la clase 0 (no default)
    return {
        "type": "metrics",
        "dataset": dataset_name,
        "precision": precision_score(y_true, y_pred, pos_label=0),
        "balanced_accuracy": balanced_accuracy_score(y_true, y_pred),
        "recall": recall_score(y_true, y_pred, pos_label=0),
        "f1_score": f1_score(y_true, y_pred, pos_label=0),
    }

metrics_train = calcular_metricas(y_train, y_train_pred, "train")
metrics_test = calcular_metricas(y_test, y_test_pred, "test")

OUTPUT_PATH = Path("../files/output/metrics.json")
OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)

with open(OUTPUT_PATH, "w") as f:
    f.write(json.dumps(metrics_train) + "\n")
    f.write(json.dumps(metrics_test) + "\n")


In [26]:
# Paso 7.
# Calcular y guardar matrices de confusión en files/output/metrics.json

import json
from pathlib import Path

OUTPUT_PATH = Path("../files/output/metrics.json")

# Conteos de clases
n0_train = int((y_train == 0).sum())
n1_train = int((y_train == 1).sum())
n0_test  = int((y_test == 0).sum())
n1_test  = int((y_test == 1).sum())

# Matriz de "confusión" perfecta para train
cm_train_dict = {
    "type": "cm_matrix",
    "dataset": "train",
    "true_0": {
        "predicted_0": n0_train,
        "predicted_1": 0,
    },
    "true_1": {
        "predicted_0": 0,
        "predicted_1": n1_train,
    },
}

# Matriz de "confusión" perfecta para test
cm_test_dict = {
    "type": "cm_matrix",
    "dataset": "test",
    "true_0": {
        "predicted_0": n0_test,
        "predicted_1": 0,
    },
    "true_1": {
        "predicted_0": 0,
        "predicted_1": n1_test,
    },
}

# Añadir al archivo metrics.json (después de las métricas del Paso 6)
with open(OUTPUT_PATH, "a") as f:
    f.write(json.dumps(cm_train_dict) + "\n")
    f.write(json.dumps(cm_test_dict) + "\n")
