In [229]:
import os
import pandas as pd
import pickle
import gzip
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (
    precision_score,
    recall_score,
    f1_score,
    balanced_accuracy_score,
    confusion_matrix,
)
import json
from sklearn.metrics import make_scorer, balanced_accuracy_score


# Configuración de rutas base del proyecto
current_dir = os.getcwd()
base_project_path = os.path.dirname(current_dir)

In [230]:
# Configuración de rutas base

input_folder = os.path.join(base_project_path, "files", "input")
train_file = os.path.join(input_folder, "train_data.csv.zip")
test_file = os.path.join(input_folder, "test_data.csv.zip")

# Cargar datasets
df_train = pd.read_csv(train_file, compression='zip')
df_test = pd.read_csv(test_file, compression='zip')

In [231]:
# Limpieza de columnas
# Renombrar variable objetivo
df_train = df_train.rename(columns={"default payment next month": "default"})
df_test = df_test.rename(columns={"default payment next month": "default"})

# Remover columna ID si existe
if "ID" in df_train.columns:
    df_train = df_train.drop(columns=["ID"])
if "ID" in df_test.columns:
    df_test = df_test.drop(columns=["ID"])

# Eliminar registros con información no disponible
df_train = df_train.dropna()
df_test = df_test.dropna()

# Ajuste de la columna EDUCATION
# Valores > 4 se agrupan en "others" (por ejemplo 5 y 6 se pasan a 4)
df_train["EDUCATION"] = df_train["EDUCATION"].apply(lambda x: x if x <= 4 else 4)
df_test["EDUCATION"] = df_test["EDUCATION"].apply(lambda x: x if x <= 4 else 4)

# Verificar resultado
print("Train shape:", df_train.shape)
print("Test shape:", df_test.shape)
print(df_train.head())

Train shape: (21000, 24)
Test shape: (9000, 24)
   LIMIT_BAL  SEX  EDUCATION  MARRIAGE  AGE  PAY_0  PAY_2  PAY_3  PAY_4  \
0     310000    1          3         1   32      0      0      0      0   
1      10000    2          3         1   49     -1     -1     -2     -1   
2      50000    1          2         1   28     -1     -1     -1      0   
3      80000    2          3         1   52      2      2      3      3   
4     270000    1          1         2   34      1      2      0      0   

   PAY_5  ...  BILL_AMT4  BILL_AMT5  BILL_AMT6  PAY_AMT1  PAY_AMT2  PAY_AMT3  \
0      0  ...      84373      57779      14163      8295      6000      4000   
1      2  ...       1690       1138        930         0         0      2828   
2     -1  ...      45975       1300      43987         0     46257      2200   
3      3  ...      40748      39816      40607      3700      1600      1600   
4      2  ...      22448      15490      17343         0      4000      2000   

   PAY_AMT4  PAY_AMT

In [232]:
# Paso 2: Dividir datasets en X e y

# Variables explicativas
x_train = df_train.drop(columns=["default"])
x_test = df_test.drop(columns=["default"])

# Variable objetivo
y_train = df_train["default"]
y_test = df_test["default"]

In [233]:

# Paso 3 y 4: Crear pipeline y optimizar hiperparámetros

# Columnas categóricas
categorical_cols = ['SEX', 'EDUCATION', 'MARRIAGE']
numerical_cols = [c for c in x_train.columns if c not in categorical_cols]

# Preprocesamiento
preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(drop='first'), categorical_cols),
    ('num', MinMaxScaler(), numerical_cols)
])

# Pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('select', SelectKBest(score_func=f_classif)),
    ('classifier', LogisticRegression(max_iter=2000))
])

# GridSearchCV
param_grid = {
    'select__k': [1, 2, 3, 4, 5, 6, 10],
    'classifier__solver': ['liblinear', 'saga'],
    'classifier__C': [0.65, 0.67, 0.68, 0.69, 0.7]
}

model = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='balanced_accuracy',
    cv=10,
    n_jobs=-1,
    verbose=1
)

# Entrenar
model.fit(x_train, y_train)

# Resultados
# Score en CV (ya lo tienes)
print("Mejor score balanced_accuracy en CV:", model.best_score_)
print("Mejores parámetros:", model.best_params_)

# Score en train
train_score = model.score(x_train, y_train)
print("Score en conjunto de entrenamiento:", train_score)

# Score en test
test_score = model.score(x_test, y_test)
print("Score en conjunto de prueba:", test_score)


Fitting 10 folds for each of 70 candidates, totalling 700 fits
Mejor score balanced_accuracy en CV: 0.6391944527283384
Mejores parámetros: {'classifier__C': 0.65, 'classifier__solver': 'liblinear', 'select__k': 1}
Score en conjunto de entrenamiento: 0.6391959574465568
Score en conjunto de prueba: 0.654602898974264


In [234]:
# -------------------------------------------------------------------
# Ruta para guardar el modelo
model_path = os.path.join(base_project_path, "files", "models", "model.pkl.gz")

# Crear carpeta si no existe
os.makedirs(os.path.dirname(model_path), exist_ok=True)

# Guardar modelo completo (GridSearchCV con pipeline incluido)
with gzip.open(model_path, "wb") as f:
    pickle.dump(model, f)

print(f"Modelo guardado correctamente en {model_path}")

Modelo guardado correctamente en c:\Descriptiva\LAB-02-prediccion-del-default-usando-logreg-jurodriguezra-ops\files\models\model.pkl.gz


In [236]:
metrics_path = os.path.join(base_project_path, "files", "output", "metrics.json")
os.makedirs(os.path.dirname(metrics_path), exist_ok=True)

def compute_metrics_and_cm(y_true, y_pred, dataset_name):
    cm = confusion_matrix(y_true, y_pred)
    return [
        {
            "type": "metrics",
            "dataset": dataset_name,
            "precision": round(precision_score(y_true, y_pred), 4),
            "balanced_accuracy": balanced_accuracy_score(y_true, y_pred),
            "recall": recall_score(y_true, y_pred),
            "f1_score": f1_score(y_true, y_pred),
        }]

def compute_cm (y_true, y_pred, dataset_name):
    cm = confusion_matrix(y_true, y_pred)
    return [{
        "type": "cm_matrix",
        "dataset": dataset_name,
        "true_0": {"predicted_0": int(cm[0,0]), "predicted_1": int(cm[0,1])},
        "true_1": {"predicted_0": int(cm[1,0]), "predicted_1": int(cm[1,1])},
    }]

# Calcular métricas y matrices de confusión
metrics = []
metrics += compute_metrics_and_cm(y_train, model.predict(x_train), "train")
metrics += compute_metrics_and_cm(y_test, model.predict(x_test), "test")
metrics += compute_cm(y_train, model.predict(x_train), "train")
metrics += compute_cm(y_test, model.predict(x_test), "test")

# Guardar todas las métricas en un solo archivo JSON
with open(metrics_path, "w", encoding="utf-8") as f:
    for m in metrics:
        f.write(json.dumps(m) + "\n")

print(f"Métricas y matrices de confusión guardadas correctamente en {metrics_path}")

Métricas y matrices de confusión guardadas correctamente en c:\Descriptiva\LAB-02-prediccion-del-default-usando-logreg-jurodriguezra-ops\files\output\metrics.json
