In [1]:
import os
import json
import gzip
import pickle
import numpy as np
import pandas as pd

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.svm import SVC
from sklearn.metrics import (
    precision_score, recall_score, f1_score,
    balanced_accuracy_score, confusion_matrix
)
from sklearn.discriminant_analysis import StandardScaler


In [2]:
# Cargar datasets comprimidos
train_df = pd.read_csv('../files/input/train_data.csv.zip', compression='zip')
test_df  = pd.read_csv('../files/input/test_data.csv.zip',  compression='zip')

# Renombrar la columna objetivo
train_df.rename(columns={'default payment next month': 'default'}, inplace=True)
test_df.rename(columns={'default payment next month': 'default'}, inplace=True)

# Eliminar ID y registros nulos
train_df.drop(columns=['ID'], inplace=True)
test_df.drop(columns=['ID'], inplace=True)
train_df.dropna(inplace=True)
test_df.dropna(inplace=True)

# Normalizar EDUCATION: agrupar valores mayores a 4 como "otros"
train_df['EDUCATION'] = train_df['EDUCATION'].apply(lambda x: 4 if x > 4 else x)
test_df['EDUCATION']  = test_df['EDUCATION'].apply(lambda x: 4 if x > 4 else x)

# Eliminar registros con EDUCATION=0 o MARRIAGE=0
train_df = train_df[(train_df['EDUCATION'] != 0) & (train_df['MARRIAGE'] != 0)]
test_df  = test_df[(test_df['EDUCATION'] != 0) & (test_df['MARRIAGE'] != 0)]

print("Datos listos. Tamaños:")
print("Entrenamiento:", train_df.shape)
print("Prueba:", test_df.shape)


Datos listos. Tamaños:
Entrenamiento: (20953, 24)
Prueba: (8979, 24)


In [3]:
X_train, y_train = train_df.drop(columns=['default']), train_df['default']
X_test,  y_test  = test_df.drop(columns=['default']),  test_df['default']

print("Conjuntos creados:")
print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"X_test:  {X_test.shape},  y_test:  {y_test.shape}")


Conjuntos creados:
X_train: (20953, 23), y_train: (20953,)
X_test:  (8979, 23),  y_test:  (8979,)


In [4]:
# Variables categóricas y numéricas
cat_features = ["SEX", "EDUCATION", "MARRIAGE"]
num_features = [c for c in X_train.columns if c not in cat_features]

# Preprocesamiento
transformer = ColumnTransformer(transformers=[
    ("cat_encoder", OneHotEncoder(), cat_features),
    ("num_scaler", StandardScaler(), num_features)
])

# Construcción del pipeline completo
model_pipeline = Pipeline(steps=[
    ("preprocess", transformer),
    ("pca", PCA()),
    ("feature_select", SelectKBest(f_classif)),
    ("svc_model", SVC(random_state=12))
])


In [5]:
# Parámetros para validación cruzada
search_params = {
    "pca__n_components": [20, 21],
    "feature_select__k": [12],
    "svc_model__kernel": ["rbf"],
    "svc_model__gamma": [0.1]
}

# GridSearchCV con validación cruzada de 10 particiones
grid = GridSearchCV(
    estimator=model_pipeline,
    param_grid=search_params,
    cv=10,
    scoring="balanced_accuracy",
    refit=True,
    verbose=1,
    n_jobs=-1
)

# Entrenamiento
grid.fit(X_train, y_train)

print(f"Mejores hiperparámetros: {grid.best_params_}")
best_pipeline = grid.best_estimator_


Fitting 10 folds for each of 2 candidates, totalling 20 fits
Mejores hiperparámetros: {'feature_select__k': 12, 'pca__n_components': 20, 'svc_model__gamma': 0.1, 'svc_model__kernel': 'rbf'}


In [6]:
os.makedirs("../files/models", exist_ok=True)

with gzip.open("../files/models/model.pkl.gz", "wb") as file:
    pickle.dump(grid, file)

print("Modelo guardado en ../files/models/model.pkl.gz")


Modelo guardado en ../files/models/model.pkl.gz


In [7]:
# Predicciones
y_pred_train = best_pipeline.predict(X_train)
y_pred_test  = best_pipeline.predict(X_test)

# Cálculo de métricas
os.makedirs("../files/output", exist_ok=True)

results = [
    {
        "type": "metrics",
        "dataset": "train",
        "precision": precision_score(y_train, y_pred_train),
        "balanced_accuracy": balanced_accuracy_score(y_train, y_pred_train),
        "recall": recall_score(y_train, y_pred_train),
        "f1_score": f1_score(y_train, y_pred_train),
    },
    {
        "type": "metrics",
        "dataset": "test",
        "precision": precision_score(y_test, y_pred_test),
        "balanced_accuracy": balanced_accuracy_score(y_test, y_pred_test),
        "recall": recall_score(y_test, y_pred_test),
        "f1_score": f1_score(y_test, y_pred_test),
    }
]

# Mostrar resultados
display(pd.DataFrame(results))

# Guardar en metrics.json
with open("../files/output/metrics.json", "w") as file:
    for entry in results:
        json.dump(entry, file)
        file.write("\n")

print("Métricas guardadas en ../files/output/metrics.json")


Unnamed: 0,type,dataset,precision,balanced_accuracy,recall,f1_score
0,metrics,train,0.702692,0.664692,0.375661,0.489588
1,metrics,test,0.673675,0.6681,0.386674,0.491333


Métricas guardadas en ../files/output/metrics.json


In [8]:
# Calcular matrices
cm_train = confusion_matrix(y_train, y_pred_train, labels=[0, 1])
cm_test  = confusion_matrix(y_test,  y_pred_test,  labels=[0, 1])

tn_tr, fp_tr, fn_tr, tp_tr = cm_train.ravel()
tn_te, fp_te, fn_te, tp_te = cm_test.ravel()

conf_results = [
    {
        "type": "cm_matrix",
        "dataset": "train",
        "true_0": {"predicted_0": int(tn_tr), "predicted_1": int(fp_tr)},
        "true_1": {"predicted_0": int(fn_tr), "predicted_1": int(tp_tr)},
    },
    {
        "type": "cm_matrix",
        "dataset": "test",
        "true_0": {"predicted_0": int(tn_te), "predicted_1": int(fp_te)},
        "true_1": {"predicted_0": int(fn_te), "predicted_1": int(tp_te)},
    },
]

# Guardar matrices en el mismo archivo JSON
with open("../files/output/metrics.json", "a") as file:
    for row in conf_results:
        json.dump(row, file)
        file.write("\n")

print("Matriz de confusión (Entrenamiento):\n", cm_train)
print("Matriz de confusión (Prueba):\n", cm_test)


Matriz de confusión (Entrenamiento):
 [[15477   751]
 [ 2950  1775]]
Matriz de confusión (Prueba):
 [[6716  357]
 [1169  737]]
