In [1]:
import pandas as pd
import numpy as np
import os
import gzip
import pickle
import json
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score, balanced_accuracy_score, confusion_matrix

# Cargar datos
ruta_test = "../files/input/test_data.csv.zip"
ruta_train = "../files/input/train_data.csv.zip"

df_test = pd.read_csv(ruta_test, index_col=False, compression="zip")
df_train = pd.read_csv(ruta_train, index_col=False, compression="zip")


In [2]:
# Renombrar la columna objetivo
df_test.rename(columns={'default payment next month': 'default'}, inplace=True)
df_train.rename(columns={'default payment next month': 'default'}, inplace=True)

In [3]:
# Eliminar columna ID
df_test.drop(columns=['ID'], inplace=True)
df_train.drop(columns=['ID'], inplace=True)

In [4]:
# Filtrar valores inválidos en MARRIAGE y EDUCATION
df_train = df_train[(df_train["MARRIAGE"] != 0) & (df_train["EDUCATION"] != 0)]
df_test = df_test[(df_test["MARRIAGE"] != 0) & (df_test["EDUCATION"] != 0)]

In [5]:
# Agrupar niveles superiores de EDUCATION
df_train['EDUCATION'] = df_train['EDUCATION'].apply(lambda x: 4 if x > 4 else x)
df_test['EDUCATION'] = df_test['EDUCATION'].apply(lambda x: 4 if x > 4 else x)


In [6]:
# Separar variables predictoras y objetivo
X_train, y_train = df_train.drop(columns="default"), df_train["default"]
X_test, y_test = df_test.drop(columns="default"), df_test["default"]

In [7]:
# Definir características categóricas y numéricas
categorical_cols = ["SEX", "EDUCATION", "MARRIAGE"]
numeric_cols = [col for col in X_train.columns if col not in categorical_cols]


In [8]:
# Crear transformador para preprocesamiento
preprocessing = ColumnTransformer(
    transformers=[
        ('cat_encoder', OneHotEncoder(), categorical_cols),
        ('num_scaler', StandardScaler(), numeric_cols)
    ]
)

In [9]:
# Construir pipeline con PCA, selección de características y SVM
modelo_pipeline = Pipeline([
    ("preprocessing", preprocessing),
    ("pca", PCA()),
    ("feature_selection", SelectKBest(score_func=f_classif)),
    ("svm_classifier", SVC(kernel="rbf", random_state=12345, max_iter=-1))
])

In [10]:
# Definir los hiperparámetros para optimización
grid_params = {
    'pca__n_components': [20, X_train.shape[1] - 2],
    'feature_selection__k': [12],
    'svm_classifier__kernel': ['rbf'],
    'svm_classifier__gamma': [0.1],
}

# Configurar la búsqueda de hiperparámetros con validación cruzada
modelo_grid_search = GridSearchCV(
    modelo_pipeline,
    param_grid=grid_params,
    cv=10,
    scoring="balanced_accuracy",
    n_jobs=-1,
    refit=True
)

# Ajustar el modelo con los datos de entrenamiento
modelo_grid_search.fit(X_train, y_train)


In [11]:
# Guardar el modelo
os.makedirs("../files/models", exist_ok=True)
with gzip.open("../files/models/model.pkl.gz", "wb") as f:
    pickle.dump(modelo_grid_search, f)

# Función para calcular métricas y guardarlas
def guardar_metricas(modelo, X_train, X_test, y_train, y_test):
    metricas = []
    for dataset, X, y in [("train", X_train, y_train), ("test", X_test, y_test)]:
        y_pred = modelo.predict(X)
        metricas.append({
            'type': 'metrics',
            'dataset': dataset,
            'precision': precision_score(y, y_pred, zero_division=0),
            'balanced_accuracy': balanced_accuracy_score(y, y_pred),
            'recall': recall_score(y, y_pred, zero_division=0),
            'f1_score': f1_score(y, y_pred, zero_division=0)
        })
    return metricas


In [12]:
# Función para calcular matrices de confusión
def guardar_matrices_confusion(modelo, X_train, X_test, y_train, y_test):
    matrices = []
    for dataset, X, y in [("train", X_train, y_train), ("test", X_test, y_test)]:
        cm = confusion_matrix(y, modelo.predict(X))
        matrices.append({
            'type': 'cm_matrix',
            'dataset': dataset,
            'true_0': {'predicted_0': int(cm[0, 0]), 'predicted_1': int(cm[0, 1])},
            'true_1': {'predicted_0': int(cm[1, 0]), 'predicted_1': int(cm[1, 1])}
        })
    return matrices

In [13]:
# Guardar resultados
os.makedirs("../files/output", exist_ok=True)
output_path = "../files/output/metrics.json"
with open(output_path, 'w') as f:
    for entry in guardar_metricas(modelo_grid_search, X_train, X_test, y_train, y_test) + guardar_matrices_confusion(modelo_grid_search, X_train, X_test, y_train, y_test):
        f.write(json.dumps(entry) + '\n')
