In [9]:
import pandas as pd  #  type: ignore
import numpy as np  #  type: ignore
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, balanced_accuracy_score
from sklearn.feature_selection import SelectKBest, f_classif



In [10]:
train_path = "../files/input/train_data.csv.zip"
test_path = "../files/input/test_data.csv.zip"

train_df = pd.read_csv(train_path, compression="zip")
test_df = pd.read_csv(test_path, compression="zip")

In [11]:
# ======================================================
# === Paso 1: Carga y limpieza de datos ===============
# ======================================================

def limpieza(df: pd.DataFrame) -> pd.DataFrame:
    df = df.rename(columns={"default payment next month": "default"})
    df = df.drop(columns=["ID"]).dropna()
    df = df.loc[df["MARRIAGE"] != 0]
    df = df.loc[df["EDUCATION"] != 0]
    df["EDUCATION"] = df["EDUCATION"].apply(lambda x: x if x <= 4 else 4)
    df = df.dropna()
    return df

train_df = limpieza(train_df)
test_df = limpieza(test_df)

In [12]:
# ====================================================================================
# === Paso 2: Divisi√≥n de datos en conjuntos de entrenamiento y prueba ===============
# ====================================================================================

def split_datasets(train_data, test_data):

    # train
    X_train = train_data.drop(columns=["default"])
    y_train = train_data["default"]

    #test
    X_test = test_data.drop(columns=["default"])
    y_test = test_data["default"]

    return X_train, y_train, X_test, y_test

X_train, y_train, X_test, y_test = split_datasets(train_df, test_df)

# Paso 3.
Cree un pipeline para el modelo de clasificaci√≥n. Este pipeline debe
contener las siguientes capas:
- Transforma las variables categoricas usando el m√©todo
  one-hot-encoding.
- Descompone la matriz de entrada usando PCA. El PCA usa todas las componentes.
- Estandariza la matriz de entrada.
- Selecciona las K columnas mas relevantes de la matrix de entrada.
- Ajusta una maquina de vectores de soporte (svm).

In [13]:
# ======================================================
# === Paso 3: Pipeline del modelo ======================
# ======================================================

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.svm import SVC


def make_pipeline(cat_cols):
    # --- 1. Codificador para variables categ√≥ricas ---
    encoder = OneHotEncoder(handle_unknown="ignore")

    # --- 2. Escalador para las variables num√©ricas ---
    scaler = StandardScaler()

    # --- 3. Definir qu√© columnas son categ√≥ricas y cu√°les num√©ricas ---
    #    El ColumnTransformer aplica one-hot a las categ√≥ricas
    #    y StandardScaler a las restantes.
    preprocessor = ColumnTransformer(
        transformers=[
            ("cat", encoder, cat_cols),
            ("num", scaler, [col for col in X_train.columns if col not in cat_cols])
        ]
    )

    # --- 4. PCA ---
    pca = PCA()

    # --- 5. Selector de caracter√≠sticas ---
    selector = SelectKBest(score_func=f_classif)#, k=k_features)

    # --- 6. Vectorizador ---
    classifier = MLPClassifier(max_iter=15000,random_state=21)


    # El Pipeline secuencia el preprocesamiento y el modelo.
    pipeline = Pipeline(steps=[
        ('preprocess', preprocessor),
        ('feature_selection', selector), 
        ('pca', pca),
        ('classifier', classifier )
    ])
    return pipeline

    

# Variables categ√≥ricas del dataset
cat_cols = ["SEX", "EDUCATION", "MARRIAGE"]#, "PAY_0", "PAY_2", "PAY_3", "PAY_4", "PAY_5", "PAY_6"]

# Crear y ajustar el pipeline
pipeline = make_pipeline(cat_cols)#, k_features=10)
pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('preprocess', ...), ('feature_selection', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,score_func,<function f_c...x7f147659f370>
,k,10

0,1,2
,n_components,
,copy,True
,whiten,False
,svd_solver,'auto'
,tol,0.0
,iterated_power,'auto'
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,

0,1,2
,hidden_layer_sizes,"(100,)"
,activation,'relu'
,solver,'adam'
,alpha,0.0001
,batch_size,'auto'
,learning_rate,'constant'
,learning_rate_init,0.001
,power_t,0.5
,max_iter,15000
,shuffle,True


# Paso 4.
Optimice los hiperparametros del pipeline usando validaci√≥n cruzada.
Use 10 splits para la validaci√≥n cruzada. Use la funci√≥n de precision
balanceada para medir la precisi√≥n del modelo.

In [14]:
# ======================================================
# === Paso 4: Optimizaci√≥n de hiperpar√°metros ==========
# ======================================================

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, balanced_accuracy_score
import joblib
def optimize_hyperparameters(pipeline, X_train, y_train):

    param_grid = {
        "pca__n_components": [None],
        "feature_selection__k": [20],
        "classifier__hidden_layer_sizes": [(50, 30, 40, 60)],
        "classifier__alpha": [0.26],
        'classifier__learning_rate_init': [0.001],
    }

    grid_search = GridSearchCV(
        estimator=pipeline,           # Pipeline que incluye el preprocesamiento y el clasificador
        param_grid=param_grid,        # Hiperpar√°metros a optimizar
        cv=10,                        # 10 divisiones para la validaci√≥n cruzada
        scoring='balanced_accuracy',
        n_jobs=-1,
        refit=True 
      
    )

    # Entrenar la b√∫squeda
    grid_search.fit(X_train, y_train)

    return grid_search


# Ejecutar la optimizaci√≥n
grid_search = optimize_hyperparameters(pipeline, X_train, y_train)


model = grid_search  

#save_estimator(model)

# Mostrar los mejores hiperpar√°metros encontrados
print("Mejores par√°metros:", grid_search.best_params_)
print("Mejor balanced accuracy (CV):", grid_search.best_score_)


Mejores par√°metros: {'classifier__alpha': 0.26, 'classifier__hidden_layer_sizes': (50, 30, 40, 60), 'classifier__learning_rate_init': 0.001, 'feature_selection__k': 20, 'pca__n_components': None}
Mejor balanced accuracy (CV): 0.6550339201577555


# Paso 5.
Guarde el modelo (comprimido con gzip) como "files/models/model.pkl.gz".
Recuerde que es posible guardar el modelo comprimido usanzo la libreria gzip.

In [15]:
# ======================================================
# === Paso 5: Guardar modelo ====================
# ======================================================

import os
import gzip
import pickle

# üîπ El modelo final es el resultado del GridSearchCV
model = grid_search

def save_estimator(estimator):
    """Guarda el modelo entrenado en formato comprimido gzip."""
    models_path = "../files/models"  # üëà carpeta destino exacta
    os.makedirs(models_path, exist_ok=True)

    output_path = os.path.join(models_path, "model.pkl.gz")

    # Guardar modelo comprimido
    with gzip.open(output_path, "wb") as file:
        pickle.dump(estimator, file)

    print(f"‚úÖ Modelo guardado en: {output_path}")

# Llamada para guardar el modelo
save_estimator(model)


‚úÖ Modelo guardado en: ../files/models/model.pkl.gz


# Paso 6.
Calcule las metricas de precision, precision balanceada, recall,
y f1-score para los conjuntos de entrenamiento y prueba.
Guardelas en el archivo files/output/metrics.json. Cada fila
del archivo es un diccionario con las metricas de un modelo.
Este diccionario tiene un campo para indicar si es el conjunto
de entrenamiento o prueba. Por ejemplo:
## 
{'dataset': 'train', 'precision': 0.8, 'balanced_accuracy': 0.7, 'recall': 0.9, 'f1_score': 0.85}
{'dataset': 'test', 'precision': 0.7, 'balanced_accuracy': 0.6, 'recall': 0.8, 'f1_score': 0.75}

# Paso 7.
Calcule las matrices de confusion para los conjuntos de entrenamiento y
prueba. Guardelas en el archivo files/output/metrics.json. Cada fila
del archivo es un diccionario con las metricas de un modelo.
de entrenamiento o prueba. Por ejemplo:
#
{'type': 'cm_matrix', 'dataset': 'train', 'true_0': {"predicted_0": 15562, "predicte_1": 666}, 'true_1': {"predicted_0": 3333, "predicted_1": 1444}}
{'type': 'cm_matrix', 'dataset': 'test', 'true_0': {"predicted_0": 15562, "predicte_1": 650}, 'true_1': {"predicted_0": 2490, "predicted_1": 1420}}

In [16]:
# ======================================================
# === Paso 6 y 7: M√©tricas y Matrices de Confusi√≥n =====
# ======================================================

import os
import json
from pathlib import Path
from sklearn.metrics import (
    confusion_matrix, precision_score, recall_score,
    f1_score, balanced_accuracy_score
)

def build_rows(y_true, y_pred, dataset: str):
    """Construye diccionarios de m√©tricas y matriz de confusi√≥n para train/test."""

    # --- M√©tricas ---
    metrics_row = {
        "type": "metrics",
        "dataset": dataset,
        "precision": float(precision_score(y_true, y_pred, zero_division=0)),
        "balanced_accuracy": float(balanced_accuracy_score(y_true, y_pred)),
        "recall": float(recall_score(y_true, y_pred, zero_division=0)),
        "f1_score": float(f1_score(y_true, y_pred, zero_division=0)),
    }

    # --- Matriz de confusi√≥n ---
    cm = confusion_matrix(y_true, y_pred, labels=[0, 1])
    cm_row = {
        "type": "cm_matrix",
        "dataset": dataset,
        "true_0": {"predicted_0": int(cm[0, 0]), "predicted_1": int(cm[0, 1])},
        "true_1": {"predicted_0": int(cm[1, 0]), "predicted_1": int(cm[1, 1])},
    }

    return metrics_row, cm_row


# --- Obtener el mejor modelo del GridSearchCV ---
best_model = grid_search.best_estimator_

# --- Predicciones ---
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)

# --- Construir registros ---
m_train, cm_train = build_rows(y_train, y_train_pred, "train")
m_test, cm_test = build_rows(y_test, y_test_pred, "test")

# --- Guardar los resultados ---
records = [m_train, m_test, cm_train, cm_test]

output_dir = Path("../files/output")
output_dir.mkdir(parents=True, exist_ok=True)
metrics_path = output_dir / "metrics.json"

with open(metrics_path, "w", encoding="utf-8") as f:
    for rec in records:
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")

print(f"‚úÖ M√©tricas guardadas en: {metrics_path}")


‚úÖ M√©tricas guardadas en: ../files/output/metrics.json
