In [101]:
# flake8: noqa: E501
#
# En este dataset se desea pronosticar el default (pago) del cliente el próximo
# mes a partir de 23 variables explicativas.
#
#   LIMIT_BAL: Monto del credito otorgado. Incluye el credito individual y el
#              credito familiar (suplementario).
#         SEX: Genero (1=male; 2=female).
#   EDUCATION: Educacion (0=N/A; 1=graduate school; 2=university; 3=high school; 4=others).
#    MARRIAGE: Estado civil (0=N/A; 1=married; 2=single; 3=others).
#         AGE: Edad (years).
#       PAY_0: Historia de pagos pasados. Estado del pago en septiembre, 2005.
#       PAY_2: Historia de pagos pasados. Estado del pago en agosto, 2005.
#       PAY_3: Historia de pagos pasados. Estado del pago en julio, 2005.
#       PAY_4: Historia de pagos pasados. Estado del pago en junio, 2005.
#       PAY_5: Historia de pagos pasados. Estado del pago en mayo, 2005.
#       PAY_6: Historia de pagos pasados. Estado del pago en abril, 2005.
#   BILL_AMT1: Historia de pagos pasados. Monto a pagar en septiembre, 2005.
#   BILL_AMT2: Historia de pagos pasados. Monto a pagar en agosto, 2005.
#   BILL_AMT3: Historia de pagos pasados. Monto a pagar en julio, 2005.
#   BILL_AMT4: Historia de pagos pasados. Monto a pagar en junio, 2005.
#   BILL_AMT5: Historia de pagos pasados. Monto a pagar en mayo, 2005.
#   BILL_AMT6: Historia de pagos pasados. Monto a pagar en abril, 2005.
#    PAY_AMT1: Historia de pagos pasados. Monto pagado en septiembre, 2005.
#    PAY_AMT2: Historia de pagos pasados. Monto pagado en agosto, 2005.
#    PAY_AMT3: Historia de pagos pasados. Monto pagado en julio, 2005.
#    PAY_AMT4: Historia de pagos pasados. Monto pagado en junio, 2005.
#    PAY_AMT5: Historia de pagos pasados. Monto pagado en mayo, 2005.
#    PAY_AMT6: Historia de pagos pasados. Monto pagado en abril, 2005.
#
# La variable "default payment next month" corresponde a la variable objetivo.
#
# El dataset ya se encuentra dividido en conjuntos de entrenamiento y prueba
# en la carpeta "files/input/".
#
# Los pasos que debe seguir para la construcción de un modelo de
# clasificación están descritos a continuación.
#
#



In [102]:
# Paso 1.
# Realice la limpieza de los datasets:
# - Renombre la columna "default payment next month" a "default".
# - Remueva la columna "ID".
# - Elimine los registros con informacion no disponible.
# - Para la columna EDUCATION, valores > 4 indican niveles superiores
#   de educación, agrupe estos valores en la categoría "others".
# - Renombre la columna "default payment next month" a "default"
# - Remueva la columna "ID".
#

import pandas as pd
import os

def load_data(file_path):
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"Archivo no encontrado: {file_path}")
    return pd.read_csv(file_path, compression='zip')

data_train = load_data('../files/input/train_data.csv.zip')
data_test  = load_data('../files/input/test_data.csv.zip')

print("Training Data:")
data_train.head()

print("\nTesting Data:")
data_test.head()


Training Data:

Testing Data:


Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
1,10,20000,1,3,2,35,-2,-2,-2,-2,...,0,13007,13912,0,0,0,13007,1122,0,0
2,11,200000,2,3,2,34,0,0,2,0,...,2513,1828,3731,2306,12,50,300,3738,66,0
3,15,250000,1,1,2,29,0,0,0,0,...,59696,56875,55512,3000,3000,3000,3000,3000,3000,0
4,16,50000,2,3,3,23,1,2,0,0,...,28771,29531,30211,0,1500,1100,1200,1300,1100,0


In [103]:
def clean_data(df):
    df = df.copy()
    df = df.rename(columns = {"default payment next month":"default"})
    df = df.drop(columns=["ID"])
    df = df.dropna()
    df = df[(df["EDUCATION"] != 0) & (df["MARRIAGE"] != 0)]
    df['EDUCATION'] = df['EDUCATION'].apply(lambda x: 4 if x >= 4 else x).astype('category')

    return df

data_train_cleaned = clean_data(data_train)
data_test_cleaned  = clean_data(data_test)  

data_test_cleaned

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default
0,120000,2,2,2,26,-1,2,0,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
1,20000,1,3,2,35,-2,-2,-2,-2,-1,...,0,13007,13912,0,0,0,13007,1122,0,0
2,200000,2,3,2,34,0,0,2,0,0,...,2513,1828,3731,2306,12,50,300,3738,66,0
3,250000,1,1,2,29,0,0,0,0,0,...,59696,56875,55512,3000,3000,3000,3000,3000,3000,0
4,50000,2,3,3,23,1,2,0,0,0,...,28771,29531,30211,0,1500,1100,1200,1300,1100,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8995,20000,1,2,1,44,-2,-2,-2,-2,-2,...,2882,9235,1719,2890,2720,2890,9263,1824,1701,0
8996,360000,1,1,2,35,-1,-1,-2,-2,-2,...,0,0,0,0,0,0,0,0,0,0
8997,150000,1,1,2,35,-1,-1,-1,-1,-1,...,780,0,0,9054,0,783,0,0,0,0
8998,30000,1,2,2,37,4,3,2,-1,0,...,20878,20582,19357,0,0,22000,4200,2000,3100,1


In [104]:
#
# Paso 2.
# Divida los datasets en x_train, y_train, x_test, y_test.
#

def split_data(data):
    X = data.drop(columns=["default"])
    y = data["default"]
    return X, y

X_train, y_train = split_data(data_train_cleaned)
X_test, y_test   = split_data(data_test_cleaned)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((20953, 23), (20953,), (8979, 23), (8979,))

In [114]:
#
# Paso 3.
# Cree un pipeline para el modelo de clasificación. Este pipeline debe
# contener las siguientes capas:
# - Transforma las variables categoricas usando el método
#   one-hot-encoding.
# - Descompone la matriz de entrada usando componentes principales.
#   El pca usa todas las componentes.
# - Escala la matriz de entrada al intervalo [0, 1].
# - Selecciona las K columnas mas relevantes de la matrix de entrada.
# - Ajusta una red neuronal tipo MLP.
#


from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.compose import ColumnTransformer


def create_pipeline():
    numeric = [
        "LIMIT_BAL", "AGE", "PAY_0", "PAY_2", "PAY_3", "PAY_4", "PAY_5", "PAY_6",
        "BILL_AMT1", "BILL_AMT2", "BILL_AMT3", "BILL_AMT4", "BILL_AMT5", "BILL_AMT6",
        "PAY_AMT1", "PAY_AMT2", "PAY_AMT3", "PAY_AMT4", "PAY_AMT5", "PAY_AMT6"
    ]
    categorical = ["SEX", "EDUCATION", "MARRIAGE"]

    preprocessor = ColumnTransformer(
        transformers=[
            ("cat", OneHotEncoder(handle_unknown='ignore'), categorical),
            ('scaler', StandardScaler(), numeric)
        ])
    
    classifier = MLPClassifier(max_iter=15000, random_state=21)
    pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ('feature_selection', SelectKBest(score_func=f_classif)), 
        ('pca', PCA()),
        ('classifier', classifier ), 
        ])
    return pipeline

pipeline = create_pipeline()
pipeline

0,1,2
,steps,"[('preprocessor', ...), ('feature_selection', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('scaler', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,score_func,<function f_c...001EE9F1CB7E0>
,k,10

0,1,2
,n_components,
,copy,True
,whiten,False
,svd_solver,'auto'
,tol,0.0
,iterated_power,'auto'
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,

0,1,2
,hidden_layer_sizes,"(100,)"
,activation,'relu'
,solver,'adam'
,alpha,0.0001
,batch_size,'auto'
,learning_rate,'constant'
,learning_rate_init,0.001
,power_t,0.5
,max_iter,15000
,shuffle,True


In [117]:
#
# Paso 4.
# Optimice los hiperparametros del pipeline usando validación cruzada.
# Use 10 splits para la validación cruzada. Use la función de precision
# balanceada para medir la precisión del modelo.
#
from sklearn.model_selection import GridSearchCV

def optimize_hyperparameters(pipeline, X_train, y_train):
    param_grid = {
        "pca__n_components": [None],
        "feature_selection__k": [20], 
        'classifier__hidden_layer_sizes': [(50, 30, 40, 60)],
        'classifier__alpha': [0.26],
        "classifier__learning_rate_init": [0.001],     
    }

    gridSearch = GridSearchCV(
        pipeline,
        param_grid,
        cv=10,
        scoring='balanced_accuracy',
        n_jobs=-1,
        verbose=2,
        refit=True
    )

    gridSearch.fit(X_train, y_train)
    return gridSearch

grid_search = optimize_hyperparameters(pipeline, X_train, y_train)

print("Best Hyperparameters:", grid_search.best_params_)
best_pipeline = grid_search.best_estimator_
print("Best Pipeline:", best_pipeline)
best_accuracy = grid_search.best_score_
print("Best Balanced Accuracy from CV:", best_accuracy)

Fitting 10 folds for each of 1 candidates, totalling 10 fits
Best Hyperparameters: {'classifier__alpha': 0.26, 'classifier__hidden_layer_sizes': (50, 30, 40, 60), 'classifier__learning_rate_init': 0.001, 'feature_selection__k': 20, 'pca__n_components': None}
Best Pipeline: Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['SEX', 'EDUCATION',
                                                   'MARRIAGE']),
                                                 ('scaler', StandardScaler(),
                                                  ['LIMIT_BAL', 'AGE', 'PAY_0',
                                                   'PAY_2', 'PAY_3', 'PAY_4',
                                                   'PAY_5', 'PAY_6',
                                                   'BILL_AMT1', 'BILL_AMT2',
                            

In [None]:
#
# Paso 5.
# Guarde el modelo (comprimido con gzip) como "files/models/model.pkl.gz".
# Recuerde que es posible guardar el modelo comprimido usanzo la libreria gzip.
#
import pickle
import gzip
import os

ROOT = os.path.abspath("..")

def save_model(model):
    model_dir = os.path.join(ROOT, "files", "models")
    os.makedirs(model_dir, exist_ok=True)

    model_path = os.path.join(model_dir, "model.pkl.gz")
    with gzip.open(model_path, "wb") as f:
        pickle.dump(model, f)

    print("Modelo guardado en:", model_path)

save_model(grid_search)

✅ Modelo guardado en: c:\Users\Victus\Desktop\Prueba Tecnica\LAB-04-prediccion-del-default-usando-mlp-JPortoL\files\models\model.pkl.gz
Modelo guardado en 'files/models/model.pkl.gz'


In [None]:
#
# Paso 6.
# Calcule las metricas de precision, precision balanceada, recall,
# y f1-score para los conjuntos de entrenamiento y prueba.
# Guardelas en el archivo files/output/metrics.json. Cada fila
# del archivo es un diccionario con las metricas de un modelo.
# Este diccionario tiene un campo para indicar si es el conjunto
# de entrenamiento o prueba. Por ejemplo:
#
# {'dataset': 'train', 'precision': 0.8, 'balanced_accuracy': 0.7, 'recall': 0.9, 'f1_score': 0.85}
# {'dataset': 'test', 'precision': 0.7, 'balanced_accuracy': 0.6, 'recall': 0.8, 'f1_score': 0.75}
#
from sklearn.metrics import precision_score, balanced_accuracy_score, recall_score, f1_score
import json

def save_metrics(model, x_train, y_train, x_test, y_test):
    out_dir = os.path.join(ROOT, "files", "output")
    os.makedirs(out_dir, exist_ok=True)

    metrics_file = os.path.join(out_dir, "metrics.json")

    metrics = []

    y_train_pred = model.predict(x_train)
    metrics.append({
        "type": "metrics",
        "dataset": "train",
        "precision": float(precision_score(y_train, y_train_pred)),
        "balanced_accuracy": float(balanced_accuracy_score(y_train, y_train_pred)),
        "recall": float(recall_score(y_train, y_train_pred)),
        "f1_score": float(f1_score(y_train, y_train_pred)),
    })

    y_test_pred = model.predict(x_test)
    metrics.append({
        "type": "metrics",
        "dataset": "test",
        "precision": float(precision_score(y_test, y_test_pred)),
        "balanced_accuracy": float(balanced_accuracy_score(y_test, y_test_pred)),
        "recall": float(recall_score(y_test, y_test_pred)),
        "f1_score": float(f1_score(y_test, y_test_pred)),
    })

    with open(metrics_file, "w", encoding="utf-8") as f:
        for m in metrics:
            f.write(json.dumps(m) + "\n")

    print("Metrics guardadas en:", metrics_file)

save_metrics(grid_search, X_train, y_train, X_test, y_test)



✅ Metrics guardadas en: c:\Users\Victus\Desktop\Prueba Tecnica\LAB-04-prediccion-del-default-usando-mlp-JPortoL\files\output\metrics.json


In [121]:
#
# Paso 7.
# Calcule las matrices de confusion para los conjuntos de entrenamiento y
# prueba. Guardelas en el archivo files/output/metrics.json. Cada fila
# del archivo es un diccionario con las metricas de un modelo.
# de entrenamiento o prueba. Por ejemplo:
#
# {'type': 'cm_matrix', 'dataset': 'train', 'true_0': {"predicted_0": 15562, "predicte_1": 666}, 'true_1': {"predicted_0": 3333, "predicted_1": 1444}}
# {'type': 'cm_matrix', 'dataset': 'test', 'true_0': {"predicted_0": 15562, "predicte_1": 650}, 'true_1': {"predicted_0": 2490, "predicted_1": 1420}}
#
from sklearn.metrics import confusion_matrix

def save_confusion_matrices(model, x_train, y_train, x_test, y_test):
    out_dir = os.path.join(ROOT, "files", "output")
    metrics_file = os.path.join(out_dir, "metrics.json")

    metrics = []

    with open(metrics_file, "r", encoding="utf-8") as f:
        for line in f:
            metrics.append(json.loads(line))

    cm_train = confusion_matrix(y_train, model.predict(x_train))
    metrics.append({
        "type": "cm_matrix",
        "dataset": "train",
        "true_0": {
            "predicted_0": int(cm_train[0][0]),
            "predicted_1": int(cm_train[0][1])
        },
        "true_1": {
            "predicted_0": int(cm_train[1][0]),
            "predicted_1": int(cm_train[1][1])
        }
    })

    cm_test = confusion_matrix(y_test, model.predict(x_test))
    metrics.append({
        "type": "cm_matrix",
        "dataset": "test",
        "true_0": {
            "predicted_0": int(cm_test[0][0]),
            "predicted_1": int(cm_test[0][1])
        },
        "true_1": {
            "predicted_0": int(cm_test[1][0]),
            "predicted_1": int(cm_test[1][1])
        }
    })

    with open(metrics_file, "w", encoding="utf-8") as f:
        for m in metrics:
            f.write(json.dumps(m) + "\n")

    print("Confusion matrices agregadas a:", metrics_file)

save_confusion_matrices(grid_search, X_train, y_train, X_test, y_test)

Confusion matrices agregadas a: c:\Users\Victus\Desktop\Prueba Tecnica\LAB-04-prediccion-del-default-usando-mlp-JPortoL\files\output\metrics.json
