In [1]:
# 1.1 Cargar datos de entrenamiento
import pandas as pd  #  type: ignore

train_data = pd.read_csv(
    "../files/input/train_data.csv.zip",
    #index_col=False,
    compression="zip",
)
train_data.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,10748,310000,1,3,1,32,0,0,0,0,...,84373,57779,14163,8295,6000,4000,3000,1000,2000,0
1,12574,10000,2,3,1,49,-1,-1,-2,-1,...,1690,1138,930,0,0,2828,0,182,0,1
2,29677,50000,1,2,1,28,-1,-1,-1,0,...,45975,1300,43987,0,46257,2200,1300,43987,1386,0
3,8857,80000,2,3,1,52,2,2,3,3,...,40748,39816,40607,3700,1600,1600,0,1600,1600,1
4,21099,270000,1,1,2,34,1,2,0,0,...,22448,15490,17343,0,4000,2000,0,2000,2000,0


In [2]:
# 1.2 Cargar datos de prueba

test_data = pd.read_csv(
    "../files/input/test_data.csv.zip",
    #index_col=False,
    compression="zip",
)
test_data.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
1,10,20000,1,3,2,35,-2,-2,-2,-2,...,0,13007,13912,0,0,0,13007,1122,0,0
2,11,200000,2,3,2,34,0,0,2,0,...,2513,1828,3731,2306,12,50,300,3738,66,0
3,15,250000,1,1,2,29,0,0,0,0,...,59696,56875,55512,3000,3000,3000,3000,3000,3000,0
4,16,50000,2,3,3,23,1,2,0,0,...,28771,29531,30211,0,1500,1100,1200,1300,1100,0


In [3]:
# Paso 1: Realice la limpieza de los datasets

# 1. Renombre la columna "default payment next month" a "default"
train_data.rename(columns={'default payment next month': 'default'}, inplace=True)
test_data.rename(columns={'default payment next month': 'default'}, inplace=True)

# 2. Eliminar la columna "ID"
train_data.drop(columns=['ID'], inplace=True)
test_data.drop(columns=['ID'], inplace=True)

# 3. Elimine los registros con información no disponible en ambos datasets
train_data.dropna(inplace=True)
test_data.dropna(inplace=True)

# 4. Para la columna EDUCATION, valores > 4 indican niveles superiores,
#    agrupe estos valores en la categoría "others" (igual a 4)
train_data.loc[train_data["EDUCATION"] > 4, "EDUCATION"] = 4
test_data.loc[test_data["EDUCATION"] > 4, "EDUCATION"] = 4

# 5. Eliminar registros con valores de EDUCATION o MARRIAGE iguales a 0
train_data = train_data[(train_data["EDUCATION"] != 0) & (train_data["MARRIAGE"] != 0)]
test_data = test_data[(test_data["EDUCATION"] != 0) & (test_data["MARRIAGE"] != 0)]

2. Dividir los datos

In [4]:
# Divida los datasets en x_train, y_train, x_test, y_test
# x_train y x_test contienen  todas las columnas excepto la columna default.
# y_train y y_test contiene la variable objetivo default (pago)
x_train = train_data.drop(['default'], axis=1)
y_train = train_data['default']

x_test = test_data.drop(['default'], axis=1)
y_test = test_data['default']

3. Construir el pipeline

In [5]:
# Paso 3.
# Cree un pipeline para el modelo de clasificación. Este pipeline debe
# contener las siguientes capas:
# - Transforma las variables categoricas usando el método
#   one-hot-encoding.
# - Descompone la matriz de entrada usando PCA. El PCA usa todas las componentes.
# - Estandariza la matriz de entrada.
# - Selecciona las K columnas mas relevantes de la matrix de entrada.
# - Ajusta una maquina de vectores de soporte (svm).

import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.svm import SVC
from sklearn.metrics import (
    accuracy_score, balanced_accuracy_score,
    precision_score, recall_score, f1_score,
    confusion_matrix, ConfusionMatrixDisplay,
    classification_report
)

# Identificar las columnas categóricas y numéricas
categorical_features = ['SEX', 'EDUCATION', 'MARRIAGE']
numerical_features = ['LIMIT_BAL', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 
                      'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6',
                      'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']

# # Preprocesamiento para las variables categóricas
# categorical_transformer = OneHotEncoder(handle_unknown="ignore")

# # Preprocesamiento para variables numéricas
# numerical_transformer = StandardScaler()

# Crear el preprocesador con las transformaciones necesarias
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown="ignore"), categorical_features),  # One-hot encoding para las variables categóricas
        ('num', StandardScaler(with_mean=True, with_std=True), numerical_features)  # Estandarización para las numéricas
    ]
)

# Construir el pipeline
pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("pca", PCA()),
        ("selector", SelectKBest(f_classif)),
        ("classifier", SVC(kernel="rbf", random_state=12345, max_iter=-1)),
        ]
)

# Ajustar el pipeline a los datos de entrenamiento
pipeline.fit(x_train, y_train)

# # Evaluar el modelo en el conjunto de prueba
# score = pipeline.score(x_test, y_test)
# print(f"Accuracy en el conjunto de prueba: {score:.4f}")

# Evaluar el modelo en el conjunto de prueba
print("Modelo entrenado. Precisión en test:", pipeline.score(x_test, y_test))


Modelo entrenado. Precisión en test: 0.8264840182648402


4. Optimizar los hiperparámetros con validación cruzada

In [6]:
# Optimice los hiperparametros del pipeline usando validación cruzada.
# Use 10 splits para la validación cruzada. 
# Use la función de precision balanceada para medir la precisión del modelo.

# Definir el grid de hiperparámetros
param_grid = {
        "pca__n_components": [20, x_train.shape[1]-2],
        "selector__k": [12],
        "classifier__kernel": ['rbf'],
        "classifier__gamma": [0.1]
}

# Validación cruzada se realiza con 10 splits
grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring="balanced_accuracy", n_jobs = -1)
grid_search.fit(x_train, y_train)

# Imprima el mejor modelo
print("Mejores parámetros: ", grid_search.best_params_)
best_model = grid_search.best_estimator_

Mejores parámetros:  {'classifier__gamma': 0.1, 'classifier__kernel': 'rbf', 'pca__n_components': 20, 'selector__k': 12}


5. Evaluación del modelo

In [None]:
# Predicciones
y_train_pred = best_model.predict(x_train)
y_test_pred = best_model.predict(x_test)

# Métricas

metrics = {
    "Train": {
        "Accuracy": float(accuracy_score(y_train, y_train_pred)),
        "Balanced Accuracy": float(balanced_accuracy_score(y_train, y_train_pred)),
        "Precision": float(precision_score(y_train, y_train_pred)),
        "Recall": float(recall_score(y_train, y_train_pred)),
        "F1_Score": float(f1_score(y_train, y_train_pred))
    },
    "Test":{
        "Accuracy": float(accuracy_score(y_test, y_test_pred)),
        "Balanced Accuracy": float(balanced_accuracy_score(y_test, y_test_pred)),
        "Precision": float(precision_score(y_test, y_test_pred)),
        "Recall": float(recall_score(y_test, y_test_pred)),
        "F1_Score": float(f1_score(y_test, y_test_pred))
    }
}
print(metrics)

6. Matriz de confusion

In [None]:
for dataset, y_true, y_pred in [("Train", y_train, y_train_pred), ("Test", y_test, y_test_pred)]:
    cm = confusion_matrix(y_true, y_pred)
    print(f"Matriz de Confusión ({dataset}):\n", cm)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=grid_search.best_estimator_.classes_)
    disp.plot()

7. Guardar el modelo

In [9]:
# Guarde el modelo (comprimido con gzip) como "files/models/model.pkl.gz".
# Recuerde que es posible guardar el modelo comprimido usanzo la libreria gzip.

import pickle
import gzip

# Guardar el modelo comprimido con gzip
with gzip.open('../files/models/model.pkl.gz', 'wb') as file:
    pickle.dump(grid_search, file)

8. Calcular las métricas, matriz de confusión y guardarlas en json

In [None]:
import json
# Función para calcular las métricas de precisión, recall, etc.
def compute_metrics(model, X, y, dataset):
    y_pred = model.predict(X)
    metrics = classification_report(y, y_pred, output_dict=True)
    
    # formato de json
    results = {
        'type': 'metrics',
        'dataset': dataset,
        'precision': metrics['1']['precision'],
        'balanced_accuracy': balanced_accuracy_score(y, y_pred),
        'recall': metrics['1']['recall'],
        'f1_score': metrics['1']['f1-score']
    }
    return results

# Función para calcular la matriz de confusión
def compute_confusion_matrix(model, X, y, dataset):
    cm = confusion_matrix(y, model.predict(X))

    # formato de json
    return {
        'type': 'cm_matrix',
        'dataset': dataset,
        'true_0': {'predicted_0': int(cm[0][0]), 'predicted_1': int(cm[0][1])},
        'true_1': {'predicted_0': int(cm[1][0]), 'predicted_1': int(cm[1][1])}
    }

# Calcula las métricas para entrenamiento y prueba
metrics_list = [
    compute_metrics(grid_search.best_estimator_, x_train, y_train, 'train'),
    compute_metrics(grid_search.best_estimator_, x_test, y_test, 'test')
]

# Calcula la matriz de confusión para entrenamiento y prueba
cm_train = compute_confusion_matrix(grid_search.best_estimator_, x_train, y_train, 'train')
cm_test = compute_confusion_matrix(grid_search.best_estimator_, x_test, y_test, 'test')

# Crear una lista con todas las métricas y matrices
all_results = metrics_list + [cm_train, cm_test]

#Guardar todas las métricas y matrices en el archivo 'metrics.json'
with open('../files/output/metrics.json', 'w') as file:
    for result in all_results:
        file.write(json.dumps(result) + '\n')