In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import precision_score, balanced_accuracy_score, recall_score, f1_score, confusion_matrix
import gzip
import pickle
import json

In [2]:
# Datos
train_csv = pd.read_csv("../files/input/train_data.csv.zip")
test_csv = pd.read_csv("../files/input/test_data.csv.zip")

## Paso 1: Limpiar los datos

In [8]:
def clean_data(df):
    # Renombre la columna "default payment next month" a "default".
    df = df.rename(columns={"default payment next month": "default"})
    
    # Remueva la columna "ID".
    df = df.drop(columns=["ID"])
    
    # Elimine los registros con informacion no disponible.
    df = df[df["EDUCATION"] != 0]
    df = df[df["MARRIAGE"] != 0]
    
    # Para la columna EDUCATION, valores > 4 indican niveles superiores
    # de educación, agrupe estos valores en la categoría "others".
    df.loc[df["EDUCATION"] > 4, "EDUCATION"] = 4
    
    return df

In [9]:
df_train = clean_data(train_csv)
df_test = clean_data(test_csv)

## Paso 2: Dividir los datasets

In [10]:
# Divida los datasets en x_train, y_train, x_test, y_test.

x_train = df_train.drop(columns=["default"])
y_train = df_train["default"]

x_test = df_test.drop(columns=["default"])
y_test = df_test["default"]

## Paso 3: Crear un pipeline para el modelo de clasificación

In [11]:
# Cree un pipeline para el modelo de clasificación. Este pipeline debe
# contener las siguientes capas:
# - Transforma las variables categoricas usando el método
#   one-hot-encoding.
# - Descompone la matriz de entrada usando componentes principales.
#   El pca usa todas las componentes.
# - Escala la matriz de entrada al intervalo [0, 1].
# - Selecciona las K columnas mas relevantes de la matrix de entrada.
# - Ajusta una red neuronal tipo MLP.


categorical= ["SEX", "EDUCATION", "MARRIAGE"]
numeric = [c for c in x_train.columns if c not in categorical]

# Preprocesamiento para las variables categóricas
categorical_trans= OneHotEncoder(handle_unknown="ignore")

# Preprocesamiento para las variables numéricas
numerical_trans = StandardScaler()

preprocessor = ColumnTransformer(
    transformers = [
        ["cat", categorical_trans, categorical],
        ["scaler", numerical_trans, numeric]
    ]
)



pipeline = Pipeline(
    [
        ("preprocessor", preprocessor),
        ("feature_selection", SelectKBest(score_func=f_classif)),
        ("pca", PCA()),
        ("classifier", MLPClassifier(max_iter=15000, random_state=17)),
    ]
)

## Paso 4: Optimizar los hiperparametros del pipeline.

In [12]:
# Optimice los hiperparametros del pipeline usando validación cruzada.
# Use 10 splits para la validación cruzada. Use la función de precision
# balanceada para medir la precisión del modelo.

param= {
    'pca__n_components': [None],
    'feature_selection__k':[20],
    "classifier__hidden_layer_sizes": [(50, 30, 40, 60)],
    'classifier__alpha': [0.26],
    "classifier__learning_rate_init": [0.001],
}

grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param,
    scoring='balanced_accuracy',
    cv=10,
    n_jobs=-1,
    verbose=1
)

grid_search.fit(x_train, y_train)

Fitting 10 folds for each of 1 candidates, totalling 10 fits


0,1,2
,estimator,Pipeline(step...m_state=17))])
,param_grid,"{'classifier__alpha': [0.26], 'classifier__hidden_layer_sizes': [(50, ...)], 'classifier__learning_rate_init': [0.001], 'feature_selection__k': [20], ...}"
,scoring,'balanced_accuracy'
,n_jobs,-1
,refit,True
,cv,10
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[['cat', OneHotEncoder...nown='ignore'), ...], ['scaler', StandardScaler(), ...]]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,score_func,<function f_c...002BA7A36A020>
,k,20

0,1,2
,n_components,
,copy,True
,whiten,False
,svd_solver,'auto'
,tol,0.0
,iterated_power,'auto'
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,

0,1,2
,hidden_layer_sizes,"(50, ...)"
,activation,'relu'
,solver,'adam'
,alpha,0.26
,batch_size,'auto'
,learning_rate,'constant'
,learning_rate_init,0.001
,power_t,0.5
,max_iter,15000
,shuffle,True


## Paso 5: Guardar el modelo.

In [13]:
# Guarde el modelo (comprimido con gzip) como "files/models/model.pkl.gz".
# Recuerde que es posible guardar el modelo comprimido usanzo la libreria gzip.

os.makedirs("../files/models/", exist_ok=True)

with gzip.open("../files/models/model.pkl.gz", "wb") as f:
    pickle.dump(grid_search, f)

## Paso 6: Calcular las metricas

In [14]:
# Calcule las metricas de precision, precision balanceada, recall,
# y f1-score para los conjuntos de entrenamiento y prueba.
# Guardelas en el archivo files/output/metrics.json. Cada fila
# del archivo es un diccionario con las metricas de un modelo.
# Este diccionario tiene un campo para indicar si es el conjunto
# de entrenamiento o prueba.

def calculate_metrics(y_true, y_pred, name):
    metric = {
        "type": "metrics",
        "dataset": name,
        "precision": precision_score(y_true=y_true, y_pred=y_pred, zero_division=0),
        "balanced_accuracy" : balanced_accuracy_score(y_true=y_true, y_pred=y_pred),
        "recall": recall_score(y_true=y_true, y_pred=y_pred, zero_division=0),
        "f1_score": f1_score(y_true=y_true, y_pred=y_pred, zero_division=0)
    }
    return metric

In [15]:
y_train_pred = grid_search.predict(x_train)
y_test_pred = grid_search.predict(x_test)

In [16]:
metrics = []
train_metrics = calculate_metrics(y_train, y_train_pred, "train")
test_metrics = calculate_metrics(y_test, y_test_pred, "test")

metrics.append(train_metrics)
metrics.append(test_metrics)

## Paso 7: Calcular la matriz de confución

In [17]:
# Calcule las matrices de confusion para los conjuntos de entrenamiento y
# prueba. Guardelas en el archivo files/output/metrics.json. Cada fila
# del archivo es un diccionario con las metricas de un modelo.
# de entrenamiento o prueba. 

def calc_confusion_matrices(y_train, y_test, y_train_pred, y_test_pred):

    cm_train = confusion_matrix(y_train, y_train_pred)
    cm_test = confusion_matrix(y_test, y_test_pred)

    confusion_matrices = [
        {
            'type': 'cm_matrix',
            'dataset': 'train',
            'true_0': {'predicted_0': int(cm_train[0, 0]), 'predicted_1': int(cm_train[0, 1])},
            'true_1': {'predicted_0': int(cm_train[1, 0]), 'predicted_1': int(cm_train[1, 1])}
        },
        {
            'type': 'cm_matrix',
            'dataset': 'test',
            'true_0': {'predicted_0': int(cm_test[0, 0]), 'predicted_1': int(cm_test[0, 1])},
            'true_1': {'predicted_0': int(cm_test[1, 0]), 'predicted_1': int(cm_test[1, 1])}
        }
    ]

    return confusion_matrices

In [18]:
cm_train, cm_test= calc_confusion_matrices(y_train, y_test, y_train_pred, y_test_pred)

In [19]:
metrics.append(cm_train)
metrics.append(cm_test)

In [20]:
# Crear carpeta destino si no existe
os.makedirs("../files/output", exist_ok=True)

# Guardar metricas en archivo JSON
with open("../files/output/metrics.json", "w") as file:
        for metric in metrics:
            file.write(json.dumps(metric, ensure_ascii=False))
            file.write('\n')