In [217]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import balanced_accuracy_score, make_scorer
import gzip
import json
import os
import pickle
from sklearn.metrics import precision_score, balanced_accuracy_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix
from sklearn.neural_network import MLPClassifier



In [218]:
#Paso 1

#Cargamos los datos
df_train = pd.read_csv(
    "../files/input/train_data.csv.zip", 
    index_col=False, 
    compression="zip",
    )
df_test = pd.read_csv(
    "../files/input/test_data.csv.zip", 
    index_col=False, 
    compression="zip",
    )

In [219]:
#Renombrar la columna "default payment next month" a "default"
df_train = df_train.rename(columns={'default payment next month': 'default'})
df_test = df_test.rename(columns={'default payment next month': 'default'})


In [220]:
#Remover la columna "ID"
df_train = df_train.drop(columns=['ID'])
df_test = df_test.drop(columns=['ID'])

In [221]:
#Eliminación de los registro de información no disponible
df_train = df_train.dropna()
df_train = df_train[df_train["EDUCATION"] != 0]
df_train = df_train[df_train["MARRIAGE"] != 0]
df_test = df_test.dropna()
df_test = df_test[df_test["EDUCATION"] != 0]
df_test = df_test[df_test["MARRIAGE"] != 0]

In [222]:
#Agrupación de los valore de EDUCATION > 4 en la categoría "others"
df_train.loc[df_train["EDUCATION"] > 4, "EDUCATION"] = 4
df_test.loc[df_test["EDUCATION"] > 4, "EDUCATION"] = 4

In [223]:
#Paso 2
#Dividir los datasets en x_train, y_train, x_test, y_test
x_train = df_train.drop(columns=["default"])
y_train = df_train["default"]
x_test = df_test.drop(columns=["default"])
y_test = df_test["default"]

In [224]:
# Paso 3.
# Cree un pipeline para el modelo de clasificación. Este pipeline debe
# contener las siguientes capas:
# - Transforma las variables categoricas usando el método
#   one-hot-encoding.
# - Descompone la matriz de entrada usando componentes principales.
#   El pca usa todas las componentes.
# - Escala la matriz de entrada al intervalo [0, 1].
# - Selecciona las K columnas mas relevantes de la matrix de entrada.
# - Ajusta una red neuronal tipo MLP.

#Columnas categóricas 
categorical_features=["SEX", "EDUCATION", "MARRIAGE"]
numerical_features = num_columns = [col for col in x_train.columns if col not in categorical_features]

#Preprocesador
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_features),
        ('scaler', StandardScaler(with_mean=True, with_std=True), numerical_features),
    ],
)

#Creamos el pipeline
pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ('pca', PCA()),
        ('feature_selection', SelectKBest(score_func=f_classif)),
        ("classifier", MLPClassifier(max_iter=20000, random_state=17)),
    ],
)

print("Pipeline de clasificación ajustado exitosamente")

Pipeline de clasificación ajustado exitosamente


In [225]:
# Paso 4.
# Optimice los hiperparametros del pipeline usando validación cruzada.
# Use 10 splits para la validación cruzada. Use la función de precision
# balanceada para medir la precisión del modelo.

param_grid = {
    'pca__n_components':[None],
    'feature_selection__k':[20],
    'classifier__hidden_layer_sizes': [(50, 40, 60, 70)],
    'classifier__alpha' : [0.27],
    'classifier__learning_rate_init': [0.002],
}

model=GridSearchCV(
    pipeline,
    param_grid,
    cv=10,
    scoring="balanced_accuracy",
    n_jobs=-1,
    refit=True
    )

model.fit(x_train, y_train)

best_pipeline = model.best_estimator_
print(best_pipeline)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat', OneHotEncoder(),
                                                  ['SEX', 'EDUCATION',
                                                   'MARRIAGE']),
                                                 ('scaler', StandardScaler(),
                                                  ['LIMIT_BAL', 'AGE', 'PAY_0',
                                                   'PAY_2', 'PAY_3', 'PAY_4',
                                                   'PAY_5', 'PAY_6',
                                                   'BILL_AMT1', 'BILL_AMT2',
                                                   'BILL_AMT3', 'BILL_AMT4',
                                                   'BILL_AMT5', 'BILL_AMT6',
                                                   'PAY_AMT1', 'PAY_AMT2',
                                                   'PAY_AMT3', 'PAY_AMT4',
                                                   'PAY_AMT5', 'PAY_AMT

In [226]:
# Paso 5.
# Guarde el modelo (comprimido con gzip) como "files/models/model.pkl.gz".
models_dir = '../files/models'
os.makedirs(models_dir, exist_ok=True)

#Nombre del archivo
compressed_model_path = "../files/models/model.pkl.gz"

with gzip.open(compressed_model_path, "wb") as file:
    pickle.dump(model, file)

In [227]:
# Paso 6.
# Calcule las metricas de precision, precision balanceada, recall,
# y f1-score para los conjuntos de entrenamiento y prueba.
# Guardelas en el archivo files/output/metrics.json. Cada fila
# del archivo es un diccionario con las metricas de un modelo.
# Este diccionario tiene un campo para indicar si es el conjunto
# de entrenamiento o prueba. 

def calculate_and_save_metrics(model, X_train, X_test, y_train, y_test):
    #Prediciones
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    #Metricas para el entrenamiento
    metrics_train = {
        'type': 'metrics',
        'dataset': 'train',
        'precision': precision_score(y_train, y_train_pred, zero_division=0),
        'balanced_accuracy': balanced_accuracy_score(y_train, y_train_pred),
        'recall': recall_score(y_train, y_train_pred, zero_division=0),
        'f1_score': f1_score(y_train, y_train_pred, zero_division=0)
    }

    #Metricas para las pruebas
    metrics_test = {
        'type': 'metrics',
        'dataset': 'test',
        'precision' : precision_score(y_test, y_test_pred, zero_division=0),
        'balanced_accuracy' : balanced_accuracy_score(y_test, y_test_pred),
        'recall' : recall_score(y_test, y_test_pred, zero_division=0),
        'f1_score' : f1_score(y_test, y_test_pred, zero_division=0)
    }

    #Crear carpeta si no existe
    output_dir = '../files/output'
    os.makedirs(output_dir, exist_ok=True)

    #Guardar las metricas en un archivo JSON
    output_path = os.path.join(output_dir, 'metrics.json')
    with open(output_path, 'w') as f: #w para comenzar con un archivo limpio
        f.write(json.dumps(metrics_train) + '\n')
        f.write(json.dumps(metrics_test) + '\n')

    print(metrics_train)
    print(metrics_test)


In [228]:


def calculate_and_save_confusion_matrices(model, X_train, X_test, y_train, y_test):
    #Predicciones
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    #Calcular matrices de confusion
    cm_train = confusion_matrix(y_train, y_train_pred)
    cm_test = confusion_matrix(y_test, y_test_pred)

    #Convertir las matrices de confision en formato json
    def format_confussion_matrix(cm, dataset_type):
        return {
            'type': 'cm_matrix', 
            'dataset': dataset_type,
            'true_0': {
                'predicted_0': int(cm_train[0, 0]), 
                'predicted_1': int(cm_train[0, 1])
            },
            'true_1': {
                'predicted_0': int(cm_train[1, 0]), 
                'predicted_1': int(cm_train[1, 1])
            },
        }
    
    metrics = [
        format_confussion_matrix(cm_train, 'train'),
        format_confussion_matrix(cm_test, 'test')
    ]

    #Guardar las matrices de confusión en el mismo archivo json
    output_path = '../files/output/metrics.json'
    with open(output_path, 'a') as f: #'a' para agregar después de las metricas
        for metric in metrics:
            f.write(json.dumps(metric) + '\n')

#Función principal para ejecutar todo
def main(model, X_train, X_test, y_train, y_test):
    #Crear el directorio de salida si no existe
    os.makedirs('../files/output', exist_ok=True)

    #Calcular y guardar las metricas
    calculate_and_save_metrics(model, X_train, X_test, y_train, y_test)

    #Calcular y guardar las matrices de confusión
    calculate_and_save_confusion_matrices(model, X_train, X_test, y_train, y_test)

#Ejemplo de uso
main(model, x_train, x_test, y_train, y_test)



{'type': 'metrics', 'dataset': 'train', 'precision': 0.6921212121212121, 'balanced_accuracy': np.float64(0.6577919019057792), 'recall': 0.36253968253968255, 'f1_score': 0.47583333333333333}
{'type': 'metrics', 'dataset': 'test', 'precision': 0.6842105263157895, 'balanced_accuracy': np.float64(0.664237433071303), 'recall': 0.3751311647429171, 'f1_score': 0.4845814977973568}
