In [1]:
# Carga de datos

import pandas as pd

test_data = pd.read_csv(
    "../files/input/test_data.csv.zip",
    index_col = False,
    compression = "zip",
)


train_data = pd.read_csv(
    "../files/input/train_data.csv.zip",
    index_col = False,
    compression = "zip",
)



In [2]:
# Paso 1.
# Realice la limpieza de los datasets:
# - Renombre la columna "default payment next month" a "default".
# - Remueva la columna "ID".
# - Elimine los registros con informacion no disponible.
# - Para la columna EDUCATION, valores > 4 indican niveles superiores
#   de educación, agrupe estos valores en la categoría "others".
# - Renombre la columna "default payment next month" a "default"
# - Remueva la columna "ID".

test_data = test_data.rename(columns={'default payment next month' : 'default'})

train_data = train_data.rename(columns={'default payment next month' : 'default'})

# print(test_data.columns)
# print(train_data.columns)

In [3]:
test_data = test_data.drop(columns=['ID'])
train_data = train_data.drop(columns=['ID'])

# print(test_data.columns)
# print(train_data.columns)

In [4]:
import numpy as np

train_data = train_data.loc[train_data["MARRIAGE"] != 0]
train_data = train_data.loc[train_data["EDUCATION"] != 0]

test_data = test_data.loc[test_data["MARRIAGE"] != 0]
test_data = test_data.loc[test_data["EDUCATION"] != 0]

In [5]:
train_data['EDUCATION'] = train_data['EDUCATION'].apply(lambda x: 4 if x > 4 else x)
test_data['EDUCATION'] = test_data['EDUCATION'].apply(lambda x: 4 if x > 4 else x)

In [6]:
# Paso 2.
# Divida los datasets en x_train, y_train, x_test, y_test.

x_train = train_data.drop(columns="default")
y_train = train_data["default"]

x_test = test_data.drop(columns="default")
y_test = test_data["default"]

In [9]:
# Paso 3.
# Cree un pipeline para el modelo de clasificación. Este pipeline debe
# contener las siguientes capas:
# - Transforma las variables categoricas usando el método
#   one-hot-encoding.
# - Descompone la matriz de entrada usando componentes principales.
#   El pca usa todas las componentes.
# - Escala la matriz de entrada al intervalo [0, 1].
# - Selecciona las K columnas mas relevantes de la matrix de entrada.
# - Ajusta una red neuronal tipo MLP.

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import f_classif
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.neural_network import MLPClassifier

categorical_features = ["SEX","EDUCATION","MARRIAGE"]
non_categoriacal_features = x_train.columns
non_categoriacal_features = non_categoriacal_features.drop(categorical_features).tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_features), 
        ('scaler', StandardScaler(), non_categoriacal_features)
    ],
    remainder="passthrough"
)

pipeline = Pipeline(
    [
        ('preprocessor',preprocessor),
        ('pca', PCA(n_components=None)),
        ('feature_selection', SelectKBest(score_func=f_classif)),
        ('classifier', MLPClassifier(max_iter=1000,random_state=42))
    ]
)

In [10]:
# Paso 4.
# Optimice los hiperparametros del pipeline usando validación cruzada.
# Use 10 splits para la validación cruzada. Use la función de precision
# balanceada para medir la precisión del modelo.

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, balanced_accuracy_score
from sklearn.metrics import precision_score


param_grid = {
    'feature_selection__k': [20],  # Probar varios valores para la selección de características
    'classifier__hidden_layer_sizes': [(100,50,100,50,100,100)],  # Probar varias configuraciones de capas
    'classifier__activation': ['relu'],  # Varias funciones de activación
    #'classifier__solver': ['sgd', 'adam', 'lbfgs'],  # Probar optimizadores diferentes
    'classifier__alpha': [0.5],  # Valores más variados de regularización
    #'classifier__learning_rate': ['constant'],  # Variar la tasa de aprendizaje
    #'classifier__early_stopping': [True],
    'classifier__learning_rate_init':[0.0003]
    #'classifier__max_iter':[200]
}

model = GridSearchCV(
    pipeline,
    param_grid,
    cv=10,
    scoring="balanced_accuracy",
    n_jobs=-1,
    refit = True
)

model.fit(x_train, y_train)


y_train_pred = model.predict(x_train)
y_test_pred = model.predict(x_test)


print("Mejores parámetros encontrados:", model.best_params_)
print("Mejor puntuación precision train:", precision_score(y_train, y_train_pred, zero_division=0))
print("Mejor puntuación precision test:", precision_score(y_test, y_test_pred, zero_division=0))

#y_pred = model.best_estimator_.predict(x_train)
#test_score = balanced_accuracy_score(x_test, y_pred)
#print("Precisión balanceada en el conjunto de prueba:", test_score)

Mejores parámetros encontrados: {'classifier__activation': 'relu', 'classifier__alpha': 0.5, 'classifier__hidden_layer_sizes': (100, 50, 100, 50, 100, 100), 'classifier__learning_rate_init': 0.0003, 'feature_selection__k': 20}
Mejor puntuación precision train: 0.6964705882352941
Mejor puntuación precision test: 0.6742700729927007


In [11]:
# Paso 5.
# Guarde el modelo (comprimido con gzip) como "files/models/model.pkl.gz".
# Recuerde que es posible guardar el modelo comprimido usanzo la libreria gzip.

import os
import gzip
import pickle

models_dir = '../files/models'
os.makedirs(models_dir, exist_ok=True)

# Guardar el modelo comprimido en formato gzip
gzip_path = os.path.join(models_dir, "model.pkl.gz")
with gzip.open(gzip_path, "wb") as gz_file:
    pickle.dump(model, gz_file)

In [12]:
# Paso 6.
# Calcule las metricas de precision, precision balanceada, recall,
# y f1-score para los conjuntos de entrenamiento y prueba.
# Guardelas en el archivo files/output/metrics.json. Cada fila
# del archivo es un diccionario con las metricas de un modelo.
# Este diccionario tiene un campo para indicar si es el conjunto
# de entrenamiento o prueba. Por ejemplo:
#
# {'dataset': 'train', 'precision': 0.8, 'balanced_accuracy': 0.7, 'recall': 0.9, 'f1_score': 0.85}
# {'dataset': 'test', 'precision': 0.7, 'balanced_accuracy': 0.6, 'recall': 0.8, 'f1_score': 0.75}
#

import json
from sklearn.metrics import precision_score, recall_score, f1_score, balanced_accuracy_score

def calculate_save_metrics(model, x_train, x_test, y_train, y_test):
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)

    metrics_train = {
        'type': 'metrics',
        'dataset': 'train',
        'precision': precision_score(y_train, y_train_pred, zero_division=0),
        'balanced_accuracy': balanced_accuracy_score(y_train, y_train_pred),
        'recall': recall_score(y_train, y_train_pred, zero_division=0),
        'f1_score': f1_score(y_train, y_train_pred, zero_division=0)
    }

    metrics_test = {
        'type': 'metrics',
        'dataset': 'test',
        'precision': precision_score(y_test, y_test_pred, zero_division=0),
        'balanced_accuracy': balanced_accuracy_score(y_test, y_test_pred),
        'recall': recall_score(y_test, y_test_pred, zero_division=0),
        'f1_score': f1_score(y_test, y_test_pred, zero_division=0)
    }

    output_dir = '../files/output'
    os.makedirs(output_dir, exist_ok=True)

    output_path = os.path.join(output_dir, 'metrics.json')
    with open(output_path, 'w') as f:
        f.write(json.dumps(metrics_train)+ '\n')
        f.write(json.dumps(metrics_test)+ '\n')


In [13]:
# Paso 7.
# Calcule las matrices de confusion para los conjuntos de entrenamiento y
# prueba. Guardelas en el archivo files/output/metrics.json. Cada fila
# del archivo es un diccionario con las metricas de un modelo.
# de entrenamiento o prueba. Por ejemplo:
#
# {'type': 'cm_matrix', 'dataset': 'train', 'true_0': {"predicted_0": 15562, "predicte_1": 666}, 'true_1': {"predicted_0": 3333, "predicted_1": 1444}}
# {'type': 'cm_matrix', 'dataset': 'test', 'true_0': {"predicted_0": 15562, "predicte_1": 650}, 'true_1': {"predicted_0": 2490, "predicted_1": 1420}}
#

from sklearn.metrics import confusion_matrix

def calculate_save_cm(model, x_train, x_test, y_train, y_test):
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)

    cm_train = confusion_matrix(y_train, y_train_pred)
    cm_test = confusion_matrix(y_test, y_test_pred)

    def format_cm(cm, dataset_type):
        return {
            'type': 'cm_matrix',
            'dataset': dataset_type,
            'true_0': {
                'predicted_0': int(cm[0,0]),
                'predicted_1': int(cm[0,1]),
            },
            'true_1': {
                'predicted_0': int(cm[1,0]),
                'predicted_1': int(cm[1,1]),
            },

        }
    
    metrics = [
        format_cm(cm_train, 'train'),
        format_cm(cm_test, 'test')
    ]


    output_path = '../files/output/metrics.json'
    with open(output_path, 'a') as f:
        for metric in metrics:
            f.write(json.dumps(metric)+ '\n')






In [14]:
def main(model, x_train, x_test, y_train, y_test):

    import os
    #os.makedirs('../files/output', exist_ok=True)

    calculate_save_metrics(model, x_train, x_test, y_train, y_test)

    calculate_save_cm(model, x_train, x_test, y_train, y_test)


main(model, x_train, x_test, y_train, y_test)