# Paso 1: Limpieza del Dataset

1. Renombre la columna "default payment next month" a "default".
2. Remueva la columna "ID".
3. Elimine los registros con informacion no disponible.
4. Para la columna EDUCATION, valores > 4 indican niveles superiores de educación, agrupe estos valores en la categoría "others". 

In [1339]:
# Leer Dataset

import pandas as pd

train = pd.read_csv("../files/input/train_data.csv.zip", index_col = False, compression = "zip")
test = pd.read_csv("../files/input/test_data.csv.zip", index_col = False, compression = "zip")

In [1340]:
# Renombre la columna "default payment next month" a "default"
train.rename(columns={"default payment next month": "default"}, inplace=True)
test.rename(columns={"default payment next month": "default"}, inplace=True)

In [1341]:
# Remueva la columna "ID"
train.drop(columns=["ID"], inplace=True)
test.drop(columns=["ID"], inplace=True)

In [1342]:
# elimine los registros con informacion no disponible.
train.dropna(inplace=True)
test.dropna(inplace=True)

In [1343]:
# Elimine los registros con informacion no disponible.
train = train.loc[train['EDUCATION'] != 0]
test = test.loc[test['EDUCATION'] != 0]

train = train.loc[train['MARRIAGE'] != 0]
test = test.loc[test['MARRIAGE'] != 0]

In [1344]:
# Para la columna EDUCATION, valores > 4 indican niveles superiores
# de educación, agrupe estos valores en la categoría "others".
train.loc[train['EDUCATION'] > 4, 'EDUCATION'] = 4
test.loc[test['EDUCATION'] > 4, 'EDUCATION'] = 4

# Paso 2: División del Dataset

1. Divida los datasets en x_train, y_train, x_test, y_test.

In [1345]:
x_train = train.drop(columns=['default'])
y_train = train['default']

x_test = test.drop(columns=['default'])
y_test = test['default']

# Paso 3: Pipeline para el modelo de Clasificación

1. Transforma las variables categoricas usando el método one-hot-encoding.
2. Descompone la matriz de entrada usando componentes principales. El pca usa todas las componentes.
3. Escala la matriz de entrada al intervalo [0, 1].
4. Selecciona las K columnas mas relevantes de la matrix de entrada.
5. Ajusta una red neuronal tipo MLP.

In [1346]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.neural_network import MLPClassifier

# Identificar columnas categóricas y numéricas
categoricas = ['SEX', 'EDUCATION', 'MARRIAGE']

# Preprocesamiento
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categoricas)
    ],
    remainder=StandardScaler()
)

#Selección de las k mejores características
k_best = SelectKBest(score_func=f_classif,k=10)

MLP = MLPClassifier()

#Pipeline
'''pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('feature_selection', k_best), 
    ('pca', PCA()),
    ('classifier', MLP) 
])'''

"pipeline = Pipeline([\n    ('preprocessor', preprocessor),\n    ('feature_selection', k_best), \n    ('pca', PCA()),\n    ('classifier', MLP) \n])"

# Paso 4: Optimización

1. Optimice los hiperparametros del pipeline usando validación cruzada.
2. Use 10 splits para la validación cruzada. 

    Use la función de precision balanceada para medir la precisión del modelo.

In [1347]:
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import make_scorer, balanced_accuracy_score

cv = KFold(n_splits=10, shuffle=True, random_state=42)

'''param_grid = {
    'feature_selection__k':[20],
    'pca__n_components': [None], 
    'classifier__hidden_layer_sizes': [(50, 30, 40, 60)],
    'classifier__alpha': [0.3],
    'classifier__learning_rate_init': [0.001],
    'classifier__max_iter': [2000]
}

model=GridSearchCV(
    pipeline,
    param_grid,
    cv=10,
    scoring="balanced_accuracy",
    n_jobs=-1,
    refit=True
    )

model.fit(x_train, y_train)

print(f"Mejores parámetros: {model.best_params_}")
print(f"Precisión: {model.best_score_}")

    'pca__n_components': [15],#14     0.68613
    'feature_selection__k':[20],
    "classifier__hidden_layer_sizes": [(70,50,40,60)], # (52,23,40,60)(m70,50,40,60)
    'classifier__alpha': [0.26], # 0.26
    "classifier__learning_rate_init": [0.001],
    "classifier__max_iter": [21000], #"20000"

'''

'param_grid = {\n    \'feature_selection__k\':[20],\n    \'pca__n_components\': [None], \n    \'classifier__hidden_layer_sizes\': [(50, 30, 40, 60)],\n    \'classifier__alpha\': [0.3],\n    \'classifier__learning_rate_init\': [0.001],\n    \'classifier__max_iter\': [2000]\n}\n\nmodel=GridSearchCV(\n    pipeline,\n    param_grid,\n    cv=10,\n    scoring="balanced_accuracy",\n    n_jobs=-1,\n    refit=True\n    )\n\nmodel.fit(x_train, y_train)\n\nprint(f"Mejores parámetros: {model.best_params_}")\nprint(f"Precisión: {model.best_score_}")\n\n    \'pca__n_components\': [15],#14     0.68613\n    \'feature_selection__k\':[20],\n    "classifier__hidden_layer_sizes": [(70,50,40,60)], # (52,23,40,60)(m70,50,40,60)\n    \'classifier__alpha\': [0.26], # 0.26\n    "classifier__learning_rate_init": [0.001],\n    "classifier__max_iter": [21000], #"20000"\n\n'

In [1348]:
pipeline = Pipeline(
    [
        ("preprocessor", preprocessor),
        ("feature_selection", SelectKBest()),
        ("pca", PCA()),
        (
            "classifier",
            MLPClassifier(
                random_state=42,
                #early_stopping=True,
                validation_fraction=0.1,
                n_iter_no_change=10, 
            ),
        ),
    ]
)

parameter_grid = {

    'pca__n_components': [14],#14     0.68613
    'feature_selection__k':[20],
    "classifier__hidden_layer_sizes": [(70,50,40,60)], # (52,23,40,60)(m70,50,40,60)
    'classifier__alpha': [0.24], # 0.26
    "classifier__learning_rate_init": [0.001],
    "classifier__max_iter": [21000], #"20000"

}
model=GridSearchCV(
    pipeline,
    parameter_grid,
    cv=cv,
    scoring="balanced_accuracy",
    n_jobs=-1,
    refit=True
    )

model.fit(x_train, y_train)

print(f"Mejores parámetros: {model.best_params_}")
print(f"Precisión: {model.best_score_}")

Mejores parámetros: {'classifier__alpha': 0.24, 'classifier__hidden_layer_sizes': (70, 50, 40, 60), 'classifier__learning_rate_init': 0.001, 'classifier__max_iter': 21000, 'feature_selection__k': 20, 'pca__n_components': 14}
Precisión: 0.6590221467902995


In [1349]:
print(model.score(x_train, y_train), model.score(x_test, y_test))

0.6615585055811825 0.6698860659982859


# Paso 5: Guardar el Modelo

1. Guarde el modelo (comprimido con gzip) como "files/models/model.pkl.gz". 

    Recuerde que es posible guardar el modelo comprimido usanzo la libreria gzip.

In [1350]:
import gzip
import pickle
import os

models_dir = '../files/models'
os.makedirs(models_dir, exist_ok=True)
output_path = "../files/models/model.pkl.gz"
with gzip.open(output_path, 'wb') as file:
    pickle.dump(model, file)

# Paso 6: Métricas

1. Calcule las metricas de precision, precision balanceada, recall y f1-score para los conjuntos de entrenamiento y prueba.
2. Guardelas en el archivo files/output/metrics.json. 

    Cada fila del archivo es un diccionario con las metricas de un modelo. Este diccionario tiene un campo para indicar si es el conjunto de entrenamiento o prueba. Por ejemplo:

{'type': 'metrics', 'dataset': 'train', 'precision': 0.8, 'balanced_accuracy': 0.7, 'recall': 0.9, 'f1_score': 0.85}

{'type': 'metrics', 'dataset': 'test', 'precision': 0.7, 'balanced_accuracy': 0.6, 'recall': 0.8, 'f1_score': 0.75}


In [1351]:
from sklearn.metrics import precision_score, balanced_accuracy_score, recall_score, f1_score

# Realizar las predicciones sobre los conjuntos de entrenamiento y prueba
y_train_pred = model.predict(x_train)
y_test_pred = model.predict(x_test)

# Calcular métricas para el conjunto de entrenamiento
precision_train = precision_score(y_train, y_train_pred)
balanced_accuracy_train = balanced_accuracy_score(y_train, y_train_pred)
recall_train = recall_score(y_train, y_train_pred)
f1_train = f1_score(y_train, y_train_pred)

# Calcular métricas para el conjunto de prueba
precision_test = precision_score(y_test, y_test_pred)
balanced_accuracy_test = balanced_accuracy_score(y_test, y_test_pred)
recall_test = recall_score(y_test, y_test_pred)
f1_test = f1_score(y_test, y_test_pred)

# Mostrar los resultados
print("Métricas para el conjunto de entrenamiento:")
print(f"Precisión: {precision_train:.4f}")
print(f"Precisión balanceada: {balanced_accuracy_train:.4f}")
print(f"Recall: {recall_train:.4f}")
print(f"F1-score: {f1_train:.4f}")

print("\nMétricas para el conjunto de prueba:")
print(f"Precisión: {precision_test:.4f}")
print(f"Precisión balanceada: {balanced_accuracy_test:.4f}")
print(f"Recall: {recall_test:.4f}")
print(f"F1-score: {f1_test:.4f}")


Métricas para el conjunto de entrenamiento:
Precisión: 0.6912
Precisión balanceada: 0.6616
Recall: 0.3714
F1-score: 0.4832

Métricas para el conjunto de prueba:
Precisión: 0.6773
Precisión balanceada: 0.6699
Recall: 0.3898
F1-score: 0.4948


In [1352]:
# Guardar las métricas en un archivo json

import json

metrics = [
    {
        "type": "metrics",
        'dataset': 'train',
        'precision': precision_train,
        'balanced_accuracy': balanced_accuracy_train,
        'recall': recall_train,
        'f1_score': f1_train,
    },
    {
        "type": "metrics",
        'dataset': 'test',
        'precision': precision_test,
        'balanced_accuracy': balanced_accuracy_test,
        'recall': recall_test,
        'f1_score': f1_test,
    }
]

output_dir = "../files/output"
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir,"metrics.json")

with open(output_path,"w") as file:
        file.write(json.dumps(metrics) + '\n')


# Paso 7: Matrices de Confusion

1. Calcule las matrices de confusion para los conjuntos de entrenamiento y prueba. 

2. Guardelas en el archivo files/output/metrics.json. 
    
    Cada fila del archivo es un diccionario con las metricas de un modelo de entrenamiento o prueba. Por ejemplo:

{'type': 'cm_matrix', 'dataset': 'train', 'true_0': {"predicted_0": 15562, "predicte_1": 666}, 'true_1': {"predicted_0": 3333, "predicted_1": 1444}}

{'type': 'cm_matrix', 'dataset': 'test', 'true_0': {"predicted_0": 15562, "predicte_1": 650}, 'true_1': {"predicted_0": 2490, "predicted_1": 1420}}


In [1353]:
from sklearn.metrics import (
    precision_score,
    recall_score,
    f1_score,
    balanced_accuracy_score,
)
import json

y_train_pred =model.predict(x_train)
y_test_pred = model.predict(x_test)

metrics = [
    {
        "type": "metrics",
        "dataset": "train",
        "precision": float(precision_score(y_train, y_train_pred)),
        "balanced_accuracy": float(balanced_accuracy_score(y_train, y_train_pred)),
        "recall": float(recall_score(y_train, y_train_pred)),
        "f1_score": float(f1_score(y_train, y_train_pred)),
    },
    {
        "type": "metrics",
        "dataset": "test",
        "precision": float(precision_score(y_test, y_test_pred)),
        "balanced_accuracy": float(balanced_accuracy_score(y_test, y_test_pred)),
        "recall": float(recall_score(y_test, y_test_pred)),
        "f1_score": float(f1_score(y_test, y_test_pred)),
    },
]

In [1354]:
from sklearn.metrics import confusion_matrix
import os

train_cm = confusion_matrix(y_train, y_train_pred)
test_cm = confusion_matrix(y_test, y_test_pred)

confusion_matrices = [
    {
        "type": "cm_matrix",
        "dataset": "train",
        "true_0": {
            "predicted_0": int(train_cm[0, 0]),
            "predicted_1": int(train_cm[0, 1]),
        },
        "true_1": {
            "predicted_0": int(train_cm[1, 0]),
            "predicted_1": int(train_cm[1, 1]),
        },
    },
    {
        "type": "cm_matrix",
        "dataset": "test",
        "true_0": {
            "predicted_0": int(test_cm[0, 0]),
            "predicted_1": int(test_cm[0, 1]),
        },
        "true_1": {
            "predicted_0": int(test_cm[1, 0]),
            "predicted_1": int(test_cm[1, 1]),
        },
    },
]

output_file = "../files/output/metrics.json"
os.makedirs("../files/output", exist_ok=True)

output_data = metrics + confusion_matrices

with open(output_file, "w") as f:
    for item in output_data:
        f.write(json.dumps(item) + "\n")