# LAB 4

## Configuración de variables

In [1]:
from sklearn.neural_network import MLPClassifier

test_path = '../files/input/test_data.csv.zip'
train_path = '../files/input/train_data.csv.zip'
categorical = ['SEX', 'EDUCATION', 'MARRIAGE']
estimator = MLPClassifier(max_iter=1000, random_state=420)
"""
param_grid = {
    'classifier__hidden_layer_sizes': [(30, 15), (100, 50)],
    'classifier__activation': ['relu'],
    'classifier__alpha': [0.001, 0.01, 0.1],
    'classifier__max_iter': [200],
    'classifier__solver': ['adam', 'lbfgs'],
    'classifier__learning_rate_init': [0.001, 0.01], 
    'classifier__early_stopping': [True, False],
    'selector__k': [15, 20],
}

param_grid = {
    'classifier__hidden_layer_sizes': [(100, 50)],
    'classifier__activation': ['relu'],
    'classifier__alpha': [0.1],
    'classifier__max_iter': [200],
    'classifier__solver': ['adam'],
    'classifier__learning_rate_init': [0.01], 
    'classifier__early_stopping': [False],
    'selector__k': [20],
}
"""

param_grid = {
    'pca__n_components': [20],
    'classifier__hidden_layer_sizes': [(50, 30, 40, 60)],
    'classifier__alpha': [0.256],
    'classifier__learning_rate': ['adaptive'],
    'classifier__activation': ['relu'],
    'classifier__solver': ['adam', 'lbfgs'],
    'classifier__learning_rate_init': [0.001],
    'selector__k': [15, 20],
}
filepath_model = '../files/models'
name_model = 'model.pkl.gz'

## Paso 1.
Realice la limpieza de los datasets:
- Renombre la columna "default payment next month" a "default".
- Remueva la columna "ID".
- Elimine los registros con informacion no disponible.
- Para la columna EDUCATION, valores > 4 indican niveles superiores de educación, agrupe estos valores en la categoría "others".
- Renombre la columna "default payment next month" a "default"
- Remueva la columna "ID".

In [2]:
def clean_data(df):

    df = df.rename(columns = {'default payment next month': 'default'})     # Renombrar columna target
    df = df.drop(columns = ['ID'])                                          # Eliminar columna 'ID'
    df = df.dropna()                                                        # Eliminar filas con valores nulos
    df['EDUCATION'] = df['EDUCATION'].astype(int)                           # Convertir 'EDUCATION' a entero
    df.loc[df['EDUCATION'] > 4, 'EDUCATION'] = 4                            # Limitar 'EDUCATION' a 4
    df = df.query('MARRIAGE != 0 and EDUCATION != 0')                       # Filtrar filas

    return df

## Paso 2.

Divida los datasets en x_train, y_train, x_test, y_test.

In [3]:
def load_clean_data(test_path, train_path):
    import pandas as pd

    # Cargar datos
    train = pd.read_csv(train_path, index_col=False, compression='zip')
    test = pd.read_csv(test_path, index_col=False, compression='zip')

    # Limpiar datos
    train = clean_data(train)
    test = clean_data(test)

    # Separar características y target
    target = 'default'
    data_test = test.drop(columns=[target])
    data_train = train.drop(columns=[target])
    target_test = test[target].astype(int)
    target_train = train[target].astype(int)

    return data_train, target_train, data_test, target_test

In [4]:
X_train, y_train, X_test, y_test = load_clean_data(test_path, train_path)

## Paso 3.
Cree un pipeline para el modelo de clasificación. Este pipeline debe contener las siguientes capas:
- Transforma las variables categoricas usando el método one-hot-encoding.
- Descompone la matriz de entrada usando componentes principales. El pca usa todas las componentes.
- Escala la matriz de entrada al intervalo [0, 1].
- Selecciona las K columnas mas relevantes de la matrix de entrada.
- Ajusta una red neuronal tipo MLP.

In [5]:
def make_pipeline():
    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import OneHotEncoder, StandardScaler
    from sklearn.decomposition import PCA
    from sklearn.compose import ColumnTransformer
    from sklearn.feature_selection import SelectKBest, f_classif
    from sklearn.neural_network import MLPClassifier

    numerical = [col for col in X_train.columns if col not in categorical]

    transformer = ColumnTransformer(transformers=[
        ('scaler', StandardScaler(), numerical),
        ('onehot', OneHotEncoder(), categorical),
    ],
    )

    pipeline = Pipeline(steps=[
        ('preprocessor', transformer),
        ('selector', SelectKBest(score_func=f_classif)),
        ('pca', PCA()),
        ('classifier', MLPClassifier(max_iter=1000, random_state=420))
    ])

    return pipeline

## Paso 4.

- Optimice los hiperparametros del pipeline usando validación cruzada.
- Use 10 splits para la validación cruzada. 
- Use la función de precision balanceada para medir la precisión del modelo.

In [6]:
def make_grid_search(pipeline, param_grid, n_folds = 10, n_jobs = -1):
    from sklearn.model_selection import GridSearchCV
    from sklearn.model_selection import StratifiedKFold
    from sklearn.metrics import make_scorer, balanced_accuracy_score, precision_score, recall_score, f1_score

    grid_search = GridSearchCV(estimator=pipeline,
                               param_grid=param_grid,
                               cv=n_folds,
                               scoring='balanced_accuracy',
                               n_jobs=n_jobs,
                               refit=True,)

    return grid_search

"""
    cv = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
    scorers = {
        'accuracy': 'accuracy',
        'balanced_accuracy': make_scorer(balanced_accuracy_score),
        'precision': make_scorer(precision_score),
        'recall': make_scorer(recall_score),
        'f1': make_scorer(f1_score)
    }
"""


"\n    cv = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)\n    scorers = {\n        'accuracy': 'accuracy',\n        'balanced_accuracy': make_scorer(balanced_accuracy_score),\n        'precision': make_scorer(precision_score),\n        'recall': make_scorer(recall_score),\n        'f1': make_scorer(f1_score)\n    }\n"

In [7]:
#pipeline = make_pipeline()
#model = make_grid_search(pipeline, param_grid)
#model.fit(X_train, y_train)

## Paso 5.

Guarde el modelo (comprimido con gzip) como "files/models/model.pkl.gz". - Recuerde que es posible guardar el modelo comprimido usanzo la libreria gzip.

In [8]:
def save_model(model, filepath = '../files/models'):

    import gzip
    import pickle
    import os

    os.makedirs(filepath, exist_ok=True)
    model_path = os.path.join(filepath, 'model.pkl.gz')

    with gzip.open(model_path, 'wb') as f:
        pickle.dump(model, f)

# ----------------------------------------------------------------------------

def load_model(filepath = '../files/models', name = 'model.pkl.gz'):
    import os
    import gzip
    import pickle

    model_path = os.path.join(filepath, name)

    if not os.path.exists(model_path):
        return None
    with gzip.open(model_path, 'rb') as f:
        model = pickle.load(f)

    return model

## Paso 6.

Calcule las metricas de precision, precision balanceada, recall, y f1-score para los conjuntos de entrenamiento y prueba.
Guardelas en el archivo files/output/metrics.json. Cada fila del archivo es un diccionario con las metricas de un modelo.
Este diccionario tiene un campo para indicar si es el conjunto de entrenamiento o prueba. Por ejemplo:

- {'type': 'metrics', 'dataset': 'train', 'precision': 0.8, 'balanced_accuracy': 0.7, 'recall': 0.9, 'f1_score': 0.85}

- {'type': 'metrics', 'dataset': 'test', 'precision': 0.7, 'balanced_accuracy': 0.6, 'recall': 0.8, 'f1_score': 0.75}

Entrenamiento

In [9]:
def train_model(model):
    
    from sklearn.metrics import balanced_accuracy_score
    
    X_train, y_train, X_test, y_test = load_clean_data(test_path, train_path)
    
    model.fit(X_train, y_train)
    best_model = load_model()

    if best_model is not None:

        saved_bas = balanced_accuracy_score(
        y_true = y_test, y_pred = best_model.predict(X_test)
        )
        
        current_bas = balanced_accuracy_score(
        y_true = y_test, y_pred = model.predict(X_test)
        )

        if saved_bas >= current_bas:
            model = best_model
    
    save_model(model)

In [10]:
def train_mlp_model(param_grid):

    pipeline = make_pipeline()
    
    model = make_grid_search(
        pipeline=pipeline, 
        param_grid=param_grid,
        )

    train_model(model)

Métricas

In [11]:
def eval_metrics(model, x_train, y_train, x_test, y_test):
        
    from sklearn.metrics import (
        precision_score,
        balanced_accuracy_score,
        recall_score,
        f1_score,
        confusion_matrix,
    )

    y_pred_train = model.predict(x_train)
    y_pred_test = model.predict(x_test)

    def metrics_dict(y_true, y_pred, dataset):
        return {
            "type": "metrics",
            "dataset": dataset,
            "precision": precision_score(y_true, y_pred),
            "balanced_accuracy": balanced_accuracy_score(y_true, y_pred),
            "recall": recall_score(y_true, y_pred),
            "f1_score": f1_score(y_true, y_pred),
        }

    def cm_dict(y_true, y_pred, dataset):
        cm = confusion_matrix(y_true.astype(int), y_pred.astype(int), labels=[0, 1])
        tn, fp, fn, tp = int(cm[0, 0]), int(cm[0, 1]), int(cm[1, 0]), int(cm[1, 1])
        return {
            "type": "cm_matrix",
            "dataset": dataset,
            "true_0": {"predicted_0": tn, "predicted_1": fp},
            "true_1": {"predicted_0": fn, "predicted_1": tp},
        }

    # Construye los 4 renglones en el orden exacto que pide el test
    metrics_train = metrics_dict(y_train, y_pred_train, "train")
    metrics_test  = metrics_dict(y_test,  y_pred_test,  "test")
    cm_train      = cm_dict(y_train, y_pred_train, "train")
    cm_test       = cm_dict(y_test,  y_pred_test,  "test")

    return metrics_train, metrics_test, cm_train, cm_test


In [12]:
def save_report(
   metrics_train,
   metrics_test,
   cm_train,
   cm_test,
):
    
    import json
    import os
    
    path = '../files/output/metrics.json'

    if not os.path.exists(os.path.dirname(path)):
       os.makedirs(os.path.dirname(path))

    with open(path, 'a', encoding='utf-8', newline='\n') as f:
       f.write(json.dumps(metrics_train) + '\n')
       f.write(json.dumps(metrics_test) + '\n')
       f.write(json.dumps(cm_train) + '\n')
       f.write(json.dumps(cm_test) + '\n')

In [13]:
def print_report(metrics_train, metrics_test, cm_train=None, cm_test=None):
    """Imprime un resumen compacto de métricas.
    Muestra valores de test con el valor de train entre paréntesis.
    """
    def fmt(name, test_val, train_val):
        return f"{name:>20}: {test_val:.4f} ({train_val:.4f})"

    print("-" * 80)
    print("Metrics summary (test (train))")
    print("-" * 80)
    print(fmt("Balanced Accuracy", metrics_test["balanced_accuracy"], metrics_train["balanced_accuracy"]))
    print(fmt("Precision",         metrics_test["precision"],         metrics_train["precision"]))
    print(fmt("Recall",            metrics_test["recall"],            metrics_train["recall"]))
    print(fmt("F1-score",          metrics_test["f1_score"],          metrics_train["f1_score"]))
    if cm_test and cm_train:
        print("-" * 80)
        print("Confusion matrix (test):")
        print(f" true_0 -> predicted_0: {cm_test['true_0']['predicted_0']}, predicted_1: {cm_test['true_0']['predicted_1']}")
        print(f" true_1 -> predicted_0: {cm_test['true_1']['predicted_0']}, predicted_1: {cm_test['true_1']['predicted_1']}")
    print("-" * 80)

In [14]:
def check_estimator():

    x_train, y_train, x_test, y_test = load_clean_data(test_path, train_path)

    model = load_model()

    metrics_train, metrics_test, cm_train, cm_test = eval_metrics(
    model,
    x_train,
    y_train,
    x_test,
    y_test,
    )
    save_report(
        metrics_train,
        metrics_test,
        cm_train,
        cm_test,
    )
    print_report(             
        metrics_train,
        metrics_test,
        cm_train,
        cm_test,
    )


Mejores parámetros

In [15]:
def print_get_params():
    model = load_model()
    print("Get model parameters:")
    for param, value in model.get_params().items():
        print(f"  {param}: {value}")


def print_best_model_params():
    model = load_model()
    print("Best model parameters:")
    for param, value in model.best_params_.items():
        print(f"  {param}: {value}")

## Ejecución

In [16]:
train_mlp_model(param_grid=param_grid)
check_estimator()
print_best_model_params()

20 fits failed out of a total of 40.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "c:\analitica_predictiva\LAB-04-prediccion-del-default-usando-mlp-caego\.venv\lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\analitica_predictiva\LAB-04-prediccion-del-default-usando-mlp-caego\.venv\lib\site-packages\sklearn\base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\analitica_predictiva\LAB-04-prediccion-del-default-usando-mlp-caego\.venv\lib\site-packages\sklearn\pipeline.py", line 655, in fit
    Xt = self._fit(X, y, rout

--------------------------------------------------------------------------------
Metrics summary (test (train))
--------------------------------------------------------------------------------
   Balanced Accuracy: 0.6704 (0.6674)
           Precision: 0.6801 (0.7016)
              Recall: 0.3903 (0.3822)
            F1-score: 0.4960 (0.4949)
--------------------------------------------------------------------------------
Confusion matrix (test):
 true_0 -> predicted_0: 6723, predicted_1: 350
 true_1 -> predicted_0: 1162, predicted_1: 744
--------------------------------------------------------------------------------
Best model parameters:
  mlp__activation: relu
  mlp__alpha: 0.256
  mlp__hidden_layer_sizes: (50, 30, 40, 60)
  mlp__learning_rate: adaptive
  mlp__learning_rate_init: 0.001
  mlp__solver: adam
  pca__n_components: 20
  selectkbest__k: 20


Métricas mínimas requeridas:

Para TRAIN:

- precision > 0.691
- balanced_accuracy > 0.661
- recall > 0.370
- f1_score > 0.482

Para TEST:

- precision > 0.673
- balanced_accuracy > 0.661
- recall > 0.370
- f1_score > 0.482

Matriz de confusión mínima:

Train:

- true_0/predicted_0 > 15440
- true_1/predicted_1 > 1735

Test:

- true_0/predicted_0 > 6710
- true_1/predicted_1 > 730