In [1]:
# %%
# Paso 1.
# Limpieza de los datasets
import pandas as pd

# Importo test
test_data = pd.read_csv(
    "../files/input/test_data.csv.zip",
    index_col=False,
    compression="zip",
)
# Importo train
train_data = pd.read_csv(
    "../files/input/train_data.csv.zip",
    index_col=False,
    compression="zip",
)

# Renombrar columna "default payment next month" a "default"
test_data = test_data.rename(columns={'default payment next month': 'default'})
train_data = train_data.rename(columns={'default payment next month': 'default'})

# Remover columna "ID"
test_data = test_data.drop(columns=['ID'])
train_data = train_data.drop(columns=['ID'])

# Remover registros con información no disponible
test_data = test_data.loc[test_data["MARRIAGE"] != 0]
test_data = test_data.loc[test_data["EDUCATION"] != 0]

train_data = train_data.loc[train_data["MARRIAGE"] != 0]
train_data = train_data.loc[train_data["EDUCATION"] != 0]

# Agrupar valores > 4 en "EDUCATION" como "others" (4)
test_data['EDUCATION'] = test_data['EDUCATION'].apply(lambda x: 4 if x > 4 else x)
train_data['EDUCATION'] = train_data['EDUCATION'].apply(lambda x: 4 if x > 4 else x)

# Dividir datasets en x_train, y_train, x_test, y_test
x_train = train_data.drop(columns="default")
y_train = train_data["default"]

x_test = test_data.drop(columns="default")
y_test = test_data["default"]

# %%
# Paso 2. Crear pipeline para el modelo de clasificación
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.neural_network import MLPClassifier

# Columnas categóricas y numéricas
categorical_features = ["SEX", "EDUCATION", "MARRIAGE"]
numerical_features = [col for col in x_train.columns if col not in categorical_features]

# Preprocesador
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_features),
        ('scaler', StandardScaler(), numerical_features),
    ]
)

# Pipeline
pipeline = Pipeline(
    [
        ("preprocessor", preprocessor),
        ("pca", PCA(n_components=25)),
        ("feature_selection", SelectKBest(score_func=f_classif, k=5)),
        ("classifier", MLPClassifier(max_iter=1000))
    ]
)

# %%
# Paso 3. Optimizar hiperparámetros con validación cruzada
from sklearn.model_selection import GridSearchCV

param_grid = {
    'feature_selection__k': [3, 5, 10],
    "classifier__hidden_layer_sizes": [(10,), (50,), (100,)],
    'classifier__solver': ['sgd'],
    "classifier__activation": ['logistic', 'relu'],
    "classifier__learning_rate": ['invscaling'],
    "classifier__learning_rate_init": [0.001, 0.01, 0.1],
}

model = GridSearchCV(
    pipeline,
    param_grid,
    cv=10,
    scoring="balanced_accuracy",
    n_jobs=-1,
    refit=True
)

model.fit(x_train, y_train)

# %%
# Paso 4. Guardar el modelo como "files/models/model.pkl"
import pickle
import os
import gzip

models_dir = '../files/models'
os.makedirs(models_dir, exist_ok=True)

compressed_model_path = "../files/models/model.pkl.gz"

with gzip.open(compressed_model_path, "wb") as file:
    pickle.dump(model, file)

# %%
# Paso 5. Calcular y guardar métricas
import json
from sklearn.metrics import precision_score, recall_score, f1_score, balanced_accuracy_score

def calculate_and_save_metrics(model, X_train, X_test, y_train, y_test):
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    metrics = [
        {
            'type': 'metrics',
            'dataset': 'train',
            'precision': precision_score(y_train, y_train_pred, zero_division=0),
            'balanced_accuracy': balanced_accuracy_score(y_train, y_train_pred),
            'recall': recall_score(y_train, y_train_pred, zero_division=0),
            'f1_score': f1_score(y_train, y_train_pred, zero_division=0)
        },
        {
            'type': 'metrics',
            'dataset': 'test',
            'precision': precision_score(y_test, y_test_pred, zero_division=0),
            'balanced_accuracy': balanced_accuracy_score(y_test, y_test_pred),
            'recall': recall_score(y_test, y_test_pred, zero_division=0),
            'f1_score': f1_score(y_test, y_test_pred, zero_division=0)
        }
    ]

    output_dir = '../files/output'
    os.makedirs(output_dir, exist_ok=True)
    output_path = os.path.join(output_dir, 'metrics.json')

    with open(output_path, 'w') as f:
        for metric in metrics:
            f.write(json.dumps(metric) + '\n')

# %%
# Paso 6. Calcular y guardar matrices de confusión
from sklearn.metrics import confusion_matrix

def calculate_and_save_confusion_matrices(model, X_train, X_test, y_train, y_test):
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    cm_train = confusion_matrix(y_train, y_train_pred)
    cm_test = confusion_matrix(y_test, y_test_pred)

    def format_confusion_matrix(cm, dataset_type):
        return {
            'type': 'cm_matrix',
            'dataset': dataset_type,
            'true_0': {
                'predicted_0': int(cm[0, 0]),
                'predicted_1': int(cm[0, 1])
            },
            'true_1': {
                'predicted_0': int(cm[1, 0]),
                'predicted_1': int(cm[1, 1])
            }
        }

    metrics = [
        format_confusion_matrix(cm_train, 'train'),
        format_confusion_matrix(cm_test, 'test')
    ]

    output_path = '../files/output/metrics.json'
    with open(output_path, 'a') as f:
        for metric in metrics:
            f.write(json.dumps(metric) + '\n')

# Función principal
calculate_and_save_metrics(model, x_train, x_test, y_train, y_test)
calculate_and_save_confusion_matrices(model, x_train, x_test, y_train, y_test)


ValueError: 
All the 540 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
540 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/maferojas/GitHub/Ana_predicitiva_RPOS/Laboratorios/2024-2-LAB-04-prediccion-del-default-usando-mlp-maferojas23/.venv/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/maferojas/GitHub/Ana_predicitiva_RPOS/Laboratorios/2024-2-LAB-04-prediccion-del-default-usando-mlp-maferojas23/.venv/lib/python3.12/site-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/maferojas/GitHub/Ana_predicitiva_RPOS/Laboratorios/2024-2-LAB-04-prediccion-del-default-usando-mlp-maferojas23/.venv/lib/python3.12/site-packages/sklearn/pipeline.py", line 652, in fit
    Xt = self._fit(X, y, routed_params, raw_params=params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/maferojas/GitHub/Ana_predicitiva_RPOS/Laboratorios/2024-2-LAB-04-prediccion-del-default-usando-mlp-maferojas23/.venv/lib/python3.12/site-packages/sklearn/pipeline.py", line 586, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/maferojas/GitHub/Ana_predicitiva_RPOS/Laboratorios/2024-2-LAB-04-prediccion-del-default-usando-mlp-maferojas23/.venv/lib/python3.12/site-packages/joblib/memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/maferojas/GitHub/Ana_predicitiva_RPOS/Laboratorios/2024-2-LAB-04-prediccion-del-default-usando-mlp-maferojas23/.venv/lib/python3.12/site-packages/sklearn/pipeline.py", line 1540, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/maferojas/GitHub/Ana_predicitiva_RPOS/Laboratorios/2024-2-LAB-04-prediccion-del-default-usando-mlp-maferojas23/.venv/lib/python3.12/site-packages/sklearn/utils/_set_output.py", line 319, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/maferojas/GitHub/Ana_predicitiva_RPOS/Laboratorios/2024-2-LAB-04-prediccion-del-default-usando-mlp-maferojas23/.venv/lib/python3.12/site-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/maferojas/GitHub/Ana_predicitiva_RPOS/Laboratorios/2024-2-LAB-04-prediccion-del-default-usando-mlp-maferojas23/.venv/lib/python3.12/site-packages/sklearn/decomposition/_pca.py", line 468, in fit_transform
    U, S, _, X, x_is_centered, xp = self._fit(X)
                                    ^^^^^^^^^^^^
  File "/Users/maferojas/GitHub/Ana_predicitiva_RPOS/Laboratorios/2024-2-LAB-04-prediccion-del-default-usando-mlp-maferojas23/.venv/lib/python3.12/site-packages/sklearn/decomposition/_pca.py", line 542, in _fit
    return self._fit_full(X, n_components, xp, is_array_api_compliant)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/maferojas/GitHub/Ana_predicitiva_RPOS/Laboratorios/2024-2-LAB-04-prediccion-del-default-usando-mlp-maferojas23/.venv/lib/python3.12/site-packages/sklearn/decomposition/_pca.py", line 556, in _fit_full
    raise ValueError(
ValueError: n_components=30 must be between 0 and min(n_samples, n_features)=29 with svd_solver='covariance_eigh'
