In [6]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler 
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer, balanced_accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import pickle
import os
import gzip
import json

In [16]:
test_data =pd.read_csv("../files/input/test_data.csv.zip") #Leemos el dataframe de test_data.csv.zip
train_data =pd.read_csv("../files/input/train_data.csv.zip") #Leemos el dataframe de test_data.csv.zip

# Paso 1.
test_data.rename(columns={"default payment next month": "default"}, inplace=True) # Renombramos la columna para test_data
test_data.drop(columns=["ID"], inplace=True) # Eliminar la columna 'ID'
test_data.dropna(inplace=True)  # Eliminar filas con cualquier valor nulo
test_data["EDUCATION"] = test_data["EDUCATION"].apply(lambda x: 4 if x > 4 else x)
test_data = test_data.loc[test_data["MARRIAGE"] != 0]
test_data = test_data.loc[test_data["EDUCATION"] != 0]

train_data.rename(columns={"default payment next month": "default"}, inplace=True) # Renombramos la columna train_data
train_data.drop(columns=["ID"], inplace=True) # Eliminar la columna 'ID'
train_data.dropna(inplace=True)  # Eliminar filas con cualquier valor nulo
train_data["EDUCATION"] = train_data["EDUCATION"].apply(lambda x: 4 if x > 4 else x)
train_data = train_data.loc[train_data["MARRIAGE"] != 0]
train_data = train_data.loc[train_data["EDUCATION"] != 0]


# Paso 2.

X_train = train_data.drop(columns=["default"])  # Todas las columnas excepto "default"
y_train = train_data["default"]  # Columna objetivo

X_test = test_data.drop(columns=["default"])  # Todas las columnas excepto "default"
y_test = test_data["default"]  # Columna objetivo

# Paso 3.
categorical_features=["SEX","EDUCATION","MARRIAGE"]
numerical_features= ['LIMIT_BAL', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 
       'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1','PAY_AMT2', 
       'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6' ]


preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_features),  
        ('num', StandardScaler(), numerical_features)  
    ]
)

pipeline=Pipeline(
    [
        ("preprocessor",preprocessor),
        ('feature_selection',SelectKBest(score_func=f_classif)),
        ('pca',PCA()),
        ('classifier',MLPClassifier(max_iter=15000, random_state=17))
    ]
)

# # Ajustar el pipeline a los datos de entrenamiento
# pipeline.fit(X_train, y_train)

# # Evaluar el modelo en el conjunto de prueba
# print("Modelo entrenado. Precisión en test:", pipeline.score(X_test, y_test))

# Paso 4. 

param_grid = {
            'pca__n_components': [None],
            'feature_selection__k':[20],
            "classifier__hidden_layer_sizes": [(50, 30, 40, 60 )],
            'classifier__alpha': [0.26],
            "classifier__learning_rate_init": [0.001],
}


grid_search = GridSearchCV(
                        estimator=pipeline,
                        param_grid=param_grid,
                        scoring='balanced_accuracy',  
                        cv=10,                                        
                        n_jobs=-1,                                  
                        verbose=1,
                        refit=True                              
)
grid_search.fit(X_train, y_train)

# Paso 5

models_dir = '../files/models'
os.makedirs(models_dir, exist_ok=True)

with gzip.open("../files/models/model.pkl.gz", "wb") as file:
    pickle.dump(grid_search, file)


# Paso 6

y_train_pred = grid_search.predict(X_train)
y_test_pred = grid_search.predict(X_test)

metrics = [
    {
        "type": "metrics",
        "dataset": "train",
        "precision": float(precision_score(y_train, y_train_pred)),
        "balanced_accuracy": float(balanced_accuracy_score(y_train, y_train_pred)),
        "recall": float(recall_score(y_train, y_train_pred)),
        "f1_score": float(f1_score(y_train, y_train_pred)),
    },
    {
        "type": "metrics",
        "dataset": "test",
        "precision": float(precision_score(y_test, y_test_pred)),
        "balanced_accuracy": float(balanced_accuracy_score(y_test, y_test_pred)),
        "recall": float(recall_score(y_test, y_test_pred)),
        "f1_score": float(f1_score(y_test, y_test_pred)),
    },
]

output_file = "../files/output/metrics.json"
os.makedirs("../files/output", exist_ok=True)

with open(output_file, "w") as f:
    for item in metrics:
        f.write(str(item).replace("'", '"') + "\n")


# Paso 7 
train_cm = confusion_matrix(y_train, y_train_pred)
test_cm = confusion_matrix(y_test, y_test_pred)

confusion_matrices = [
    {
        "type": "cm_matrix",
        "dataset": "train",
        "true_0": {
            "predicted_0": int(train_cm[0, 0]),
            "predicted_1": int(train_cm[0, 1]),
        },
        "true_1": {
            "predicted_0": int(train_cm[1, 0]),
            "predicted_1": int(train_cm[1, 1]),
        },
    },
    {
        "type": "cm_matrix",
        "dataset": "test",
        "true_0": {
            "predicted_0": int(test_cm[0, 0]),
            "predicted_1": int(test_cm[0, 1]),
        },
        "true_1": {
            "predicted_0": int(test_cm[1, 0]),
            "predicted_1": int(test_cm[1, 1]),
        },
    },
]

with open(output_file, "a") as f:
    for item in confusion_matrices:
        f.write(str(item).replace("'", '"') + "\n")



Fitting 10 folds for each of 1 candidates, totalling 10 fits
