In [1]:
# Importar librerias

import pandas as pd
import gzip
import json
import os
import pickle
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import precision_score, balanced_accuracy_score, recall_score, f1_score, confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA
from sklearn.svm import SVC as svm
from sklearn.neural_network import MLPClassifier
import warnings
warnings.filterwarnings("ignore")






In [2]:
# Paso 1

train = pd.read_csv('/content/files/input/test_default_of_credit_card_clients.csv')
test = pd.read_csv('/content/files/input/train_default_of_credit_card_clients.csv')

train.rename(columns={"default payment next month": "default"}, inplace=True)
test.rename(columns={"default payment next month": "default"}, inplace=True)

train.drop(columns=["ID"], inplace=True)
test.drop(columns=["ID"], inplace=True)

train.dropna(inplace=True)
test.dropna(inplace=True)

train['EDUCATION'] = np.where(train['EDUCATION'] > 4, 4, train['EDUCATION'])
test['EDUCATION'] = np.where(test['EDUCATION'] > 4, 4, test['EDUCATION'])




In [3]:
# Paso 2.
x_train = train.drop(columns=["default"])
y_train = train["default"]
x_test = test.drop(columns=["default"])
y_test = test["default"]

In [5]:
# Paso 3
categorical_features = ['SEX', 'EDUCATION', 'MARRIAGE']
numerical_features = [col for col in train.columns if col not in categorical_features + ['default']]

categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ("ohe", OneHotEncoder(dtype="int"),
         ["SEX", "EDUCATION", "MARRIAGE"]),
    ],
    remainder="passthrough",
)

# Creamos el pipeline
pipeline = Pipeline(
    steps=[
        ("transformer", preprocessor),
        ("pca", PCA(n_components=22)),
        ("scaler", StandardScaler()),
        ("feature_selection", SelectKBest(score_func=f_classif)),
        ("classifier", MLPClassifier(activation='relu', solver='adam', max_iter=1000)),
    ],
    verbose=False,
)




In [6]:
# Paso 4.
param_grid = {
    "feature_selection__k": [1],
    "classifier__hidden_layer_sizes": [10],
}

model = GridSearchCV(
    pipeline,
    param_grid,
    cv=10,
    scoring="balanced_accuracy",
    n_jobs=-1,
    refit=True
)

model.fit(x_train, y_train)

print(f"Mejores hiperparámetros: {model.best_params_}")



Mejores hiperparámetros: {'classifier__hidden_layer_sizes': 10, 'feature_selection__k': 1}


In [7]:
# Paso 5.
models_dir = '/content/files/models'
os.makedirs(models_dir, exist_ok = True)

model_path = "/content/files/models/model.pkl.gz"

with gzip.open(model_path, "wb") as model_file:
    pickle.dump(model, model_file)

print(f"Modelo guardado en '{model_path}'")



Modelo guardado en '/content/files/models/model.pkl.gz'


In [9]:
# Paso 6
y_train_pred = model.predict(x_train)
y_test_pred = model.predict(x_test)

metrics_train = {
    'type': 'metrics',
    'dataset': 'train',
    'precision': precision_score(y_train, y_train_pred),
    'balanced_accuracy': balanced_accuracy_score(y_train, y_train_pred),
    'recall': recall_score(y_train, y_train_pred),
    'f1_score': f1_score(y_train, y_train_pred)
}

metrics_test = {
    'type': 'metrics',
    'dataset': 'test',
    'precision': precision_score(y_test, y_test_pred),
    'balanced_accuracy': balanced_accuracy_score(y_test, y_test_pred),
    'recall': recall_score(y_test, y_test_pred),
    'f1_score': f1_score(y_test, y_test_pred)
}

os.makedirs("metrics_path", exist_ok=True)

metrics_path = "/content/files/output"

output_path = os.path.join(metrics_path, 'metrics.json')

with open(output_path, 'w') as f:
    f.write(json.dumps(metrics_train) + '\n')
    f.write(json.dumps(metrics_test) + '\n')





In [10]:
# Paso 7
cm_train = confusion_matrix(y_train, y_train_pred)
cm_matrix_train = {
    'type': 'cm_matrix',
    'dataset': 'train',
    'true_0': {"predicted_0": int(cm_train[0, 0]), "predicted_1": int(cm_train[0, 1])},
    'true_1': {"predicted_0": int(cm_train[1, 0]), "predicted_1": int(cm_train[1, 1])}
}

cm_test = confusion_matrix(y_test, y_test_pred)
cm_matrix_test = {
    'type': 'cm_matrix',
    'dataset': 'test',
    'true_0': {"predicted_0": int(cm_test[0, 0]), "predicted_1": int(cm_test[0, 1])},
    'true_1': {"predicted_0": int(cm_test[1, 0]), "predicted_1": int(cm_test[1, 1])}
}

metrics = [
    cm_matrix_train,
    cm_matrix_test
]


output_path = '/content/files/output/metrics.json'
with open(output_path, 'a') as f:
    for metric in metrics:
        f.write(json.dumps(metric) + '\n')

