In [1]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score, balanced_accuracy_score, recall_score, f1_score, confusion_matrix
import os
import json
import gzip
import pickle

In [2]:
# Paso 1: Cargar y limpiar los datos
train_data = pd.read_csv('../files/input/train_data.csv.zip')
test_data = pd.read_csv('../files/input/test_data.csv.zip')

def clean_data(df):
    df = df.rename(columns={"default payment next month": "default"})
    df = df.drop(columns=["ID"])
    df = df.dropna()
    df["EDUCATION"] = df["EDUCATION"].apply(lambda x: 4 if x > 4 else x)
    df = df.loc[(df["MARRIAGE"] != 0) & (df["EDUCATION"] != 0)]
    return df

train_data = clean_data(train_data)
test_data = clean_data(test_data)

In [10]:
# Paso 2: Dividir los datasets en x_train, y_train, x_test, y_test
X_train = train_data.drop(columns=["default"])
y_train = train_data["default"]
X_test = test_data.drop(columns=["default"])
y_test = test_data["default"]

print(f'Train description:\n{X_train.describe()}\n')
print(f'Test description:\n{X_test.describe()}\n')
print(f'Train target description:\n{y_train.describe()}\n')
print(f'Test target description:\n{y_test.describe()}\n')

Train description:
           LIMIT_BAL           SEX     EDUCATION      MARRIAGE           AGE  \
count    20953.00000  20953.000000  20953.000000  20953.000000  20953.000000   
mean    167245.35866      1.604400      1.836300      1.557247     35.485515   
std     129698.73226      0.488991      0.744411      0.517892      9.253303   
min      10000.00000      1.000000      1.000000      1.000000     21.000000   
25%      50000.00000      1.000000      1.000000      1.000000     28.000000   
50%     140000.00000      2.000000      2.000000      2.000000     34.000000   
75%     240000.00000      2.000000      2.000000      2.000000     41.000000   
max    1000000.00000      2.000000      4.000000      3.000000     79.000000   

              PAY_0         PAY_2         PAY_3         PAY_4         PAY_5  \
count  20953.000000  20953.000000  20953.000000  20953.000000  20953.000000   
mean      -0.010261     -0.135494     -0.167518     -0.221353     -0.266692   
std        1.120141    

In [4]:
# Paso 3: Crear el pipeline
categorical_features = ["SEX", "EDUCATION", "MARRIAGE"]
numerical_features = [col for col in X_train.columns if col not in categorical_features]

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ]
)

pipeline = Pipeline(
    [
        ("preprocessor", preprocessor),
        ('selectkbest', SelectKBest(score_func=f_classif)),
        ('pca', PCA()),
        ('mlp', MLPClassifier(max_iter=1000, random_state=420))
    ]
)


In [5]:
# Paso 4: Optimizar hiperparámetros
param_grid = {
    'pca__n_components': [20],
    'selectkbest__k': [20],
    'mlp__hidden_layer_sizes': [(50,30,40,60)],
    'mlp__alpha': [0.256],
    'mlp__learning_rate': ['adaptive'],
    'mlp__activation': ['relu'],
    'mlp__solver': ['adam'],
    'mlp__learning_rate_init': [0.001]
}


model = GridSearchCV(
    pipeline,
    param_grid,
    cv=10,
    scoring="balanced_accuracy",
    n_jobs=-1,
    refit=True
)

model.fit(X_train, y_train)

0,1,2
,estimator,Pipeline(step..._state=420))])
,param_grid,"{'mlp__activation': ['relu'], 'mlp__alpha': [0.256], 'mlp__hidden_layer_sizes': [(50, ...)], 'mlp__learning_rate': ['adaptive'], ...}"
,scoring,'balanced_accuracy'
,n_jobs,-1
,refit,True
,cv,10
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,score_func,<function f_c...00242C2BC45E0>
,k,20

0,1,2
,n_components,20
,copy,True
,whiten,False
,svd_solver,'auto'
,tol,0.0
,iterated_power,'auto'
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,

0,1,2
,hidden_layer_sizes,"(50, ...)"
,activation,'relu'
,solver,'adam'
,alpha,0.256
,batch_size,'auto'
,learning_rate,'adaptive'
,learning_rate_init,0.001
,power_t,0.5
,max_iter,1000
,shuffle,True


In [6]:
# Paso 5: Guardar el modelo entrenado
models_dir = '../files/models'
os.makedirs(models_dir, exist_ok=True)

with gzip.open("../files/models/model.pkl.gz", "wb") as file:
    pickle.dump(model, file)

In [7]:
# Paso 6: Calcular métricas
def calculate_metrics(model, x, y, dataset_type):
    y_pred = model.predict(x)
    metrics = {
        "type": "metrics",
        'dataset': dataset_type,
        'precision': precision_score(y, y_pred),
        'balanced_accuracy': balanced_accuracy_score(y, y_pred),
        'recall': recall_score(y, y_pred),
        'f1_score': f1_score(y, y_pred)
    }
    return metrics

train_metrics = calculate_metrics(model, X_train, y_train, 'train')
test_metrics = calculate_metrics(model, X_test, y_test, 'test')

In [8]:
# Paso 7: Calcular matrices de confusión
def calculate_confusion_matrix(model, x, y, dataset_type):
    y_pred = model.predict(x)
    cm = confusion_matrix(y, y_pred)
    cm_dict = {
        'type': 'cm_matrix',
        'dataset': dataset_type,
        'true_0': {'predicted_0': int(cm[0, 0]), 'predicted_1': int(cm[0, 1])},
        'true_1': {'predicted_0': int(cm[1, 0]), 'predicted_1': int(cm[1, 1])}
    }
    return cm_dict

train_cm = calculate_confusion_matrix(model, X_train, y_train, 'train')
test_cm = calculate_confusion_matrix(model, X_test, y_test, 'test')

In [9]:
# Paso 8: Guardar métricas y matrices de confusión
output_file = "../files/output/metrics.json"
os.makedirs("../files/output", exist_ok=True)

with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(train_metrics, f, ensure_ascii=False)
    f.write('\n')
    json.dump(test_metrics, f, ensure_ascii=False)
    f.write('\n')

with open(output_file, 'a', encoding='utf-8') as f:
    json.dump(train_cm, f, ensure_ascii=False)
    f.write('\n')
    json.dump(test_cm, f, ensure_ascii=False)
    f.write('\n')