In [10]:
import pandas as pd
import numpy as np
import gzip
import os
import pickle
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import mutual_info_classif
from sklearn.decomposition import PCA
import zipfile
from sklearn.metrics import (
    make_scorer,
    precision_score,
    recall_score,
    f1_score,
    balanced_accuracy_score,
    confusion_matrix,
    accuracy_score
)
import json

In [11]:
ruta_test = "../files/input/test_data.csv.zip"
ruta_train = "../files/input/train_data.csv.zip"

with zipfile.ZipFile(ruta_train) as z:
    csv_name = [name for name in z.namelist() if name.endswith(".csv")][0]
    with z.open(csv_name) as f:
        df_train = pd.read_csv(f)


df_train.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,10748,310000,1,3,1,32,0,0,0,0,...,84373,57779,14163,8295,6000,4000,3000,1000,2000,0
1,12574,10000,2,3,1,49,-1,-1,-2,-1,...,1690,1138,930,0,0,2828,0,182,0,1
2,29677,50000,1,2,1,28,-1,-1,-1,0,...,45975,1300,43987,0,46257,2200,1300,43987,1386,0
3,8857,80000,2,3,1,52,2,2,3,3,...,40748,39816,40607,3700,1600,1600,0,1600,1600,1
4,21099,270000,1,1,2,34,1,2,0,0,...,22448,15490,17343,0,4000,2000,0,2000,2000,0


In [12]:
with zipfile.ZipFile(ruta_test) as z:
    csv_name = [name for name in z.namelist() if name.endswith(".csv")][0]
    with z.open(csv_name) as f:
        df_test = pd.read_csv(f)


df_test.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
1,10,20000,1,3,2,35,-2,-2,-2,-2,...,0,13007,13912,0,0,0,13007,1122,0,0
2,11,200000,2,3,2,34,0,0,2,0,...,2513,1828,3731,2306,12,50,300,3738,66,0
3,15,250000,1,1,2,29,0,0,0,0,...,59696,56875,55512,3000,3000,3000,3000,3000,3000,0
4,16,50000,2,3,3,23,1,2,0,0,...,28771,29531,30211,0,1500,1100,1200,1300,1100,0


In [13]:
df_train = df_train.rename(columns={'default payment next month': 'default'})
df_test = df_test.rename(columns={'default payment next month': 'default'})
df_train.drop('ID', axis=1, inplace=True)
df_test.drop('ID', axis=1, inplace=True)

df_train['EDUCATION'] = df_train['EDUCATION'].replace({0: np.nan})
df_train.loc[df_train['EDUCATION'] > 4, 'EDUCATION'] = 4

df_test['EDUCATION'] = df_test['EDUCATION'].replace({0: np.nan})
df_test.loc[df_test['EDUCATION'] > 4, 'EDUCATION'] = 4

df_train['MARRIAGE'] = df_train['MARRIAGE'].replace({0: np.nan})
df_test['MARRIAGE'] = df_test['MARRIAGE'].replace({0: np.nan})

df_train = df_train.dropna()
df_test = df_test.dropna()

df_train['EDUCATION'] = df_train['EDUCATION'].astype(int)
df_test['EDUCATION'] = df_test['EDUCATION'].astype(int)

df_train['MARRIAGE'] = df_train['MARRIAGE'].astype(int)
df_test['MARRIAGE'] = df_test['MARRIAGE'].astype(int)

In [14]:
y_test = df_test['default']
x_test = df_test.drop('default', axis = 1)

y_train= df_train['default']
x_train = df_train.drop('default', axis = 1)

In [15]:
cat_cols = ['SEX', 'EDUCATION', 'MARRIAGE']
num_cols = [c for c in x_train.columns if c not in cat_cols]

preprocesamiento = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), []),
    ('num', StandardScaler(), num_cols)
])

pipeline_mlp = Pipeline([
    ('conversion', preprocesamiento),
    ('pca', PCA(n_components=None)),
    ('seleccion', SelectKBest(score_func=f_classif)),
    ('mlp', MLPClassifier(
        max_iter=300,
        early_stopping=True,
        random_state=42
    ))
])
  

In [16]:
param_grid = {
        "seleccion__k": [5,10,15,20],
        "mlp__hidden_layer_sizes": [(h,) for h in range(1, 100,10)],
        "mlp__learning_rate_init": [0.001,0.1],
    }

grid = GridSearchCV(
    estimator=pipeline_mlp,
    param_grid=param_grid,
    cv=10,
    scoring='balanced_accuracy',
    n_jobs=-1,
    verbose=1
)

grid.fit(x_train,y_train)

Fitting 10 folds for each of 80 candidates, totalling 800 fits


0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'mlp__hidden_layer_sizes': [(1,), (11,), ...], 'mlp__learning_rate_init': [0.001, 0.1], 'seleccion__k': [5, 10, ...]}"
,scoring,'balanced_accuracy'
,n_jobs,-1
,refit,True
,cv,10
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_components,
,copy,True
,whiten,False
,svd_solver,'auto'
,tol,0.0
,iterated_power,'auto'
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,

0,1,2
,score_func,<function f_c...001FD8D41C5E0>
,k,20

0,1,2
,hidden_layer_sizes,"(81,)"
,activation,'relu'
,solver,'adam'
,alpha,0.0001
,batch_size,'auto'
,learning_rate,'constant'
,learning_rate_init,0.1
,power_t,0.5
,max_iter,300
,shuffle,True


In [17]:
os.makedirs("../files/models", exist_ok=True)
with gzip.open("../files/models/model.pkl.gz", "wb") as f:
    pickle.dump(grid, f)

In [18]:
os.makedirs("../files/output", exist_ok=True)

metrics_output = []

y_train_pred = grid.predict(x_train)
y_test_pred = grid.predict(x_test)

def calcular_metricas(y_true, y_pred, dataset):
    return {
        'type': 'metrics',  
        'dataset': dataset,
        'precision': precision_score(y_true, y_pred, zero_division=0),
        'balanced_accuracy': balanced_accuracy_score(y_true, y_pred),
        'recall': recall_score(y_true, y_pred, zero_division=0),
        'f1_score': f1_score(y_true, y_pred, zero_division=0)
    }

def calcular_cm(y_true, y_pred, dataset):
    cm = confusion_matrix(y_true, y_pred, labels=[0, 1])
    return {
        'type': 'cm_matrix',
        'dataset': dataset,
        'true_0': {"predicted_0": int(cm[0, 0]), "predicted_1": int(cm[0, 1])},
        'true_1': {"predicted_0": int(cm[1, 0]), "predicted_1": int(cm[1, 1])}
    }

metrics_output.append(calcular_metricas(y_train, y_train_pred, 'train'))
metrics_output.append(calcular_metricas(y_test, y_test_pred, 'test'))

metrics_output.append(calcular_cm(y_train, y_train_pred, 'train'))
metrics_output.append(calcular_cm(y_test, y_test_pred, 'test'))

with open("../files/output/metrics.json", "w") as f:
    for item in metrics_output:
        json.dump(item, f)
        f.write("\n")