In [1]:
# Paso 1

def limpieza(dataset):
    
    df = dataset.copy()

    df.rename(columns={'default payment next month':'default'}, inplace=True)

    df.drop(columns='ID', inplace=True)

    df = df[(df['EDUCATION'] != 0) & (df['MARRIAGE'] != 0)]

    df['EDUCATION'] = df['EDUCATION'].apply(lambda x: 4 if x > 4 else x)

    df['SEX'] = df['SEX'].astype('category')
    df['EDUCATION'] = df['EDUCATION'].astype('category')
    df['MARRIAGE'] = df['MARRIAGE'].astype('category')
    df['PAY_0'] = df['PAY_0'].astype('category')
    df['PAY_2'] = df['PAY_2'].astype('category')
    df['PAY_3'] = df['PAY_3'].astype('category')
    df['PAY_4'] = df['PAY_4'].astype('category')
    df['PAY_5'] = df['PAY_5'].astype('category')
    df['PAY_6'] = df['PAY_6'].astype('category')
    df['default'] = df['default'].astype('category')

    return df

In [2]:
import glob
import pandas as pd

test_file, train_file = glob.glob(f'../files/input/*')

def load_data(directory):
    df = pd.read_csv(directory)
    df = limpieza(dataset=df)
    return df

test = load_data(directory=test_file)
train = load_data(directory=train_file)

In [3]:
# Paso 2
def division_dataset(dataset):

    df = dataset.copy()
    x  = df.drop(columns='default')
    y  = df['default']

    return x, y

x_train, y_train = division_dataset(train)
x_test, y_test = division_dataset(test)


In [4]:
# Paso 3
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.neural_network import MLPClassifier

categorical_columns = x_train.select_dtypes(include='category').columns.to_list()

column_transformer = ColumnTransformer(
    [
        ('categories', OneHotEncoder(drop='if_binary', max_categories=6, handle_unknown='infrequent_if_exist', sparse_output=False), categorical_columns),
    ],
    remainder=StandardScaler()
)

pipeline = Pipeline(steps=
    [
        ('preprocessor', column_transformer),
        ('dimensionality_reducter', PCA()),
        ('k_best_selector', SelectKBest(score_func=f_classif, k='all')),
        ('classifier', MLPClassifier(solver='adam', early_stopping=True, random_state=17, learning_rate='adaptive', max_iter=7000))
    ]
)

pipeline.fit(x_train, y_train)


The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [5]:
from sklearn.metrics import balanced_accuracy_score, precision_score, recall_score, f1_score

y_pred_train = pipeline.predict(x_train)
y_pred_test = pipeline.predict(x_test)

print('balanced accuracy train:', balanced_accuracy_score(y_train, y_pred_train))
print('balanced accuracy test:', balanced_accuracy_score(y_test, y_pred_test))
print('precision score train:', precision_score(y_train, y_pred_train))
print('precision score test:', precision_score(y_test, y_pred_test))
print('recall score train:', recall_score(y_train, y_pred_train))
print('recall score test:', recall_score(y_test, y_pred_test))
print('f1 score train:', f1_score(y_train, y_pred_train))
print('f1 score test:', f1_score(y_test, y_pred_test))

balanced accuracy train: 0.6672419803514208
balanced accuracy test: 0.6666768042875906
precision score train: 0.6973785659213569
precision score test: 0.6672727272727272
recall score train: 0.38285714285714284
recall score test: 0.385099685204617
f1 score train: 0.4943298264790272
f1 score test: 0.48835662009314706


In [7]:
from sklearn.metrics import confusion_matrix

def matriz(yt, yp):
    matrix = confusion_matrix(y_true=yt, y_pred=yp)
    return matrix

matrix_train = confusion_matrix_df = pd.DataFrame(
    matriz(yt=y_train, yp=pipeline.predict(x_train)),
    index=["N (Clase Real Negativa)", "P (Clase Real Positiva)"],
    columns=["PN (Predicción Negativa)", "PP (Predicción Positiva)"]
)

matrix_test = confusion_matrix_df = pd.DataFrame(
    matriz(yt=y_test, yp=pipeline.predict(x_test)),
    index=["N (Clase Real Negativa)", "P (Clase Real Positiva)"],
    columns=["PN (Predicción Negativa)", "PP (Predicción Positiva)"]
)

display(
    matrix_train,
    matrix_test
)

Unnamed: 0,PN (Predicción Negativa),PP (Predicción Positiva)
N (Clase Real Negativa),15443,785
P (Clase Real Positiva),2916,1809


Unnamed: 0,PN (Predicción Negativa),PP (Predicción Positiva)
N (Clase Real Negativa),6707,366
P (Clase Real Positiva),1172,734


In [8]:
columns_transformer = pipeline.named_steps['preprocessor']

x_transformed = columns_transformer.transform(x_train)
x_transformed = pd.DataFrame(x_transformed, columns=columns_transformer.get_feature_names_out())

print('columnas dataset original:', len(train.columns))
print('columnas dataset transformado:', len(x_transformed.columns))

pca = pipeline.named_steps['dimensionality_reducter']
x_reduced = pca.transform(x_transformed)

varianza_explicada = pca.explained_variance_ratio_ * 100
varianza_explicada = pd.DataFrame(
    {
        'Componente': pca.get_feature_names_out(),
        'Varianza Explicada (%)': varianza_explicada
    }
)

varianza_explicada['Varianza Acumulada (%)'] = varianza_explicada['Varianza Explicada (%)'].cumsum()
varianza_explicada['Varianza Explicada (%)'] = varianza_explicada['Varianza Explicada (%)'].apply(lambda x:f'{x:.2f}')

varianza_explicada  

columnas dataset original: 24
columnas dataset transformado: 58




Unnamed: 0,Componente,Varianza Explicada (%),Varianza Acumulada (%)
0,pca0,32.31,32.314852
1,pca1,10.72,43.036821
2,pca2,6.63,49.666841
3,pca3,5.38,55.048897
4,pca4,4.75,59.797053
5,pca5,4.57,64.371524
6,pca6,4.38,68.755129
7,pca7,3.99,72.745435
8,pca8,3.87,76.613692
9,pca9,3.55,80.163613


In [9]:
kbest = pipeline.named_steps['k_best_selector'] 

feature_names = pca.get_feature_names_out()

anova = pd.DataFrame(
    {
        'feature':feature_names,
        'F-Score':kbest.scores_,
        'P-Value':[f'{p:.6%}' for p in kbest.pvalues_],
    }
).sort_values(by='F-Score', ascending=False).reset_index(drop=True)

In [10]:
from sklearn.inspection import permutation_importance

permutation = permutation_importance(
    estimator=pipe,
    X=x_test,
    y=y_test,
    scoring='balanced_accuracy',
    n_jobs=-1,
    random_state=17,
    n_repeats=10
)

importances = permutation.importances_mean

importance_features = list(zip(pipe[:-1].get_feature_names_out(), importances))
importance_features = sorted(importance_features, key=lambda x: x[1], reverse=True)

for feature, importance in importance_features:
    print(f'{feature}: {importance:.6}')

NameError: name 'pipe' is not defined

In [12]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
import numpy as np

param_grid = {
    'classifier__hidden_layer_sizes': [(58, 29)],
    'classifier__batch_size': ['auto', 64, 32],
    'classifier__alpha': np.arange(0.000093, 0.0000937, 0.0000001),
}

model = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=10,
    scoring='balanced_accuracy',
    n_jobs=-1,
    refit=True,
    verbose=1,
)

model.fit(x_train, y_train)

Fitting 10 folds for each of 24 candidates, totalling 240 fits


The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [13]:
y_pred_train = model.predict(x_train)
y_pred_test = model.predict(x_test)

print('mejores parámetros encontrados:', model.best_params_)
print()
print('balanced accuracy train:', balanced_accuracy_score(y_train, y_pred_train))
print('balanced accuracy test:', balanced_accuracy_score(y_test, y_pred_test))
print('precision score train:', precision_score(y_train, y_pred_train))
print('precision score test:', precision_score(y_test, y_pred_test))
print('recall score train:', recall_score(y_train, y_pred_train))
print('recall score test:', recall_score(y_test, y_pred_test))
print('f1 score train:', f1_score(y_train, y_pred_train))
print('f1 score test:', f1_score(y_test, y_pred_test))
print()
print('score train:', model.score(x_train, y_train))
print('score test:', model.score(x_test, y_test))

mejores parámetros encontrados: {'classifier__alpha': np.float64(9.350000000000001e-05), 'classifier__batch_size': 64, 'classifier__hidden_layer_sizes': (58, 29)}

balanced accuracy train: 0.6639039715795939
balanced accuracy test: 0.6715102983145784
precision score train: 0.6948356807511737
precision score test: 0.6775067750677507
recall score train: 0.37587301587301586
recall score test: 0.3934942287513116
f1 score train: 0.48784507622579315
f1 score test: 0.49784268171257884

score train: 0.6639039715795939
score test: 0.6715102983145784


In [14]:
results = pd.DataFrame(model.cv_results_).sort_values('rank_test_score').reset_index(drop=True)

results = results[
    [
        'param_classifier__alpha',
        'param_classifier__hidden_layer_sizes',
        'param_classifier__batch_size',
        'mean_test_score',
        'std_test_score',
        'rank_test_score'
    ]
].sort_values('mean_test_score', ascending=False).reset_index(drop=True)

results

Unnamed: 0,param_classifier__alpha,param_classifier__hidden_layer_sizes,param_classifier__batch_size,mean_test_score,std_test_score,rank_test_score
0,9.4e-05,"(58, 29)",64,0.664569,0.010934,1
1,9.3e-05,"(58, 29)",64,0.664432,0.011207,2
2,9.4e-05,"(58, 29)",64,0.663779,0.009585,3
3,9.3e-05,"(58, 29)",64,0.663597,0.011967,4
4,9.3e-05,"(58, 29)",64,0.663566,0.012016,5
5,9.3e-05,"(58, 29)",64,0.663093,0.010708,6
6,9.4e-05,"(58, 29)",64,0.663088,0.012447,7
7,9.3e-05,"(58, 29)",64,0.662318,0.011775,8
8,9.4e-05,"(58, 29)",32,0.661479,0.01173,9
9,9.3e-05,"(58, 29)",32,0.66048,0.01219,10


In [15]:
# Paso 5

import pickle
import gzip
import os

os.makedirs('../files/models', exist_ok=True)

with gzip.open('../files/models/model.pkl.gz', 'wb') as file:
    pickle.dump(model, file)

In [16]:
# Paso 6

from sklearn.metrics import precision_score, balanced_accuracy_score, recall_score, f1_score
import json

y_train_pred = model.predict(x_train)
y_test_pred = model.predict(x_test)

train_metrics = {
    "type": "metrics", 
    'dataset': 'train',
    'precision': precision_score(y_train, y_train_pred),
    'balanced_accuracy': balanced_accuracy_score(y_train, y_train_pred),
    'recall': recall_score(y_train, y_train_pred),
    'f1_score': f1_score(y_train, y_train_pred)
}


test_metrics = {
    "type": "metrics",
    'dataset': 'test',
    'precision': precision_score(y_test, y_test_pred),
    'balanced_accuracy': balanced_accuracy_score(y_test, y_test_pred),
    'recall': recall_score(y_test, y_test_pred),
    'f1_score': f1_score(y_test, y_test_pred)
}

output_path = '../files/output/metrics.json'
os.makedirs(os.path.dirname(output_path), exist_ok=True)

with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(train_metrics, f, ensure_ascii=False) 
    f.write('\n')
    json.dump(test_metrics, f, ensure_ascii=False) 
    f.write('\n')

In [17]:
# Paso 7

import json

def calculate_metrics(modelo, x, y, tipo):

    y_pred = modelo.predict(x)
    
    metrics = {
        'type': 'metrics',
        'dataset':tipo,
        'precision':precision_score(y_pred=y_pred, y_true=y),
        'balanced_accuracy':balanced_accuracy_score(y_pred=y_pred, y_true=y),
        'recall':recall_score(y_pred=y_pred, y_true=y),
        'f1_score':f1_score(y_pred=y_pred, y_true=y)
    }
    return metrics

train_metrics = calculate_metrics(modelo=model, x=x_train, y=y_train, tipo='train')
test_metrics = calculate_metrics(modelo=model, x=x_test, y=y_test, tipo='test')

metricas = [train_metrics, test_metrics]

os.makedirs('../files/output', exist_ok=True)

with open('../files/output/metrics.json', 'w') as file:
    for metrica in metricas:
        file.write(json.dumps(metrica)+'\n')

In [18]:
from sklearn.metrics import confusion_matrix

def create_mc(modelo, x, y, tipo):

    y_pred = modelo.predict(x)

    matrix = confusion_matrix(y_true=y, y_pred=y_pred)

    dictionary = {
        'type':'cm_matrix',
        'dataset':tipo,
        'true_0':{
            'predicted_0':int(matrix[0, 0]),
            'predicted_1':int(matrix[0, 1])
        },
        'true_1':{
            'predicted_0':int(matrix[0, 1]),
            'predicted_1':int(matrix[1, 1])
        }
    }
    return dictionary

train_matrix = create_mc(modelo=model, x=x_train, y=y_train, tipo='train')
test_matrix = create_mc(modelo=model, x=x_test, y=y_test, tipo='test')

metricas_2 = [train_matrix, test_matrix]

with open('../files/output/metrics.json', 'a') as file:
    for metrica in metricas_2:
        file.write(json.dumps(metrica)+'\n')