In [1]:
def limpieza(dataset):
    
    df = dataset.copy()

    df.rename(columns={'default payment next month':'default'}, inplace=True)

    df.drop(columns='ID', inplace=True)

    df = df[(df['EDUCATION'] != 0) & (df['MARRIAGE'] != 0)]

    df['EDUCATION'] = df['EDUCATION'].apply(lambda x: 4 if x > 4 else x)

    df['SEX'] = df['SEX'].astype('category')
    df['EDUCATION'] = df['EDUCATION'].astype('category')
    df['MARRIAGE'] = df['MARRIAGE'].astype('category')
    df['PAY_0'] = df['PAY_0'].astype('category')
    df['PAY_2'] = df['PAY_2'].astype('category')
    df['PAY_3'] = df['PAY_3'].astype('category')
    df['PAY_4'] = df['PAY_4'].astype('category')
    df['PAY_5'] = df['PAY_5'].astype('category')
    df['PAY_6'] = df['PAY_6'].astype('category')
    df['default'] = df['default'].astype('category')

    return df

In [2]:
import glob
import pandas as pd

test_file, train_file = glob.glob(f'../files/input/*')

def load_data(directory):
    df = pd.read_csv(directory)
    df = limpieza(dataset=df)
    return df

test = load_data(directory=test_file)
train = load_data(directory=train_file)

In [3]:
def division_dataset(dataset):

    df = dataset.copy()
    x  = df.drop(columns='default')
    y  = df['default']

    return x, y

x_train, y_train = division_dataset(train)
x_test, y_test = division_dataset(test)

In [4]:
y_train.value_counts()

default
0    16228
1     4725
Name: count, dtype: int64

In [5]:
# from sklearn.utils import shuffle

# class_0 = x_train[y_train == 0]
# class_1 = x_train[y_train == 1]

# class_0_sample = class_0.sample(frac=0.3, random_state=666)

# x_train_balanced = pd.concat([class_0_sample, class_1])
# y_train_balanced = pd.concat([
#     pd.Series([0] * len(class_0_sample), index=class_0_sample.index),
#     pd.Series([1] * len(class_1), index=class_1.index)
# ])

# x_train_balanced, y_train_balanced = shuffle(x_train_balanced, y_train_balanced, random_state=666)

# y_train_balanced.value_counts()

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.neural_network import MLPClassifier

In [7]:
categorical_columns = x_train.select_dtypes(include='category').columns.to_list()

column_transformer = ColumnTransformer(
    [
        ('categories', OneHotEncoder(drop='if_binary', max_categories=6, handle_unknown='infrequent_if_exist', sparse_output=False), categorical_columns),
    ],
    remainder=StandardScaler()
)

pipe = Pipeline(steps=
    [
        ('preprocessor', column_transformer),
        ('dimensionality_reducter', PCA()),
        ('k_best_selector', SelectKBest(score_func=f_classif, k='all')),
        ('classifier', MLPClassifier(solver='adam', early_stopping=True, random_state=17, learning_rate='adaptive', max_iter=7000))
    ]
)

pipe.fit(x_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [8]:
from sklearn.metrics import balanced_accuracy_score, precision_score, recall_score, f1_score

y_pred_train = pipe.predict(x_train)
y_pred_test = pipe.predict(x_test)

print('balanced accuracy train:', balanced_accuracy_score(y_train, y_pred_train))
print('balanced accuracy test:', balanced_accuracy_score(y_test, y_pred_test))
print('precision score train:', precision_score(y_train, y_pred_train))
print('precision score test:', precision_score(y_test, y_pred_test))
print('recall score train:', recall_score(y_train, y_pred_train))
print('recall score test:', recall_score(y_test, y_pred_test))
print('f1 score train:', f1_score(y_train, y_pred_train))
print('f1 score test:', f1_score(y_test, y_pred_test))

balanced accuracy train: 0.6672419803514208
balanced accuracy test: 0.6666768042875906
precision score train: 0.6973785659213569
precision score test: 0.6672727272727272
recall score train: 0.38285714285714284
recall score test: 0.385099685204617
f1 score train: 0.4943298264790272
f1 score test: 0.48835662009314706


In [9]:
from sklearn.metrics import confusion_matrix

def matriz(yt, yp):
    matrix = confusion_matrix(y_true=yt, y_pred=yp)
    return matrix

matrix_train = confusion_matrix_df = pd.DataFrame(
    matriz(yt=y_train, yp=pipe.predict(x_train)),
    index=["N (Clase Real Negativa)", "P (Clase Real Positiva)"],
    columns=["PN (Predicción Negativa)", "PP (Predicción Positiva)"]
)

matrix_test = confusion_matrix_df = pd.DataFrame(
    matriz(yt=y_test, yp=pipe.predict(x_test)),
    index=["N (Clase Real Negativa)", "P (Clase Real Positiva)"],
    columns=["PN (Predicción Negativa)", "PP (Predicción Positiva)"]
)

display(
    matrix_train,
    matrix_test
)

Unnamed: 0,PN (Predicción Negativa),PP (Predicción Positiva)
N (Clase Real Negativa),15443,785
P (Clase Real Positiva),2916,1809


Unnamed: 0,PN (Predicción Negativa),PP (Predicción Positiva)
N (Clase Real Negativa),6707,366
P (Clase Real Positiva),1172,734


In [10]:
columns_transformer = pipe.named_steps['preprocessor']

x_transformed = columns_transformer.transform(x_train)
x_transformed = pd.DataFrame(x_transformed, columns=columns_transformer.get_feature_names_out())

print('columnas dataset original:', len(train.columns))
print('columnas dataset transformado:', len(x_transformed.columns))

columnas dataset original: 24
columnas dataset transformado: 58


In [11]:
pca = pipe.named_steps['dimensionality_reducter']
x_reduced = pca.transform(x_transformed)

varianza_explicada = pca.explained_variance_ratio_ * 100
varianza_explicada = pd.DataFrame(
    {
        'Componente': pca.get_feature_names_out(),
        'Varianza Explicada (%)': varianza_explicada
    }
)

varianza_explicada['Varianza Acumulada (%)'] = varianza_explicada['Varianza Explicada (%)'].cumsum()
varianza_explicada['Varianza Explicada (%)'] = varianza_explicada['Varianza Explicada (%)'].apply(lambda x:f'{x:.2f}')

varianza_explicada  



Unnamed: 0,Componente,Varianza Explicada (%),Varianza Acumulada (%)
0,pca0,32.31,32.314852
1,pca1,10.72,43.036821
2,pca2,6.63,49.666841
3,pca3,5.38,55.048897
4,pca4,4.75,59.797053
5,pca5,4.57,64.371524
6,pca6,4.38,68.755129
7,pca7,3.99,72.745435
8,pca8,3.87,76.613692
9,pca9,3.55,80.163613


In [12]:
kbest = pipe.named_steps['k_best_selector'] 

feature_names = pca.get_feature_names_out()

anova = pd.DataFrame(
    {
        'feature':feature_names,
        'F-Score':kbest.scores_,
        'P-Value':[f'{p:.6%}' for p in kbest.pvalues_],
    }
).sort_values(by='F-Score', ascending=False).reset_index(drop=True)

anova

Unnamed: 0,feature,F-Score,P-Value
0,pca9,1560.098261,0.000000%
1,pca12,490.169788,0.000000%
2,pca14,414.142094,0.000000%
3,pca10,296.605277,0.000000%
4,pca1,185.635648,0.000000%
5,pca8,182.537675,0.000000%
6,pca27,152.299276,0.000000%
7,pca4,131.53248,0.000000%
8,pca19,124.172837,0.000000%
9,pca32,113.414809,0.000000%


In [13]:
from sklearn.inspection import permutation_importance

permutation = permutation_importance(
    estimator=pipe,
    X=x_test,
    y=y_test,
    scoring='balanced_accuracy',
    n_jobs=-1,
    random_state=17,
    n_repeats=10
)

importances = permutation.importances_mean

importance_features = list(zip(pipe[:-1].get_feature_names_out(), importances))
importance_features = sorted(importance_features, key=lambda x: x[1], reverse=True)

for feature, importance in importance_features:
    print(f'{feature}: {importance:.6}')

pca5: 0.119993
pca0: 0.00643468
pca7: 0.0062336
pca11: 0.00517219
pca17: 0.00288683
pca14: 0.00257546
pca9: 0.00170719
pca22: 0.00170525
pca6: 0.00135825
pca10: 0.00130116
pca1: 0.000554979
pca19: 0.00034351
pca20: 0.000342571
pca21: 0.000137733
pca18: 9.58191e-05
pca8: -0.000482745
pca3: -0.000546805
pca4: -0.00062517
pca16: -0.00178284
pca13: -0.00204379
pca15: -0.00269638
pca2: -0.00330154
pca12: -0.00717682


In [14]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
import numpy as np

# def custom_metric(yt, yp):
#     precision = precision_score(y_true=yt, y_pred=yp)
#     balanced = balanced_accuracy_score(y_true=yt, y_pred=yp)
#     if precision >= 0.691 and balanced >= 0.661:
#         return precision + balanced
#     else:
#         return balanced

# scoring = make_scorer(custom_metric)


param_grid = {
    'classifier__hidden_layer_sizes': [(58, 29)],
    'classifier__batch_size': [64],
    #'classifier__alpha': np.arange(0.01, 0.02, 0.01),
}

model = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    cv=10,
    scoring='balanced_accuracy',
    n_jobs=-1,
    refit=True,
    verbose=1,
)

model.fit(x_train, y_train)

Fitting 10 folds for each of 1 candidates, totalling 10 fits


The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [15]:
y_pred_train = model.predict(x_train)
y_pred_test = model.predict(x_test)

print('mejores parámetros encontrados:', model.best_params_)
print()
print('balanced accuracy train:', balanced_accuracy_score(y_train, y_pred_train))
print('balanced accuracy test:', balanced_accuracy_score(y_test, y_pred_test))
print('precision score train:', precision_score(y_train, y_pred_train))
print('precision score test:', precision_score(y_test, y_pred_test))
print('recall score train:', recall_score(y_train, y_pred_train))
print('recall score test:', recall_score(y_test, y_pred_test))
print('f1 score train:', f1_score(y_train, y_pred_train))
print('f1 score test:', f1_score(y_test, y_pred_test))
print()
print('score train:', model.score(x_train, y_train))
print('score test:', model.score(x_test, y_test))

mejores parámetros encontrados: {'classifier__batch_size': 64, 'classifier__hidden_layer_sizes': (58, 29)}

balanced accuracy train: 0.6635784384687515
balanced accuracy test: 0.6725596162579153
precision score train: 0.691170750680669
precision score test: 0.6786678667866787
recall score train: 0.3760846560846561
recall score test: 0.3955928646379853
f1 score train: 0.48711622807017546
f1 score test: 0.4998342724560822

score train: 0.6635784384687515
score test: 0.6725596162579153


In [16]:
results = pd.DataFrame(model.cv_results_).sort_values('rank_test_score').reset_index(drop=True)

results = results[
    [
        'param_classifier__hidden_layer_sizes',
        'param_classifier__batch_size',
        'mean_test_score',
        'std_test_score',
        'rank_test_score'
    ]
].sort_values('mean_test_score', ascending=False).reset_index(drop=True)

results

Unnamed: 0,param_classifier__hidden_layer_sizes,param_classifier__batch_size,mean_test_score,std_test_score,rank_test_score
0,"(58, 29)",64,0.662335,0.011241,1


In [17]:
# from sklearn.model_selection import TunedThresholdClassifierCV

# turned_model = TunedThresholdClassifierCV(
#     estimator=model.best_estimator_,
#     scoring=scoring,
#     n_jobs=-1,
#     random_state=666,
#     cv=7,
#     refit=True
# )

# turned_model.fit(x_train, y_train)

# print('mejor umbral de decisión:', turned_model.best_threshold_)
# print('mejor score:', turned_model.best_score_)

In [18]:
# y_pred_train_turned = turned_model.predict(x_train)
# y_pred_test_turned = turned_model.predict(x_test)

# print('balanced accuracy train:', balanced_accuracy_score(y_train, y_pred_train_turned))
# print('balanced accuracy test:', balanced_accuracy_score(y_test, y_pred_test_turned))
# print('precision score train:', precision_score(y_train, y_pred_train_turned))
# print('precision score test:', precision_score(y_test, y_pred_test_turned))
# print('recall score train:', recall_score(y_train, y_pred_train_turned))
# print('recall score test:', recall_score(y_test, y_pred_test_turned))
# print('f1 score train:', f1_score(y_train, y_pred_train_turned))
# print('f1 score test:', f1_score(y_test, y_pred_test_turned))

In [19]:
# from tqdm import tqdm

# probabilities = model.best_estimator_.predict_proba(x_test)[:,1]

# umbrales_validos = []

# for threshold in tqdm(np.arange(turned_model.best_threshold_ - 0.1, turned_model.best_threshold_ + 0.1, 0.001), desc='Calculando umbrales'):
#     predicciones = (probabilities >= threshold).astype(int)
    
#     precision = precision_score(y_test, predicciones, zero_division=0)
#     balanced_accuracy = balanced_accuracy_score(y_test, predicciones)
#     combined_score = (precision * 0.4) + (balanced_accuracy * 0.6)
    
#     if precision >= 0.673 and balanced_accuracy >= 0.661:
#         umbrales_validos.append(
#             {
#                 'threshold': threshold,
#                 'precision': precision,
#                 'balanced_accuracy': balanced_accuracy,
#                 'combined_score': combined_score
#             }
#         )

# if umbrales_validos:
#     mejor_resultado = sorted(umbrales_validos, key=lambda x: x['combined_score'], reverse=True)[0]
#     print(mejor_resultado)
# else:
#     print('No se encontraron umbrales válidos que cumplan las condiciones.')

In [20]:
# if umbrales_validos:
#     best_threshold = mejor_resultado['threshold']
#     probabilities = model.predict_proba(x_test)[:,1]
#     threshold = (probabilities >= best_threshold).astype(int)

In [21]:
import pickle
import gzip
import os

os.makedirs('../files/models', exist_ok=True)

with gzip.open('../files/models/model.pkl.gz', 'wb') as file:
    pickle.dump(model, file)

In [22]:
import json

def calculate_metrics(modelo, x, y, tipo):

    y_pred = modelo.predict(x)
    
    metrics = {
        'type': 'metrics',
        'dataset':tipo,
        'precision':precision_score(y_pred=y_pred, y_true=y),
        'balanced_accuracy':balanced_accuracy_score(y_pred=y_pred, y_true=y),
        'recall':recall_score(y_pred=y_pred, y_true=y),
        'f1_score':f1_score(y_pred=y_pred, y_true=y)
    }
    return metrics

train_metrics = calculate_metrics(modelo=model, x=x_train, y=y_train, tipo='train')
test_metrics = calculate_metrics(modelo=model, x=x_test, y=y_test, tipo='test')

metricas = [train_metrics, test_metrics]

os.makedirs('../files/output', exist_ok=True)

with open('../files/output/metrics.json', 'w') as file:
    for metrica in metricas:
        file.write(json.dumps(metrica)+'\n')

In [23]:
from sklearn.metrics import confusion_matrix

def create_mc(modelo, x, y, tipo):

    y_pred = modelo.predict(x)

    matrix = confusion_matrix(y_true=y, y_pred=y_pred)

    dictionary = {
        'type':'cm_matrix',
        'dataset':tipo,
        'true_0':{
            'predicted_0':int(matrix[0, 0]),
            'predicted_1':int(matrix[0, 1])
        },
        'true_1':{
            'predicted_0':int(matrix[0, 1]),
            'predicted_1':int(matrix[1, 1])
        }
    }
    return dictionary

train_matrix = create_mc(modelo=model, x=x_train, y=y_train, tipo='train')
test_matrix = create_mc(modelo=model, x=x_test, y=y_test, tipo='test')

metricas_2 = [train_matrix, test_matrix]

with open('../files/output/metrics.json', 'a') as file:
    for metrica in metricas_2:
        file.write(json.dumps(metrica)+'\n')