In [1]:
def limpieza(dataset):
    
    df = dataset.copy()

    df.rename(columns={'default payment next month':'default'}, inplace=True)

    df.drop(columns='ID', inplace=True)

    df = df[(df['EDUCATION'] != 0) & (df['MARRIAGE'] != 0)]

    df['EDUCATION'] = df['EDUCATION'].apply(lambda x: 4 if x > 4 else x)

    df['SEX'] = df['SEX'].astype('category')
    df['EDUCATION'] = df['EDUCATION'].astype('category')
    df['MARRIAGE'] = df['MARRIAGE'].astype('category')
    df['PAY_0'] = df['PAY_0'].astype('category')
    df['PAY_2'] = df['PAY_2'].astype('category')
    df['PAY_3'] = df['PAY_3'].astype('category')
    df['PAY_4'] = df['PAY_4'].astype('category')
    df['PAY_5'] = df['PAY_5'].astype('category')
    df['PAY_6'] = df['PAY_6'].astype('category')
    df['default'] = df['default'].astype('category')

    return df

In [2]:
import glob
import pandas as pd

test_file, train_file = glob.glob(f'../files/input/*')

def load_data(directory):
    df = pd.read_csv(directory)
    df = limpieza(dataset=df)
    return df

test = load_data(directory=test_file)
train = load_data(directory=train_file)

In [3]:
def division_dataset(dataset):

    df = dataset.copy()
    x  = df.drop(columns='default')
    y  = df['default']

    return x, y

x_train, y_train = division_dataset(train)
x_test, y_test = division_dataset(test)

In [4]:
train['default'].value_counts()

default
0    16228
1     4725
Name: count, dtype: int64

In [5]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

categorical_features = x_train.select_dtypes(include='category').columns.to_list()

preprocessor = ColumnTransformer(
    transformers=[
        ('categories', OneHotEncoder(drop='if_binary', max_categories=6, handle_unknown='infrequent_if_exist', sparse_output=False), categorical_features)
        ],
    remainder='passthrough'
)


pipe = Pipeline(steps=[
    ('preprocessor',  preprocessor),
    ('classifier', RandomForestClassifier(random_state=666, n_jobs=-1))]
)

pipe.fit(x_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [6]:
from sklearn.metrics import balanced_accuracy_score, precision_score, recall_score, f1_score

y_pred_train = pipe.predict(x_train)
y_pred_test = pipe.predict(x_test)

print('balanced accuracy train:', balanced_accuracy_score(y_train, y_pred_train))
print('balanced accuracy test:', balanced_accuracy_score(y_test, y_pred_test))
print('precision score train:', precision_score(y_train, y_pred_train))
print('precision score test:', precision_score(y_test, y_pred_test))
print('recall score train:', recall_score(y_train, y_pred_train))
print('recall score test:', recall_score(y_test, y_pred_test))
print('f1 score train:', f1_score(y_train, y_pred_train))
print('f1 score test:', f1_score(y_test, y_pred_test))

balanced accuracy train: 0.9991802137007955
balanced accuracy test: 0.6701041855665301
precision score train: 0.9987301587301587
precision score test: 0.6473594548551959
recall score train: 0.9987301587301587
recall score test: 0.3987408184679958
f1 score train: 0.9987301587301587
f1 score test: 0.4935064935064935


In [7]:
from sklearn.metrics import confusion_matrix

def matriz(yt, yp):
    matrix = confusion_matrix(y_true=yt, y_pred=yp)
    return matrix

matrix_train = confusion_matrix_df = pd.DataFrame(
    matriz(yt=y_train, yp=pipe.predict(x_train)),
    index=["N (Clase Real Negativa)", "P (Clase Real Positiva)"],
    columns=["PN (Predicción Negativa)", "PP (Predicción Positiva)"]
)

matrix_test = confusion_matrix_df = pd.DataFrame(
    matriz(yt=y_test, yp=pipe.predict(x_test)),
    index=["N (Clase Real Negativa)", "P (Clase Real Positiva)"],
    columns=["PN (Predicción Negativa)", "PP (Predicción Positiva)"]
)

display(
    matrix_train,
    matrix_test
)

Unnamed: 0,PN (Predicción Negativa),PP (Predicción Positiva)
N (Clase Real Negativa),16222,6
P (Clase Real Positiva),6,4719


Unnamed: 0,PN (Predicción Negativa),PP (Predicción Positiva)
N (Clase Real Negativa),6659,414
P (Clase Real Positiva),1146,760


In [8]:
columns_transformer = pipe.named_steps['preprocessor']

x_transformed = columns_transformer.transform(x_train)
x_transformed = pd.DataFrame(x_transformed, columns=columns_transformer.get_feature_names_out())

print('columnas dataset original:', len(train.columns))
print('columnas dataset transformado:', len(x_transformed.columns))

columnas dataset original: 24
columnas dataset transformado: 58


In [9]:
import numpy as np

rf_model = pipe.named_steps['classifier']

profundidad_promedio = np.mean([tree.tree_.max_depth for tree in rf_model.estimators_])
nodos_promedio = np.mean([tree.tree_.node_count for tree in rf_model.estimators_])

print(f'max_depth AVG: {profundidad_promedio}')
print(f'nodes number AVG: {nodos_promedio}')

max_depth AVG: 41.5
nodes number AVG: 6916.62


In [10]:
rf_model = pipe.named_steps['classifier']

features_importances = rf_model.feature_importances_

importance = pd.DataFrame(
    {
        'feature': columns_transformer.get_feature_names_out(),
        'importance': features_importances
    }
).sort_values('importance', ascending=False)

importance

Unnamed: 0,feature,importance
45,remainder__AGE,0.064713
44,remainder__LIMIT_BAL,0.060637
46,remainder__BILL_AMT1,0.057345
12,categories__PAY_0_2,0.054917
47,remainder__BILL_AMT2,0.053954
48,remainder__BILL_AMT3,0.050712
52,remainder__PAY_AMT1,0.049093
49,remainder__BILL_AMT4,0.049025
51,remainder__BILL_AMT6,0.04882
50,remainder__BILL_AMT5,0.048112


In [11]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'classifier__min_samples_split': np.arange(0.0003, 0.0007, 0.00002)
}

model = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    cv=5,
    scoring='balanced_accuracy',
    n_jobs=-1,
    refit=True,
    verbose=1
)

model.fit(x_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [12]:
y_pred_train = model.predict(x_train)
y_pred_test = model.predict(x_test)

print('mejores parámetros encontrados:', model.best_params_)
print()
print('balanced accuracy train:', balanced_accuracy_score(y_train, y_pred_train))
print('balanced accuracy test:', balanced_accuracy_score(y_test, y_pred_test))
print('precision score train:', precision_score(y_train, y_pred_train))
print('precision score test:', precision_score(y_test, y_pred_test))
print('recall score train:', recall_score(y_train, y_pred_train))
print('recall score test:', recall_score(y_test, y_pred_test))
print('f1 score train:', f1_score(y_train, y_pred_train))
print('f1 score test:', f1_score(y_test, y_pred_test))
print()
print('score train:', model.score(x_train, y_train))
print('score test:', model.score(x_test, y_test))

mejores parámetros encontrados: {'classifier__min_samples_split': np.float64(0.00035999999999999997)}

balanced accuracy train: 0.8612696391239649
balanced accuracy test: 0.6746455306666248
precision score train: 0.9775312855517634
precision score test: 0.6601195559350982
recall score train: 0.7274074074074074
recall score test: 0.4055613850996852
f1 score train: 0.8341220725640093
f1 score test: 0.5024374390640234

score train: 0.8612696391239649
score test: 0.6746455306666248


In [13]:
results = pd.DataFrame(model.cv_results_).sort_values('rank_test_score').reset_index(drop=True)

results = results[
    [
        'param_classifier__min_samples_split',
        'mean_test_score',
        'std_test_score',
        'rank_test_score'
    ]
].sort_values('mean_test_score', ascending=False).reset_index(drop=True)

results

Unnamed: 0,param_classifier__min_samples_split,mean_test_score,std_test_score,rank_test_score
0,0.00036,0.657712,0.006959,1
1,0.00038,0.657712,0.006959,1
2,0.0004,0.657712,0.006959,1
3,0.00056,0.657548,0.006103,4
4,0.00058,0.657548,0.006103,4
5,0.00054,0.657548,0.006103,4
6,0.00066,0.65682,0.008114,7
7,0.00068,0.65682,0.008114,7
8,0.00062,0.65671,0.006739,9
9,0.00064,0.65671,0.006739,9


In [14]:
best_rf_model = model.best_estimator_.named_steps['classifier']

profundidad_promedio = np.mean([tree.tree_.max_depth for tree in best_rf_model.estimators_])
nodos_promedio = np.mean([tree.tree_.node_count for tree in best_rf_model.estimators_])

print(f'max_depth AVG: {profundidad_promedio}')
print(f'nodes number AVG: {nodos_promedio}')

max_depth AVG: 39.21
nodes number AVG: 4059.72


In [15]:
import pickle
import gzip
import os

os.makedirs('../files/models', exist_ok=True)

with gzip.open("../files/models/model.pkl.gz","wb") as file:
    pickle.dump(model, file)

In [16]:
import json
from sklearn.metrics import precision_score, recall_score, f1_score

def calcular_metricas(modelo, x, y, tipo):
    y_pred = modelo.predict(x)
    metrics = {
    "type": "metrics",
    'dataset': tipo,
    'precision': precision_score(y, y_pred),
    'balanced_accuracy': balanced_accuracy_score(y, y_pred),
    'recall': recall_score(y, y_pred),
    'f1_score': f1_score(y, y_pred)
    }
    return metrics

train_metrics = calcular_metricas(modelo=model , x=x_train, y=y_train, tipo='train')
test_metrics = calcular_metricas(modelo=model , x=x_test, y=y_test, tipo='test')

metricas = [train_metrics, test_metrics]

output_dir = '../files/output'
os.makedirs(output_dir, exist_ok=True)

with open('../files/output/metrics.json', 'w') as file:
    for metrica in metricas:
        file.write(json.dumps(metrica)+ '\n')

In [17]:
from sklearn.metrics import confusion_matrix

def matriz_confusion(modelo, x, y, tipo):
    y_pred = modelo.predict(x)
    cm = confusion_matrix(y, y_pred)
    cm_dict = {
        'type': 'cm_matrix',
        'dataset': tipo,
        'true_0': {'predicted_0': int(cm[0, 0]), 'predicted_1': int(cm[0, 1])},
        'true_1': {'predicted_0': int(cm[1, 0]), 'predicted_1': int(cm[1, 1])}
    }
    return cm_dict

train_cm = matriz_confusion(modelo=model, x=x_train, y=y_train, tipo='train')
test_cm = matriz_confusion(modelo=model, x=x_test, y=y_test, tipo='test')

metricas_extendidas = [train_cm, test_cm]

with open('../files/output/metrics.json', 'a') as file:
    for metrica in metricas_extendidas:
        file.write(json.dumps(metrica) + '\n')