In [1]:
def limpieza(dataset):
    
    df = dataset.copy()

    df.rename(columns={'default payment next month':'default'}, inplace=True)

    df.drop(columns='ID', inplace=True)

    df = df[(df['EDUCATION'] != 0) & (df['MARRIAGE'] != 0)]

    df['EDUCATION'] = df['EDUCATION'].apply(lambda x: 4 if x > 4 else x)

    df['SEX'] = df['SEX'].astype('category')
    df['EDUCATION'] = df['EDUCATION'].astype('category')
    df['MARRIAGE'] = df['MARRIAGE'].astype('category')
    df['PAY_0'] = df['PAY_0'].astype('category')
    df['PAY_2'] = df['PAY_2'].astype('category')
    df['PAY_3'] = df['PAY_3'].astype('category')
    df['PAY_4'] = df['PAY_4'].astype('category')
    df['PAY_5'] = df['PAY_5'].astype('category')
    df['PAY_6'] = df['PAY_6'].astype('category')
    df['default'] = df['default'].astype('category')

    return df

In [2]:
import glob
import pandas as pd

test_file, train_file = glob.glob(f'../files/input/*')

def load_data(directory):
    df = pd.read_csv(directory)
    df = limpieza(dataset=df)
    return df

test = load_data(directory=test_file)
train = load_data(directory=train_file)

In [3]:
def division_dataset(dataset):

    df = dataset.copy()
    x  = df.drop(columns='default')
    y  = df['default']

    return x, y

x_train, y_train = division_dataset(train)
x_test, y_test = division_dataset(test)

In [4]:
y_train.value_counts()

default
0    16228
1     4725
Name: count, dtype: int64

In [5]:
# from sklearn.utils import shuffle

# class_0 = x_train[y_train == 0]
# class_1 = x_train[y_train == 1]

# class_0_sample = class_0.sample(frac=0.8, random_state=666)
# #class_1_sample = class_1.sample(frac=0.5, random_state=666)

# x_train_reducted = pd.concat([class_0_sample, class_1])
# y_train_reducted = pd.concat([
#     pd.Series([0] * len(class_0_sample), index=class_0_sample.index),
#     pd.Series([1] * len(class_1), index=class_1.index)
# ])

# x_train_reducted, y_train_reducted = shuffle(x_train_reducted, y_train_reducted, random_state=666)

# y_train_reducted.value_counts()

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.svm import SVC

In [8]:
categorical_columns = x_train.select_dtypes(include='category').columns.to_list()

column_transformer = ColumnTransformer(
    transformers=[
        ("ohe", OneHotEncoder(drop='if_binary', max_categories=6, handle_unknown='infrequent_if_exist', sparse_output=False), categorical_columns)
    ],
    remainder=StandardScaler() 
)

pipe = Pipeline(steps=
    [
        ('preprocessor', column_transformer),
        ('dimensionality_reducter', PCA()),
        ('k_best_selector', SelectKBest(score_func=mutual_info_classif, k='all')),
        ('classifier', SVC())
    ]
)

pipe.fit(x_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [9]:
from sklearn.metrics import balanced_accuracy_score, precision_score, recall_score, f1_score

y_pred_train = pipe.predict(x_train)
y_pred_test = pipe.predict(x_test)

print('balanced accuracy train:', balanced_accuracy_score(y_train, y_pred_train))
print('balanced accuracy test:', balanced_accuracy_score(y_test, y_pred_test))
print('precision score train:', precision_score(y_train, y_pred_train))
print('precision score test:', precision_score(y_test, y_pred_test))
print('recall score train:', recall_score(y_train, y_pred_train))
print('recall score test:', recall_score(y_test, y_pred_test))
print('f1 score train:', f1_score(y_train, y_pred_train))
print('f1 score test:', f1_score(y_test, y_pred_test))

balanced accuracy train: 0.6497994908532252
balanced accuracy test: 0.6543931973695396
precision score train: 0.7193693693693693
precision score test: 0.7033898305084746
recall score train: 0.337989417989418
recall score test: 0.3483735571878279
f1 score train: 0.45989920806335494
f1 score test: 0.46596491228070175


In [10]:
from sklearn.metrics import confusion_matrix

def matriz(yt, yp):
    matrix = confusion_matrix(y_true=yt, y_pred=yp)
    return matrix

matrix_train = confusion_matrix_df = pd.DataFrame(
    matriz(yt=y_train, yp=pipe.predict(x_train)),
    index=["N (Clase Real Negativa)", "P (Clase Real Positiva)"],
    columns=["PN (Predicción Negativa)", "PP (Predicción Positiva)"]
)

matrix_test = confusion_matrix_df = pd.DataFrame(
    matriz(yt=y_test, yp=pipe.predict(x_test)),
    index=["N (Clase Real Negativa)", "P (Clase Real Positiva)"],
    columns=["PN (Predicción Negativa)", "PP (Predicción Positiva)"]
)

display(
    matrix_train,
    matrix_test
)

Unnamed: 0,PN (Predicción Negativa),PP (Predicción Positiva)
N (Clase Real Negativa),15605,623
P (Clase Real Positiva),3128,1597


Unnamed: 0,PN (Predicción Negativa),PP (Predicción Positiva)
N (Clase Real Negativa),6793,280
P (Clase Real Positiva),1242,664


In [11]:
columns_transformer = pipe.named_steps['preprocessor']

x_transformed = columns_transformer.transform(x_train)
x_transformed = pd.DataFrame(x_transformed, columns=columns_transformer.get_feature_names_out())

print('columnas dataset original:', len(train.columns))
print('columnas dataset transformado:', len(x_transformed.columns))

columnas dataset original: 24
columnas dataset transformado: 58


In [12]:
pca = pipe.named_steps['dimensionality_reducter']
x_reduced = pca.transform(x_transformed)

pca_dataset = pd.DataFrame(
        x_reduced,
        columns= pca.get_feature_names_out()
)

pca_dataset



Unnamed: 0,pca0,pca1,pca2,pca3,pca4,pca5,pca6,pca7,pca8,pca9,...,pca48,pca49,pca50,pca51,pca52,pca53,pca54,pca55,pca56,pca57
0,2.081071,-0.736325,0.357130,0.066989,0.106754,0.457366,0.021435,-0.073800,0.572787,1.047760,...,0.000641,-0.000434,-6.800116e-16,-1.498801e-15,1.665335e-15,6.661338e-16,-9.992007e-16,-2.942091e-15,1.665335e-15,-1.665335e-15
1,-2.152255,0.238527,1.355193,1.098400,-0.276086,-0.140106,-0.130959,-0.026608,-0.580099,-1.322077,...,-0.005559,0.004601,5.384582e-15,-5.384582e-15,-2.775558e-15,4.440892e-16,-2.220446e-16,-7.910339e-15,6.189493e-15,1.165734e-15
2,-0.822377,1.390894,-0.973927,-0.088268,1.478943,-1.791579,-1.736686,0.464691,-0.484886,-1.326177,...,-0.004268,-0.004448,6.689094e-15,-1.110223e-16,1.332268e-15,5.551115e-16,-1.554312e-15,-6.383782e-15,2.872702e-15,-4.440892e-16
3,-0.533175,-0.064933,1.734299,1.131794,-0.207810,-0.121706,-0.228828,0.121106,-0.427082,-1.340437,...,-0.119996,0.110110,8.881784e-15,2.253753e-14,4.218847e-15,-4.440892e-16,1.504352e-14,-6.106227e-16,4.149459e-15,-7.438494e-15
4,-0.866940,-0.070422,0.022019,-0.589915,-0.075421,0.088189,-0.177593,0.076538,0.001898,0.474395,...,-0.008187,0.014438,-3.094747e-15,-2.275957e-15,-1.998401e-15,-9.992007e-16,9.436896e-16,5.967449e-15,-1.734723e-15,4.940492e-15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20948,-1.846756,-0.027277,-0.300545,-0.255708,-0.265003,0.042805,-0.128468,-0.072706,-0.264773,-0.412476,...,0.001265,0.003134,-9.020562e-16,-2.053913e-15,-1.998401e-15,-1.332268e-15,1.998401e-15,-9.992007e-16,6.522560e-16,2.220446e-15
20949,1.892162,-1.209081,0.008488,0.570860,0.295253,0.078858,-0.129274,-0.131467,0.027955,0.266125,...,0.002031,-0.001459,1.290634e-15,-4.107825e-15,-1.221245e-15,-1.776357e-15,1.443290e-15,2.581269e-15,-1.387779e-16,3.441691e-15
20950,-0.354368,-1.527812,-1.520315,-0.098227,0.141456,-0.011980,0.108114,-0.055856,0.055736,0.321660,...,0.000125,-0.000170,-7.494005e-16,2.220446e-16,-1.110223e-16,1.110223e-16,6.106227e-16,-1.665335e-16,1.249001e-16,8.326673e-16
20951,-1.091839,-1.133297,-1.517749,0.077893,0.247249,0.075494,0.336162,0.045265,0.203375,0.606285,...,-0.000888,0.000134,6.245005e-16,2.775558e-16,9.992007e-16,4.440892e-16,1.665335e-16,-1.304512e-15,-2.775558e-16,5.551115e-16


In [13]:
varianza_explicada = pca.explained_variance_ratio_ * 100
varianza_explicada = pd.DataFrame(
    {
        'Componente': pca.get_feature_names_out(),
        'Varianza Explicada (%)': varianza_explicada
    }
)

varianza_explicada['Varianza Acumulada (%)'] = varianza_explicada['Varianza Explicada (%)'].cumsum()
varianza_explicada['Varianza Explicada (%)'] = varianza_explicada['Varianza Explicada (%)'].apply(lambda x:f'{x:.2f}')
varianza_explicada

Unnamed: 0,Componente,Varianza Explicada (%),Varianza Acumulada (%)
0,pca0,32.31,32.314852
1,pca1,10.72,43.036821
2,pca2,6.63,49.666841
3,pca3,5.38,55.048897
4,pca4,4.75,59.797053
5,pca5,4.57,64.371524
6,pca6,4.38,68.755129
7,pca7,3.99,72.745435
8,pca8,3.87,76.613692
9,pca9,3.55,80.163613


In [14]:
kbest = pipe.named_steps['k_best_selector'] 

feature_names = pca.get_feature_names_out()

mutual_test = pd.DataFrame(
    {
        'feature':feature_names,
        'Score':kbest.scores_,
        #'P-Value':[f'{p:.6%}' for p in kbest.pvalues_],
    }
).sort_values(by='Score', ascending=False).reset_index(drop=True)

mutual_test

Unnamed: 0,feature,Score
0,pca42,0.046521
1,pca10,0.04498
2,pca39,0.044895
3,pca32,0.043722
4,pca41,0.043439
5,pca9,0.042563
6,pca43,0.040659
7,pca20,0.038381
8,pca27,0.036734
9,pca40,0.036637


In [22]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
import numpy as np

# def custom_metric(yt, yp):
#     precision = precision_score(y_true=yt, y_pred=yp)
#     balanced = balanced_accuracy_score(y_true=yt, y_pred=yp)
#     if precision >= 0.691 and balanced >= 0.661:
#         return precision + balanced
#     else:
#         return balanced

# scoring = make_scorer(custom_metric)

param_grid = {
    'k_best_selector__k': list(range(29, 31)),
    #'classifier__C': [1, 5, 10, 20, 40, 70, 100],
    'classifier__C': np.arange(44, 45, 1),
    #'classifier__gamma': [0.01, 0.03, 0.06, 0.1, 0.3, 0.6, 1],
    'classifier__gamma': np.arange(0.08, 0.15, 0.06),
}

model = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    cv=4,
    scoring='balanced_accuracy',
    n_jobs=-1,
    refit=True,
    verbose=1
)

model.fit(x_train, y_train)

Fitting 4 folds for each of 4 candidates, totalling 16 fits


The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [23]:
y_pred_train = model.predict(x_train)
y_pred_test = model.predict(x_test)

print('mejores parámetros encontrados:', model.best_params_)
print()
print('balanced accuracy train:', balanced_accuracy_score(y_train, y_pred_train))
print('balanced accuracy test:', balanced_accuracy_score(y_test, y_pred_test))
print('precision score train:', precision_score(y_train, y_pred_train))
print('precision score test:', precision_score(y_test, y_pred_test))
print('recall score train:', recall_score(y_train, y_pred_train))
print('recall score test:', recall_score(y_test, y_pred_test))
print('f1 score train:', f1_score(y_train, y_pred_train))
print('f1 score test:', f1_score(y_test, y_pred_test))
print()
print('score train:', model.score(x_train, y_train))
print('score test:', model.score(x_test, y_test))

mejores parámetros encontrados: {'classifier__C': np.int64(44), 'classifier__gamma': np.float64(0.08), 'k_best_selector__k': 30}

balanced accuracy train: 0.6717000142154197
balanced accuracy test: 0.6668794948913066
precision score train: 0.7206461780929866
precision score test: 0.6749769159741459
recall score train: 0.3870899470899471
recall score test: 0.38352570828961174
f1 score train: 0.5036486300426821
f1 score test: 0.4891267982602877

score train: 0.6717000142154197
score test: 0.6668794948913066


In [24]:
results = pd.DataFrame(model.cv_results_).sort_values('rank_test_score').reset_index(drop=True)

results = results[
    [
        'param_classifier__C',
        'param_k_best_selector__k',
        'param_classifier__gamma',
        'mean_test_score',
        'std_test_score',
        'rank_test_score'
    ]
].sort_values('mean_test_score', ascending=False).reset_index(drop=True)

results.head(10)

Unnamed: 0,param_classifier__C,param_k_best_selector__k,param_classifier__gamma,mean_test_score,std_test_score,rank_test_score
0,44,30,0.08,0.649238,0.002539,1
1,44,29,0.08,0.649013,0.00268,2
2,44,29,0.14,0.645217,0.00386,3
3,44,30,0.14,0.645036,0.005669,4


In [25]:
svm = pipe.named_steps['classifier']
print('Valor de gamma:', svm._gamma)

Valor de gamma: 0.05197503588555655


In [26]:
import pickle
import gzip
import os

os.makedirs('../files/models', exist_ok=True)

with gzip.open('../files/models/model.pkl.gz', 'wb') as file:
    pickle.dump(model, file)

In [27]:
import json

def calculate_metrics(modelo, x, y, tipo):

    y_pred = modelo.predict(x)
    
    metrics = {
        'type': 'metrics',
        'dataset':tipo,
        'precision':precision_score(y_pred=y_pred, y_true=y),
        'balanced_accuracy':balanced_accuracy_score(y_pred=y_pred, y_true=y),
        'recall':recall_score(y_pred=y_pred, y_true=y),
        'f1_score':f1_score(y_pred=y_pred, y_true=y)
    }
    return metrics

train_metrics = calculate_metrics(modelo=model, x=x_train, y=y_train, tipo='train')
test_metrics = calculate_metrics(modelo=model, x=x_test, y=y_test, tipo='test')

metricas = [train_metrics, test_metrics]

os.makedirs('../files/output', exist_ok=True)

with open('../files/output/metrics.json', 'w') as file:
    for metrica in metricas:
        file.write(json.dumps(metrica)+'\n')

In [28]:
from sklearn.metrics import confusion_matrix

def create_mc(modelo, x, y, tipo):

    y_pred = modelo.predict(x)

    matrix = confusion_matrix(y_true=y, y_pred=y_pred)

    dictionary = {
        'type':'cm_matrix',
        'dataset':tipo,
        'true_0':{
            'predicted_0':int(matrix[0, 0]),
            'predicted_1':int(matrix[0, 1])
        },
        'true_1':{
            'predicted_0':int(matrix[0, 1]),
            'predicted_1':int(matrix[1, 1])
        }
    }
    return dictionary

train_matrix = create_mc(modelo=model, x=x_train, y=y_train, tipo='train')
test_matrix = create_mc(modelo=model, x=x_test, y=y_test, tipo='test')

metricas_2 = [train_matrix, test_matrix]

with open('../files/output/metrics.json', 'a') as file:
    for metrica in metricas_2:
        file.write(json.dumps(metrica)+'\n')