In [1]:
def limpieza(dataset):
    
    df = dataset.copy()

    df.rename(columns={'default payment next month':'default'}, inplace=True)

    df.drop(columns='ID', inplace=True)

    df = df[(df['EDUCATION'] != 0) & (df['MARRIAGE'] != 0)]

    df['EDUCATION'] = df['EDUCATION'].apply(lambda x: 4 if x > 4 else x)

    df['SEX'] = df['SEX'].astype('category')
    df['EDUCATION'] = df['EDUCATION'].astype('category')
    df['MARRIAGE'] = df['MARRIAGE'].astype('category')
    df['default'] = df['default'].astype('category')

    return df

In [2]:
import glob
import pandas as pd

test_file, train_file = glob.glob(f'../files/input/*')

def load_data(directory):
    df = pd.read_csv(directory)
    df = limpieza(dataset=df)
    return df

test = load_data(directory=test_file)
train = load_data(directory=train_file)

In [3]:
def division_dataset(dataset):

    df = dataset.copy()
    x  = df.drop(columns='default')
    y  = df['default']

    return x, y

x_train, y_train = division_dataset(train)
x_test, y_test = division_dataset(test)

In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.svm import SVC

In [5]:
# column_transformer = ColumnTransformer(
#     [
#         ('categories', OneHotEncoder(drop='if_binary', handle_unknown='ignore'), ['SEX', 'EDUCATION', 'MARRIAGE']),
#     ],
#     remainder=StandardScaler()
# )
#
# pipe = Pipeline(steps=
#     [
#         ('preprocessor', column_transformer),
#         ('dimensionality_reducter', PCA()),
#         ('k_best_selector', SelectKBest(score_func=f_classif)),
#         ('classifier', SVC(kernel='rbf'))
#     ]
# )


Categoria = ['SEX', 'EDUCATION', 'MARRIAGE']
numericas = ['LIMIT_BAL', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1','PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6' ]

column_transformer = ColumnTransformer(
    transformers=[
        ("ohe", OneHotEncoder(dtype=int, handle_unknown='ignore'), Categoria)
    ],
    remainder='passthrough' 
)

pipe = Pipeline(steps=
    [
        ('preprocessor', column_transformer),
        ('dimensionality_reducter', PCA()),
        ("scaler", StandardScaler()),
        ('k_best_selector', SelectKBest(score_func=f_classif)),
        ('classifier', SVC(random_state=42))
    ]
)

pipe.fit(x_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [6]:
from sklearn.metrics import balanced_accuracy_score, precision_score, recall_score, f1_score

y_pred_train = pipe.predict(x_train)
y_pred_test = pipe.predict(x_test)

print('balanced accuracy train:', balanced_accuracy_score(y_train, y_pred_train))
print('balanced accuracy test:', balanced_accuracy_score(y_test, y_pred_test))
print('precision score train:', precision_score(y_train, y_pred_train))
print('precision score test:', precision_score(y_test, y_pred_test))
print('recall score train:', recall_score(y_train, y_pred_train))
print('recall score test:', recall_score(y_test, y_pred_test))
print('f1 score train:', f1_score(y_train, y_pred_train))
print('f1 score test:', f1_score(y_test, y_pred_test))
print()
print(pipe.get_params())

balanced accuracy train: 0.6569627647295875
balanced accuracy test: 0.6637221575804654
precision score train: 0.6921202274573518
precision score test: 0.6773888363292336
recall score train: 0.36063492063492064
recall score test: 0.3756558237145855
f1 score train: 0.47418950883539723
f1 score test: 0.4832939588255147

{'memory': None, 'steps': [('preprocessor', ColumnTransformer(remainder='passthrough',
                  transformers=[('ohe',
                                 OneHotEncoder(dtype=<class 'int'>,
                                               handle_unknown='ignore'),
                                 ['SEX', 'EDUCATION', 'MARRIAGE'])])), ('dimensionality_reducter', PCA()), ('scaler', StandardScaler()), ('k_best_selector', SelectKBest()), ('classifier', SVC(random_state=42))], 'transform_input': None, 'verbose': False, 'preprocessor': ColumnTransformer(remainder='passthrough',
                  transformers=[('ohe',
                                 OneHotEncoder(dtype=<class

In [7]:
columns_transformer = pipe.named_steps['preprocessor']

x_transformed = columns_transformer.transform(x_train)
x_transformed = pd.DataFrame(x_transformed, columns=columns_transformer.get_feature_names_out())

print('columnas dataset original:', len(train.columns))
print('columnas dataset transformado:', len(x_transformed.columns))

columnas dataset original: 24
columnas dataset transformado: 29


In [8]:
pca = pipe.named_steps['dimensionality_reducter']
x_reduced = pca.transform(x_transformed)

varianza_explicada = pca.explained_variance_ratio_ * 100
varianza_explicada = pd.DataFrame(
    {
        'Componente': pca.get_feature_names_out(),
        'Varianza Explicada (%)': varianza_explicada
    }
)

varianza_explicada['Varianza Acumulada (%)'] = varianza_explicada['Varianza Explicada (%)'].cumsum()
varianza_explicada['Varianza Explicada (%)'] = varianza_explicada['Varianza Explicada (%)'].apply(lambda x:f'{x:.2f}')
varianza_explicada



Unnamed: 0,Componente,Varianza Explicada (%),Varianza Acumulada (%)
0,pca0,61.33,61.329373
1,pca1,29.09,90.417646
2,pca2,3.08,93.500891
3,pca3,1.7,95.196887
4,pca4,1.0,96.198064
5,pca5,0.94,97.134065
6,pca6,0.78,97.911715
7,pca7,0.64,98.55175
8,pca8,0.6,99.154483
9,pca9,0.29,99.442403


In [9]:
kbest = pipe.named_steps['k_best_selector'] 

feature_names = pca.get_feature_names_out()

anova = pd.DataFrame(
    {
        'feature':feature_names,
        'F-Score':kbest.scores_,
        'P-Value':[f'{p:.6%}' for p in kbest.pvalues_],
    }
).sort_values(by='F-Score', ascending=False).reset_index(drop=True)

anova

Unnamed: 0,feature,F-Score,P-Value
0,pca14,1303.67657,0.000000%
1,pca15,486.155109,0.000000%
2,pca1,428.877111,0.000000%
3,pca16,148.443945,0.000000%
4,pca0,97.158576,0.000000%
5,pca4,77.226874,0.000000%
6,pca13,27.298439,0.000018%
7,pca27,27.03107,0.000020%
8,pca24,22.286608,0.000236%
9,pca3,20.779427,0.000518%


In [10]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

# param_grid = [
#     {
#         'k_best_selector__k': [1, 2, 5],
#         'classifier__C': [13.5],
#         'classifier__gamma': ['auto']
#     }
# ]

param_grid = {
    'k_best_selector__k': [4],
    'classifier__C': [13.5],
    'classifier__kernel': ['rbf'],
    'classifier__gamma': ['auto'],
    'classifier__degree': [2],
}

model = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    cv=10,
    scoring='balanced_accuracy',
    n_jobs=-1,
    refit=True,
    verbose=1
)

model.fit(x_train, y_train)

Fitting 10 folds for each of 1 candidates, totalling 10 fits


The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [11]:
print('mejor exactitud validación cruzada', model.best_score_)
print('exactitud dataset de entrenamiento:', model.score(x_train, y_train))
print('exactitud dataset de prueba:', model.score(x_test, y_test))
print()
print('mejores parámetros:', model.best_params_)

mejor exactitud validación cruzada 0.6531354511934417
exactitud dataset de entrenamiento: 0.6612892798781387
exactitud dataset de prueba: 0.6667381121682754

mejores parámetros: {'classifier__C': 13.5, 'classifier__degree': 2, 'classifier__gamma': 'auto', 'classifier__kernel': 'rbf', 'k_best_selector__k': 4}


In [12]:
import pickle
import gzip
import os

os.makedirs('../files/models', exist_ok=True)

with gzip.open('../files/models/model.pkl.gz', 'wb') as file:
    pickle.dump(model, file)

In [13]:
import json

def calculate_metrics(modelo, x, y, tipo):

    y_pred = modelo.predict(x)
    
    metrics = {
        'type': 'metrics',
        'dataset':tipo,
        'precision':precision_score(y_pred=y_pred, y_true=y),
        'balanced_accuracy':balanced_accuracy_score(y_pred=y_pred, y_true=y),
        'recall':recall_score(y_pred=y_pred, y_true=y),
        'f1_score':f1_score(y_pred=y_pred, y_true=y)
    }
    return metrics

train_metrics = calculate_metrics(modelo=model, x=x_train, y=y_train, tipo='train')
test_metrics = calculate_metrics(modelo=model, x=x_test, y=y_test, tipo='test')

metricas = [train_metrics, test_metrics]

os.makedirs('../files/output', exist_ok=True)

with open('../files/output/metrics.json', 'w') as file:
    for metrica in metricas:
        file.write(json.dumps(metrica)+'\n')

In [14]:
from sklearn.metrics import confusion_matrix

def create_mc(modelo, x, y, tipo):

    y_pred = modelo.predict(x)

    matrix = confusion_matrix(y_true=y, y_pred=y_pred)

    dictionary = {
        'type':'cm_matrix',
        'dataset':tipo,
        'true_0':{
            'predicted_0':int(matrix[0, 0]),
            'predicted_1':int(matrix[0, 1])
        },
        'true_1':{
            'predicted_0':int(matrix[0, 1]),
            'predicted_1':int(matrix[1, 1])
        }
    }
    return dictionary

train_matrix = create_mc(modelo=model, x=x_train, y=y_train, tipo='train')
test_matrix = create_mc(modelo=model, x=x_test, y=y_test, tipo='test')

metricas_2 = [train_matrix, test_matrix]

with open('../files/output/metrics.json', 'a') as file:
    for metrica in metricas_2:
        file.write(json.dumps(metrica)+'\n')