In [1]:
import glob

test_file, train_file = glob.glob(f'../files/input/*')

def load_input(ruta):

    import pandas as pd

    df = pd.read_csv(ruta)

    return df 

train = load_input(ruta=train_file)
test = load_input(ruta=test_file)

In [2]:
def renombrar_columna(dataset):
    df = dataset.copy()
    df.rename(columns= {'default payment next month':'default'}, inplace=True)
    return df

train = renombrar_columna(dataset=train)
test = renombrar_columna(dataset=test)

In [3]:
def remover_columna(dataset):
    df = dataset.copy()
    df.drop(columns='ID', inplace=True)
    return df

train = remover_columna(dataset=train)
test = remover_columna(dataset=test)

In [4]:
def remover_nulos(dataset):
    df = dataset.copy()
    df = df[(df['EDUCATION'] != 0) & (df['MARRIAGE'] != 0)]
    return df

train = remover_nulos(dataset=train)
test = remover_nulos(dataset=test)

In [5]:
def homologar_educacion(dataset):
    df = dataset.copy()
    df['EDUCATION'] = df['EDUCATION'].apply(lambda x: 4 if x > 4 else x)
    return df

train = homologar_educacion(dataset=train)
test = homologar_educacion(dataset=test)

In [6]:
def convertir_categoricas(dataset):
    df = dataset.copy()
    df['SEX'] = df['SEX'].astype('category')
    df['EDUCATION'] = df['EDUCATION'].astype('category')
    df['MARRIAGE'] = df['MARRIAGE'].astype('category')
    df['default'] = df['default'].astype('category')
    return df

train = convertir_categoricas(dataset=train)
test = convertir_categoricas(dataset=test)

In [7]:
x_train = train.drop(columns='default')
y_train = train['default']

x_test = test.drop(columns='default')
y_test = test['default']

In [8]:
import sklearn
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

In [9]:
categorical_features = ['SEX', 'EDUCATION', 'MARRIAGE']

columns_trans = ColumnTransformer(
    transformers = [
        ('categorias', OneHotEncoder(drop='if_binary'), categorical_features)
    ],
    remainder=MinMaxScaler()
)

k_best_selector = SelectKBest(score_func=f_classif)

pipeline = Pipeline(
    [
        ('preprocessor', columns_trans),
        ('feature_selection', k_best_selector),
        ('classifier', LogisticRegression(penalty='l2', n_jobs=-1, random_state=666, max_iter=7000))
    ]
)

In [10]:
import pandas as pd

x_transformed = columns_trans.fit_transform(x_train)

x_transformed = pd.DataFrame(x_transformed, columns=columns_trans.get_feature_names_out())
len(x_transformed.columns)

28

In [11]:
k_best_selector.fit(x_train, y_train)

feature_names = x_train.columns

scores = pd.DataFrame({
    'Feature': feature_names,
    'F-Score': k_best_selector.scores_,
    'P-Value': k_best_selector.pvalues_
})

scores = scores.sort_values(by='F-Score', ascending=False)
scores

Unnamed: 0,Feature,F-Score,P-Value
5,PAY_0,2422.194387,0.0
6,PAY_2,1505.460155,4.290767e-318
7,PAY_3,1227.709807,1.969449e-261
8,PAY_4,1019.048942,2.174002e-218
9,PAY_5,902.639845,3.403415e-194
10,PAY_6,743.220697,7.614765e-161
0,LIMIT_BAL,497.499225,6.164117e-109
17,PAY_AMT1,101.295942,8.971592e-24
20,PAY_AMT4,80.341831,3.4072739999999998e-19
21,PAY_AMT5,67.66876,2.045689e-16


In [12]:
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import GridSearchCV

param_grid = {
    'feature_selection__k': [1, 2, 4, 7],
    'classifier__C':[1, 100],
    'classifier__solver':['lbfgs', 'sag'],
    #'classifier__class_weight': [None, 'balanced']
}

model = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=10,
    scoring='balanced_accuracy',
    n_jobs=-1,
    refit=True
)

model.fit(x_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [13]:
print('mejores parámetros:', model.best_params_)
print('mejor exactitud validación cruzada', model.best_score_)
print('exactitud dataset de entrenamiento:', model.score(x_train, y_train))
print('exactitud dataset de prueba:', model.score(x_test, y_test))

mejores parámetros: {'classifier__C': 1, 'classifier__solver': 'lbfgs', 'feature_selection__k': 1}
mejor exactitud validación cruzada 0.6392688664250823
exactitud dataset de entrenamiento: 0.6392682710528409
exactitud dataset de prueba: 0.6547057822566611


In [14]:
import pickle
import gzip
import os

os.makedirs('../files/models', exist_ok=True)

with gzip.open("../files/models/model.pkl.gz","wb") as file:
    pickle.dump(model, file)

In [15]:
import json
from sklearn.metrics import precision_score, recall_score, f1_score, balanced_accuracy_score

def calcular_metricas(modelo, x, y, tipo):
    y_pred = modelo.predict(x)
    metrics = {
    "type": "metrics",
    'dataset': tipo,
    'precision': precision_score(y, y_pred),
    'balanced_accuracy': balanced_accuracy_score(y, y_pred),
    'recall': recall_score(y, y_pred),
    'f1_score': f1_score(y, y_pred)
    }
    return metrics

train_metrics = calcular_metricas(modelo=model , x=x_train, y=y_train, tipo='train')
test_metrics = calcular_metricas(modelo=model , x=x_test, y=y_test, tipo='test')

metricas = [train_metrics, test_metrics]

output_dir = '../files/output'
os.makedirs(output_dir, exist_ok=True)

with open('../files/output/metrics.json', 'w') as file:
    for metrica in metricas:
        file.write(json.dumps(metrica)+ '\n')

In [16]:
from sklearn.metrics import confusion_matrix

def matriz_confusion(modelo, x, y, tipo):
    y_pred = modelo.predict(x)
    cm = confusion_matrix(y, y_pred)
    cm_dict = {
        'type': 'cm_matrix',
        'dataset': tipo,
        'true_0': {'predicted_0': int(cm[0, 0]), 'predicted_1': int(cm[0, 1])},
        'true_1': {'predicted_0': int(cm[1, 0]), 'predicted_1': int(cm[1, 1])}
    }
    return cm_dict

train_cm = matriz_confusion(modelo=model, x=x_train, y=y_train, tipo='train')
test_cm = matriz_confusion(modelo=model, x=x_test, y=y_test, tipo='test')

metricas_extendidas = [train_cm, test_cm]

with open('../files/output/metrics.json', 'a') as file:
    for metrica in metricas_extendidas:
        file.write(json.dumps(metrica) + '\n')