In [28]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, balanced_accuracy_score, recall_score, f1_score
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
import numpy as np
import gzip
import pickle
import json
import os
import zipfile

In [3]:
# Cargar los datos
train_df = pd.read_csv('../files/input/train_data.csv/train_default_of_credit_card_clients.csv')
test_df = pd.read_csv('../files/input/test_data.csv/test_default_of_credit_card_clients.csv')

# Renombrar la columna "default payment next month" a "default"
train_df.rename(columns={'default payment next month': 'default'}, inplace=True)
test_df.rename(columns={'default payment next month': 'default'}, inplace=True)

# Remover la columna "ID"
train_df.drop(columns=['ID'], inplace=True)
test_df.drop(columns=['ID'], inplace=True)

# Eliminar registros con información no disponible
train_df.dropna(inplace=True)
test_df.dropna(inplace=True)

# Agrupar valores de EDUCATION > 4 en la categoría "others"
train_df['EDUCATION'] = train_df['EDUCATION'].apply(lambda x: 4 if x > 4 else x)
test_df['EDUCATION'] = test_df['EDUCATION'].apply(lambda x: 4 if x > 4 else x)

In [4]:
# Dividir los datos en características (X) y etiquetas (y)
x_train = train_df.drop(columns=['default'])
y_train = train_df['default']
x_test = test_df.drop(columns=['default'])
y_test = test_df['default']

In [38]:
# Identificar columnas categóricas y numéricas
categorical_cols = ['SEX', 'EDUCATION', 'MARRIAGE']
numeric_cols = [col for col in x_train.columns if col not in categorical_cols]

# Crear un transformador para las columnas categóricas y numéricas
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_cols),
        ('num', MinMaxScaler(), numeric_cols)
    ],
        remainder='passthrough'
    )

k_best = SelectKBest(f_classif)

model = LogisticRegression(random_state=42)

pipeline = Pipeline(
	steps=[
		("preprocessor", preprocessor),
		("k_best", k_best),
		("model", model)
	]
)

In [40]:
print(pipeline)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('cat', OneHotEncoder(),
                                                  ['SEX', 'EDUCATION',
                                                   'MARRIAGE']),
                                                 ('num', MinMaxScaler(),
                                                  ['LIMIT_BAL', 'AGE', 'PAY_0',
                                                   'PAY_2', 'PAY_3', 'PAY_4',
                                                   'PAY_5', 'PAY_6',
                                                   'BILL_AMT1', 'BILL_AMT2',
                                                   'BILL_AMT3', 'BILL_AMT4',
                                                   'BILL_AMT5', 'BILL_AMT6',
                                                   'PAY_AMT1', 'PAY_AMT2',
                                                   'PAY_AMT3', 'PAY_AMT4',
                

In [None]:
param_grid = {
	"k_best__k": range(1, len(x_train.columns) + 1),
	"model__C": np.logspace(-3, 3, 7)
}
grid_search = GridSearchCV(
	pipeline,
	param_grid=param_grid,
	cv=10,
	scoring="balanced_accuracy",
	n_jobs=-1,
	refit=True,
	verbose=2
)
grid_search.fit(x_train, y_train)
best_model = grid_search

Fitting 10 folds for each of 161 candidates, totalling 1610 fits


In [43]:
print(best_model)

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('cat',
                                                                         OneHotEncoder(),
                                                                         ['SEX',
                                                                          'EDUCATION',
                                                                          'MARRIAGE']),
                                                                        ('num',
                                                                         MinMaxScaler(),
                                                                         ['LIMIT_BAL',
                                                                          'AGE',
                                                                          'PAY

In [44]:
# Guardar el modelo comprimido
with gzip.open('../files/models/model.pkl.gz', 'wb') as f:
    pickle.dump(best_model, f)

In [45]:
# Predecir en los conjuntos de entrenamiento y prueba
y_train_pred = best_model.predict(x_train)
y_test_pred = best_model.predict(x_test)

# Calcular métricas para el conjunto de entrenamiento
train_metrics = {
    'type': 'metrics',
    'dataset': 'train',
    'precision': precision_score(y_train, y_train_pred),
    'balanced_accuracy': balanced_accuracy_score(y_train, y_train_pred),
    'recall': recall_score(y_train, y_train_pred),
    'f1_score': f1_score(y_train, y_train_pred)
}

# Calcular métricas para el conjunto de prueba
test_metrics = {
    'type': 'metrics',
    'dataset': 'test',
    'precision': precision_score(y_test, y_test_pred),
    'balanced_accuracy': balanced_accuracy_score(y_test, y_test_pred),
    'recall': recall_score(y_test, y_test_pred),
    'f1_score': f1_score(y_test, y_test_pred)
}
# Calcular la matriz de confusión para el conjunto de entrenamiento
train_cm = confusion_matrix(y_train, y_train_pred)
train_cm_dict = {
    'type': 'cm_matrix',
    'dataset': 'train',
    'true_0': {'predicted_0': int(train_cm[0, 0]), 'predicted_1': int(train_cm[0, 1])},
    'true_1': {'predicted_0': int(train_cm[1, 0]), 'predicted_1': int(train_cm[1, 1])}
}

# Calcular la matriz de confusión para el conjunto de prueba
test_cm = confusion_matrix(y_test, y_test_pred)
test_cm_dict = {
    'type': 'cm_matrix',
    'dataset': 'test',
    'true_0': {'predicted_0': int(test_cm[0, 0]), 'predicted_1': int(test_cm[0, 1])},
    'true_1': {'predicted_0': int(test_cm[1, 0]), 'predicted_1': int(test_cm[1, 1])}
}
# Guardar las métricas y matrices de confusión en un archivo JSON
with open("../files/output/metrics.json", "w") as f:
    json.dump(train_metrics, f)
    f.write('\n')  # Nueva línea
    json.dump(test_metrics, f)
    f.write('\n')  # Nueva línea
    json.dump(train_cm_dict, f)
    f.write('\n')  # Nueva línea
    json.dump(test_cm_dict, f)
    f.write('\n')  # Nueva línea