In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, balanced_accuracy_score,
    precision_score, recall_score, f1_score,
    confusion_matrix, ConfusionMatrixDisplay
)

In [2]:
import pandas as pd

data_train = pd.read_csv('../files/input/train_data.csv.zip', index_col= False, compression="zip")
data_test = pd.read_csv('../files/input/test_data.csv.zip', index_col= False, compression="zip")
columnas_diferentes = ['PAY_AMT1','PAY_AMT2','PAY_AMT3','PAY_AMT4','PAY_AMT5','PAY_AMT6']

print(data_train.isnull().sum())

ID                            0
LIMIT_BAL                     0
SEX                           0
EDUCATION                     0
MARRIAGE                      0
AGE                           0
PAY_0                         0
PAY_2                         0
PAY_3                         0
PAY_4                         0
PAY_5                         0
PAY_6                         0
BILL_AMT1                     0
BILL_AMT2                     0
BILL_AMT3                     0
BILL_AMT4                     0
BILL_AMT5                     0
BILL_AMT6                     0
PAY_AMT1                      0
PAY_AMT2                      0
PAY_AMT3                      0
PAY_AMT4                      0
PAY_AMT5                      0
PAY_AMT6                      0
default payment next month    0
dtype: int64


In [3]:
import numpy as np

data_train.rename(columns={"default payment next month" : "default"}, inplace=True)
data_test.rename(columns={"default payment next month" : "default"}, inplace=True)
data_train.drop(columns=["ID"], inplace=True)
data_test.drop(columns=["ID"], inplace=True)

data_train['EDUCATION'] = data_train['EDUCATION'].apply(lambda x: 4 if x > 4 else x)
data_test['EDUCATION'] = data_test['EDUCATION'].apply(lambda x: 4 if x > 4 else x)

data_train['EDUCATION'] = data_train['EDUCATION'].apply(lambda x: x if x > 0 else np.nan)
data_test['EDUCATION'] = data_test['EDUCATION'].apply(lambda x: x if x > 0 else np.nan)

data_train['MARRIAGE'] = data_train['MARRIAGE'].apply(lambda x: x if x > 0 else np.nan)
data_test['MARRIAGE'] = data_test['MARRIAGE'].apply(lambda x: x if x > 0 else np.nan)

pay_columns = ['PAY_AMT1','PAY_AMT2','PAY_AMT3','PAY_AMT4','PAY_AMT5','PAY_AMT6']

data_train[pay_columns] = data_train[pay_columns].applymap(lambda x: x if x >= 0 else np.nan)
data_test[pay_columns] = data_test[pay_columns].applymap(lambda x: x if x >= 0 else np.nan)

data_train.dropna(inplace=True)
data_test.dropna(inplace=True)

data_train = data_train.astype(int)
data_test = data_test.astype(int)

print(data_test.isnull().sum())

LIMIT_BAL    0
SEX          0
EDUCATION    0
MARRIAGE     0
AGE          0
PAY_0        0
PAY_2        0
PAY_3        0
PAY_4        0
PAY_5        0
PAY_6        0
BILL_AMT1    0
BILL_AMT2    0
BILL_AMT3    0
BILL_AMT4    0
BILL_AMT5    0
BILL_AMT6    0
PAY_AMT1     0
PAY_AMT2     0
PAY_AMT3     0
PAY_AMT4     0
PAY_AMT5     0
PAY_AMT6     0
default      0
dtype: int64


  data_train[pay_columns] = data_train[pay_columns].applymap(lambda x: x if x >= 0 else np.nan)
  data_test[pay_columns] = data_test[pay_columns].applymap(lambda x: x if x >= 0 else np.nan)


In [4]:
import pickle

# Dividir en características (X) y etiquetas (y)
X_train = data_train.drop(columns="default")
y_train = data_train["default"]

X_test = data_test.drop(columns="default")
y_test = data_test["default"]

In [5]:
print(X_test.shape)

(8979, 23)


In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

In [7]:
# Identificar las variables categóricas y numéricas
categorical_features = ['SEX', 'EDUCATION', 'MARRIAGE']
numerical_features = ['LIMIT_BAL', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']



# Preprocesamiento para las variables categóricas
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

# Preprocesamiento para las variables numéricas
numerical_transformer = MinMaxScaler()



In [8]:
# Combinación de preprocesadores

preprocessor = ColumnTransformer(
    transformers= [
        ("encoder", categorical_transformer, categorical_features),
        ("numerica", numerical_transformer, numerical_features)
    ]
)


In [9]:
# Creación del Pipeline
#pipeline = Pipeline([
#    ("preprocessor", preprocessor),
#    ("feature_selection", SelectKBest(score_func=f_classif, k=10)),
#    ("classifier", LogisticRegression(max_iter=1000, random_state=42, class_weight="balanced"))
#])


# Creación del Pipeline
pipeline = Pipeline([
    ("preprocessor", preprocessor),  # Tu preprocesador (por ejemplo, ColumnTransformer)
    ("feature_selection", SelectKBest(score_func=f_classif)),  # Sin especificar k aún
    ("classifier", LogisticRegression(max_iter=10000, random_state=42, C=0.01, solver='liblinear', class_weight='balanced'))##LogisticRegression(max_iter=1000, random_state=42))  # Clasificador
])

In [10]:
# Definición del grid de hiperparámetros
#param_grid = {
#    "feature_selection__k": [5, 7, 10],
#    "classifier__C": [0.1, 1, 10],
#    "classifier__solver": ["liblinear", "lbfgs"]
#}

param_grid = {
    "feature_selection__k": [10, 15],  # Puedes probar con menos valores para 'k'
    "classifier__C": [0.01, 0.1, 1, 10],
    "classifier__solver": ["liblinear", "lbfgs", "saga"],
    "classifier__class_weight": ["balanced", None]
}




# Validación cruzada con 10 splits
model = GridSearchCV(pipeline, param_grid, cv=10, scoring="balanced_accuracy", n_jobs=-1, refit=True)
model.fit(X_train, y_train)



In [11]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (
    accuracy_score, balanced_accuracy_score,
    precision_score, recall_score, f1_score,
    confusion_matrix, ConfusionMatrixDisplay
)

In [12]:
# Mejor modelo
print("Mejores parámetros: ", model.best_params_)
best_model = model.best_estimator_

Mejores parámetros:  {'classifier__C': 10, 'classifier__class_weight': 'balanced', 'classifier__solver': 'saga', 'feature_selection__k': 10}


In [13]:
# Evaluar el score en el conjunto de entrenamiento
train_score = model.score(X_train, y_train)
print(f"Score en el conjunto de entrenamiento: {train_score}")

# Evaluar el score en el conjunto de test
test_score = model.score(X_test, y_test)
print(f"Score en el conjunto de prueba: {test_score}")


Score en el conjunto de entrenamiento: 0.6659523418273726
Score en el conjunto de prueba: 0.6641183036624949


In [14]:
# Suponiendo que 'model' es un GridSearchCV
best_score = model.best_score_
print(f"Mejor puntuación obtenida: {best_score}")


Mejor puntuación obtenida: 0.6664238359391096


In [15]:
from sklearn.metrics import classification_report

# Hacer predicciones en el conjunto de test
y_pred = model.predict(X_test)

# Generar el reporte de clasificación
report = classification_report(y_test, y_pred)
print(report)


              precision    recall  f1-score   support

           0       0.88      0.67      0.76      7073
           1       0.35      0.66      0.46      1906

    accuracy                           0.67      8979
   macro avg       0.61      0.66      0.61      8979
weighted avg       0.77      0.67      0.70      8979



In [16]:
# Evaluar el score con la mejor combinación de parámetros
best_model = model.best_estimator_
best_train_score = best_model.score(X_train, y_train)
best_test_score = best_model.score(X_test, y_test)

print(f"Mejor score en entrenamiento: {best_train_score}")
print(f"Mejor score en test: {best_test_score}")


Mejor score en entrenamiento: 0.6673030115019329
Mejor score en test: 0.6667780376433902


In [17]:
#Guardar el modelo

import os
import pickle
import gzip

# Ruta del directorio donde se guardará el archivo
dir_path = '../files/models'

# Verificar si el directorio existe, si no, crearlo
if not os.path.exists(dir_path):
    os.makedirs(dir_path)

# Ruta del archivo GZIP
gzip_file_path = os.path.join(dir_path, 'model.pkl.gz')

# Guardar el modelo comprimido como un archivo GZIP
with gzip.open(gzip_file_path, 'wb') as f:
    pickle.dump(model, f)

print(f"Modelo guardado correctamente en {gzip_file_path}")


Modelo guardado correctamente en ../files/models\model.pkl.gz


In [18]:
# Predicciones
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)



# Métricas

metrics = {
    "Train": {
        "Accuracy": accuracy_score(y_train, y_train_pred),
        "Balanced accuracy": balanced_accuracy_score(y_train, y_train_pred),
        "Precision": precision_score(y_train, y_train_pred),
        "Recall": recall_score(y_train, y_train_pred),
        "F1-Score": f1_score(y_train, y_train_pred)
    },
    "Test":{
        "Accuracy": accuracy_score(y_test, y_test_pred),
        "Balanced accuracy": balanced_accuracy_score(y_test, y_test_pred),
        "Precision": precision_score(y_test, y_test_pred),
        "Recall": recall_score(y_test, y_test_pred),
        "F1-Score": f1_score(y_test, y_test_pred)
    }
}

print(metrics)

{'Train': {'Accuracy': 0.6673030115019329, 'Balanced accuracy': np.float64(0.6659523418273726), 'Precision': 0.3681305777360263, 'Recall': 0.6634920634920635, 'F1-Score': 0.47352918963824486}, 'Test': {'Accuracy': 0.6667780376433902, 'Balanced accuracy': np.float64(0.6641183036624949), 'Precision': 0.3491666666666667, 'Recall': 0.6594963273871983, 'F1-Score': 0.4565928078459862}}


In [19]:
# Matriz de Confusión
for dataset, y_true, y_pred in [("Train", y_train, y_train_pred), (" Test", y_test, y_test_pred)]:
  cm = confusion_matrix(y_true, y_pred)
  print(f"Matriz de confusión ({dataset}):\n", cm)


Matriz de confusión (Train):
 [[10847  5381]
 [ 1590  3135]]
Matriz de confusión ( Test):
 [[4730 2343]
 [ 649 1257]]


In [20]:
from sklearn.metrics import classification_report, confusion_matrix

# Predicciones en el conjunto de prueba
y_pred = model.predict(X_test)

# Reporte de clasificación
print(classification_report(y_test, y_pred))

# Matriz de confusión
print(confusion_matrix(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.88      0.67      0.76      7073
           1       0.35      0.66      0.46      1906

    accuracy                           0.67      8979
   macro avg       0.61      0.66      0.61      8979
weighted avg       0.77      0.67      0.70      8979

[[4730 2343]
 [ 649 1257]]


In [21]:
#crear json

import json

# Lista para almacenar las líneas del archivo JSON
results = []

# Agregar información de metrics para train y test
for dataset in metrics:
    results.append({
        'type': 'metrics',
        'dataset': dataset.lower(),
        'precision': float(metrics[dataset]['Precision']), 
        'balanced_accuracy': float(metrics[dataset]['Balanced accuracy']),       
        'recall': float(metrics[dataset]['Recall']),
        'f1_score': float(metrics[dataset]['F1-Score'])
    })

# Generar las matrices de confusión para train y test
for dataset, y_true, y_pred in [("Train", y_train, y_train_pred), ("Test", y_test, y_test_pred)]:
    # Calculamos la matriz de confusión
    cm = confusion_matrix(y_true, y_pred)

    # Organizar la matriz de confusión en un diccionario
    cm_dict = {
        "type": "cm_matrix",
        "dataset": dataset.lower(),  # 'train' o 'test'
        "true_0": {"predicted_0": cm[0, 0], "predicted_1": cm[0, 1]},
        "true_1": {"predicted_0": cm[1, 0], "predicted_1": cm[1, 1]}
    }
    
    # Agregar la matriz de confusión a la lista de resultados
    results.append(cm_dict)


print(results)


[{'type': 'metrics', 'dataset': 'train', 'precision': 0.3681305777360263, 'balanced_accuracy': 0.6659523418273726, 'recall': 0.6634920634920635, 'f1_score': 0.47352918963824486}, {'type': 'metrics', 'dataset': 'test', 'precision': 0.3491666666666667, 'balanced_accuracy': 0.6641183036624949, 'recall': 0.6594963273871983, 'f1_score': 0.4565928078459862}, {'type': 'cm_matrix', 'dataset': 'train', 'true_0': {'predicted_0': np.int64(10847), 'predicted_1': np.int64(5381)}, 'true_1': {'predicted_0': np.int64(1590), 'predicted_1': np.int64(3135)}}, {'type': 'cm_matrix', 'dataset': 'test', 'true_0': {'predicted_0': np.int64(4730), 'predicted_1': np.int64(2343)}, 'true_1': {'predicted_0': np.int64(649), 'predicted_1': np.int64(1257)}}]


In [22]:
#guardar json

import os
import json
import numpy as np

# Ruta donde se guardará el archivo JSON
output_path = "../files/output"

# Crear la carpeta de salida si no existe
if not os.path.exists(output_path):
    os.makedirs(output_path)

# Función para convertir tipos de datos de numpy (int64, float64) a tipos estándar de Python
def convert_numpy_types(obj):
    if isinstance(obj, dict):
        return {key: convert_numpy_types(value) for key, value in obj.items()}
    elif isinstance(obj, list):
        return [convert_numpy_types(item) for item in obj]
    elif isinstance(obj, np.int64):  # Si el valor es un int64 de numpy
        return int(obj)  # Convertir a tipo int de Python
    elif isinstance(obj, np.float64):  # Si el valor es un float64 de numpy
        return float(obj)  # Convertir a tipo float de Python
    else:
        return obj

# Guardar cada elemento en una línea separada del archivo JSON
with open('../files/output/metrics.json', 'w', encoding='utf-8') as f:  # Abrir en modo texto con codificación UTF-8
    for result in results:
        result = convert_numpy_types(result)  # Convertir los valores de int64 y float64 a tipos estándar
        json.dump(result, f, ensure_ascii=False)  # Escribir el objeto en formato JSON
        f.write('\n')  # Escribir un salto de línea después de cada línea

print(f"Archivo guardado correctamente en {output_path}")


Archivo guardado correctamente en ../files/output
