In [1]:
#  Cargar datos de entrenamiento
import pandas as pd  #  type: ignore

train_data = pd.read_csv(
    "../files/input/train_data.csv.zip",
    #index_col=False,
    compression="zip",
)


#  Cargar datos de prueba
import pandas as pd  #  type: ignore

test_data = pd.read_csv(
    "../files/input/test_data.csv.zip",
    #index_col=False,
    compression="zip",
)

In [2]:
train_data.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,10748,310000,1,3,1,32,0,0,0,0,...,84373,57779,14163,8295,6000,4000,3000,1000,2000,0
1,12574,10000,2,3,1,49,-1,-1,-2,-1,...,1690,1138,930,0,0,2828,0,182,0,1
2,29677,50000,1,2,1,28,-1,-1,-1,0,...,45975,1300,43987,0,46257,2200,1300,43987,1386,0
3,8857,80000,2,3,1,52,2,2,3,3,...,40748,39816,40607,3700,1600,1600,0,1600,1600,1
4,21099,270000,1,1,2,34,1,2,0,0,...,22448,15490,17343,0,4000,2000,0,2000,2000,0


In [3]:
test_data.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
1,10,20000,1,3,2,35,-2,-2,-2,-2,...,0,13007,13912,0,0,0,13007,1122,0,0
2,11,200000,2,3,2,34,0,0,2,0,...,2513,1828,3731,2306,12,50,300,3738,66,0
3,15,250000,1,1,2,29,0,0,0,0,...,59696,56875,55512,3000,3000,3000,3000,3000,3000,0
4,16,50000,2,3,3,23,1,2,0,0,...,28771,29531,30211,0,1500,1100,1200,1300,1100,0


In [4]:
# Paso 1.
# Realice la limpieza de los datasets:
# - Renombre la columna "default payment next month" a "default".
# - Remueva la columna "ID".
# - Elimine los registros con informacion no disponible.
# - Para la columna EDUCATION, valores > 4 indican niveles superiores
#   de educación, agrupe estos valores en la categoría "others".
#
# Renombre la columna "default payment next month" a "default"
# y remueva la columna "ID".



# Paso 1: Limpieza de datos
# Renombrar la columna objetivo y eliminar 'ID'
train_data.rename(columns={"default payment next month": "default"}, inplace=True)
test_data.rename(columns={"default payment next month": "default"}, inplace=True)

train_data.drop(columns=["ID"], inplace=True)
test_data.drop(columns=["ID"], inplace=True)

# Eliminar registros con valores faltantes
train_data.dropna(inplace=True)
test_data.dropna(inplace=True)

# Agrupar valores > 4 en EDUCATION como 'otros'
train_data["EDUCATION"] = train_data["EDUCATION"].apply(lambda x: x if x <= 4 else 4)
test_data["EDUCATION"] = test_data["EDUCATION"].apply(lambda x: x if x <= 4 else 4)

#
print(train_data.head())

   LIMIT_BAL  SEX  EDUCATION  MARRIAGE  AGE  PAY_0  PAY_2  PAY_3  PAY_4  \
0     310000    1          3         1   32      0      0      0      0   
1      10000    2          3         1   49     -1     -1     -2     -1   
2      50000    1          2         1   28     -1     -1     -1      0   
3      80000    2          3         1   52      2      2      3      3   
4     270000    1          1         2   34      1      2      0      0   

   PAY_5  ...  BILL_AMT4  BILL_AMT5  BILL_AMT6  PAY_AMT1  PAY_AMT2  PAY_AMT3  \
0      0  ...      84373      57779      14163      8295      6000      4000   
1      2  ...       1690       1138        930         0         0      2828   
2     -1  ...      45975       1300      43987         0     46257      2200   
3      3  ...      40748      39816      40607      3700      1600      1600   
4      2  ...      22448      15490      17343         0      4000      2000   

   PAY_AMT4  PAY_AMT5  PAY_AMT6  default  
0      3000      1000    

In [5]:
# Paso 2: Dividir los datasets en x_train, y_train, x_test, y_test
x_train = train_data.drop(columns=["default"])
y_train = train_data["default"]

x_test = test_data.drop(columns=["default"])
y_test = test_data["default"]


In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

# Paso 3: Crear el pipeline
# Identificar columnas categóricas y numéricas
categorical_features = ["SEX", "EDUCATION", "MARRIAGE"]
numerical_features = [col for col in x_train.columns if col not in categorical_features]

# Transformador para variables categóricas (One-Hot Encoding)
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

# Preprocesamiento de columnas
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", categorical_transformer, categorical_features),
        ("num", "passthrough", numerical_features),
    ]
)

# Crear el pipeline
pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),          # Transformar variables categóricas
        ("scaler", StandardScaler()),            # Estandarizar la matriz de entrada
        ("pca", PCA()),                          # Descomponer la matriz de entrada usando PCA
        ("select_k", SelectKBest(score_func=f_classif, k=10)),  # Seleccionar las K columnas más relevantes
        ("svm", SVC(kernel="linear", random_state=42))          # Ajustar una SVM
    ]
)

# Ajustar el pipeline a los datos de entrenamiento
pipeline.fit(x_train, y_train)

# Evaluar el pipeline en los datos de prueba
y_pred = pipeline.predict(x_test)

# Resultados del modelo
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Matriz de Confusión
#print("Matriz de Confusión:")
#print(confusion_matrix(y_test, y_pred))

# Reporte de Clasificación
print("\nReporte de Clasificación:")
print(classification_report(y_test, y_pred))

# Exactitud del modelo
accuracy = accuracy_score(y_test, y_pred)
print(f"\nExactitud del modelo: {accuracy:.2f}")


Matriz de Confusión:
[[7072   19]
 [1883   26]]

Reporte de Clasificación:
              precision    recall  f1-score   support

           0       0.79      1.00      0.88      7091
           1       0.58      0.01      0.03      1909

    accuracy                           0.79      9000
   macro avg       0.68      0.51      0.45      9000
weighted avg       0.74      0.79      0.70      9000


Exactitud del modelo: 0.79


In [10]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, balanced_accuracy_score, accuracy_score



# Definir los hiperparámetros a optimizar
param_grid = {
    "pca__n_components": [20, x_train.shape[1]-2],            # Número de componentes PCA
    "select_k__k": [12, 20],                     # Número de características seleccionadas
    "svm__C": [0.1,],                        # Valor del hiperparámetro C de la SVM
    "svm__kernel": ["linear", "rbf"]                    # Kernel de la SVM
}

# Definir el scoring
#scorer = make_scorer(accuracy_score)

# Configurar la búsqueda en cuadrícula con validación cruzada
#grid_search = GridSearchCV(
    #pipeline,
    #param_grid=param_grid,
    #scoring=scorer,
    #cv=10,                # Número de splits de validación cruzada
   # verbose=1,            # Muestra progreso de la búsqueda
    #n_jobs=-1             # Usa todos los núcleos disponibles
#)

# Validación cruzada se realiza con 10 splits
grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring="balanced_accuracy", n_jobs = -1)
grid_search.fit(x_train, y_train)

# Imprima el mejor modelo
print("Mejores parámetros: ", grid_search.best_params_)
best_model = grid_search.best_estimator_


Mejores parámetros:  {'pca__n_components': 21, 'select_k__k': 12, 'svm__C': 0.1, 'svm__kernel': 'rbf'}


In [11]:
from sklearn.metrics import precision_score, recall_score, f1_score


# Predicciones
y_train_pred = best_model.predict(x_train)
y_test_pred = best_model.predict(x_test)

# Métricas

metrics = {
    "Train": {
        "Accuracy": float(accuracy_score(y_train, y_train_pred)),
        "Balanced Accuracy": float(balanced_accuracy_score(y_train, y_train_pred)),
        "Precision": float(precision_score(y_train, y_train_pred)),
        "Recall": float(recall_score(y_train, y_train_pred)),
        "F1_Score": float(f1_score(y_train, y_train_pred))
    },
    "Test":{
        "Accuracy": float(accuracy_score(y_test, y_test_pred)),
        "Balanced Accuracy": float(balanced_accuracy_score(y_test, y_test_pred)),
        "Precision": float(precision_score(y_test, y_test_pred)),
        "Recall": float(recall_score(y_test, y_test_pred)),
        "F1_Score": float(f1_score(y_test, y_test_pred))
    }
}
print(metrics)

{'Train': {'Accuracy': 0.8123333333333334, 'Balanced Accuracy': 0.6347735306111006, 'Precision': 0.6817761332099908, 'Recall': 0.31182568225089907, 'F1_Score': 0.4279285817970678}, 'Test': {'Accuracy': 0.8257777777777778, 'Balanced Accuracy': 0.6457783455503509, 'Precision': 0.6831364124597207, 'Recall': 0.33315872184389733, 'F1_Score': 0.447887323943662}}


In [None]:
for dataset, y_true, y_pred in [("Train", y_train, y_train_pred), ("Test", y_test, y_test_pred)]:
    cm = confusion_matrix(y_true, y_pred)
    print(f"Matriz de Confusión ({dataset}):\n", cm)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=grid_search.best_estimator_.classes_)
    disp.plot()

In [13]:
# Guarde el modelo (comprimido con gzip) como "files/models/model.pkl.gz".
# Recuerde que es posible guardar el modelo comprimido usanzo la libreria gzip.

import pickle
import gzip
import os

# Crear el directorio si no existe
os.makedirs('../files/models/', exist_ok=True)

# Guardar el modelo comprimido con gzip
with gzip.open('../files/models/model.pkl.gz', 'wb') as file:
    pickle.dump(grid_search, file)

In [21]:
import json
import os
from sklearn.metrics import classification_report, balanced_accuracy_score, confusion_matrix

# Función para calcular las métricas de precisión, recall, etc.
def compute_metrics(model, X, y, dataset):
    y_pred = model.predict(X)
    metrics = classification_report(y, y_pred, output_dict=True)
    
    # formato de json
    results = {
        'type': 'metrics',
        'dataset': dataset,
        'precision': metrics['1']['precision'],
        'balanced_accuracy': balanced_accuracy_score(y, y_pred),
        'recall': metrics['1']['recall'],
        'f1_score': metrics['1']['f1-score']
    }
    return results

# Función para calcular la matriz de confusión
def compute_confusion_matrix(model, X, y, dataset):
    cm = confusion_matrix(y, model.predict(X))

    # formato de json
    return {
        'type': 'cm_matrix',
        'dataset': dataset,
        'true_0': {'predicted_0': int(cm[0][0]), 'predicted_1': int(cm[0][1])},
        'true_1': {'predicted_0': int(cm[1][0]), 'predicted_1': int(cm[1][1])}
    }

# Calcula las métricas para entrenamiento y prueba
metrics_list = [
    compute_metrics(grid_search.best_estimator_, x_train, y_train, 'train'),
    compute_metrics(grid_search.best_estimator_, x_test, y_test, 'test')
]

# Calcula la matriz de confusión para entrenamiento y prueba
cm_train = compute_confusion_matrix(grid_search.best_estimator_, x_train, y_train, 'train')
cm_test = compute_confusion_matrix(grid_search.best_estimator_, x_test, y_test, 'test')

# Crear una lista con todas las métricas y matrices
all_results = metrics_list + [cm_train, cm_test]

# Crear el directorio si no existe
os.makedirs('../files/output/', exist_ok=True)

# Guardar todas las métricas y matrices en el archivo 'metrics.json'
with open('../files/output/metrics.json', 'w') as file:
    for result in all_results:
        file.write(json.dumps(result) + '\n')

print("Métricas guardadas en 'files/output/metrics.json'")


Métricas guardadas en 'files/output/metrics.json'
