In [63]:
import pandas as pd
import numpy as np
import pickle

df1 = pd.read_csv("../files/input/train_data.csv.zip", index_col=False, compression="zip")
df1_ = df1.copy()
df1_.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21000 entries, 0 to 20999
Data columns (total 25 columns):
 #   Column                      Non-Null Count  Dtype
---  ------                      --------------  -----
 0   ID                          21000 non-null  int64
 1   LIMIT_BAL                   21000 non-null  int64
 2   SEX                         21000 non-null  int64
 3   EDUCATION                   21000 non-null  int64
 4   MARRIAGE                    21000 non-null  int64
 5   AGE                         21000 non-null  int64
 6   PAY_0                       21000 non-null  int64
 7   PAY_2                       21000 non-null  int64
 8   PAY_3                       21000 non-null  int64
 9   PAY_4                       21000 non-null  int64
 10  PAY_5                       21000 non-null  int64
 11  PAY_6                       21000 non-null  int64
 12  BILL_AMT1                   21000 non-null  int64
 13  BILL_AMT2                   21000 non-null  int64
 14  BILL_A

In [64]:
df2 = pd.read_csv("../files/input/test_data.csv.zip", index_col=False, compression="zip")
df2_=df2.copy()
df2_.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9000 entries, 0 to 8999
Data columns (total 25 columns):
 #   Column                      Non-Null Count  Dtype
---  ------                      --------------  -----
 0   ID                          9000 non-null   int64
 1   LIMIT_BAL                   9000 non-null   int64
 2   SEX                         9000 non-null   int64
 3   EDUCATION                   9000 non-null   int64
 4   MARRIAGE                    9000 non-null   int64
 5   AGE                         9000 non-null   int64
 6   PAY_0                       9000 non-null   int64
 7   PAY_2                       9000 non-null   int64
 8   PAY_3                       9000 non-null   int64
 9   PAY_4                       9000 non-null   int64
 10  PAY_5                       9000 non-null   int64
 11  PAY_6                       9000 non-null   int64
 12  BILL_AMT1                   9000 non-null   int64
 13  BILL_AMT2                   9000 non-null   int64
 14  BILL_AMT

In [65]:
def preprocess_data(df):
    df.rename(columns={"default payment next month": "default"}, inplace=True)
    df.drop(columns=["ID"], inplace=True)
    df.dropna(inplace=True)

    df.loc[df["EDUCATION"] > 4, "EDUCATION"] = 4
    df = df.loc[df["EDUCATION"] !=0]
    df = df.loc[df["MARRIAGE"] !=0]
    df.dropna(inplace=True)
    
    return df
df1_ = preprocess_data(df1)
df2_ = preprocess_data(df2)
# ver valores unicos de "SEX"
df1_["EDUCATION"].unique()

array([3, 2, 1, 4])

In [66]:
X_train = df1_.drop(columns=["default"])
y_train = df1_["default"]
X_test = df2_.drop(columns=["default"])
y_test = df2_["default"]

In [67]:
# Cree un pipeline para el modelo de clasificación. Este pipeline debe
# contener las siguientes capas:
# - Transforma las variables categoricas usando el método
#   one-hot-encoding.
# - Descompone la matriz de entrada usando PCA. El PCA usa todas las componentes.
# - Estandariza la matriz de entrada.
# - Selecciona las K columnas mas relevantes de la matrix de entrada.
# - Ajusta una maquina de vectores de soporte (svm).
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.svm import SVC
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import f_classif

categorical_features = ["SEX", "EDUCATION", "MARRIAGE"]
numerical_features = ["LIMIT_BAL", "AGE", "PAY_0", "PAY_2", "PAY_3", "PAY_4", "PAY_5", "PAY_6", "BILL_AMT1", "BILL_AMT2", "BILL_AMT3", "BILL_AMT4", "BILL_AMT5", "BILL_AMT6", "PAY_AMT1", "PAY_AMT2", "PAY_AMT3", "PAY_AMT4", "PAY_AMT5", "PAY_AMT6"]

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
        ("num", Pipeline([
            #("scaler", StandardScaler()),
            ("selectkbest", SelectKBest(score_func=f_classif)),
            ("pca", PCA()),  
        ]), numerical_features),
    ]
)


pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("svm", SVC(random_state=42)),
])

pipeline.fit(X_train, y_train)

accuracy = pipeline.score(X_test, y_test)
print("Accuracy del modelo:", accuracy)


Accuracy del modelo: 0.787726918365074


In [68]:
# Paso 4.
# Optimice los hiperparametros del pipeline usando validación cruzada.
# Use 10 splits para la validación cruzada. Use la función de precision
# balanceada para medir la precisión del modelo.
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import make_scorer, balanced_accuracy_score

param_grid = {
    "preprocessor__num__selectkbest__k": [13], 
    "preprocessor__num__pca__n_components": [10],
    "svm__C": [0.9],
    "svm__gamma": [ 0.11],
    #"svm__C": [0.9, 1, 1.1],
    #"svm__gamma": [0.01, 0.1, 0.11, 0.12],
    
}

stratified_kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

scorer = make_scorer(balanced_accuracy_score)
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring=scorer, verbose=3)
grid_search.fit(X_train, y_train)

# Obtener el mejor modelo y calcular su accuracy en el conjunto de prueba
best_pipeline = grid_search.best_estimator_
best_accuracy = best_pipeline.score(X_test, y_test)
print("Accuracy del mejor modelo:", best_accuracy)


Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END preprocessor__num__pca__n_components=10, preprocessor__num__selectkbest__k=13, svm__C=0.9, svm__gamma=0.11;, score=0.504 total time=  48.4s
[CV 2/5] END preprocessor__num__pca__n_components=10, preprocessor__num__selectkbest__k=13, svm__C=0.9, svm__gamma=0.11;, score=0.503 total time=  44.4s
[CV 3/5] END preprocessor__num__pca__n_components=10, preprocessor__num__selectkbest__k=13, svm__C=0.9, svm__gamma=0.11;, score=0.504 total time=  42.7s
[CV 4/5] END preprocessor__num__pca__n_components=10, preprocessor__num__selectkbest__k=13, svm__C=0.9, svm__gamma=0.11;, score=0.504 total time=  44.3s
[CV 5/5] END preprocessor__num__pca__n_components=10, preprocessor__num__selectkbest__k=13, svm__C=0.9, svm__gamma=0.11;, score=0.505 total time=  41.1s
Accuracy del mejor modelo: 0.79273861231763


In [69]:
# Imprimir los mejores hiperparámetros
print("Mejores hiperparámetros encontrados:")
print(grid_search.best_params_)

# Imprimir la mejor puntuación obtenida durante la búsqueda
print("\nMejor puntuación (balanced accuracy) obtenida en validación cruzada:")
print(grid_search.best_score_)

Mejores hiperparámetros encontrados:
{'preprocessor__num__pca__n_components': 10, 'preprocessor__num__selectkbest__k': 13, 'svm__C': 0.9, 'svm__gamma': 0.11}

Mejor puntuación (balanced accuracy) obtenida en validación cruzada:
0.5037985692530007


In [70]:
import joblib
import os
import pickle
import gzip

os.makedirs("../files/models", exist_ok=True)
with gzip.open("../files/models/model.pkl.gz", "wb") as f:
    pickle.dump(grid_search, f)

In [71]:
from sklearn.metrics import (
    precision_score,
    recall_score,
    f1_score,
    balanced_accuracy_score,
)
import json

y_train_pred = grid_search.predict(X_train)
y_test_pred = grid_search.predict(X_test)

metrics = [
    {
        "type": "metrics",
        "dataset": "train",
        "precision": float(precision_score(y_train, y_train_pred)),
        "balanced_accuracy": float(balanced_accuracy_score(y_train, y_train_pred)),
        "recall": float(recall_score(y_train, y_train_pred)),
        "f1_score": float(f1_score(y_train, y_train_pred)),
    },
    {
        "type": "metrics",
        "dataset": "test",
        "precision": float(precision_score(y_test, y_test_pred)),
        "balanced_accuracy": float(balanced_accuracy_score(y_test, y_test_pred)),
        "recall": float(recall_score(y_test, y_test_pred)),
        "f1_score": float(f1_score(y_test, y_test_pred)),
    },
]

In [72]:
from sklearn.metrics import confusion_matrix
import os

train_cm = confusion_matrix(y_train, y_train_pred)
test_cm = confusion_matrix(y_test, y_test_pred)

confusion_matrices = [
    {
        "type": "cm_matrix",
        "dataset": "train",
        "true_0": {
            "predicted_0": int(train_cm[0, 0]),
            "predicted_1": int(train_cm[0, 1]),
        },
        "true_1": {
            "predicted_0": int(train_cm[1, 0]),
            "predicted_1": int(train_cm[1, 1]),
        },
    },
    {
        "type": "cm_matrix",
        "dataset": "test",
        "true_0": {
            "predicted_0": int(test_cm[0, 0]),
            "predicted_1": int(test_cm[0, 1]),
        },
        "true_1": {
            "predicted_0": int(test_cm[1, 0]),
            "predicted_1": int(test_cm[1, 1]),
        },
    },
]

output_file = "../files/output/metrics.json"
os.makedirs("../files/output", exist_ok=True)

output_data = metrics + confusion_matrices

with open(output_file, "w") as f:
    for item in output_data:
        f.write(str(item).replace("'", '"') + "\n")