In [2]:
import os
import json
import pickle
import pandas as pd
import gzip
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
import warnings
from sklearn.metrics import (
    precision_score,
    balanced_accuracy_score,
    recall_score,
    f1_score,
    confusion_matrix,
)


warnings.filterwarnings("ignore")


def clean_dataset(df):
    df = df.rename(columns={"default payment next month": "default"})
    df = df.drop(columns=["ID"])
    df = df.dropna()
    df.loc[df["EDUCATION"] > 4, "EDUCATION"] = 4
    return df


def split_dataset(df):
    y = df["default"]
    X = df.drop(columns=["default"])
    return X, y


def create_pipeline():
    categorical_features = ["SEX", "EDUCATION", "MARRIAGE"]
    numerical_features = list(set(x_train.columns) - set(categorical_features))
    preprocessor = ColumnTransformer(
        transformers=[
            ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
            ("scaler", StandardScaler(with_mean=True, with_std=True), numerical_features),
        ]
    )
    pipeline = Pipeline(
        steps=[
            ("preprocessor", preprocessor),
            ("pca", PCA()),
            ("selector", SelectKBest(f_classif)),
            ("classifier", SVC(kernel="rbf", random_state=12345, max_iter=-1)),
        ]
    )
    return pipeline


def optimize_hyperparameters(pipeline, x_train, y_train):
    param_grid = {
        "pca__n_components": [20, x_train.shape[1]-2],
        "selector__k": [12],
        "classifier__kernel": ['rbf'],
        "classifier__gamma": [0.1]
    }
    grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring="balanced_accuracy",refit=True)
    grid_search.fit(x_train, y_train)
    return grid_search


def save_model(model):
    os.makedirs(
        "../files/models",
        exist_ok=True,
    )
    with gzip.open(
        "../files/models/model.pkl.gz",
        "wb",
    ) as file:
        pickle.dump(model, file)


def calculate_metrics(model, x_train, y_train, x_test, y_test):
    metrics = []
    for dataset, x, y in [("train", x_train, y_train), ("test", x_test, y_test)]:
        y_pred = model.predict(x)
        precision = round(precision_score(y, y_pred), 3)
        balanced_accuracy = round(balanced_accuracy_score(y, y_pred), 3)
        recall = round(recall_score(y, y_pred), 3)
        f1 = round(f1_score(y, y_pred), 3)
        metrics.append(
            {
                "type": "metrics",
                "dataset": dataset,
                "precision": float(precision),
                "balanced_accuracy": float(balanced_accuracy),
                "recall": float(recall),
                "f1_score": float(f1),
            }
        )
    return metrics


def calculate_confusion_matrix(model, x_train, y_train, x_test, y_test):
    cm_matrices = []
    for dataset, x, y in [("train", x_train, y_train), ("test", x_test, y_test)]:
        y_pred = model.predict(x)
        cm = confusion_matrix(y, y_pred)
        cm_matrices.append(
            {
                "type": "cm_matrix",
                "dataset": dataset,
                "true_0": {"predicted_0": int(cm[0, 0]), "predicted_1": int(cm[0, 1])},
                "true_1": {"predicted_0": int(cm[1, 0]), "predicted_1": int(cm[1, 1])},
            }
        )
    return cm_matrices


if __name__ == "__main__":
    df_train = pd.read_csv(
        "../files/input/train_data.csv.zip",
        compression="zip",
        index_col=None,
    )
    df_test = pd.read_csv(
        "../files/input/test_data.csv.zip",
        compression="zip",
        index_col=None,
    )

    df_train = clean_dataset(df_train)
    df_test = clean_dataset(df_test)

    x_train, y_train = split_dataset(df_train)
    x_test, y_test = split_dataset(df_test)


    pipeline = create_pipeline()

    model = optimize_hyperparameters(pipeline, x_train, y_train)

    save_model(model)

    metrics = calculate_metrics(model, x_train, y_train, x_test, y_test)

    cm_matrices = calculate_confusion_matrix(model, x_train, y_train, x_test, y_test)

    os.makedirs(
        "../files/output",
        exist_ok=True,
    )
    with open(
        "../files/output/metrics.json",
        "w",
    ) as file:
        json.dump(metrics + cm_matrices, file, indent=4)
    print("Métricas guardadas en 'files/output/metrics.json'")


Métricas guardadas en 'files/output/metrics.json'
