In [1]:
import gzip
import json
import os
import pickle
from typing import Tuple

import numpy as np  
import pandas as pd  
from sklearn.compose import ColumnTransformer  
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif  
from sklearn.metrics import (  
    balanced_accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import GridSearchCV  
from sklearn.pipeline import Pipeline  
from sklearn.preprocessing import OneHotEncoder, StandardScaler  
from sklearn.svm import SVC  

In [2]:
#PASO 1
def load_and_clean_data():
   
    train_df = pd.read_csv("../files/input/train_data.csv.zip",
    index_col=False,
    compression="zip",)
    test_df = pd.read_csv("../files/input/test_data.csv.zip",
    index_col=False,
    compression="zip",)

    train_df = train_df.rename(columns={"default payment next month": "default"})
    test_df = test_df.rename(columns={"default payment next month": "default"})

    train_df = train_df.drop(columns=["ID"])
    test_df = test_df.drop(columns=["ID"])
   
    train_df = train_df[train_df["EDUCATION"] != 0]
    train_df = train_df[train_df["MARRIAGE"] != 0]
    test_df = test_df[test_df["EDUCATION"] != 0]
    test_df = test_df[test_df["MARRIAGE"] != 0]

    train_df.loc[train_df["EDUCATION"] > 4, "EDUCATION"] = 4
    test_df.loc[test_df["EDUCATION"] > 4, "EDUCATION"] = 4

    return train_df, test_df

In [3]:
#PASO 2
def split_data(train_df, test_df):
    
    x_train = train_df.drop(columns=["default"])
    y_train = train_df["default"]
    x_test = test_df.drop(columns=["default"])
    y_test = test_df["default"]

    return x_train, y_train, x_test, y_test

In [4]:
#PASO 3
def create_pipeline(x_train: pd.DataFrame) -> Pipeline:

    categorical_cols = ["SEX", "EDUCATION", "MARRIAGE"]
    numerical_cols = [col for col in x_train.columns if col not in categorical_cols]

    preprocessor = ColumnTransformer(
        transformers=[
            (
                "cat",
                OneHotEncoder(handle_unknown="ignore"),
                categorical_cols,
            ),
            ("num", StandardScaler(), numerical_cols),
        ],
        remainder="passthrough",
    )

    pipeline = Pipeline(
        [
            ("preprocesamiento", preprocessor),
            ("pca", PCA()), 
            ("selector_kbest", SelectKBest(score_func=f_classif)),
            ("clasificador", SVC(kernel="rbf", random_state=42)),
        ]
    )

    return pipeline

In [5]:
def optimize_hyperparameters(pipeline, x_train, y_train):
    param_grid ={
        "pca__n_components": [20, 21],
        "selector_kbest__k": [12],
        "clasificador__kernel": ["rbf"],
        "clasificador__gamma": [0.099],
    }

    grid_search = GridSearchCV(
        pipeline, 
        param_grid, 
        cv=10, 
        refit=True,
        verbose=1,
        scoring='balanced_accuracy', 
    )
    grid_search.fit(x_train, y_train)
    return grid_search


In [6]:
#PASO 5
def save_model(model):

    os.makedirs("../files/models", exist_ok=True)

    with gzip.open("../files/models/model.pkl.gz", "wb") as f:
        pickle.dump(model, f)

In [7]:
#PASO 6-7
def calculate_and_save_metrics(model, x_train, y_train, x_test, y_test):
    
    os.makedirs("../files/output", exist_ok=True)

    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)

    train_metrics = {
        "type": "metrics",
        "dataset": "train",
        "precision": precision_score(y_train, y_train_pred, zero_division=0),
        "balanced_accuracy": balanced_accuracy_score(y_train, y_train_pred),
        "recall": recall_score(y_train, y_train_pred, zero_division=0),
        "f1_score": f1_score(y_train, y_train_pred, zero_division=0),
    }

    test_metrics = {
        "type": "metrics",
        "dataset": "test",
        "precision": precision_score(y_test, y_test_pred, zero_division=0),
        "balanced_accuracy": balanced_accuracy_score(y_test, y_test_pred),
        "recall": recall_score(y_test, y_test_pred, zero_division=0),
        "f1_score": f1_score(y_test, y_test_pred, zero_division=0),
    }

    cm_train = confusion_matrix(y_train, y_train_pred)
    cm_test = confusion_matrix(y_test, y_test_pred)

    train_cm = {
        "type": "cm_matrix",
        "dataset": "train",
        "true_0": {"predicted_0": int(cm_train[0, 0]), "predicted_1": int(cm_train[0, 1])},
        "true_1": {"predicted_0": int(cm_train[1, 0]), "predicted_1": int(cm_train[1, 1])},
    }

    test_cm = {
        "type": "cm_matrix",
        "dataset": "test",
        "true_0": {"predicted_0": int(cm_test[0, 0]), "predicted_1": int(cm_test[0, 1])},
        "true_1": {"predicted_0": int(cm_test[1, 0]), "predicted_1": int(cm_test[1, 1])},
    }

    with open("../files/output/metrics.json", "w", encoding="utf-8") as f:
        f.write(json.dumps(train_metrics) + "\n")
        f.write(json.dumps(test_metrics) + "\n")
        f.write(json.dumps(train_cm) + "\n")
        f.write(json.dumps(test_cm) + "\n")

In [None]:
train_df, test_df = load_and_clean_data()
x_train, y_train, x_test, y_test = split_data(train_df, test_df)
pipeline = create_pipeline(x_train)
model = optimize_hyperparameters(pipeline, x_train, y_train)
save_model(model)
calculate_and_save_metrics(model, x_train, y_train, x_test, y_test)

Fitting 10 folds for each of 2 candidates, totalling 20 fits
