In [1]:
import os
import gzip
import json
import pickle

import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (
    precision_score,
    balanced_accuracy_score,
    recall_score,
    f1_score,
    confusion_matrix,
)

In [None]:
train_df = pd.read_csv("../files/input/train_data.csv.zip")
test_df = pd.read_csv("../files/input/test_data.csv.zip")


def clean_data(df: pd.DataFrame) -> pd.DataFrame:
    df = df.rename(columns={"default payment next month": "default"})
    if "ID" in df.columns:
        df = df.drop(columns=["ID"])
    df = df[(df["EDUCATION"] != 0) & (df["MARRIAGE"] != 0)]
    df["EDUCATION"] = df["EDUCATION"].apply(lambda v: 4 if v > 4 else v)
    df = df.dropna()
    return df


train_df = clean_data(train_df)
test_df = clean_data(test_df)

In [None]:
X_train = train_df.drop(columns=["default"])
y_train = train_df["default"]

X_test = test_df.drop(columns=["default"])
y_test = test_df["default"]

In [None]:
categorical_features = ["SEX", "EDUCATION", "MARRIAGE"]
numerical_features = [c for c in X_train.columns if c not in categorical_features]

preprocessor = ColumnTransformer(
    transformers=[
        (
            "cat",
            OneHotEncoder(handle_unknown="ignore", sparse_output=False),
            categorical_features,
        ),
        ("num", "passthrough", numerical_features),
    ],
    remainder="drop",
    verbose_feature_names_out=False,
)

pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("pca", PCA()),
        ("scaler", MinMaxScaler()),
        ("select", SelectKBest(score_func=f_classif)),
        ("mlp", MLPClassifier(max_iter=300, random_state=42)),
    ]
)

In [None]:
total_base_features = len(categorical_features) + len(numerical_features)

param_grid = {
    "select__k": [10, 20, 30, total_base_features],
    "mlp__hidden_layer_sizes": [(50,), (100,), (50, 50)],
    "mlp__alpha": [0.0001, 0.001, 0.01],
    "mlp__learning_rate_init": [0.001, 0.01],
}

grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=10,
    scoring="balanced_accuracy",
    n_jobs=-1,
    verbose=2,
    refit=True,
)

grid_search.fit(X_train, y_train)

In [None]:
os.makedirs("../files/models", exist_ok=True)
with gzip.open("../files/models/model.pkl.gz", "wb") as f:
    pickle.dump(grid_search, f)

In [None]:
def calculate_metrics(model, x, y, dataset_name: str) -> dict:
    y_pred = model.predict(x)
    return {
        "type": "metrics",
        "dataset": dataset_name,
        "precision": precision_score(y, y_pred, zero_division=0),
        "balanced_accuracy": balanced_accuracy_score(y, y_pred),
        "recall": recall_score(y, y_pred, zero_division=0),
        "f1_score": f1_score(y, y_pred, zero_division=0),
    }


metrics = []
metrics.append(calculate_metrics(grid_search, X_train, y_train, "train"))
metrics.append(calculate_metrics(grid_search, X_test, y_test, "test"))

In [None]:
def calculate_confusion_matrix(model, x, y, dataset_name: str) -> dict:
    y_pred = model.predict(x)
    cm = confusion_matrix(y, y_pred)
    return {
        "type": "cm_matrix",
        "dataset": dataset_name,
        "true_0": {
            "predicted_0": int(cm[0, 0]),
            "predicted_1": int(cm[0, 1]),
        },
        "true_1": {
            "predicted_0": int(cm[1, 0]),
            "predicted_1": int(cm[1, 1]),
        },
    }


metrics.append(calculate_confusion_matrix(grid_search, X_train, y_train, "train"))
metrics.append(calculate_confusion_matrix(grid_search, X_test, y_test, "test"))

os.makedirs("../files/output", exist_ok=True)
with open("../files/output/metrics.json", "w", encoding="utf-8") as f:
    for row in metrics:
        f.write(json.dumps(row) + "\n")