In [None]:
import pandas as pd
import os

In [None]:
INPUT_PATH = "../files/input/"
OUTPUT_PATH = "../files/output/"

TRAIN_FILENAME = os.path.join(INPUT_PATH, "train_data.csv.zip")
TEST_FILENAME = os.path.join(INPUT_PATH, "test_data.csv.zip")

In [None]:
train = pd.read_csv(TRAIN_FILENAME, compression="zip")
train.sample(5)

In [None]:
test = pd.read_csv(TEST_FILENAME, compression="zip")
test.sample(5)

In [None]:
def clean_dataframe(df):
    df = df.copy()
    # Duplicate column
    df["default"] = df["default payment next month"]
    # Drop columns
    df = df.drop(["ID", "default payment next month"], axis=1)
    # Filter valid records
    is_valid = (df["EDUCATION"] != 0) & (df["MARRIAGE"] != 0)
    df = df[is_valid]
    # EDUCATION grouped
    df["EDUCATION"] = df["EDUCATION"].apply(lambda x: x if x < 4 else 4)
    return df

In [None]:
df_train = clean_dataframe(train)
df_train.sample(5)

In [None]:
df_test = clean_dataframe(test)
df_test.sample(5)

In [None]:
x_train = df_train.drop(columns=["default"])
y_train = df_train["default"]

x_test = df_test.drop(columns=["default"])
y_test = df_test["default"]

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier


categorical_cols = ["SEX", "EDUCATION", "MARRIAGE"]
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols)
    ],
    remainder='passthrough'
)

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(random_state=42))
])

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "classifier__n_estimators": [50, 100, 200],
    "classifier__max_depth": [None, 5, 10, 20],
    "classifier__min_samples_split": [2, 5, 10],
    "classifier__min_samples_leaf": [1, 2, 4],
}

grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring='balanced_accuracy', n_jobs=-1)
grid_search.fit(x_train, y_train)

In [None]:
import gzip
import pickle
        
with gzip.open("../files/models/model.pkl.gz", "wb") as f:
    pickle.dump(grid_search, f)

In [None]:
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, f1_score, precision_score, recall_score

def evaluate_model(model, X, y, dataset_type):
    y_pred = model.predict(X)

    precision = precision_score(y, y_pred)
    balanced_acc = balanced_accuracy_score(y, y_pred)
    recall = recall_score(y, y_pred)
    f1 = f1_score(y, y_pred)

    metrics = {
        'type': 'metrics',
        'dataset': dataset_type,
        'precision': round(precision, 4),
        'balanced_accuracy': round(balanced_acc, 4),
        'recall': round(recall, 4),
        'f1_score': round(f1, 4)
    }

    cm = confusion_matrix(y, y_pred)
    cm_dict = {
        'type': 'cm_matrix',
        'dataset': dataset_type,
        'true_0': {'predicted_0': int(cm[0, 0]), 'predicted_1': int(cm[0, 1])},
        'true_1': {'predicted_0': int(cm[1, 0]), 'predicted_1': int(cm[1, 1])}
    }

    return metrics, cm_dict


metrics_train, cm_train = evaluate_model(grid_search, x_train, y_train, "train")
metrics_test, cm_test = evaluate_model(grid_search, x_test, y_test, "test")

metrics_list = [metrics_train, metrics_test, cm_train, cm_test]


In [None]:
import json

with open("../files/output/metrics.json", "w") as f:
    for line in metrics_list:
        f.write(json.dumps(line) + "\n")