In [1]:
import pandas as pd
import pickle, gzip, json, os
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, balanced_accuracy_score, confusion_matrix
import numpy as np
import zipfile

In [2]:
def clean_data(df):
    df = df.rename(columns={"default payment next month": "default"})
    df = df.drop(columns=["ID"])
    df = df.dropna()
    df["EDUCATION"] = df["EDUCATION"].where(df["EDUCATION"] <= 4, 4)
    df["MARRIAGE"] = df["MARRIAGE"].where(df["MARRIAGE"].isin([1, 2, 3]), 3)
    df["SEX"] = df["SEX"].where(df["SEX"].isin([1, 2]), 1)
    

    df[["SEX", "MARRIAGE", "EDUCATION"]] = df[["SEX", "MARRIAGE", "EDUCATION"]].astype(str)
    return df

def load_zipped_csv(path):
    with zipfile.ZipFile(path) as z:
        name = z.namelist()[0]
        with z.open(name) as f:
            df = pd.read_csv(f)
    return df

train_df = clean_data(load_zipped_csv("../files/input/train_data.csv.zip"))
test_df = clean_data(load_zipped_csv("../files/input/test_data.csv.zip"))

x_train = train_df.drop("default", axis=1)
y_train = train_df["default"]
x_test = test_df.drop("default", axis=1)
y_test = test_df["default"]


In [None]:

preprocessor = ColumnTransformer(
    transformers=[
        ("onehot", OneHotEncoder(handle_unknown="ignore", dtype=np.float64), ["SEX", "EDUCATION", "MARRIAGE"])
    ],
    remainder="passthrough"
)


pipeline = Pipeline([
    ("preprocesador", preprocessor),
    ("clasificador", RandomForestClassifier(random_state=42))
])


param_grid = {
    "clasificador__n_estimators": [300, 400, 500],
    "clasificador__max_depth": [ 25, 30, 35],
    "clasificador__min_samples_split": [2, 5],
    "clasificador__min_samples_leaf": [1, 2]
}


model = GridSearchCV(
    pipeline,
    param_grid,
    cv=10,
    scoring="balanced_accuracy",
    verbose=1,
    n_jobs=-1
)


model.fit(x_train, y_train)

Fitting 10 folds for each of 36 candidates, totalling 360 fits


In [None]:
os.makedirs("../files/grading", exist_ok=True)

categorical = ["SEX", "EDUCATION", "MARRIAGE"]
for col in categorical:
    x_train[col] = x_train[col].astype(str)
    x_test[col] = x_test[col].astype(str)

with open("../files/grading/x_train.pkl", "wb") as f:
    pickle.dump(x_train, f)
with open("../files/grading/y_train.pkl", "wb") as f:
    pickle.dump(y_train, f)
with open("../files/grading/x_test.pkl", "wb") as f:
    pickle.dump(x_test, f)
with open("../files/grading/y_test.pkl", "wb") as f:
    pickle.dump(y_test, f)
print(".pkl guardados correctamente.")


In [None]:

os.makedirs("../files/models", exist_ok=True)

with gzip.open("../files/models/model.pkl.gz", "wb") as f:
    pickle.dump(model, f)

In [None]:
def generate_metrics(model, x, y, dataset):
    y_pred = model.predict(x)
    cm = confusion_matrix(y, y_pred)
    metrics = {
        "type": "metrics",
        "dataset": dataset,
        "precision": precision_score(y, y_pred),
        "balanced_accuracy": balanced_accuracy_score(y, y_pred),
        "recall": recall_score(y, y_pred),
        "f1_score": f1_score(y, y_pred),
    }
    cm_matrix = {
        "type": "cm_matrix",
        "dataset": dataset,
        "true_0": {"predicted_0": int(cm[0][0]), "predicted_1": int(cm[0][1])},
        "true_1": {"predicted_0": int(cm[1][0]), "predicted_1": int(cm[1][1])},
    }
    return [metrics, cm_matrix]

train_metrics, train_cm = generate_metrics(model, x_train, y_train, "train")
test_metrics, test_cm = generate_metrics(model, x_test, y_test, "test")

results = [train_metrics, test_metrics, train_cm, test_cm]

os.makedirs("../files/output", exist_ok=True)

with open("../files/output/metrics.json", "w", encoding="utf-8") as file:
    for r in results:
        file.write(json.dumps(r) + "\n")

NotFittedError: This GridSearchCV instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.