In [1]:
import sys; sys.path.append("../../automl/")

In [2]:
from pathlib import Path
import yaml

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from src.automl.model.catboost import CatBoostClassification
from src.automl.loggers import configure_root_logger
from src.automl.constants import create_ml_data_dir
from src.automl.model.metrics import RocAuc

In [3]:
create_ml_data_dir()
configure_root_logger()

## Constants

In [4]:
RANDOM_SEED = 77
DATA_PATH = Path("../../data/")
CONFIG_PATH = Path("../../configs/config.yaml")
N_JOBS = 8

In [5]:
with CONFIG_PATH.open() as f:
    cfg = yaml.load(f, Loader=yaml.SafeLoader)

## Data

In [6]:
df_train = pd.read_parquet(DATA_PATH / "train_preproc.parquet")
df_train, df_test = df_train.sort_values(by="id").iloc[:300_000], df_train.sort_values(by="id").iloc[300_000:]

In [7]:
ohe_cols = df_train.columns[df_train.columns.str.startswith("OneHotEncoder")].values.tolist()
oe_cols = df_train.columns[df_train.columns.str.startswith("OrdinalEncoder")].values.tolist()
te_cols = df_train.columns[df_train.columns.str.startswith("MeanTargetEncoder")].values.tolist()

In [8]:
# take target encoded columns
X_train, y_train = df_train[cfg["selected_features"] + te_cols + ohe_cols], df_train["target"]
X_test, y_test = df_test[cfg["selected_features"] + te_cols + ohe_cols], df_test["target"]

In [8]:
# take ordinal encoded columns
X_train, y_train = df_train[cfg["selected_features"] + oe_cols + ohe_cols], df_train["target"]
X_test, y_test = df_test[cfg["selected_features"] + oe_cols + ohe_cols], df_test["target"]

In [9]:
display(y_train.value_counts(normalize=True))
display(y_test.value_counts(normalize=True))

target
0    0.940853
1    0.059147
Name: proportion, dtype: float64

target
0    0.941322
1    0.058678
Name: proportion, dtype: float64

In [9]:
categorical_features = ohe_cols# + oe_cols

## Model

In [13]:
model = CatBoostClassification(n_jobs=N_JOBS, random_state=RANDOM_SEED)
model.tune(X_train, y_train, metric=RocAuc(), timeout=60 * 10, categorical_features=categorical_features)

[2024-11-05 17:47:30,201] - [   START    ] - Tuning CatBoostClassification
[2024-11-05 17:51:14,854] - [   OPTUNA   ] - Trial 0. New best score 0.7993246319642113 with parameters {'boosting_type': 'Plain', 'depth': 15, 'l2_leaf_reg': 128.43911998477108, 'bootstrap_type': 'Bernoulli', 'grow_policy': 'Depthwise', 'min_data_in_leaf': 139, 'rsm': 0.5441411055208036, 'subsample': 0.7272537553455429, 'model_size_reg': 80.11090073597784, 'auto_class_weights': 'Balanced', 'iterations': 96}
[2024-11-05 17:53:29,712] - [   OPTUNA   ] - Trial 1. New best score 0.8013202125215256 with parameters {'boosting_type': 'Plain', 'depth': 5, 'l2_leaf_reg': 56.20353813079284, 'bootstrap_type': 'Bernoulli', 'grow_policy': 'Depthwise', 'min_data_in_leaf': 45, 'rsm': 0.42962619958869225, 'subsample': 0.5754852032169728, 'model_size_reg': 13.359826218292259, 'auto_class_weights': None, 'iterations': 917}
[2024-11-05 17:56:04,365] - [   OPTUNA   ] - Trial 2. New best score 0.8025709024597971 with parameters {'b

In [10]:
# with target encoding
model = CatBoostClassification(n_jobs=N_JOBS, random_state=RANDOM_SEED)
model.tune(X_train, y_train, metric=RocAuc(), timeout=60 * 20, categorical_features=categorical_features)

[2024-11-05 18:40:32,259] - [   START    ] - Tuning CatBoostClassification
[2024-11-05 18:42:14,339] - [   OPTUNA   ] - Trial 0. New best score 0.7991621905716942 with parameters {'boosting_type': 'Plain', 'depth': 10, 'l2_leaf_reg': 6.421955999238555, 'bootstrap_type': 'Bernoulli', 'grow_policy': 'Depthwise', 'min_data_in_leaf': 139, 'rsm': 0.5441411055208036, 'subsample': 0.7272537553455429, 'model_size_reg': 4.0055450367988925, 'auto_class_weights': 'Balanced', 'iterations': 128}
[2024-11-05 18:44:28,587] - [   OPTUNA   ] - Trial 1. New best score 0.8004784676977812 with parameters {'boosting_type': 'Plain', 'depth': 3, 'l2_leaf_reg': 2.810176906539642, 'bootstrap_type': 'Bernoulli', 'grow_policy': 'Depthwise', 'min_data_in_leaf': 45, 'rsm': 0.42962619958869225, 'subsample': 0.5754852032169728, 'model_size_reg': 0.667991310914613, 'auto_class_weights': None, 'iterations': 1493}
[2024-11-05 18:47:14,040] - [   OPTUNA   ] - Trial 2. New best score 0.8026399838861289 with parameters {'

In [12]:
# with target encoding
model = CatBoostClassification(n_jobs=N_JOBS, random_state=RANDOM_SEED, time_series=True)
model.tune(X_train, y_train, metric=RocAuc(), timeout=60 * 20, categorical_features=categorical_features)

[2024-11-05 19:02:31,327] - [   START    ] - Tuning CatBoostClassification
[2024-11-05 19:03:37,881] - [   OPTUNA   ] - Trial 0. New best score 0.7965052264260117 with parameters {'boosting_type': 'Plain', 'depth': 10, 'l2_leaf_reg': 6.421955999238555, 'bootstrap_type': 'Bernoulli', 'grow_policy': 'Depthwise', 'min_data_in_leaf': 139, 'rsm': 0.5441411055208036, 'subsample': 0.7272537553455429, 'model_size_reg': 4.0055450367988925, 'auto_class_weights': 'Balanced', 'iterations': 97}
[2024-11-05 19:05:14,032] - [   OPTUNA   ] - Trial 1. New best score 0.798767675441276 with parameters {'boosting_type': 'Plain', 'depth': 3, 'l2_leaf_reg': 2.810176906539642, 'bootstrap_type': 'Bernoulli', 'grow_policy': 'Depthwise', 'min_data_in_leaf': 45, 'rsm': 0.42962619958869225, 'subsample': 0.5754852032169728, 'model_size_reg': 0.667991310914613, 'auto_class_weights': None, 'iterations': 1276}
[2024-11-05 19:06:24,075] - [   OPTUNA   ] - Trial 2. New best score 0.8001091854791653 with parameters {'bo

In [16]:
model.fit(X_train, y_train, categorical_features=categorical_features)
print(RocAuc()(y_test.values, model.predict(X_test)[:, 1]))

[2024-11-05 19:25:14,272] - [   START    ] - Fitting CatBoostClassification
[2024-11-05 19:25:14,283] - [    FIT     ] - CatBoostClassification fold 0
[2024-11-05 19:25:17,587] - [    FIT     ] - CatBoostClassification fold 1
[2024-11-05 19:25:23,161] - [    FIT     ] - CatBoostClassification fold 2
[2024-11-05 19:25:31,733] - [    FIT     ] - CatBoostClassification fold 3
[2024-11-05 19:25:42,386] - [    FIT     ] - CatBoostClassification fold 4
[2024-11-05 19:25:55,206] - [    END     ] - Fitting CatBoostClassification
0.7934884944291627


In [20]:
test = pd.read_parquet(DATA_PATH / "test_preproc.parquet")
test["target"] = model.predict(test[cfg["selected_features"] + te_cols + ohe_cols])[:, 1]
test[['id', 'target']].to_csv('catboost_ts.csv', index=False)

In [25]:
pred_1 = pd.read_csv("lama_utilized.csv")
pred_2 = pd.read_csv("lgmb_oe_ohe_cols_0805.csv")
pred_3 = pd.read_csv("catboost_ts.csv")

In [26]:
pred_1["target"] = 0.6 * pred_1["target"] + 0.2 * pred_2["target"] + 0.2 * pred_3["target"]

In [29]:
pred_1.to_csv("blend.csv", index=False)