In [1]:
import sys; sys.path.append("../automl/")

In [3]:
from pathlib import Path
import yaml
import joblib

import pandas as pd

from src.automl.model.catboost import CatBoostClassification
from src.automl.loggers import configure_root_logger
from src.automl.constants import create_ml_data_dir
from src.automl.model.metrics import RocAuc

In [4]:
create_ml_data_dir()
configure_root_logger()

# Tune and fit CatBoost 
Find the best parameters for CatBoost and then fit the model on these parameters. Parameters optimization is performed based on the 5-fold stratified cross-validation and the final fit is performed on the same folds. Out of fold predictions are saved for further stacking/blending. 
[CatBoostClassification](https://github.com/dertty/automl/blob/1023885f9ec99edfbcb23223ccd8dfce6224bb61/src/automl/model/catboost/catboost.py) implementation from [automl](https://github.com/dertty/automl/tree/hack) is used.

## Constants

In [None]:
DATA_PATH = Path("../data/")
RANDOM_SEED = 77
N_JOBS = 16
CONFIG_FILE = Path("../configs/config.yaml")

with CONFIG_FILE.open("r") as f:
    cfg = yaml.load(f, Loader=yaml.SafeLoader)

## Data

In [6]:
df_train = pd.read_parquet(DATA_PATH / "train_preproc_2.parquet")
cat_columns = df_train.drop(columns=["target", "id"]).select_dtypes(int).columns.values.tolist()
X_train, y_train = df_train[cfg["selected_features"] + cat_columns], df_train["target"]

## Model

In [7]:
metric = RocAuc()

### Tune

In [8]:
# model = CatBoostClassification(n_jobs=N_JOBS)
# model.tune(X_train, y_train, metric, timeout=60 * 60, categorical_features=cat_columns)

### Fit on the best parameters

In [15]:
model = CatBoostClassification(**cfg["catboost"])
oof = model.fit(X_train, y_train, categorical_features=cat_columns)
print(metric(y_train, oof))

[2024-11-12 11:42:00,739] - [   START    ] - Fitting CatBoostClassification
[2024-11-12 11:42:00,819] - [    FIT     ] - CatBoostClassification fold 0
[2024-11-12 11:42:51,191] - [    FIT     ] - CatBoostClassification fold 1
[2024-11-12 11:43:40,657] - [    FIT     ] - CatBoostClassification fold 2
[2024-11-12 11:44:32,411] - [    FIT     ] - CatBoostClassification fold 3
[2024-11-12 11:45:23,371] - [    FIT     ] - CatBoostClassification fold 4
[2024-11-12 11:46:13,782] - [    END     ] - Fitting CatBoostClassification
0.8114705063606893


### Save model file, parameters, test and oof predictions.

In [None]:
MODEL_NAME = "cb_8114_full_dataset"
MODEL_DIR = DATA_PATH / "models" / MODEL_NAME
MODEL_DIR.mkdir(exist_ok=True)

In [17]:
res = pd.DataFrame()
res[MODEL_NAME] = oof[:, 1]
res.to_csv(MODEL_DIR / "oof.csv", index=False)
joblib.dump(model, MODEL_DIR / f"{MODEL_NAME}.joblib")

with (MODEL_DIR / "params.yaml").open("w") as f:
    yaml.dump(model.params, f)

with (MODEL_DIR / "score.txt").open("w") as f:
    print("OOF:", metric(y_train, oof), file=f)
    
test = pd.read_parquet(DATA_PATH / "test_preproc_2.parquet")
test["target"] = model.predict(test[cfg["selected_features"] + cat_columns])[:, 1]
test[['id', 'target']].to_csv(MODEL_DIR / f'{MODEL_NAME}.csv', index=False)