In [2]:
import sys; sys.path.append("../automl/")

In [4]:
from pathlib import Path
import yaml
import joblib

import pandas as pd

from src.automl.model.xgboost import XGBClassification
from src.automl.loggers import configure_root_logger
from src.automl.constants import create_ml_data_dir
from src.automl.model.metrics import RocAuc

In [5]:
create_ml_data_dir()
configure_root_logger()

# Tune and fit XGBoost 
Find the best parameters for XGBoost and then fit the model on these parameters. Parameters optimization is performed based on the 5-fold stratified cross-validation and the final fit is performed on the same folds. Out of fold predictions are saved for further stacking/blending. 
[XGBClassification](https://github.com/dertty/automl/blob/1023885f9ec99edfbcb23223ccd8dfce6224bb61/src/automl/model/xgboost/xgboost.py) implementation from [automl](https://github.com/dertty/automl/tree/hack) is used.

## Constants

In [None]:
DATA_PATH = Path("../data/")
RANDOM_SEED = 77
N_JOBS = 16
CONFIG_FILE = Path("../configs/config.yaml")

with CONFIG_FILE.open("r") as f:
    cfg = yaml.load(f, Loader=yaml.SafeLoader)

## Data

In [7]:
df_train = pd.read_parquet(DATA_PATH / "train_preproc_2.parquet")
cat_columns = df_train.drop(columns=["target", "id"]).select_dtypes(int).columns.values.tolist()
X_train, y_train = df_train[cfg["selected_features"] + cat_columns], df_train["target"]

## Model

In [8]:
metric = RocAuc()

### Tune

In [None]:
# model = XGBClassification(n_jobs=N_JOBS)
# model.tune(X_train, y_train, metric, timeout=60 * 60, categorical_features=cat_columns)

### Fit on the best parameters

In [9]:
model = XGBClassification(**cfg["xgboost"])
oof = model.fit(X_train, y_train, categorical_features=cat_columns)
print(metric(y_train, oof))

[2024-11-12 11:49:15,108] - [   START    ] - Fitting XGBClassification
[2024-11-12 11:49:15,334] - [    FIT     ] - XGBClassification fold 0
[2024-11-12 11:49:26,022] - [    FIT     ] - XGBClassification fold 1
[2024-11-12 11:49:36,573] - [    FIT     ] - XGBClassification fold 2
[2024-11-12 11:49:47,674] - [    FIT     ] - XGBClassification fold 3
[2024-11-12 11:49:58,392] - [    FIT     ] - XGBClassification fold 4
[2024-11-12 11:50:08,369] - [    END     ] - Fitting XGBClassification
0.8132138583629158


### Save model file, parameters, test and oof predictions.

In [None]:
MODEL_NAME = "xgb_81325_full_dataset"
MODEL_DIR = DATA_PATH / "models" / MODEL_NAME
MODEL_DIR.mkdir(exist_ok=True)

In [17]:
res = pd.DataFrame()
res[MODEL_NAME] = oof[:, 1]
res.to_csv(MODEL_DIR / "oof.csv", index=False)
joblib.dump(model, MODEL_DIR / f"{MODEL_NAME}.joblib")

with (MODEL_DIR / "params.yaml").open("w") as f:
    yaml.dump(model.params, f)

with (MODEL_DIR / "score.txt").open("w") as f:
    print("OOF:", metric(y_train, oof), file=f)
    
test = pd.read_parquet(DATA_PATH / "test_preproc_2.parquet")
test["target"] = model.predict(test[cfg["selected_features"] + cat_columns])[:, 1]
test[['id', 'target']].to_csv(MODEL_DIR / f'{MODEL_NAME}.csv', index=False)