In [1]:
import sys; sys.path.append("../automl/")

In [None]:
from pathlib import Path
import yaml
import joblib

import pandas as pd

from src.automl.model.lightgbm import LightGBMClassification
from src.automl.loggers import configure_root_logger
from src.automl.constants import create_ml_data_dir
from src.automl.model.metrics import RocAuc

In [4]:
create_ml_data_dir()
configure_root_logger()

# Tune and fit LightGBM 
Find the best parameters for LightGBM and then fit the model on these parameters. Parameters optimization is performed based on the 5-fold stratified cross-validation and the final fit is performed on the same folds. Out of fold predictions are saved for further stacking/blending. 
[LightGBMClassification](https://github.com/dertty/automl/blob/hack/src/automl/model/lightgbm/lightgbm.py) implementation from [automl](https://github.com/dertty/automl/tree/hack) is used.

## Constants

In [None]:
DATA_PATH = Path("../data/")
RANDOM_SEED = 77
N_JOBS = 16
CONFIG_FILE = Path("../configs/config.yaml")

with CONFIG_FILE.open("r") as f:
    cfg = yaml.load(f, Loader=yaml.SafeLoader)

## Data

In [8]:
df_train = pd.read_parquet(DATA_PATH / "train_preproc_2.parquet")
cat_columns = df_train.drop(columns=["target", "id"]).select_dtypes(int).columns.values.tolist()
X_train, y_train = df_train[cfg["selected_features"] + cat_columns], df_train["target"]

## Model

In [9]:
metric = RocAuc()

### Tune

In [13]:
# model = LightGBMClassification(n_jobs=16)
# model.tune(X_train, y_train, metric, timeout=60 * 60, categorical_features=cat_columns)
# oof = model.fit(X_train, y_train, categorical_features=cat_columns)

# print(metric(y_train, oof))

[2024-11-07 07:20:16,269] - [   START    ] - Tuning LightGBMClassification
[2024-11-07 07:20:26,195] - [   OPTUNA   ] - Trial 0. New best score 0.7926583155247084 with parameters {'max_depth': 6, 'num_leaves': 488, 'min_data_in_leaf': 188, 'bagging_fraction': 0.7993292420985183, 'bagging_freq': 0, 'feature_fraction': 0.49359671220172163, 'lambda_l1': 0.5808361216819946, 'lambda_l2': 8.661761457749352, 'min_gain_to_split': 12.022300234864176, 'is_unbalance': True, 'num_iterations': 2}
[2024-11-07 07:21:10,804] - [   OPTUNA   ] - Trial 3. New best score 0.8075983858254252 with parameters {'max_depth': 16, 'num_leaves': 495, 'min_data_in_leaf': 207, 'bagging_fraction': 0.6523068845866853, 'bagging_freq': 0, 'feature_fraction': 0.8105398159072941, 'lambda_l1': 4.4015249373960135, 'lambda_l2': 1.2203823484477883, 'min_gain_to_split': 9.903538202225404, 'is_unbalance': False, 'num_iterations': 177}
[2024-11-07 07:21:33,390] - [   OPTUNA   ] - Trial 5. New best score 0.8084036366845453 with p

[2024-11-07 08:20:54,395] - [    FIT     ] - LightGBMClassification fold 4
[2024-11-07 08:21:00,928] - [    END     ] - Fitting LightGBMClassification
0.8122242363811351


### Fit on the best parameters

In [12]:
model = LightGBMClassification(**cfg["lightgbm"])
oof = model.fit(X_train, y_train, categorical_features=cat_columns)
print(metric(y_train, oof))

[2024-11-12 11:33:14,529] - [   START    ] - Fitting LightGBMClassification
[2024-11-12 11:33:14,614] - [    FIT     ] - LightGBMClassification fold 0
[2024-11-12 11:33:21,549] - [    FIT     ] - LightGBMClassification fold 1
[2024-11-12 11:33:29,192] - [    FIT     ] - LightGBMClassification fold 2
[2024-11-12 11:33:34,833] - [    FIT     ] - LightGBMClassification fold 3
[2024-11-12 11:33:41,863] - [    FIT     ] - LightGBMClassification fold 4
[2024-11-12 11:33:47,487] - [    END     ] - Fitting LightGBMClassification
0.8122242363811351


### Save model file, parameters, test and oof predictions.

In [16]:
MODEL_NAME = "lgb_8122_full_dataset"
MODEL_DIR = DATA_PATH / "models" / MODEL_NAME
MODEL_DIR.mkdir(exist_ok=True)

In [17]:
res = pd.DataFrame()
res[MODEL_NAME] = oof[:, 1]
res.to_csv(MODEL_DIR / "oof.csv", index=False)
joblib.dump(model, MODEL_DIR / f"{MODEL_NAME}.joblib")

with (MODEL_DIR / "params.yaml").open("w") as f:
    yaml.dump(model.params, f)

with (MODEL_DIR / "score.txt").open("w") as f:
    print("OOF:", metric(y_train, oof), file=f)
    
test = pd.read_parquet(DATA_PATH / "test_preproc_2.parquet")
test["target"] = model.predict(test[cfg["selected_features"] + cat_columns])[:, 1]
test[['id', 'target']].to_csv(MODEL_DIR / f'{MODEL_NAME}.csv', index=False)