In [None]:
import sys; sys.path.append("../automl/")

In [None]:
from pathlib import Path
import yaml
import joblib

import pandas as pd

from src.automl.model.lama import TabularLamaUtilized
from src.automl.loggers import configure_root_logger
from src.automl.constants import create_ml_data_dir
from src.automl.model.metrics import RocAuc

In [4]:
create_ml_data_dir()
configure_root_logger()

# Tune and fit [LightAutoMLUtilized](https://github.com/sb-ai-lab/LightAutoML) 
Find the best parameters for LightAutoML and then fit the model on these parameters. Parameters optimization is performed based on the 5-fold stratified cross-validation and the final fit is performed on the same folds. Out of fold predictions are saved for further stacking/blending. 
[TabularLamaUtilized](https://github.com/dertty/automl/blob/hack/src/automl/model/lama/default_lama.py) implementation from [automl](https://github.com/dertty/automl/tree/hack) is used.

**Unfortunately**, in LightAutoML training and tuning is performed simultaneously, hence it is impossible to save best LightAutoML parameters and then initialize model with these parameters for inference. The solution is to save model file (*joblib* format) and then use this model for inference. If necessary, we can provide this file together with the oof predictions.

## Constants

In [5]:
DATA_PATH = Path("../data/")
RANDOM_SEED = 77
N_JOBS = 16
CONFIG_FILE = Path("../configs/config.yaml")

with CONFIG_FILE.open("r") as f:
    cfg = yaml.load(f, Loader=yaml.SafeLoader)

## Data

In [6]:
df_train = pd.read_parquet(DATA_PATH / "train_preproc_2.parquet")
cat_columns = df_train.drop(columns=["target", "id"]).select_dtypes(int).columns.values.tolist()
X_train, y_train = df_train[cfg["selected_features"] + cat_columns], df_train["target"]

## Model

In [7]:
metric = RocAuc()

### Fit + Tune

**Important**: It is nearly impossible to fully reproduce LightAutoML trianing, because it strongly depends on the harware, resources utilization and timeout. To reproduce the results we can provide the saved file of a fitted model.

In [14]:
model = TabularLamaUtilized(n_jobs=N_JOBS, task="classification")
model.tune(X_train, y_train, metric, timeout=60 * 60, categorical_features=cat_columns)
oof = model.fit(X_train, y_train, categorical_features=cat_columns)

print(metric(y_train, oof))

[2024-11-08 13:27:24,414] - [   START    ] - Fitting TabularLamaUtilized
[13:27:24] Start automl [1mutilizator[0m with listed constraints:
[13:27:24] - time: 7200.00 seconds
[13:27:24] - CPU: 16 cores
[13:27:24] - memory: 16 GB

[13:27:24] [1mIf one preset completes earlier, next preset configuration will be started[0m

[13:27:24] Start 0 automl preset configuration:
[13:27:24] [1mconf_0_sel_type_0.yml[0m, random state: {'reader_params': {'random_state': 42}, 'nn_params': {'random_state': 42}, 'general_params': {'return_all_predictions': False}}
[13:27:24] Stdout logging level is INFO.
[13:27:24] Task: binary

[13:27:24] Start automl preset with listed constraints:
[13:27:24] - time: 7200.00 seconds
[13:27:24] - CPU: 16 cores
[13:27:24] - memory: 16 GB

[13:27:24] [1mTrain data shape: (413194, 63)[0m

[13:27:35] Layer [1m1[0m train process start. Time left 7188.70 secs
[13:27:48] Start fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m ...
[13:28:33] Fitting [1mLvl_0_Pipe_0_Mod_0_L

[14:00:53] Task: binary

[14:00:53] Start automl preset with listed constraints:
[14:00:53] - time: 5190.98 seconds
[14:00:53] - CPU: 16 cores
[14:00:53] - memory: 16 GB

[14:00:53] [1mTrain data shape: (413194, 63)[0m

[14:00:54] Layer [1m1[0m train process start. Time left 5190.17 secs
[14:01:07] Start fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m ...
[14:01:56] Fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m finished. score = [1m0.7878750314025083[0m
[14:01:56] [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m fitting and predicting completed
[14:01:56] Time left 5128.30 secs

[14:02:07] [1mSelector_LightGBM[0m fitting and predicting completed
[14:02:19] Start fitting [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m ...
[14:03:08] Fitting [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m finished. score = [1m0.8076809265966576[0m
[14:03:08] [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m fitting and predicting completed
[14:03:08] Start hyperparameters optimization for [1mLvl_0_Pipe_1_Mod_1_Tuned_LightGBM[0m ... Time budget is 3

[14:42:01] Fitting [1mLvl_0_Pipe_1_Mod_1_Tuned_LightGBM[0m finished. score = [1m0.8121930536698967[0m
[14:42:01] [1mLvl_0_Pipe_1_Mod_1_Tuned_LightGBM[0m fitting and predicting completed
[14:42:02] Start fitting [1mLvl_0_Pipe_1_Mod_2_CatBoost[0m ...
[14:42:54] Fitting [1mLvl_0_Pipe_1_Mod_2_CatBoost[0m finished. score = [1m0.8067263183241322[0m
[14:42:54] [1mLvl_0_Pipe_1_Mod_2_CatBoost[0m fitting and predicting completed
[14:42:54] Start hyperparameters optimization for [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m ... Time budget is 300.00 secs
[14:48:05] Hyperparameters optimization for [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m completed
[14:48:05] Start fitting [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m ...
[14:49:53] Fitting [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m finished. score = [1m0.8075557040558934[0m
[14:49:53] [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m fitting and predicting completed
[14:49:53] Time left 2250.78 secs

[14:49:53] [1mLayer 1 training completed.[0m

[15:07:10] Blending: no score update. Terminated

[15:07:10] [1mAutoml preset training completed in 1019.65 seconds[0m

[15:07:10] Model description:
Final prediction for new objects (level 0) = 
	 0.23267 * (5 averaged models Lvl_0_Pipe_1_Mod_0_LightGBM) +
	 0.65069 * (5 averaged models Lvl_0_Pipe_1_Mod_1_Tuned_LightGBM) +
	 0.11664 * (5 averaged models Lvl_0_Pipe_1_Mod_3_Tuned_CatBoost) 

[15:07:10] Start 6 automl preset configuration:
[15:07:10] [1mconf_6_sel_type_1_tuning_full_no_int_lgbm.yml[0m, random state: {'reader_params': {'random_state': 48}, 'nn_params': {'random_state': 48}, 'general_params': {'return_all_predictions': False}}
[15:07:10] Stdout logging level is INFO.
[15:07:10] Task: binary

[15:07:10] Start automl preset with listed constraints:
[15:07:10] - time: 1214.01 seconds
[15:07:10] - CPU: 16 cores
[15:07:10] - memory: 16 GB

[15:07:10] [1mTrain data shape: (413194, 63)[0m

[15:07:21] Layer [1m1[0m train process start. Time left 1203.45 secs
[15:07:33] Sta

### Alternatively, load the fitted model

In [None]:
model = joblib.load(DATA_PATH / "models" / "lamau_81425_full_dataset" / "lamau_81425_full_dataset.joblib")

### Save model file, parameters, test and oof predictions.

In [None]:
MODEL_NAME = "lamau_81425_full_dataset"
MODEL_DIR = DATA_PATH / "models" / MODEL_NAME
MODEL_DIR.mkdir(exist_ok=True)

In [None]:
res = pd.DataFrame()
res[MODEL_NAME] = oof[:, 1]
res.to_csv(MODEL_DIR / "oof.csv", index=False)
joblib.dump(model, MODEL_DIR / f"{MODEL_NAME}.joblib")

with (MODEL_DIR / "params.yaml").open("w") as f:
    yaml.dump(model.params, f)

with (MODEL_DIR / "score.txt").open("w") as f:
    print("OOF:", metric(y_train, oof), file=f)
    
test = pd.read_parquet(DATA_PATH / "test_preproc_2.parquet")
test["target"] = model.predict(test[cfg["selected_features"] + cat_columns])[:, 1]
test[['id', 'target']].to_csv(MODEL_DIR / f'{MODEL_NAME}.csv', index=False)