In [1]:
import sys; sys.path.append("../automl/")

In [8]:
from pathlib import Path
import yaml
import joblib

import pandas as pd
import numpy as np

from src.automl.model.lama import TabularLama
from src.automl.loggers import configure_root_logger
from src.automl.constants import create_ml_data_dir
from src.automl.model.metrics import RocAuc

In [3]:
create_ml_data_dir()
configure_root_logger()

# Tune and fit [LightAutoML](https://github.com/sb-ai-lab/LightAutoML) 
Find the best parameters for LightAutoML and then fit the model on these parameters. Parameters optimization is performed based on the 5-fold stratified cross-validation and the final fit is performed on the same folds. Out of fold predictions are saved for further stacking/blending. 
[TabularLama](https://github.com/dertty/automl/blob/hack/src/automl/model/lama/default_lama.py) implementation from [automl](https://github.com/dertty/automl/tree/hack) is used.

**Unfortunately**, in LightAutoML training and tuning is performed simultaneously, hence it is impossible to save best LightAutoML parameters and then initialize model with these parameters for inference. The solution is to save model file (*joblib* format) and then use this model for inference. If necessary, we can provide this file together with the oof predictions.

## Constants

In [4]:
DATA_PATH = Path("../data/")
RANDOM_SEED = 77
N_JOBS = 16
CONFIG_FILE = Path("../configs/config.yaml")

with CONFIG_FILE.open("r") as f:
    cfg = yaml.load(f, Loader=yaml.SafeLoader)

## Data

In [5]:
df_train = pd.read_parquet(DATA_PATH / "train_preproc_2.parquet")
cat_columns = df_train.drop(columns=["target", "id"]).select_dtypes(int).columns.values.tolist()
X_train, y_train = df_train[cfg["selected_features"] + cat_columns], df_train["target"]

## Model

In [6]:
metric = RocAuc()

### Fit + Tune

**Important**: It is nearly impossible to fully reproduce LightAutoML trianing, because it strongly depends on the harware, resources utilization and timeout. To reproduce the results we can provide the saved file of a fitted model.

In [None]:
model = TabularLama(n_jobs=N_JOBS, task="classification")
model.tune(X_train, y_train, metric, timeout=60 * 30, categorical_features=cat_columns)
oof = model.fit(X_train, y_train, categorical_features=cat_columns)

# fix blender weights for reproducibility
model.model.blender.wts = np.array([0.13602127, 0.74620605, 0.06681882, 0.05095384], dtype=np.float32)

print(metric(y_train, oof))

[2024-11-07 10:48:16,663] - [   START    ] - Fitting TabularLama
[10:48:16] Stdout logging level is INFO.
[10:48:16] Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer
[10:48:16] Task: binary

[10:48:16] Start automl preset with listed constraints:
[10:48:16] - time: 3600.00 seconds
[10:48:16] - CPU: 16 cores
[10:48:16] - memory: 16 GB

[10:48:16] [1mTrain data shape: (413194, 63)[0m

[10:48:29] Layer [1m1[0m train process start. Time left 3587.79 secs
[10:48:42] Start fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m ...
[10:49:30] Fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m finished. score = [1m0.7877901305548859[0m
[10:49:30] [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m fitting and predicting completed
[10:49:30] Time left 3526.10 secs

[10:49:43] [1mSelector_LightGBM[0m fitting and predicting completed
[10:49:55] Start fitting [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m ...
[10:50:52] Fitting [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m finished. score = [1

### Alternatively, load the fitted model

In [13]:
model = joblib.load(DATA_PATH / "models" / "lama_81298_full_dataset" / "lama_81298_full_dataset.joblib")

### Save model file, parameters, test and oof predictions.

In [None]:
MODEL_NAME = "lama_81298_full_dataset"
MODEL_DIR = DATA_PATH / "models" / MODEL_NAME
MODEL_DIR.mkdir(exist_ok=True)

In [None]:
res = pd.DataFrame()
res[MODEL_NAME] = oof[:, 1]
res.to_csv(MODEL_DIR / "oof.csv", index=False)
joblib.dump(model, MODEL_DIR / f"{MODEL_NAME}.joblib")

with (MODEL_DIR / "params.yaml").open("w") as f:
    yaml.dump(model.params, f)

with (MODEL_DIR / "score.txt").open("w") as f:
    print("OOF:", metric(y_train, oof), file=f)
    
test = pd.read_parquet(DATA_PATH / "test_preproc_2.parquet")
test["target"] = model.predict(test[cfg["selected_features"] + cat_columns])[:, 1]
test[['id', 'target']].to_csv(MODEL_DIR / f'{MODEL_NAME}.csv', index=False)