In [3]:
import sys; sys.path.append("../automl/")

In [11]:
from pathlib import Path
import yaml
import joblib

import pandas as pd
import numpy as np

from src.automl.model.lama import TabularLama
from src.automl.loggers import configure_root_logger
from src.automl.constants import create_ml_data_dir
from src.automl.model.metrics import RocAuc

In [4]:
create_ml_data_dir()
configure_root_logger()

# Tune and fit [LightAutoML](https://github.com/sb-ai-lab/LightAutoML) on the dataset with oof predictions.
Find the best parameters for LightAutoML and then fit the model on these parameters. Parameters optimization is performed based on the 5-fold time-series cross-validation and the final fit is performed on the same folds.
[TabularLama](https://github.com/dertty/automl/blob/hack/src/automl/model/lama/default_lama.py) implementation from [automl](https://github.com/dertty/automl/tree/hack) is used.

**Unfortunately**, in LightAutoML training and tuning is performed simultaneously, hence it is impossible to save best LightAutoML parameters and then initialize model with these parameters for inference. The solution is to save model file (*joblib* format) and then use this model for inference. If necessary, we can provide this file together with the oof predictions.

## Constants

In [7]:
DATA_PATH = Path("../data/")
RANDOM_SEED = 77
N_JOBS = 16
CONFIG_FILE = Path("../configs/config.yaml")

with CONFIG_FILE.open("r") as f:
    cfg = yaml.load(f, Loader=yaml.SafeLoader)

## Data

In [None]:
df_train = pd.read_parquet(DATA_PATH / "train_preproc_oof.parquet")
df_train = df_train.sort_values(by="id").reset_index(drop=True)

cat_columns = df_train.drop(columns=["target", "id"]).select_dtypes(int).columns.values.tolist()

X_train, y_train = df_train[cfg["stack_features"] + cfg["selected_features"] + cat_columns], df_train["target"]

## Model

In [7]:
metric = RocAuc()

### Fit + Tune

In [None]:
model = TabularLama(n_jobs=N_JOBS, task="classification", time_series=True)
model.tune(X_train, y_train, metric, timeout=60 * 30, categorical_features=cat_columns)
model.fit(X_train, y_train, categorical_features=cat_columns)

# correct blender weights for reproducibility
model.model.blender.wts = np.array([0.1449654 , 0.40106198, 0.08289293, 0.37107965], dtype=np.float32)

### Alternatively, load the fitted model

In [8]:
model = joblib.load(DATA_PATH / "models" / "lama_stack_time_series" / "lama_stack_time_series.joblib")

### Save model file, parameters, test and oof predictions.

In [None]:
MODEL_NAME = "lama_stack_time_series"
MODEL_DIR = DATA_PATH / "models" / MODEL_NAME
MODEL_DIR.mkdir(exist_ok=True)

In [None]:
joblib.dump(model, MODEL_DIR / f"{MODEL_NAME}.joblib")

with (MODEL_DIR / "params.yaml").open("w") as f:
    yaml.dump(model.params, f)
    
test = pd.read_parquet(DATA_PATH / "test_preproc_oof.parquet")
test["target"] = model.predict(test[cfg["selected_features"] + cfg["stack_features"] + cat_columns])[:, 1]
test[['id', 'target']].to_csv(MODEL_DIR / f'{MODEL_NAME}.csv', index=False)