In [None]:
import sys; sys.path.append("../automl/")

In [None]:
from pathlib import Path
import yaml
import joblib

import pandas as pd

from src.automl.model.lama import TabularLamaNN
from src.automl.loggers import configure_root_logger
from src.automl.constants import create_ml_data_dir
from src.automl.model.metrics import RocAuc

In [4]:
create_ml_data_dir()
configure_root_logger()

# Tune and fit [LightAutoML AutoInt](https://github.com/sb-ai-lab/LightAutoML) 
Fit LightAutoML AutoInt on the 5-fold stratified cross-validation. Out of fold predictions are saved for further stacking/blending. 
[TabularLamaNN](https://github.com/dertty/automl/blob/hack/src/automl/model/lama/nn_lama.py) implementation from [automl](https://github.com/dertty/automl/tree/hack) is used.

**Unfortunately**, in LightAutoML training and tuning is performed simultaneously, hence it is impossible to save best LightAutoML parameters and then initialize model with these parameters for inference. The solution is to save model file (*joblib* format) and then use this model for inference. If necessary, we can provide this file together with the oof predictions.

**Note:** GPU is required to fit tabular NNs.

## Constants

In [5]:
DATA_PATH = Path("../data/")
RANDOM_SEED = 77
N_JOBS = 16
CONFIG_FILE = Path("../configs/config.yaml")

with CONFIG_FILE.open("r") as f:
    cfg = yaml.load(f, Loader=yaml.SafeLoader)

## Data

In [6]:
df_train = pd.read_parquet(DATA_PATH / "train_preproc_2.parquet")
cat_columns = df_train.drop(columns=["target", "id"]).select_dtypes(int).columns.values.tolist()
X_train, y_train = df_train[cfg["selected_features"] + cat_columns], df_train["target"]

## Model

In [7]:
metric = RocAuc()

### Fit + Tune

**Important**: It is nearly impossible to fully reproduce LightAutoML trianing, because it strongly depends on the harware, resources utilization and timeout. To reproduce the results we can provide the saved file of a fitted model.

In [15]:
# model = TabularLamaNN(n_jobs=N_JOBS, task="classification", nn_name="autoint")
# model.tune(X_train, y_train, metric, timeout=60 * 60 * 2, categorical_features=cat_columns)
# model.verbose = 4
# oof = model.fit(X_train, y_train, categorical_features=cat_columns)

# print(metric(y_train, oof))

[2024-11-07 12:08:58,759] - [   START    ] - Fitting TabularLamaNN_autoint
[12:08:58] Stdout logging level is DEBUG.
[12:08:58] Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer
[12:08:58] Task: binary

[12:08:58] Start automl preset with listed constraints:
[12:08:58] - time: 14400.00 seconds
[12:08:58] - CPU: 16 cores
[12:08:58] - memory: 16 GB

[12:08:58] [1mTrain data shape: (413194, 63)[0m

[12:09:14] Feats was rejected during automatic roles guess: []
[12:09:14] Layer [1m1[0m train process start. Time left 14384.51 secs
[12:09:23] number of text features: 0 
[12:09:23] number of categorical features: 4 
[12:09:23] number of continuous features: 58 
[12:09:23] Start fitting [1mLvl_0_Pipe_0_Mod_0_TorchNN_autoint_0[0m ...
[12:09:23] Training params: {'num_workers': 0, 'pin_memory': False, 'max_length': 256, 'is_snap': False, 'input_bn': False, 'max_emb_size': 256, 'bert_name': None, 'pooling': 'cls', 'device': device(type='cuda'

[12:40:28] Epoch: 23, train loss: 0.183428555727005, val loss: 0.19311760365962982, val metric: 0.7957049808879094
[12:41:06] Epoch: 24, train loss: 0.18255187571048737, val loss: 0.19092701375484467, val metric: 0.7976241610330523
[12:41:43] Epoch: 25, train loss: 0.1813998967409134, val loss: 0.1934359222650528, val metric: 0.7935573166826069
[12:42:21] Epoch: 26, train loss: 0.17998187243938446, val loss: 0.19299423694610596, val metric: 0.7897826605480652
[12:42:59] Epoch: 27, train loss: 0.1777971237897873, val loss: 0.19352443516254425, val metric: 0.791000470054638
[12:43:36] Epoch: 28, train loss: 0.17629578709602356, val loss: 0.19533847272396088, val metric: 0.7896064223597945
[12:44:14] Epoch: 29, train loss: 0.17584431171417236, val loss: 0.19515475630760193, val metric: 0.7886150705202986
[12:44:52] Epoch: 30, train loss: 0.17505429685115814, val loss: 0.19657976925373077, val metric: 0.7848700337396953
[12:45:29] Epoch: 31, train loss: 0.1741061806678772, val loss: 0.1970

[13:23:32] Epoch: 30, train loss: 0.17395038902759552, val loss: 0.19658687710762024, val metric: 0.7908451313844037
[13:24:10] Epoch: 31, train loss: 0.17376229166984558, val loss: 0.20260027050971985, val metric: 0.7837468566427026
[13:24:16] Early stopping: val loss: 0.18829074501991272, val metric: 0.8059301342489384
[13:24:16] ===== Start working with [1mfold 4[0m for [1mLvl_0_Pipe_0_Mod_0_TorchNN_autoint_0[0m =====
[13:24:54] Epoch: 0, train loss: 2.0756165981292725, val loss: 0.29272064566612244, val metric: 0.6760281684864917
[13:25:32] Epoch: 1, train loss: 1.0410487651824951, val loss: 0.2044340819120407, val metric: 0.761761622140692
[13:26:09] Epoch: 2, train loss: 0.1988583207130432, val loss: 0.19150947034358978, val metric: 0.7986280810163797
[13:26:47] Epoch: 3, train loss: 0.410273015499115, val loss: 0.19233688712120056, val metric: 0.7967691374397543
[13:27:25] Epoch: 4, train loss: 0.4932568371295929, val loss: 0.18968094885349274, val metric: 0.8010486340673248

### Alternatively, load the fitted model

**GPU is required.**

In [None]:
model = joblib.load(DATA_PATH / "models" / "lamann_autoint_8053_full_dataset" / "lamann_autoint_8053_full_dataset.joblib")

### Save model file, parameters, test and oof predictions.

In [None]:
MODEL_NAME = "lamann_autoint_8053_full_dataset"
MODEL_DIR = DATA_PATH / "models" / MODEL_NAME
MODEL_DIR.mkdir(exist_ok=True)

In [None]:
res = pd.DataFrame()
res[MODEL_NAME] = oof[:, 1]
res.to_csv(MODEL_DIR / "oof.csv", index=False)
joblib.dump(model, MODEL_DIR / f"{MODEL_NAME}.joblib")

with (MODEL_DIR / "params.yaml").open("w") as f:
    yaml.dump(model.params, f)

with (MODEL_DIR / "score.txt").open("w") as f:
    print("OOF:", metric(y_train, oof), file=f)
    
test = pd.read_parquet(DATA_PATH / "test_preproc_2.parquet")
test["target"] = model.predict(test[cfg["selected_features"] + cat_columns])[:, 1]
test[['id', 'target']].to_csv(MODEL_DIR / f'{MODEL_NAME}.csv', index=False)