In [1]:
import sys; sys.path.append("../../../automl/")

In [2]:
from pathlib import Path
import yaml
import joblib

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from src.automl.model.lama import TabularLamaUtilized
from src.automl.loggers import configure_root_logger
from src.automl.constants import create_ml_data_dir
from src.automl.model.metrics import RocAuc

In [3]:
create_ml_data_dir()
configure_root_logger()

## Constants

In [4]:
RANDOM_SEED = 77
DATA_PATH = Path("../../../data/")
CONFIG_PATH = Path("../../../configs/config.yaml")
N_JOBS = 16

In [5]:
with CONFIG_PATH.open() as f:
    cfg = yaml.load(f, Loader=yaml.SafeLoader)

## Data

In [10]:
df_train = pd.read_parquet(DATA_PATH / "train_preproc_2.parquet")
df_train, df_test = df_train.sort_values(by="id").iloc[:300_000], df_train.sort_values(by="id").iloc[300_000:]

In [11]:
df_train["target"].value_counts()

0    282256
1     17744
Name: target, dtype: int64

In [9]:
# undersample the 0 class
#df_train = pd.concat([df_train.loc[df_train.target == 1], df_train.loc[df_train.target == 0].sample(200_000, random_state=RANDOM_SEED)], ignore_index=True)


In [12]:
cat_columns = df_train.drop(columns=["target", "id"]).select_dtypes(int).columns.values.tolist()

In [13]:
X_train, y_train = df_train[cfg["selected_features"] + cat_columns], df_train["target"]
X_test, y_test = df_test[cfg["selected_features"] + cat_columns], df_test["target"]

In [14]:
display(y_train.value_counts(normalize=True))
display(y_test.value_counts(normalize=True))

0    0.940853
1    0.059147
Name: target, dtype: float64

0    0.941322
1    0.058678
Name: target, dtype: float64

In [16]:
#categorical_features = ohe_cols# + oe_cols

## Model

In [16]:
metric = RocAuc()

In [17]:
model = TabularLamaUtilized(n_jobs=16, task="classification")
model.tune(X_train, y_train, metric, timeout=60 * 60, categorical_features=cat_columns)
oof = model.fit(X_train, y_train, categorical_features=cat_columns)
y_pred = model.predict(X_test)

print(metric(y_train, oof))
print(metric(y_test, y_pred))

[2024-11-06 15:11:26,708] - [   START    ] - Fitting TabularLamaUtilized
[15:11:26] Start automl [1mutilizator[0m with listed constraints:
[15:11:26] - time: 7200.00 seconds
[15:11:26] - CPU: 16 cores
[15:11:26] - memory: 16 GB

[15:11:26] [1mIf one preset completes earlier, next preset configuration will be started[0m

[15:11:26] Start 0 automl preset configuration:
[15:11:26] [1mconf_0_sel_type_0.yml[0m, random state: {'reader_params': {'random_state': 42}, 'nn_params': {'random_state': 42}, 'general_params': {'return_all_predictions': False}}
[15:11:26] Stdout logging level is INFO.
[15:11:26] Task: binary

[15:11:26] Start automl preset with listed constraints:
[15:11:26] - time: 7200.00 seconds
[15:11:26] - CPU: 16 cores
[15:11:26] - memory: 16 GB

[15:11:26] [1mTrain data shape: (300000, 63)[0m

[15:11:37] Layer [1m1[0m train process start. Time left 7189.72 secs
[15:11:45] Start fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m ...
[15:12:22] Fitting [1mLvl_0_Pipe_0_Mod_0_L

[15:40:31] Layer [1m1[0m train process start. Time left 5455.51 secs
[15:40:40] Start fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m ...
[15:41:17] Fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m finished. score = [1m0.7888030003884329[0m
[15:41:17] [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m fitting and predicting completed
[15:41:17] Time left 5408.93 secs

[15:41:25] [1mSelector_LightGBM[0m fitting and predicting completed
[15:41:33] Start fitting [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m ...
[15:42:11] Fitting [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m finished. score = [1m0.8102081895361546[0m
[15:42:11] [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m fitting and predicting completed
[15:42:11] Start hyperparameters optimization for [1mLvl_0_Pipe_1_Mod_1_Tuned_LightGBM[0m ... Time budget is 300.00 secs
[15:47:14] Hyperparameters optimization for [1mLvl_0_Pipe_1_Mod_1_Tuned_LightGBM[0m completed
[15:47:14] Start fitting [1mLvl_0_Pipe_1_Mod_1_Tuned_LightGBM[0m ...
[15:47:53] Fitting [1mLvl_0_Pipe_1_Mod_1_Tu

[16:16:03] Hyperparameters optimization for [1mLvl_0_Pipe_1_Mod_1_Tuned_LightGBM[0m completed
[16:16:03] Start fitting [1mLvl_0_Pipe_1_Mod_1_Tuned_LightGBM[0m ...
[16:16:39] Fitting [1mLvl_0_Pipe_1_Mod_1_Tuned_LightGBM[0m finished. score = [1m0.812073111942671[0m
[16:16:39] [1mLvl_0_Pipe_1_Mod_1_Tuned_LightGBM[0m fitting and predicting completed
[16:16:39] Start fitting [1mLvl_0_Pipe_1_Mod_2_CatBoost[0m ...
[16:17:13] Fitting [1mLvl_0_Pipe_1_Mod_2_CatBoost[0m finished. score = [1m0.8072995972551813[0m
[16:17:13] [1mLvl_0_Pipe_1_Mod_2_CatBoost[0m fitting and predicting completed
[16:17:13] Start hyperparameters optimization for [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m ... Time budget is 300.00 secs
[16:22:14] Hyperparameters optimization for [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m completed
[16:22:14] Start fitting [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m ...
[16:23:20] Fitting [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m finished. score = [1m0.8078172918571538[0m
[

[16:36:16] Start fitting [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m ...
[16:36:48] Fitting [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m finished. score = [1m0.8065595727647544[0m
[16:36:48] [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m fitting and predicting completed
[16:36:48] Start fitting [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m ...
[16:37:40] Fitting [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m finished. score = [1m0.807202505108077[0m
[16:37:40] [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m fitting and predicting completed
[16:37:40] Hyperparameters optimization for [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m completed
[16:37:40] Start fitting [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m ...
[16:38:53] Fitting [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m finished. score = [1m0.8082548081643194[0m
[16:38:53] [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m fitting and predicting completed
[16:38:53] Time left 1953.47 secs

[16:38:53] [1mLayer 1 training completed.[0m

[16:38:53] Blending: optimization

[17:04:39] [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m fitting and predicting completed
[17:04:39] Time left 406.87 secs

[17:04:39] [1mLayer 1 training completed.[0m

[17:04:40] Blending: optimization starts with equal weights and score [1m0.8105832568389526[0m
[17:04:43] Blending: iteration [1m0[0m: score = [1m0.8129764974051146[0m, weights = [1m[0.         0.2100713  0.5476953  0.05638938 0.18584405][0m
[17:04:47] Blending: iteration [1m1[0m: score = [1m0.8130014665044016[0m, weights = [1m[0.         0.25869945 0.5184821  0.         0.22281848][0m
[17:04:51] Blending: iteration [1m2[0m: score = [1m0.8130018483666561[0m, weights = [1m[0.        0.2589566 0.5200349 0.        0.2210085][0m
[17:04:55] Blending: iteration [1m3[0m: score = [1m0.8130018483666561[0m, weights = [1m[0.        0.2589566 0.5200349 0.        0.2210085][0m
[17:04:55] Blending: no score update. Terminated

[17:04:55] [1mAutoml preset training completed in 690.41 seconds[0m

[17:04:55] M

In [19]:
MODEL_NAME = "lamau_8147"
MODEL_DIR = Path(f"../../../data/models/{MODEL_NAME}")
MODEL_DIR.mkdir(exist_ok=True)

In [20]:
res = pd.DataFrame()
res[MODEL_NAME] = oof[:, 1]
res.to_csv(MODEL_DIR / "oof.csv", index=False)
joblib.dump(model, MODEL_DIR / f"{MODEL_NAME}.joblib")

with (MODEL_DIR / "params.yaml").open("w") as f:
    yaml.dump(model.params, f)

with (MODEL_DIR / "score.txt").open("w") as f:
    print("OOF:", metric(y_train, oof), file=f)
    print("Test:", metric(y_test, y_pred), file=f)
    
test = pd.read_parquet(DATA_PATH / "test_preproc_2.parquet")
test["target"] = model.predict(test[cfg["selected_features"] + cat_columns])[:, 1]
test[['id', 'target']].to_csv(MODEL_DIR / f'{MODEL_NAME}.csv', index=False)

## Inference

In [27]:
test = pd.read_parquet(DATA_PATH / "test_preproc_2.parquet")
test["target"] = model.predict(test[cfg["selected_features"] + cat_columns])[:, 1]
test[['id', 'target']].to_csv('lgb_813.csv', index=False)

In [25]:
pred_1 = pd.read_csv("lama_utilized.csv")
pred_2 = pd.read_csv("lgmb_oe_ohe_cols_0805.csv")
pred_3 = pd.read_csv("catboost_ts.csv")

In [26]:
pred_1["target"] = 0.6 * pred_1["target"] + 0.2 * pred_2["target"] + 0.2 * pred_3["target"]

In [29]:
pred_1.to_csv("blend.csv", index=False)

In [166]:
MODEL_DIR.open?

[0;31mSignature:[0m
[0mMODEL_DIR[0m[0;34m.[0m[0mopen[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mmode[0m[0;34m=[0m[0;34m'r'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mbuffering[0m[0;34m=[0m[0;34m-[0m[0;36m1[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mencoding[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0merrors[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mnewline[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Open the file pointed by this path and return a file object, as
the built-in open() function does.
[0;31mFile:[0m      /usr/lib/python3.10/pathlib.py
[0;31mType:[0m      method