In [1]:
import sys; sys.path.append("../../../../automl/")

In [2]:
from pathlib import Path
import yaml
import joblib

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from src.automl.model.lama import TabularLamaUtilized
from src.automl.loggers import configure_root_logger
from src.automl.constants import create_ml_data_dir
from src.automl.model.metrics import RocAuc

In [3]:
create_ml_data_dir()
configure_root_logger()

## Constants

In [4]:
RANDOM_SEED = 77
DATA_PATH = Path("../../../../data/")
CONFIG_PATH = Path("../../../../configs/config.yaml")
N_JOBS = 16

In [5]:
with CONFIG_PATH.open() as f:
    cfg = yaml.load(f, Loader=yaml.SafeLoader)

## Data

In [6]:
df_train = pd.read_parquet(DATA_PATH / "train_preproc_oof.parquet")
#df_train, df_test = df_train.sort_values(by="id").iloc[:300_000], df_train.sort_values(by="id").iloc[300_000:]

In [7]:
df_train = df_train.sort_values(by="id").reset_index(drop=True)

In [8]:
df_train["target"].value_counts(normalize=True)

0    0.940982
1    0.059018
Name: target, dtype: float64

In [9]:
# undersample the 0 class
#df_train = pd.concat([df_train.loc[df_train.target == 1], df_train.loc[df_train.target == 0].sample(200_000, random_state=RANDOM_SEED)], ignore_index=True)


In [10]:
cat_columns = df_train.drop(columns=["target", "id"]).select_dtypes(int).columns.values.tolist()

In [11]:
X_train, y_train = df_train[cfg["stack_features"] + cfg["selected_features"] + cat_columns], df_train["target"]
#X_test, y_test = df_test[cfg["selected_features"] + cat_columns], df_test["target"]

In [12]:
display(y_train.value_counts(normalize=True))
#display(y_test.value_counts(normalize=True))

0    0.940982
1    0.059018
Name: target, dtype: float64

In [13]:
#categorical_features = ohe_cols# + oe_cols

In [14]:
df_train

Unnamed: 0,target,smpl,id,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,feature_20,feature_21,feature_22,feature_23,feature_24,feature_25,feature_26,feature_27,feature_28,feature_29,feature_30,feature_31,feature_32,feature_33,feature_34,feature_35,feature_36,feature_37,...,feature_154,feature_155,feature_156,feature_157,feature_158,feature_159,feature_160,feature_161,feature_162,feature_163,feature_164,feature_165,feature_166,feature_167,feature_168,feature_169,feature_170,feature_171,feature_172,feature_173,feature_174,feature_175,feature_176,feature_177,feature_178,feature_179,feature_180,feature_181,feature_182,feature_183,feature_184,feature_185,feature_186,lamau_81425_full_dataset,lgb_8122_full_dataset,cb_8114_full_dataset,xgb_81325_full_dataset,lama_81298_full_dataset,lamann_autoint_8053_full_dataset,lamann_fttransformer_8050_full_dataset
0,0,train,0,0.131533,0.953282,-0.753298,0.321620,0.753557,-1.050702,49,0.279169,0.722191,-0.730610,-0.760519,0.386393,1.281216,0.782489,0.311110,-0.522017,1.260190,-1.227740,-1.049272,-0.769352,-0.604599,0.192394,-0.993057,-1.323136,-1.288055,0.527455,-0.022240,0.973419,-0.739394,0.404531,0,-0.818497,-0.156309,-0.261131,-1.341291,0.894532,0.779233,...,-0.123593,0.277190,0,-0.633261,1.048579,-0.534421,-0.313637,-1.031983,-1.021433,21,-0.493691,-1.986774,0.370009,0,1.376156,-1.340452,0.192977,-1.551252,0.269545,-0.794526,-0.234620,-0.055814,0.277029,-1.244719,-0.919482,8,1.466316,0.089004,1.837843,0.725687,-0.754875,98,-0.021047,0.023596,0.013489,0.079370,0.023600,0.024494,0.017239,0.023806
1,0,train,1,1.178071,0.398071,-0.505135,1.095571,0.714723,1.120692,0,-0.333070,0.818182,0.947475,0.375749,-0.026053,-0.132769,-0.676351,0.462917,1.407544,1.188619,0.945775,-0.564650,0.607051,-0.236859,-0.684764,-0.831154,0.598141,0.658130,0.728178,0.530996,1.393327,0.382338,-0.578627,1,0.241344,1.037941,-2.053609,0.897298,-0.163447,-0.135790,...,1.013775,-0.492519,0,1.204630,0.484895,-3.228340,0.275244,0.292676,0.811725,0,0.610282,1.103147,-0.927065,1,-0.558339,0.744327,1.681108,-0.015643,-1.240324,-0.599710,0.016882,-0.222279,-0.102038,-0.109116,-0.712783,8,0.168960,0.506208,0.103012,-1.227253,1.307303,0,-0.367065,0.021009,0.021553,0.050553,0.025267,0.019179,0.010484,0.013266
2,1,train,2,-0.645169,-0.227738,-0.978297,-1.213392,-0.806471,-0.377594,1,0.096883,0.728835,0.745346,-0.407901,0.571526,-0.726239,0.391985,1.572188,0.468137,-0.610774,0.219000,-1.025929,-0.445001,-0.012261,0.107585,1.632128,0.535899,-0.171477,-1.133167,0.385826,-1.621111,-0.467349,-0.921219,2,0.758828,-0.598302,-1.328814,0.227190,-0.769274,-0.834413,...,-0.969851,-0.165852,1,0.153001,-1.700664,1.601700,-0.838072,-0.454915,-0.053284,1,0.313894,0.141476,-0.552134,2,-1.052573,-0.007031,-0.267300,3.064286,-1.409659,0.944090,-0.950352,-0.139095,0.020943,-0.434769,-0.597094,8,0.095252,-0.601783,-0.535835,-0.687882,0.270950,1,-0.565696,0.067791,0.070798,0.215963,0.065025,0.069205,0.086875,0.189735
3,0,train,3,-1.955243,-0.229058,1.158362,1.446567,1.133692,-0.917416,2,-1.165934,1.577401,-0.464564,-0.190382,0.941792,0.690178,-0.429863,0.024189,0.100419,1.977306,-0.264587,-0.248304,0.879089,-0.284751,-0.023610,-0.237903,1.089011,-1.188013,0.090692,1.621569,1.978759,1.004087,-0.385069,1,0.622043,0.442276,-0.607518,-1.553079,-0.877991,0.479469,...,0.512077,0.280452,0,-0.105890,0.781871,1.143187,2.059018,-0.078121,0.361635,2,1.124844,-0.423303,1.030308,3,1.077594,-0.554907,-1.583742,-0.258054,2.287308,-0.510872,0.482168,-0.299237,1.335138,-0.983365,-1.504039,8,0.547632,0.296662,0.043079,0.980345,1.723855,2,0.929542,0.020837,0.021344,0.070169,0.021469,0.021638,0.028460,0.033119
4,0,train,4,0.442111,0.195168,-0.386245,0.407316,-0.436025,1.134586,1,1.872517,-1.484988,-0.690340,1.783132,1.428712,-0.713936,0.702171,1.203967,1.378309,-0.325925,1.064929,0.248372,2.037269,1.180247,-0.500312,2.248632,0.384196,0.046286,-1.001776,-0.543578,-1.023683,1.339766,-1.531914,2,0.344246,1.989566,-0.651738,-0.191820,-0.145032,0.460657,...,-0.663086,0.140905,1,0.496315,1.188229,0.519970,0.054354,1.777253,-0.884452,1,1.446233,-1.720222,0.978612,2,0.664399,-0.021149,0.374739,1.549963,1.701781,0.053870,1.049770,2.181977,1.437121,2.145431,0.872518,8,0.113541,0.521924,0.094967,1.438343,-0.146671,1,2.055875,0.003268,0.002256,0.016474,0.002406,0.002835,0.002161,0.003179
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413189,0,train,413189,-1.154941,-0.829717,-1.268453,-0.356428,-0.822933,0.964195,1,-0.484778,0.029571,0.774053,1.201873,0.094709,-0.198506,-1.110228,-0.234575,0.281889,0.299335,0.875548,0.216249,0.530497,0.316565,0.623539,0.253259,-0.969736,-0.678112,0.753615,-1.149565,-0.587701,0.257227,0.736858,2,-0.327038,-0.867046,-1.405382,-1.158684,0.101827,-0.021304,...,0.112123,-0.127416,1,-2.511600,-0.615228,-0.425402,-0.867366,1.252485,-1.447330,1,-0.557916,-0.355228,0.671082,2,-0.674425,-0.660521,0.971358,-0.806504,-2.315549,-0.526383,-1.600757,0.214823,0.388441,0.125703,-0.108432,8,0.173829,-0.666265,-0.153305,0.352068,-0.991257,1,0.017719,0.027886,0.025632,0.107231,0.024223,0.028619,0.035453,0.026665
413190,0,train,413190,-0.557809,-0.370656,-1.117603,0.434343,-0.801104,-0.515650,1,-1.241646,1.029897,1.892813,-0.495689,-0.707535,-0.143122,-0.399202,1.437433,-0.421678,-0.421050,-1.810876,-0.252017,-1.753064,-0.296607,1.127620,-0.412457,-1.674431,-2.120332,-0.541750,0.449971,0.134161,-0.938971,1.402169,2,-2.260702,0.154833,0.181406,-0.527623,-0.447073,0.873399,...,-0.920233,-1.761184,1,-0.555512,1.130002,-0.077044,-0.564217,-0.653959,2.095708,1,0.349993,1.529211,0.035925,2,-0.860280,-0.533620,0.433028,-1.186149,-0.276115,-0.802790,-2.231033,-1.427847,-0.518919,-0.708357,1.255197,8,0.593378,-1.332561,1.600622,-0.952750,-0.881013,1,-0.057485,0.042629,0.044959,0.133694,0.042827,0.036438,0.048080,0.044908
413191,0,train,413191,-0.579914,-1.080645,-0.855200,0.820872,-1.433521,-0.497681,1,-1.444906,-1.897505,1.095496,-2.173016,-0.889065,0.038481,-1.750720,1.560384,-1.174797,-0.898809,-0.975876,-1.009634,0.088143,-0.911838,-2.756771,-1.254225,0.017606,-0.476039,-0.160221,0.565799,-0.767802,-2.505023,0.335085,2,0.374183,0.036883,-0.113893,-1.035395,0.138019,-2.071544,...,0.421071,0.149998,1,-1.672017,0.165187,0.476620,-0.050669,-1.714348,-0.252365,1,0.071071,-0.636588,-1.427133,2,-0.879168,-0.720226,-1.092033,-1.191655,0.731434,1.042668,-1.906482,-1.037265,1.796735,-0.259775,0.814502,8,0.494727,0.229257,1.521202,-0.779365,-1.379297,1,-1.051693,0.234578,0.229939,0.525223,0.208905,0.234172,0.269151,0.244836
413192,0,train,413192,-1.298108,0.703867,-1.172353,0.319265,-0.308155,0.765472,1,-0.342075,0.217699,-0.780007,-0.147333,1.165468,1.155035,-0.185783,-0.308199,0.499064,-0.153823,0.288863,0.445216,-0.685717,-0.279427,-0.147268,-0.835158,0.120869,-0.905613,1.904665,-0.133373,0.321738,-0.284887,-1.298973,2,1.877799,-1.874387,1.422751,-0.928803,0.804263,0.717711,...,0.207983,-0.461711,1,-0.283647,0.254410,-1.051102,-1.533752,-0.250063,-1.746837,1,-2.152660,0.043048,-0.951610,2,-0.695175,-1.320789,0.198907,1.454876,-1.634491,-0.356433,0.228286,-0.980427,-0.020701,-1.730386,-0.034418,8,0.144395,0.628633,1.031002,0.618204,0.370275,1,1.183939,0.043712,0.039810,0.136429,0.044222,0.040249,0.031114,0.019104


## Model

In [16]:
metric = RocAuc()

In [17]:
model = TabularLamaUtilized(n_jobs=16, task="classification", time_series=True)
model.tune(X_train, y_train, metric, timeout=60 * 60, categorical_features=cat_columns)
oof = model.fit(X_train, y_train, categorical_features=cat_columns)

print(metric(y_train, oof))

[2024-11-10 16:44:46,576] - [   START    ] - Fitting TabularLamaUtilized
[16:44:46] Start automl [1mutilizator[0m with listed constraints:
[16:44:46] - time: 7200.00 seconds
[16:44:46] - CPU: 16 cores
[16:44:46] - memory: 16 GB

[16:44:46] [1mIf one preset completes earlier, next preset configuration will be started[0m

[16:44:46] Start 0 automl preset configuration:
[16:44:46] [1mconf_0_sel_type_0.yml[0m, random state: {'reader_params': {'random_state': 42}, 'nn_params': {'random_state': 42}, 'general_params': {'return_all_predictions': False}}
[16:44:46] Stdout logging level is INFO.
[16:44:46] Task: binary

[16:44:46] Start automl preset with listed constraints:
[16:44:46] - time: 7200.00 seconds
[16:44:46] - CPU: 16 cores
[16:44:46] - memory: 16 GB

[16:44:46] [1mTrain data shape: (413194, 70)[0m

[16:45:00] Layer [1m1[0m train process start. Time left 7186.26 secs
[16:45:14] Start fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m ...
[16:45:39] Fitting [1mLvl_0_Pipe_0_Mod_0_L

[17:04:51] Task: binary

[17:04:51] Start automl preset with listed constraints:
[17:04:51] - time: 5994.79 seconds
[17:04:51] - CPU: 16 cores
[17:04:51] - memory: 16 GB

[17:04:51] [1mTrain data shape: (413194, 70)[0m

[17:04:52] Layer [1m1[0m train process start. Time left 5993.82 secs
[17:05:06] Start fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m ...
[17:05:28] Fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m finished. score = [1m0.8083145050433943[0m
[17:05:28] [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m fitting and predicting completed
[17:05:28] Time left 5958.07 secs

[17:05:36] [1mSelector_LightGBM[0m fitting and predicting completed
[17:05:49] Start fitting [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m ...
[17:06:11] Fitting [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m finished. score = [1m0.8001843686375795[0m
[17:06:11] [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m fitting and predicting completed
[17:06:11] Start hyperparameters optimization for [1mLvl_0_Pipe_1_Mod_1_Tuned_LightGBM[0m ... Time budget is 3

[17:25:30] Fitting [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m finished. score = [1m0.8024688673049791[0m
[17:25:30] [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m fitting and predicting completed
[17:25:30] Start hyperparameters optimization for [1mLvl_0_Pipe_1_Mod_1_Tuned_LightGBM[0m ... Time budget is 300.00 secs
[17:30:32] Hyperparameters optimization for [1mLvl_0_Pipe_1_Mod_1_Tuned_LightGBM[0m completed
[17:30:32] Start fitting [1mLvl_0_Pipe_1_Mod_1_Tuned_LightGBM[0m ...
[17:30:51] Fitting [1mLvl_0_Pipe_1_Mod_1_Tuned_LightGBM[0m finished. score = [1m0.8141230085156994[0m
[17:30:51] [1mLvl_0_Pipe_1_Mod_1_Tuned_LightGBM[0m fitting and predicting completed
[17:30:51] Start fitting [1mLvl_0_Pipe_1_Mod_2_CatBoost[0m ...
[17:31:03] Fitting [1mLvl_0_Pipe_1_Mod_2_CatBoost[0m finished. score = [1m0.8129466418487088[0m
[17:31:03] [1mLvl_0_Pipe_1_Mod_2_CatBoost[0m fitting and predicting completed
[17:31:03] Start hyperparameters optimization for [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m .

[17:41:04] Start fitting [1mLvl_0_Pipe_1_Mod_2_CatBoost[0m ...
[17:41:14] Fitting [1mLvl_0_Pipe_1_Mod_2_CatBoost[0m finished. score = [1m0.8127052237777145[0m
[17:41:14] [1mLvl_0_Pipe_1_Mod_2_CatBoost[0m fitting and predicting completed
[17:41:14] Start hyperparameters optimization for [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m ... Time budget is 300.00 secs
[17:41:15] Start fitting [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m ...
[17:41:26] Fitting [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m finished. score = [1m0.8132172374781069[0m
[17:41:26] [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m fitting and predicting completed
[17:41:26] Start fitting [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m ...
[17:41:37] Fitting [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m finished. score = [1m0.8128069262900133[0m
[17:41:37] [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m fitting and predicting completed
[17:41:37] Start fitting [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m ...
[17:41:50] Fitting [1mLvl_0_Pipe_1

[17:46:42] Stdout logging level is INFO.
[17:46:42] Task: binary

[17:46:42] Start automl preset with listed constraints:
[17:46:42] - time: 3484.49 seconds
[17:46:42] - CPU: 16 cores
[17:46:42] - memory: 16 GB

[17:46:42] [1mTrain data shape: (413194, 70)[0m

[17:46:55] Layer [1m1[0m train process start. Time left 3471.52 secs
[17:47:09] Start fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m ...
[17:47:30] Fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m finished. score = [1m0.808231610514445[0m
[17:47:30] [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m fitting and predicting completed
[17:47:30] Time left 3435.96 secs

[17:47:38] [1mSelector_LightGBM[0m fitting and predicting completed
[17:47:39] Start fitting [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m ...
[17:48:01] Fitting [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m finished. score = [1m0.8043815766067242[0m
[17:48:01] [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m fitting and predicting completed
[17:48:01] Start hyperparameters optimization for [1mLvl_0_Pipe_1_Mod_1

[18:13:07] Fitting [1mLvl_0_Pipe_1_Mod_1_Tuned_LightGBM[0m finished. score = [1m0.8137078614901281[0m
[18:13:07] [1mLvl_0_Pipe_1_Mod_1_Tuned_LightGBM[0m fitting and predicting completed
[18:13:07] Start fitting [1mLvl_0_Pipe_1_Mod_2_CatBoost[0m ...
[18:13:19] Fitting [1mLvl_0_Pipe_1_Mod_2_CatBoost[0m finished. score = [1m0.8129637020509768[0m
[18:13:19] [1mLvl_0_Pipe_1_Mod_2_CatBoost[0m fitting and predicting completed
[18:13:19] Start hyperparameters optimization for [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m ... Time budget is 300.00 secs
[18:15:32] Hyperparameters optimization for [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m completed
[18:15:32] Start fitting [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m ...
[18:15:53] Fitting [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m finished. score = [1m0.8134898959337038[0m
[18:15:53] [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m fitting and predicting completed
[18:15:53] Time left 1732.84 secs

[18:15:53] [1mLayer 1 training completed.[0m

[18:32:13] Fitting [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m finished. score = [1m0.8134819285574613[0m
[18:32:13] [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m fitting and predicting completed
[18:32:13] Time left 753.03 secs

[18:32:13] [1mLayer 1 training completed.[0m

[18:32:13] Blending: optimization starts with equal weights and score [1m0.8136570756999985[0m
[18:32:18] Blending: iteration [1m0[0m: score = [1m0.8139929531552004[0m, weights = [1m[0.         0.13541941 0.30341718 0.13679096 0.42437243][0m
[18:32:23] Blending: iteration [1m1[0m: score = [1m0.8139988096160076[0m, weights = [1m[0.         0.14959873 0.2974844  0.09410293 0.45881397][0m
[18:32:27] Blending: iteration [1m2[0m: score = [1m0.8139998229653347[0m, weights = [1m[0.         0.15242389 0.30310234 0.07054314 0.47393057][0m
[18:32:32] Blending: iteration [1m3[0m: score = [1m0.8139999717590907[0m, weights = [1m[0.         0.15308547 0.3028329  0.06702238 0.47705925][0m
[18:32:37] Blendi

In [19]:
cfg["stack_features"]

['lamau_81425_full_dataset',
 'lgb_8122_full_dataset',
 'cb_8114_full_dataset',
 'xgb_81325_full_dataset',
 'lama_81298_full_dataset',
 'lamann_autoint_8053_full_dataset',
 'lamann_fttransformer_8050_full_dataset']

In [18]:
test = pd.read_parquet(DATA_PATH / "test_preproc_oof.parquet")
test["target"] = model.predict(test[cfg["selected_features"] + cfg["stack_features"] + cat_columns])[:, 1]
test[['id', 'target']].to_csv(f'lamau_stack_time_series.csv', index=False)

In [22]:
model.model.get_feature_scores()[:15]

Unnamed: 0,Feature,Importance
0,xgb_81325_full_dataset,20775.868872
1,lama_81298_full_dataset,17319.485724
2,lamau_81425_full_dataset,16520.024417
3,lgb_8122_full_dataset,4307.927217
4,cb_8114_full_dataset,2223.026608
5,lamann_fttransformer_8050_full_dataset,1503.685369
6,feature_14,1115.47876
7,feature_18,1104.18329
8,feature_20,1060.814309
9,feature_96,1060.33595


In [14]:
model = TabularLama(n_jobs=16, task="classification")
model.tune(X_train, y_train, metric, timeout=60 * 30, categorical_features=cat_columns)
oof = model.fit(X_train, y_train, categorical_features=cat_columns)

print(metric(y_train, oof))

[2024-11-08 09:07:53,431] - [   START    ] - Fitting TabularLama
[09:07:53] Stdout logging level is INFO.
[09:07:53] Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer
[09:07:53] Task: binary

[09:07:53] Start automl preset with listed constraints:
[09:07:53] - time: 3600.00 seconds
[09:07:53] - CPU: 16 cores
[09:07:53] - memory: 16 GB

[09:07:53] [1mTrain data shape: (413194, 21)[0m

[09:07:57] Layer [1m1[0m train process start. Time left 3596.00 secs
[09:07:57] Start fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m ...
[09:08:00] Fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m finished. score = [1m0.8082552247714897[0m
[09:08:00] [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m fitting and predicting completed
[09:08:00] Time left 3592.98 secs

[09:08:05] [1mSelector_LightGBM[0m fitting and predicting completed
[09:08:05] Start fitting [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m ...
[09:08:31] Fitting [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m finished. score = [1

In [21]:
model = TabularLama(n_jobs=16, task="classification")
model.tune(X_train, y_train, metric, timeout=60 * 30, categorical_features=cat_columns)
oof = model.fit(X_train, y_train, categorical_features=cat_columns)

print(metric(y_train, oof))

[2024-11-07 13:50:21,142] - [   START    ] - Fitting TabularLama
[13:50:21] Stdout logging level is INFO.
[13:50:21] Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer
[13:50:21] Task: binary

[13:50:21] Start automl preset with listed constraints:
[13:50:21] - time: 3600.00 seconds
[13:50:21] - CPU: 16 cores
[13:50:21] - memory: 16 GB

[13:50:21] [1mTrain data shape: (413194, 68)[0m

[13:50:34] Layer [1m1[0m train process start. Time left 3586.87 secs
[13:50:48] Start fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m ...
[13:51:42] Fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m finished. score = [1m0.8078772570316353[0m
[13:51:42] [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m fitting and predicting completed
[13:51:42] Time left 3518.73 secs

[13:51:49] [1mSelector_LightGBM[0m fitting and predicting completed
[13:52:01] Start fitting [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m ...
[13:52:35] Fitting [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m finished. score = [1

In [23]:
MODEL_NAME = "lama_stack_8136_full_dataset"
MODEL_DIR = Path(f"../../../../data/models/{MODEL_NAME}")
MODEL_DIR.mkdir(exist_ok=True)

In [16]:
# res = pd.DataFrame()
# res[MODEL_NAME] = oof[:, 1]
# res.to_csv(MODEL_DIR / "oof.csv", index=False)
# #joblib.dump(model, MODEL_DIR / f"{MODEL_NAME}.joblib")

# with (MODEL_DIR / "params.yaml").open("w") as f:
#     yaml.dump(model.params, f)

# with (MODEL_DIR / "score.txt").open("w") as f:
#     print("OOF:", metric(y_train, oof), file=f)
    
# test = pd.read_parquet(DATA_PATH / "test_preproc_2.parquet")
# test["target"] = model.predict(test[cfg["selected_features"] + cat_columns])[:, 1]
# test[['id', 'target']].to_csv(MODEL_DIR / f'{MODEL_NAME}.csv', index=False)

In [24]:
test = pd.read_parquet(DATA_PATH / "test_preproc_oof.parquet")
test["target"] = model.predict(test[cfg["selected_features"] + cfg["stack_features"] + cat_columns])[:, 1]
test[['id', 'target']].to_csv(f'{MODEL_NAME}.csv', index=False)

In [26]:
model.model.get_feature_scores()[:50]

Unnamed: 0,Feature,Importance
0,lamau_814_full_dataset,78048.337071
1,xgb_81325_full_dataset,29297.001751
2,lama_81298_full_dataset,24098.444188
3,feature_162,1398.695796
4,feature_24,1321.674997
5,feature_18,1298.269506
6,feature_26,1290.120701
7,feature_145,1287.683197
8,feature_78,1279.916003
9,feature_36,1248.758602


In [24]:
imp = pd.DataFrame().assign(names=model.models[0].feature_names_, imp=model.models[0].feature_importances_)

In [41]:
imp.sort_values(by="imp", ascending=False).reset_index(drop=True).query("names == 'feature_185'")

Unnamed: 0,names,imp
61,feature_185,0.0


In [29]:
cat_columns

['feature_7',
 'feature_31',
 'feature_60',
 'feature_61',
 'feature_71',
 'feature_109',
 'feature_122',
 'feature_156',
 'feature_163',
 'feature_167',
 'feature_179',
 'feature_185']

## With Time series cross val

In [13]:
metric = RocAuc()

In [14]:
df_train = df_train.sort_values(by="id").reset_index(drop=True)
X_train, y_train = df_train[cfg["selected_features"] + cat_columns], df_train["target"]

In [15]:
model = LightGBMClassification(n_jobs=16, time_series=True)
model.tune(X_train, y_train, metric, timeout=60 * 60, categorical_features=cat_columns)
oof = model.fit(X_train, y_train, categorical_features=cat_columns)

print(metric(y_train, oof))

[2024-11-07 08:27:33,388] - [   START    ] - Tuning LightGBMClassification
[2024-11-07 08:27:40,118] - [   OPTUNA   ] - Trial 0. New best score 0.7903405446081995 with parameters {'max_depth': 6, 'num_leaves': 488, 'min_data_in_leaf': 188, 'bagging_fraction': 0.7993292420985183, 'bagging_freq': 0, 'feature_fraction': 0.49359671220172163, 'lambda_l1': 0.5808361216819946, 'lambda_l2': 8.661761457749352, 'min_gain_to_split': 12.022300234864176, 'is_unbalance': True, 'num_iterations': 2}
[2024-11-07 08:28:03,043] - [   OPTUNA   ] - Trial 2. New best score 0.7913515589848906 with parameters {'max_depth': 5, 'num_leaves': 194, 'min_data_in_leaf': 117, 'bagging_fraction': 0.8925879806965068, 'bagging_freq': 0, 'feature_fraction': 0.708540663048167, 'lambda_l1': 5.924145688620425, 'lambda_l2': 0.46450412719997725, 'min_gain_to_split': 12.150897038028766, 'is_unbalance': True, 'num_iterations': 2}
[2024-11-07 08:28:12,225] - [   OPTUNA   ] - Trial 3. New best score 0.8048686053278628 with param

In [27]:
none_oofs_idx = oof[np.any(np.isnan(oof), axis=1)].shape[0]

In [32]:
metric(y_train[none_oofs_idx:], oof[none_oofs_idx:])

0.8095227594190041

In [34]:
MODEL_NAME = "lgb_8095_full_dataset_time_series"
MODEL_DIR = Path(f"../../../data/models/{MODEL_NAME}")
MODEL_DIR.mkdir(exist_ok=True)

In [35]:
res = pd.DataFrame()
res[MODEL_NAME] = oof[none_oofs_idx:, 1]
res.to_csv(MODEL_DIR / "oof.csv", index=False)
joblib.dump(model, MODEL_DIR / f"{MODEL_NAME}.joblib")

with (MODEL_DIR / "params.yaml").open("w") as f:
    yaml.dump(model.params, f)

with (MODEL_DIR / "score.txt").open("w") as f:
    print("OOF:", metric(y_train, oof), file=f)
    
test = pd.read_parquet(DATA_PATH / "test_preproc_2.parquet")
test["target"] = model.predict(test[cfg["selected_features"] + cat_columns])[:, 1]
test[['id', 'target']].to_csv(MODEL_DIR / f'{MODEL_NAME}.csv', index=False)

## TEST 
**81.22112399468679**

## Inference

In [27]:
test = pd.read_parquet(DATA_PATH / "test_preproc_2.parquet")
test["target"] = model.predict(test[cfg["selected_features"] + cat_columns])[:, 1]
test[['id', 'target']].to_csv('lgb_813.csv', index=False)

In [25]:
pred_1 = pd.read_csv("lama_utilized.csv")
pred_2 = pd.read_csv("lgmb_oe_ohe_cols_0805.csv")
pred_3 = pd.read_csv("catboost_ts.csv")

In [26]:
pred_1["target"] = 0.6 * pred_1["target"] + 0.2 * pred_2["target"] + 0.2 * pred_3["target"]

In [29]:
pred_1.to_csv("blend.csv", index=False)

In [166]:
MODEL_DIR.open?

[0;31mSignature:[0m
[0mMODEL_DIR[0m[0;34m.[0m[0mopen[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mmode[0m[0;34m=[0m[0;34m'r'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mbuffering[0m[0;34m=[0m[0;34m-[0m[0;36m1[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mencoding[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0merrors[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mnewline[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Open the file pointed by this path and return a file object, as
the built-in open() function does.
[0;31mFile:[0m      /usr/lib/python3.10/pathlib.py
[0;31mType:[0m      method