In [1]:
import sys; sys.path.append("../../../../automl/")

In [26]:
from pathlib import Path
import yaml
import joblib

import pandas as pd
import numpy as np
import optuna

from sklearn.model_selection import train_test_split, StratifiedKFold, TimeSeriesSplit
from sklearn.metrics import roc_auc_score

# from src.automl.model.lama import TabularLama
# from src.automl.loggers import configure_root_logger
# from src.automl.constants import create_ml_data_dir
# from src.automl.model.metrics import RocAuc

In [3]:
optuna.logging.set_verbosity(optuna.logging.INFO)

In [4]:
# create_ml_data_dir()
# configure_root_logger()

## Constants

In [5]:
RANDOM_SEED = 77
DATA_PATH = Path("../../../../data/")
CONFIG_PATH = Path("../../../../configs/config.yaml")
N_JOBS = 16

In [14]:
with CONFIG_PATH.open() as f:
    cfg = yaml.load(f, Loader=yaml.SafeLoader)

## Data

In [7]:
df_train = pd.read_parquet(DATA_PATH / "train_preproc_oof.parquet")
#df_train, df_test = df_train.sort_values(by="id").iloc[:300_000], df_train.sort_values(by="id").iloc[300_000:]

In [8]:
df_train["target"].value_counts(normalize=True)

0    0.940982
1    0.059018
Name: target, dtype: float64

In [9]:
# undersample the 0 class
#df_train = pd.concat([df_train.loc[df_train.target == 1], df_train.loc[df_train.target == 0].sample(200_000, random_state=RANDOM_SEED)], ignore_index=True)


In [10]:
cat_columns = df_train.drop(columns=["target", "id"]).select_dtypes(int).columns.values.tolist()

In [11]:
X_train, y_train = df_train[cfg["selected_features"] + cfg["stack_features"] + cat_columns], df_train["target"]
#X_test, y_test = df_test[cfg["selected_features"] + cat_columns], df_test["target"]

In [12]:
display(y_train.value_counts(normalize=True))
#display(y_test.value_counts(normalize=True))

0    0.940982
1    0.059018
Name: target, dtype: float64

In [13]:
#categorical_features = ohe_cols# + oe_cols

## Blend

In [15]:
cfg["stack_features"]

['lamau_814_full_dataset',
 'lgb_8122_full_dataset',
 'cb_8114_full_dataset',
 'xgb_81325_full_dataset',
 'lama_81298_full_dataset',
 'lamann_autoint_8053_full_dataset',
 'lamann_fttransformer_8050_full_dataset']

In [16]:
cfg["stack_features"] = ["lamau_814_full_dataset", "xgb_81325_full_dataset", "lamann_autoint_8053_full_dataset"]

In [17]:
def objective(trial):
    weights = []
    w0 = trial.suggest_float("w0", 0, 1)
    weights.append(w0)
    
    for i in range(1, len(cfg["stack_features"]) - 1):
        sum_prev_weights = np.sum(weights)
        wi = trial.suggest_float(f"w{i}", 0, sum_prev_weights)
        weights.append(wi)
        
    w6 = 1 - np.sum(weights)
    weights.append(w6)
    
    weights = np.array(weights)
    
    cv = kf.split(X_train, y_train)
    
    metrics = []
    for i, (train_idx, test_idx) in enumerate(cv):
    
        preds = np.sum(X_train.iloc[test_idx][cfg["stack_features"]].to_numpy() * weights, axis=1)
        metrics.append(metric(y_train.iloc[test_idx], preds))
        
    
    return np.mean(metrics)
    
    

In [15]:
def objective(trial):
    weights = []
    w0 = trial.suggest_float("w0", 0, 1)
    weights.append(w0)
    
    
        
    w1 = 1 - w0
    weights.append(w1)
    
    weights = np.array(weights)
    
    cv = kf.split(X_train, y_train)
    
    metrics = []
    for i, (train_idx, test_idx) in enumerate(cv):
    
        preds = np.sum(X_train.iloc[test_idx][cfg["stack_features"]].to_numpy() * weights, axis=1)
        metrics.append(metric(y_train.iloc[test_idx], preds))
        
    
    return np.mean(metrics)
    

In [17]:
def objective(trial):
    weights = []
    w0 = trial.suggest_float("w0", 0, 1)
    weights.append(w0)
    
    w1 = trial.suggest_float("w1", 0, 1 - w0)
    weights.append(w1)
        
    w2 = 1 - np.sum(weights)
    weights.append(w2)
    
    weights = np.array(weights)
    
    cv = kf.split(X_train, y_train)
    
    metrics = []
    for i, (train_idx, test_idx) in enumerate(cv):
    
        preds = np.sum(X_train.iloc[test_idx][cfg["stack_features"]].to_numpy() * weights, axis=1)
        metrics.append(metric(y_train.iloc[test_idx], preds))
        
    
    return np.mean(metrics)
    

In [27]:
kf = StratifiedKFold(
                n_splits=5, random_state=RANDOM_SEED, shuffle=True)

kf = TimeSeriesSplit(
                n_splits=5)
metric = roc_auc_score

In [19]:
study = optuna.create_study(
            study_name="blending",
            direction="maximize")
study.optimize(objective, timeout=60, n_jobs=N_JOBS)

[I 2024-11-08 13:15:35,475] A new study created in memory with name: blending
[I 2024-11-08 13:15:39,312] Trial 3 finished with value: 0.8142235761580594 and parameters: {'w0': 0.6836461794948678, 'w1': 0.22694190493184788}. Best is trial 3 with value: 0.8142235761580594.
[I 2024-11-08 13:15:39,342] Trial 2 finished with value: 0.8140760282878556 and parameters: {'w0': 0.79411928227457, 'w1': 0.057835541233459006}. Best is trial 3 with value: 0.8142235761580594.
[I 2024-11-08 13:15:39,347] Trial 0 finished with value: 0.8141522688043716 and parameters: {'w0': 0.4960675331774107, 'w1': 0.4072791871481723}. Best is trial 3 with value: 0.8142235761580594.
[I 2024-11-08 13:15:39,361] Trial 6 finished with value: 0.813156152389827 and parameters: {'w0': 0.5582417720861867, 'w1': 0.06827015379040256}. Best is trial 3 with value: 0.8142235761580594.
[I 2024-11-08 13:15:39,369] Trial 4 finished with value: 0.8140552167531506 and parameters: {'w0': 0.4005405415075317, 'w1': 0.5777418837906075}.

[I 2024-11-08 13:15:47,912] Trial 39 finished with value: 0.8142231824092463 and parameters: {'w0': 0.7379187001877576, 'w1': 0.17980725716149454}. Best is trial 13 with value: 0.8142524546763088.
[I 2024-11-08 13:15:47,929] Trial 46 finished with value: 0.8141658919299634 and parameters: {'w0': 0.723171389899619, 'w1': 0.1529190281589999}. Best is trial 13 with value: 0.8142524546763088.
[I 2024-11-08 13:15:47,967] Trial 38 finished with value: 0.8142295233305703 and parameters: {'w0': 0.7229005880374305, 'w1': 0.19787940802868953}. Best is trial 13 with value: 0.8142524546763088.
[I 2024-11-08 13:15:48,005] Trial 44 finished with value: 0.8142112409016127 and parameters: {'w0': 0.7091174989754764, 'w1': 0.19323464972342028}. Best is trial 13 with value: 0.8142524546763088.
[I 2024-11-08 13:15:48,094] Trial 43 finished with value: 0.8141954491295109 and parameters: {'w0': 0.6905715516232102, 'w1': 0.19878952685459114}. Best is trial 13 with value: 0.8142524546763088.
[I 2024-11-08 13:

[I 2024-11-08 13:16:00,062] Trial 88 finished with value: 0.8137655943011068 and parameters: {'w0': 0.45857449452671223, 'w1': 0.2891825812614933}. Best is trial 13 with value: 0.8142524546763088.
[I 2024-11-08 13:16:00,063] Trial 87 finished with value: 0.8138996026539521 and parameters: {'w0': 0.46378708133823643, 'w1': 0.32153849014903996}. Best is trial 13 with value: 0.8142524546763088.
[I 2024-11-08 13:16:00,140] Trial 90 finished with value: 0.8142356479923635 and parameters: {'w0': 0.7908543120890943, 'w1': 0.17110978664240256}. Best is trial 13 with value: 0.8142524546763088.
[I 2024-11-08 13:16:00,181] Trial 86 finished with value: 0.8141867098116762 and parameters: {'w0': 0.881268335209719, 'w1': 0.10617813037649343}. Best is trial 13 with value: 0.8142524546763088.
[I 2024-11-08 13:16:00,427] Trial 82 finished with value: 0.8141006206049317 and parameters: {'w0': 0.5788873736994952, 'w1': 0.2641533188362362}. Best is trial 13 with value: 0.8142524546763088.
[I 2024-11-08 13

[I 2024-11-08 13:16:09,360] Trial 125 finished with value: 0.8141383574010742 and parameters: {'w0': 0.6731041967480682, 'w1': 0.18521912357422787}. Best is trial 13 with value: 0.8142524546763088.
[I 2024-11-08 13:16:10,051] Trial 127 finished with value: 0.8142391528865615 and parameters: {'w0': 0.7650433920189735, 'w1': 0.18098201231505642}. Best is trial 13 with value: 0.8142524546763088.
[I 2024-11-08 13:16:12,059] Trial 128 finished with value: 0.8142417943716094 and parameters: {'w0': 0.7610366576585104, 'w1': 0.18662057389816264}. Best is trial 13 with value: 0.8142524546763088.
[I 2024-11-08 13:16:12,083] Trial 129 finished with value: 0.8142415222332646 and parameters: {'w0': 0.7662890755280237, 'w1': 0.18532766653653668}. Best is trial 13 with value: 0.8142524546763088.
[I 2024-11-08 13:16:12,398] Trial 131 finished with value: 0.8142404701923169 and parameters: {'w0': 0.7659074773610048, 'w1': 0.18224136592266774}. Best is trial 13 with value: 0.8142524546763088.
[I 2024-11

[I 2024-11-08 13:16:21,558] Trial 167 finished with value: 0.8142493640016928 and parameters: {'w0': 0.7441745929282696, 'w1': 0.2137995526117936}. Best is trial 13 with value: 0.8142524546763088.
[I 2024-11-08 13:16:21,612] Trial 168 finished with value: 0.814249287520496 and parameters: {'w0': 0.7470696534242539, 'w1': 0.21333660277051986}. Best is trial 13 with value: 0.8142524546763088.
[I 2024-11-08 13:16:21,897] Trial 171 finished with value: 0.8142486109564583 and parameters: {'w0': 0.7417236966022323, 'w1': 0.21250732538792935}. Best is trial 13 with value: 0.8142524546763088.
[I 2024-11-08 13:16:21,925] Trial 170 finished with value: 0.8142492342515624 and parameters: {'w0': 0.745677452914676, 'w1': 0.21351076616446246}. Best is trial 13 with value: 0.8142524546763088.
[I 2024-11-08 13:16:21,961] Trial 172 finished with value: 0.8142489869617103 and parameters: {'w0': 0.7430380810966128, 'w1': 0.2127044459276646}. Best is trial 13 with value: 0.8142524546763088.
[I 2024-11-08 

[I 2024-11-08 13:16:33,190] Trial 211 finished with value: 0.814252566110885 and parameters: {'w0': 0.7336633065589887, 'w1': 0.23277198296521806}. Best is trial 210 with value: 0.8142526810652276.
[I 2024-11-08 13:16:33,320] Trial 209 finished with value: 0.8142526288666392 and parameters: {'w0': 0.7335058647836878, 'w1': 0.2328391940792986}. Best is trial 210 with value: 0.8142526810652276.
[I 2024-11-08 13:16:33,705] Trial 212 finished with value: 0.8142243205331244 and parameters: {'w0': 0.8120840745536959, 'w1': 0.1866388794304145}. Best is trial 210 with value: 0.8142526810652276.
[I 2024-11-08 13:16:33,883] Trial 216 finished with value: 0.8142457506958893 and parameters: {'w0': 0.7329436305223928, 'w1': 0.21059436750731503}. Best is trial 210 with value: 0.8142526810652276.
[I 2024-11-08 13:16:34,040] Trial 213 finished with value: 0.8142459569063611 and parameters: {'w0': 0.7315252313107391, 'w1': 0.21139999533631504}. Best is trial 210 with value: 0.8142526810652276.
[I 2024-

In [20]:
study.best_params

{'w0': 0.7316298019106628, 'w1': 0.233397153122253}

In [28]:
study = optuna.create_study(
            study_name="blending",
            direction="maximize")
study.optimize(objective, timeout=60, n_jobs=N_JOBS)

[I 2024-11-08 13:20:07,140] A new study created in memory with name: blending
[I 2024-11-08 13:20:08,833] Trial 2 finished with value: 0.8112373002112833 and parameters: {'w0': 0.08448200397403005, 'w1': 0.41760988949228933}. Best is trial 2 with value: 0.8112373002112833.
[I 2024-11-08 13:20:08,859] Trial 0 finished with value: 0.8125778288421502 and parameters: {'w0': 0.6314920738557078, 'w1': 0.020453663804189136}. Best is trial 0 with value: 0.8125778288421502.
[I 2024-11-08 13:20:09,048] Trial 7 finished with value: 0.8130995621017719 and parameters: {'w0': 0.2516215917907254, 'w1': 0.6176909345774071}. Best is trial 7 with value: 0.8130995621017719.
[I 2024-11-08 13:20:09,081] Trial 15 finished with value: 0.8135008166306259 and parameters: {'w0': 0.6950843238874794, 'w1': 0.27354797998202074}. Best is trial 15 with value: 0.8135008166306259.
[I 2024-11-08 13:20:09,105] Trial 12 finished with value: 0.8132119815965808 and parameters: {'w0': 0.797431968107258, 'w1': 0.012706352652

[I 2024-11-08 13:20:13,201] Trial 44 finished with value: 0.8134034787118777 and parameters: {'w0': 0.6844580422352826, 'w1': 0.17708159583292157}. Best is trial 15 with value: 0.8135008166306259.
[I 2024-11-08 13:20:13,271] Trial 45 finished with value: 0.8134359907576642 and parameters: {'w0': 0.7110283027174257, 'w1': 0.17101554355712328}. Best is trial 15 with value: 0.8135008166306259.
[I 2024-11-08 13:20:13,306] Trial 43 finished with value: 0.8132647241136768 and parameters: {'w0': 0.7300854587750084, 'w1': 0.08321044957724923}. Best is trial 15 with value: 0.8135008166306259.
[I 2024-11-08 13:20:13,415] Trial 42 finished with value: 0.8133085529203804 and parameters: {'w0': 0.7050955949950181, 'w1': 0.11935299366107399}. Best is trial 15 with value: 0.8135008166306259.
[I 2024-11-08 13:20:13,649] Trial 46 finished with value: 0.8134171115679093 and parameters: {'w0': 0.7077537259852094, 'w1': 0.1628383562026361}. Best is trial 15 with value: 0.8135008166306259.
[I 2024-11-08 13

[I 2024-11-08 13:20:19,253] Trial 84 finished with value: 0.813399431158903 and parameters: {'w0': 0.8940642185050593, 'w1': 0.04066490221917765}. Best is trial 15 with value: 0.8135008166306259.
[I 2024-11-08 13:20:19,409] Trial 86 finished with value: 0.81348078373424 and parameters: {'w0': 0.7732167467149802, 'w1': 0.16525998401545203}. Best is trial 15 with value: 0.8135008166306259.
[I 2024-11-08 13:20:19,441] Trial 92 finished with value: 0.8134873721717104 and parameters: {'w0': 0.6456222992008052, 'w1': 0.27861145456752595}. Best is trial 15 with value: 0.8135008166306259.
[I 2024-11-08 13:20:19,486] Trial 87 finished with value: 0.8134582132379415 and parameters: {'w0': 0.6260154425379412, 'w1': 0.2692033146976949}. Best is trial 15 with value: 0.8135008166306259.
[I 2024-11-08 13:20:19,551] Trial 89 finished with value: 0.8134415400416461 and parameters: {'w0': 0.617894060154245, 'w1': 0.26474016440505377}. Best is trial 15 with value: 0.8135008166306259.
[I 2024-11-08 13:20:

[I 2024-11-08 13:20:24,057] Trial 125 finished with value: 0.8134895070701553 and parameters: {'w0': 0.7502638624067333, 'w1': 0.23671598007194838}. Best is trial 15 with value: 0.8135008166306259.
[I 2024-11-08 13:20:24,135] Trial 127 finished with value: 0.8134865885510848 and parameters: {'w0': 0.7549596349068594, 'w1': 0.2374980380596443}. Best is trial 15 with value: 0.8135008166306259.
[I 2024-11-08 13:20:25,067] Trial 128 finished with value: 0.813487306603751 and parameters: {'w0': 0.7556451013085231, 'w1': 0.23534537458418783}. Best is trial 15 with value: 0.8135008166306259.
[I 2024-11-08 13:20:25,177] Trial 129 finished with value: 0.8134855724843923 and parameters: {'w0': 0.7625912383047874, 'w1': 0.2285610992031917}. Best is trial 15 with value: 0.8135008166306259.
[I 2024-11-08 13:20:25,256] Trial 131 finished with value: 0.8134897380831936 and parameters: {'w0': 0.7527568715955927, 'w1': 0.23329627635348232}. Best is trial 15 with value: 0.8135008166306259.
[I 2024-11-08

[I 2024-11-08 13:20:29,922] Trial 167 finished with value: 0.8134936587802791 and parameters: {'w0': 0.7101986924774705, 'w1': 0.2748596132238206}. Best is trial 15 with value: 0.8135008166306259.
[I 2024-11-08 13:20:29,966] Trial 170 finished with value: 0.8116476163955569 and parameters: {'w0': 0.1915046555770622, 'w1': 0.34571383601722977}. Best is trial 15 with value: 0.8135008166306259.
[I 2024-11-08 13:20:29,973] Trial 169 finished with value: 0.8134895461768835 and parameters: {'w0': 0.7164150149962708, 'w1': 0.2761107536743384}. Best is trial 15 with value: 0.8135008166306259.
[I 2024-11-08 13:20:30,021] Trial 171 finished with value: 0.8134922922994532 and parameters: {'w0': 0.7133189826306692, 'w1': 0.2746173286043732}. Best is trial 15 with value: 0.8135008166306259.
[I 2024-11-08 13:20:30,157] Trial 172 finished with value: 0.813487268579953 and parameters: {'w0': 0.7220354218831566, 'w1': 0.2732691483248022}. Best is trial 15 with value: 0.8135008166306259.
[I 2024-11-08 1

[I 2024-11-08 13:20:35,831] Trial 210 finished with value: 0.8134919622605657 and parameters: {'w0': 0.6281993322077044, 'w1': 0.33012283721261554}. Best is trial 15 with value: 0.8135008166306259.
[I 2024-11-08 13:20:36,048] Trial 211 finished with value: 0.8134909274839732 and parameters: {'w0': 0.6230085944413369, 'w1': 0.3251887817987717}. Best is trial 15 with value: 0.8135008166306259.
[I 2024-11-08 13:20:36,142] Trial 212 finished with value: 0.8134860336430723 and parameters: {'w0': 0.6211256397651902, 'w1': 0.31241414100420456}. Best is trial 15 with value: 0.8135008166306259.
[I 2024-11-08 13:20:36,149] Trial 213 finished with value: 0.8134896752398444 and parameters: {'w0': 0.6184290270004754, 'w1': 0.328886523775638}. Best is trial 15 with value: 0.8135008166306259.
[I 2024-11-08 13:20:36,285] Trial 214 finished with value: 0.813491394017918 and parameters: {'w0': 0.6241705868595818, 'w1': 0.32926069224562216}. Best is trial 15 with value: 0.8135008166306259.
[I 2024-11-08 

[I 2024-11-08 13:20:41,093] Trial 252 finished with value: 0.8134972456561161 and parameters: {'w0': 0.6628163617317463, 'w1': 0.30495705178938043}. Best is trial 15 with value: 0.8135008166306259.
[I 2024-11-08 13:20:41,160] Trial 254 finished with value: 0.8134358825519428 and parameters: {'w0': 0.583625931551891, 'w1': 0.30190840411913233}. Best is trial 15 with value: 0.8135008166306259.
[I 2024-11-08 13:20:41,174] Trial 253 finished with value: 0.8134989402008106 and parameters: {'w0': 0.6610817913772536, 'w1': 0.3008249348057387}. Best is trial 15 with value: 0.8135008166306259.
[I 2024-11-08 13:20:41,647] Trial 255 finished with value: 0.8134506527899028 and parameters: {'w0': 0.5935031813774324, 'w1': 0.3014883231005645}. Best is trial 15 with value: 0.8135008166306259.
[I 2024-11-08 13:20:42,218] Trial 256 finished with value: 0.8134953432440323 and parameters: {'w0': 0.6424710105282344, 'w1': 0.3022406592775281}. Best is trial 15 with value: 0.8135008166306259.
[I 2024-11-08 

[I 2024-11-08 13:20:47,029] Trial 294 finished with value: 0.8134936411907289 and parameters: {'w0': 0.6870468767285306, 'w1': 0.29719568896330617}. Best is trial 15 with value: 0.8135008166306259.
[I 2024-11-08 13:20:47,307] Trial 295 finished with value: 0.8134921717552526 and parameters: {'w0': 0.6885508981951138, 'w1': 0.29840664069662104}. Best is trial 15 with value: 0.8135008166306259.
[I 2024-11-08 13:20:47,335] Trial 297 finished with value: 0.8134971564317052 and parameters: {'w0': 0.6781308738397022, 'w1': 0.2973280896927223}. Best is trial 15 with value: 0.8135008166306259.
[I 2024-11-08 13:20:47,353] Trial 296 finished with value: 0.8134954797133649 and parameters: {'w0': 0.6841373707893277, 'w1': 0.2957709749129303}. Best is trial 15 with value: 0.8135008166306259.
[I 2024-11-08 13:20:47,506] Trial 301 finished with value: 0.8134934261833024 and parameters: {'w0': 0.6860566286926989, 'w1': 0.2981621762687086}. Best is trial 15 with value: 0.8135008166306259.
[I 2024-11-08

[I 2024-11-08 13:20:52,819] Trial 335 finished with value: 0.813476087116601 and parameters: {'w0': 0.6096756145374345, 'w1': 0.3088047148960973}. Best is trial 15 with value: 0.8135008166306259.
[I 2024-11-08 13:20:53,207] Trial 336 finished with value: 0.8134811674022309 and parameters: {'w0': 0.6152064053725133, 'w1': 0.31034170654635973}. Best is trial 15 with value: 0.8135008166306259.
[I 2024-11-08 13:20:53,439] Trial 340 finished with value: 0.8134781475334142 and parameters: {'w0': 0.6114283452028176, 'w1': 0.30916938286854595}. Best is trial 15 with value: 0.8135008166306259.
[I 2024-11-08 13:20:53,497] Trial 339 finished with value: 0.813492893805394 and parameters: {'w0': 0.6342184687163291, 'w1': 0.3090980769859125}. Best is trial 15 with value: 0.8135008166306259.
[I 2024-11-08 13:20:53,563] Trial 337 finished with value: 0.813480229483097 and parameters: {'w0': 0.6143617848285178, 'w1': 0.3097268451141266}. Best is trial 15 with value: 0.8135008166306259.
[I 2024-11-08 13

[I 2024-11-08 13:20:58,460] Trial 376 finished with value: 0.8132079057194874 and parameters: {'w0': 0.45439285165004756, 'w1': 0.3482239218783799}. Best is trial 15 with value: 0.8135008166306259.
[I 2024-11-08 13:20:58,605] Trial 375 finished with value: 0.8111794707441013 and parameters: {'w0': 0.11399517222848943, 'w1': 0.3750475719826668}. Best is trial 15 with value: 0.8135008166306259.
[I 2024-11-08 13:20:58,728] Trial 379 finished with value: 0.8134992202916361 and parameters: {'w0': 0.6731355832502478, 'w1': 0.292405272950351}. Best is trial 15 with value: 0.8135008166306259.
[I 2024-11-08 13:20:58,935] Trial 380 finished with value: 0.8134968203346272 and parameters: {'w0': 0.6832494348591248, 'w1': 0.2934667674133281}. Best is trial 15 with value: 0.8135008166306259.
[I 2024-11-08 13:20:58,963] Trial 381 finished with value: 0.8134874265862274 and parameters: {'w0': 0.7170059115961961, 'w1': 0.278027630351077}. Best is trial 15 with value: 0.8135008166306259.
[I 2024-11-08 1

[I 2024-11-08 13:21:04,785] Trial 419 finished with value: 0.8134991116429591 and parameters: {'w0': 0.6602786597413661, 'w1': 0.29132783399465617}. Best is trial 15 with value: 0.8135008166306259.
[I 2024-11-08 13:21:04,905] Trial 420 finished with value: 0.8134420015575424 and parameters: {'w0': 0.5699848221413844, 'w1': 0.32477029118959}. Best is trial 15 with value: 0.8135008166306259.
[I 2024-11-08 13:21:04,963] Trial 421 finished with value: 0.8134983240391331 and parameters: {'w0': 0.6563936478859811, 'w1': 0.29168023041898244}. Best is trial 15 with value: 0.8135008166306259.
[I 2024-11-08 13:21:05,188] Trial 422 finished with value: 0.8134983506523374 and parameters: {'w0': 0.6565487756470766, 'w1': 0.29197293137749786}. Best is trial 15 with value: 0.8135008166306259.
[I 2024-11-08 13:21:05,366] Trial 423 finished with value: 0.8118322294007794 and parameters: {'w0': 0.2933961805914182, 'w1': 0.2548731476119429}. Best is trial 15 with value: 0.8135008166306259.
[I 2024-11-08 

In [32]:
print("w6: ", 1 - np.sum(list(study.best_params.values())))

w6:  0.011533735545351753


In [29]:
pred = np.sum((df_train[cfg["stack_features"]].to_numpy() * np.array([0.73, 0.23, 0.04])), axis=1)

In [30]:
metric(df_train["target"], pred)

0.8142472264006779

In [31]:
test = pd.read_parquet(DATA_PATH / "test_preproc_oof.parquet")
test["target"] = np.sum((test[cfg["stack_features"]].to_numpy() * np.array([0.73, 0.23, 0.04])), axis=1)
test[['id', 'target']].to_csv(f'blend3.csv', index=False)


In [16]:
# res = pd.DataFrame()
# res[MODEL_NAME] = oof[:, 1]
# res.to_csv(MODEL_DIR / "oof.csv", index=False)
# #joblib.dump(model, MODEL_DIR / f"{MODEL_NAME}.joblib")

# with (MODEL_DIR / "params.yaml").open("w") as f:
#     yaml.dump(model.params, f)

# with (MODEL_DIR / "score.txt").open("w") as f:
#     print("OOF:", metric(y_train, oof), file=f)
    
# test = pd.read_parquet(DATA_PATH / "test_preproc_2.parquet")
# test["target"] = model.predict(test[cfg["selected_features"] + cat_columns])[:, 1]
# test[['id', 'target']].to_csv(MODEL_DIR / f'{MODEL_NAME}.csv', index=False)

In [24]:
test = pd.read_parquet(DATA_PATH / "test_preproc_oof.parquet")
test["target"] = model.predict(test[cfg["selected_features"] + cfg["stack_features"] + cat_columns])[:, 1]
test[['id', 'target']].to_csv(f'{MODEL_NAME}.csv', index=False)

In [26]:
model.model.get_feature_scores()[:50]

Unnamed: 0,Feature,Importance
0,lamau_814_full_dataset,78048.337071
1,xgb_81325_full_dataset,29297.001751
2,lama_81298_full_dataset,24098.444188
3,feature_162,1398.695796
4,feature_24,1321.674997
5,feature_18,1298.269506
6,feature_26,1290.120701
7,feature_145,1287.683197
8,feature_78,1279.916003
9,feature_36,1248.758602


In [24]:
imp = pd.DataFrame().assign(names=model.models[0].feature_names_, imp=model.models[0].feature_importances_)

In [41]:
imp.sort_values(by="imp", ascending=False).reset_index(drop=True).query("names == 'feature_185'")

Unnamed: 0,names,imp
61,feature_185,0.0


In [29]:
cat_columns

['feature_7',
 'feature_31',
 'feature_60',
 'feature_61',
 'feature_71',
 'feature_109',
 'feature_122',
 'feature_156',
 'feature_163',
 'feature_167',
 'feature_179',
 'feature_185']

## With Time series cross val

In [13]:
metric = RocAuc()

In [14]:
df_train = df_train.sort_values(by="id").reset_index(drop=True)
X_train, y_train = df_train[cfg["selected_features"] + cat_columns], df_train["target"]

In [15]:
model = LightGBMClassification(n_jobs=16, time_series=True)
model.tune(X_train, y_train, metric, timeout=60 * 60, categorical_features=cat_columns)
oof = model.fit(X_train, y_train, categorical_features=cat_columns)

print(metric(y_train, oof))

[2024-11-07 08:27:33,388] - [   START    ] - Tuning LightGBMClassification
[2024-11-07 08:27:40,118] - [   OPTUNA   ] - Trial 0. New best score 0.7903405446081995 with parameters {'max_depth': 6, 'num_leaves': 488, 'min_data_in_leaf': 188, 'bagging_fraction': 0.7993292420985183, 'bagging_freq': 0, 'feature_fraction': 0.49359671220172163, 'lambda_l1': 0.5808361216819946, 'lambda_l2': 8.661761457749352, 'min_gain_to_split': 12.022300234864176, 'is_unbalance': True, 'num_iterations': 2}
[2024-11-07 08:28:03,043] - [   OPTUNA   ] - Trial 2. New best score 0.7913515589848906 with parameters {'max_depth': 5, 'num_leaves': 194, 'min_data_in_leaf': 117, 'bagging_fraction': 0.8925879806965068, 'bagging_freq': 0, 'feature_fraction': 0.708540663048167, 'lambda_l1': 5.924145688620425, 'lambda_l2': 0.46450412719997725, 'min_gain_to_split': 12.150897038028766, 'is_unbalance': True, 'num_iterations': 2}
[2024-11-07 08:28:12,225] - [   OPTUNA   ] - Trial 3. New best score 0.8048686053278628 with param

In [27]:
none_oofs_idx = oof[np.any(np.isnan(oof), axis=1)].shape[0]

In [32]:
metric(y_train[none_oofs_idx:], oof[none_oofs_idx:])

0.8095227594190041

In [34]:
MODEL_NAME = "lgb_8095_full_dataset_time_series"
MODEL_DIR = Path(f"../../../data/models/{MODEL_NAME}")
MODEL_DIR.mkdir(exist_ok=True)

In [35]:
res = pd.DataFrame()
res[MODEL_NAME] = oof[none_oofs_idx:, 1]
res.to_csv(MODEL_DIR / "oof.csv", index=False)
joblib.dump(model, MODEL_DIR / f"{MODEL_NAME}.joblib")

with (MODEL_DIR / "params.yaml").open("w") as f:
    yaml.dump(model.params, f)

with (MODEL_DIR / "score.txt").open("w") as f:
    print("OOF:", metric(y_train, oof), file=f)
    
test = pd.read_parquet(DATA_PATH / "test_preproc_2.parquet")
test["target"] = model.predict(test[cfg["selected_features"] + cat_columns])[:, 1]
test[['id', 'target']].to_csv(MODEL_DIR / f'{MODEL_NAME}.csv', index=False)

## TEST 
**81.22112399468679**

## Inference

In [27]:
test = pd.read_parquet(DATA_PATH / "test_preproc_2.parquet")
test["target"] = model.predict(test[cfg["selected_features"] + cat_columns])[:, 1]
test[['id', 'target']].to_csv('lgb_813.csv', index=False)

In [25]:
pred_1 = pd.read_csv("lama_utilized.csv")
pred_2 = pd.read_csv("lgmb_oe_ohe_cols_0805.csv")
pred_3 = pd.read_csv("catboost_ts.csv")

In [26]:
pred_1["target"] = 0.6 * pred_1["target"] + 0.2 * pred_2["target"] + 0.2 * pred_3["target"]

In [29]:
pred_1.to_csv("blend.csv", index=False)

In [166]:
MODEL_DIR.open?

[0;31mSignature:[0m
[0mMODEL_DIR[0m[0;34m.[0m[0mopen[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mmode[0m[0;34m=[0m[0;34m'r'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mbuffering[0m[0;34m=[0m[0;34m-[0m[0;36m1[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mencoding[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0merrors[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mnewline[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Open the file pointed by this path and return a file object, as
the built-in open() function does.
[0;31mFile:[0m      /usr/lib/python3.10/pathlib.py
[0;31mType:[0m      method