In [1]:
import sys; sys.path.append("../../../../automl/")

In [2]:
from pathlib import Path
import yaml
import joblib

import pandas as pd
import numpy as np
import optuna

from sklearn.model_selection import train_test_split, StratifiedKFold, TimeSeriesSplit
from sklearn.metrics import roc_auc_score

# from src.automl.model.lama import TabularLama
# from src.automl.loggers import configure_root_logger
# from src.automl.constants import create_ml_data_dir
# from src.automl.model.metrics import RocAuc

In [3]:
optuna.logging.set_verbosity(optuna.logging.INFO)

In [4]:
# create_ml_data_dir()
# configure_root_logger()

## Constants

In [5]:
RANDOM_SEED = 77
DATA_PATH = Path("../../../../data/")
CONFIG_PATH = Path("../../../../configs/config.yaml")
N_JOBS = 16

In [6]:
with CONFIG_PATH.open() as f:
    cfg = yaml.load(f, Loader=yaml.SafeLoader)

## Data

In [7]:
df_train = pd.read_parquet(DATA_PATH / "train_preproc_oof.parquet")
#df_train, df_test = df_train.sort_values(by="id").iloc[:300_000], df_train.sort_values(by="id").iloc[300_000:]

In [8]:
df_train["target"].value_counts(normalize=True)

0    0.940982
1    0.059018
Name: target, dtype: float64

In [9]:
# undersample the 0 class
#df_train = pd.concat([df_train.loc[df_train.target == 1], df_train.loc[df_train.target == 0].sample(200_000, random_state=RANDOM_SEED)], ignore_index=True)


In [10]:
cat_columns = df_train.drop(columns=["target", "id"]).select_dtypes(int).columns.values.tolist()

In [11]:
X_train, y_train = df_train[cfg["selected_features"] + cfg["stack_features"] + cat_columns], df_train["target"]
#X_test, y_test = df_test[cfg["selected_features"] + cat_columns], df_test["target"]

In [12]:
display(y_train.value_counts(normalize=True))
#display(y_test.value_counts(normalize=True))

0    0.940982
1    0.059018
Name: target, dtype: float64

In [12]:
#categorical_features = ohe_cols# + oe_cols

## Blend

In [13]:
cfg["stack_features"]

['lamau_81425_full_dataset',
 'lgb_8122_full_dataset',
 'cb_8114_full_dataset',
 'xgb_81325_full_dataset',
 'lama_81298_full_dataset',
 'lamann_autoint_8053_full_dataset',
 'lamann_fttransformer_8050_full_dataset']

In [110]:
cfg["stack_features"] = ["lamau_81425_full_dataset", "xgb_81325_full_dataset", "lamann_autoint_8053_full_dataset"]

In [124]:
df_train[cfg["stack_features"]] = np.round(df_train[cfg["stack_features"]], 10)

In [126]:
def objective(trial):
    weights = []
    w0 = trial.suggest_float("w0", 0, 1)
    weights.append(w0)
    
    for i in range(1, len(cfg["stack_features"]) - 1):
        sum_prev_weights = np.sum(weights)
        wi = trial.suggest_float(f"w{i}", 0, sum_prev_weights)
        weights.append(wi)
        
    w6 = 1 - np.sum(weights)
    weights.append(w6)
    
    weights = np.array(weights)
    
    cv = kf.split(X_train, y_train)
    
    metrics = []
    for i, (train_idx, test_idx) in enumerate(cv):
    
        preds = np.sum(X_train.iloc[test_idx][cfg["stack_features"]].to_numpy() * weights, axis=1)
        metrics.append(metric(y_train.iloc[test_idx], preds))
        
    
    return np.mean(metrics)
    
    

In [15]:
def objective(trial):
    weights = []
    w0 = trial.suggest_float("w0", 0, 1)
    weights.append(w0)
    
    
        
    w1 = 1 - w0
    weights.append(w1)
    
    weights = np.array(weights)
    
    cv = kf.split(X_train, y_train)
    
    metrics = []
    for i, (train_idx, test_idx) in enumerate(cv):
    
        preds = np.sum(X_train.iloc[test_idx][cfg["stack_features"]].to_numpy() * weights, axis=1)
        metrics.append(metric(y_train.iloc[test_idx], preds))
        
    
    return np.mean(metrics)
    

In [131]:
def objective(trial):
    weights = []
    w0 = trial.suggest_float("w0", 0, 1)
    weights.append(w0)
    
    w1 = trial.suggest_float("w1", 0, 1 - w0)
    weights.append(w1)
        
    w2 = 1 - np.sum(weights)
    weights.append(w2)
    
    weights = np.array(weights)
    
    preds = np.round(np.sum(X_train[cfg["stack_features"]].to_numpy() * weights, axis=1), 10)
    
    res = metric(y_train, preds)
        
    
    return res
    

In [112]:
# kf = StratifiedKFold(
#                 n_splits=5, random_state=RANDOM_SEED, shuffle=True)

# kf = TimeSeriesSplit(
#                 n_splits=5)
metric = roc_auc_score

In [132]:
study = optuna.create_study(
            study_name="blending",
            direction="maximize")
study.optimize(objective, timeout=60, n_jobs=N_JOBS)

[I 2024-11-10 15:52:50,469] A new study created in memory with name: blending
[I 2024-11-10 15:52:50,997] Trial 0 finished with value: 0.8141864519231701 and parameters: {'w0': 0.7011518554479736, 'w1': 0.1430507966057112}. Best is trial 0 with value: 0.8141864519231701.
[I 2024-11-10 15:52:51,116] Trial 3 finished with value: 0.8143275781655728 and parameters: {'w0': 0.658533453533373, 'w1': 0.3143729202635799}. Best is trial 3 with value: 0.8143275781655728.
[I 2024-11-10 15:52:51,117] Trial 5 finished with value: 0.8134965212270425 and parameters: {'w0': 0.3187691553573996, 'w1': 0.3893997870690148}. Best is trial 3 with value: 0.8143275781655728.
[I 2024-11-10 15:52:51,144] Trial 4 finished with value: 0.8142632599872135 and parameters: {'w0': 0.617153084578135, 'w1': 0.2748973705057389}. Best is trial 3 with value: 0.8143275781655728.
[I 2024-11-10 15:52:51,145] Trial 1 finished with value: 0.8128616392096611 and parameters: {'w0': 0.26225853421257506, 'w1': 0.33769660436789384}. 

[I 2024-11-10 15:52:53,118] Trial 40 finished with value: 0.8143258879744093 and parameters: {'w0': 0.8246276272493711, 'w1': 0.1077941106117776}. Best is trial 8 with value: 0.8143539106277631.
[I 2024-11-10 15:52:53,197] Trial 47 finished with value: 0.8143237415777065 and parameters: {'w0': 0.7906257468928604, 'w1': 0.1318198422434818}. Best is trial 8 with value: 0.8143539106277631.
[I 2024-11-10 15:52:53,280] Trial 38 finished with value: 0.8142655893723829 and parameters: {'w0': 0.916123583548078, 'w1': 0.0048531519409197865}. Best is trial 8 with value: 0.8143539106277631.
[I 2024-11-10 15:52:53,303] Trial 45 finished with value: 0.8143268899812826 and parameters: {'w0': 0.7944533130657545, 'w1': 0.13158437759169422}. Best is trial 8 with value: 0.8143539106277631.
[I 2024-11-10 15:52:53,388] Trial 37 finished with value: 0.8142864821728193 and parameters: {'w0': 0.9591164616689037, 'w1': 0.04083412896305744}. Best is trial 8 with value: 0.8143539106277631.
[I 2024-11-10 15:52:5

[I 2024-11-10 15:52:55,763] Trial 90 finished with value: 0.8142420080653199 and parameters: {'w0': 0.7521831924383513, 'w1': 0.11727324445551594}. Best is trial 8 with value: 0.8143539106277631.
[I 2024-11-10 15:52:55,776] Trial 85 finished with value: 0.814323472052043 and parameters: {'w0': 0.8813958528430661, 'w1': 0.07570823001117008}. Best is trial 8 with value: 0.8143539106277631.
[I 2024-11-10 15:52:55,820] Trial 88 finished with value: 0.814322214599599 and parameters: {'w0': 0.8867303357651832, 'w1': 0.0727639258951373}. Best is trial 8 with value: 0.8143539106277631.
[I 2024-11-10 15:52:55,827] Trial 89 finished with value: 0.8142714705267711 and parameters: {'w0': 0.7630753368766641, 'w1': 0.12164164713392928}. Best is trial 8 with value: 0.8143539106277631.
[I 2024-11-10 15:52:55,878] Trial 82 finished with value: 0.8143413276130783 and parameters: {'w0': 0.8529774701332773, 'w1': 0.12355847067315552}. Best is trial 8 with value: 0.8143539106277631.
[I 2024-11-10 15:52:55,

[I 2024-11-10 15:52:58,147] Trial 125 finished with value: 0.8142908716811671 and parameters: {'w0': 0.9077765254944582, 'w1': 0.028320036089175882}. Best is trial 8 with value: 0.8143539106277631.
[I 2024-11-10 15:52:58,153] Trial 128 finished with value: 0.8143040551827874 and parameters: {'w0': 0.91481601612353, 'w1': 0.04039304585401704}. Best is trial 8 with value: 0.8143539106277631.
[I 2024-11-10 15:52:58,181] Trial 127 finished with value: 0.814285446415912 and parameters: {'w0': 0.918220792571101, 'w1': 0.019094439939983082}. Best is trial 8 with value: 0.8143539106277631.
[I 2024-11-10 15:52:58,414] Trial 124 finished with value: 0.8142988752380932 and parameters: {'w0': 0.9070898158149608, 'w1': 0.036356846374290624}. Best is trial 8 with value: 0.8143539106277631.
[I 2024-11-10 15:52:58,445] Trial 129 finished with value: 0.8142867703664705 and parameters: {'w0': 0.913862003240696, 'w1': 0.021994577994459086}. Best is trial 8 with value: 0.8143539106277631.
[I 2024-11-10 15

[I 2024-11-10 15:53:00,963] Trial 164 finished with value: 0.8143346296551294 and parameters: {'w0': 0.8719943657451448, 'w1': 0.11654789002944702}. Best is trial 8 with value: 0.8143539106277631.
[I 2024-11-10 15:53:00,964] Trial 169 finished with value: 0.8143319210566856 and parameters: {'w0': 0.8765242774893699, 'w1': 0.11807619806399068}. Best is trial 8 with value: 0.8143539106277631.
[I 2024-11-10 15:53:00,986] Trial 171 finished with value: 0.814332383379419 and parameters: {'w0': 0.8759878045174424, 'w1': 0.11739225997442919}. Best is trial 8 with value: 0.8143539106277631.
[I 2024-11-10 15:53:01,067] Trial 167 finished with value: 0.8143377613418918 and parameters: {'w0': 0.8647009012811004, 'w1': 0.11745891696718115}. Best is trial 8 with value: 0.8143539106277631.
[I 2024-11-10 15:53:01,083] Trial 173 finished with value: 0.814333848500042 and parameters: {'w0': 0.8735586516046092, 'w1': 0.11724779334811716}. Best is trial 8 with value: 0.8143539106277631.
[I 2024-11-10 15:

[I 2024-11-10 15:53:03,761] Trial 210 finished with value: 0.8143485170558995 and parameters: {'w0': 0.8261024868533536, 'w1': 0.14672577018565222}. Best is trial 8 with value: 0.8143539106277631.
[I 2024-11-10 15:53:03,808] Trial 209 finished with value: 0.8143474023028185 and parameters: {'w0': 0.8317788001951958, 'w1': 0.1459963473286784}. Best is trial 8 with value: 0.8143539106277631.
[I 2024-11-10 15:53:03,951] Trial 214 finished with value: 0.8143489659313538 and parameters: {'w0': 0.8215469408215178, 'w1': 0.14831441343587953}. Best is trial 8 with value: 0.8143539106277631.
[I 2024-11-10 15:53:03,952] Trial 212 finished with value: 0.8143473629629349 and parameters: {'w0': 0.8318307454764432, 'w1': 0.1458337805948191}. Best is trial 8 with value: 0.8143539106277631.
[I 2024-11-10 15:53:03,963] Trial 213 finished with value: 0.8143485040304957 and parameters: {'w0': 0.8263239466729047, 'w1': 0.1463142077693064}. Best is trial 8 with value: 0.8143539106277631.
[I 2024-11-10 15:5

[I 2024-11-10 15:53:05,960] Trial 249 finished with value: 0.8143470675446673 and parameters: {'w0': 0.7994970339005814, 'w1': 0.15328971725916}. Best is trial 8 with value: 0.8143539106277631.
[I 2024-11-10 15:53:06,139] Trial 252 finished with value: 0.814326808401122 and parameters: {'w0': 0.7659122935867959, 'w1': 0.15643191799928322}. Best is trial 8 with value: 0.8143539106277631.
[I 2024-11-10 15:53:06,212] Trial 253 finished with value: 0.8143308791298501 and parameters: {'w0': 0.7720142730987735, 'w1': 0.15471775719425823}. Best is trial 8 with value: 0.8143539106277631.
[I 2024-11-10 15:53:06,245] Trial 254 finished with value: 0.8143227122016649 and parameters: {'w0': 0.7623503765831036, 'w1': 0.15596562635993982}. Best is trial 8 with value: 0.8143539106277631.
[I 2024-11-10 15:53:06,326] Trial 255 finished with value: 0.8143278199001923 and parameters: {'w0': 0.768418478555021, 'w1': 0.15526423032855025}. Best is trial 8 with value: 0.8143539106277631.
[I 2024-11-10 15:53:

[I 2024-11-10 15:53:09,094] Trial 298 finished with value: 0.8134327057659889 and parameters: {'w0': 0.49180316600284285, 'w1': 0.17678269276155179}. Best is trial 8 with value: 0.8143539106277631.
[I 2024-11-10 15:53:09,152] Trial 291 finished with value: 0.8143504919602416 and parameters: {'w0': 0.78690472197158, 'w1': 0.17068535970293008}. Best is trial 8 with value: 0.8143539106277631.
[I 2024-11-10 15:53:09,180] Trial 296 finished with value: 0.814305954571428 and parameters: {'w0': 0.7308078204605581, 'w1': 0.17332327260958322}. Best is trial 8 with value: 0.8143539106277631.
[I 2024-11-10 15:53:09,313] Trial 292 finished with value: 0.8142344780108259 and parameters: {'w0': 0.6905581887399848, 'w1': 0.17421384904022313}. Best is trial 8 with value: 0.8143539106277631.
[I 2024-11-10 15:53:09,330] Trial 297 finished with value: 0.8143205370119639 and parameters: {'w0': 0.7432544000573655, 'w1': 0.17265629356698742}. Best is trial 8 with value: 0.8143539106277631.
[I 2024-11-10 15:

[I 2024-11-10 15:53:12,058] Trial 338 finished with value: 0.8143363173677745 and parameters: {'w0': 0.751181989985199, 'w1': 0.18014773924433408}. Best is trial 326 with value: 0.8143541120205451.
[I 2024-11-10 15:53:12,077] Trial 334 finished with value: 0.8143413372634787 and parameters: {'w0': 0.7581213779150286, 'w1': 0.1799549502931611}. Best is trial 326 with value: 0.8143541120205451.
[I 2024-11-10 15:53:12,095] Trial 335 finished with value: 0.8143478114165138 and parameters: {'w0': 0.7575815688264278, 'w1': 0.19063636437578887}. Best is trial 326 with value: 0.8143541120205451.
[I 2024-11-10 15:53:12,154] Trial 339 finished with value: 0.8143338975430605 and parameters: {'w0': 0.7486102395390667, 'w1': 0.1799811064268955}. Best is trial 326 with value: 0.8143541120205451.
[I 2024-11-10 15:53:12,213] Trial 336 finished with value: 0.814349321625073 and parameters: {'w0': 0.7580503058770425, 'w1': 0.19286888540169683}. Best is trial 326 with value: 0.8143541120205451.
[I 2024-1

[I 2024-11-10 15:53:15,067] Trial 375 finished with value: 0.8143528089000858 and parameters: {'w0': 0.8008163401275279, 'w1': 0.17551508879819427}. Best is trial 326 with value: 0.8143541120205451.
[I 2024-11-10 15:53:15,068] Trial 378 finished with value: 0.8143515803988427 and parameters: {'w0': 0.806405283649938, 'w1': 0.17523581068272975}. Best is trial 326 with value: 0.8143541120205451.
[I 2024-11-10 15:53:15,218] Trial 380 finished with value: 0.8143465613468895 and parameters: {'w0': 0.7726112837756306, 'w1': 0.17376383638703055}. Best is trial 326 with value: 0.8143541120205451.
[I 2024-11-10 15:53:15,352] Trial 379 finished with value: 0.8143497811001473 and parameters: {'w0': 0.7758721520230744, 'w1': 0.17707106451088095}. Best is trial 326 with value: 0.8143541120205451.
[I 2024-11-10 15:53:15,391] Trial 381 finished with value: 0.814352230192469 and parameters: {'w0': 0.8032193840215371, 'w1': 0.17657092925955167}. Best is trial 326 with value: 0.8143541120205451.
[I 2024

[I 2024-11-10 15:53:17,798] Trial 422 finished with value: 0.8143525363685602 and parameters: {'w0': 0.7976563570763886, 'w1': 0.18260690098834215}. Best is trial 326 with value: 0.8143541120205451.
[I 2024-11-10 15:53:17,803] Trial 419 finished with value: 0.8143528350036278 and parameters: {'w0': 0.7961257757177985, 'w1': 0.18300112143128736}. Best is trial 326 with value: 0.8143541120205451.
[I 2024-11-10 15:53:17,832] Trial 421 finished with value: 0.8143529964764475 and parameters: {'w0': 0.7954440560022693, 'w1': 0.18210692894455754}. Best is trial 326 with value: 0.8143541120205451.
[I 2024-11-10 15:53:17,874] Trial 420 finished with value: 0.813515038763357 and parameters: {'w0': 0.47414828900295725, 'w1': 0.211237759030462}. Best is trial 326 with value: 0.8143541120205451.
[I 2024-11-10 15:53:18,061] Trial 424 finished with value: 0.8143535704907002 and parameters: {'w0': 0.7918843549004655, 'w1': 0.1831031401778381}. Best is trial 326 with value: 0.8143541120205451.
[I 2024-

[I 2024-11-10 15:53:20,738] Trial 461 finished with value: 0.8137257635351647 and parameters: {'w0': 0.524401444997328, 'w1': 0.2000663779426577}. Best is trial 326 with value: 0.8143541120205451.
[I 2024-11-10 15:53:20,787] Trial 462 finished with value: 0.8143530468905611 and parameters: {'w0': 0.7914571104396825, 'w1': 0.18927502468236782}. Best is trial 326 with value: 0.8143541120205451.
[I 2024-11-10 15:53:20,805] Trial 455 finished with value: 0.8138974153650943 and parameters: {'w0': 0.5611563484758674, 'w1': 0.20094438699183925}. Best is trial 326 with value: 0.8143541120205451.
[I 2024-11-10 15:53:20,911] Trial 463 finished with value: 0.814353576871566 and parameters: {'w0': 0.7874836687753372, 'w1': 0.18930833141939438}. Best is trial 326 with value: 0.8143541120205451.
[I 2024-11-10 15:53:20,949] Trial 460 finished with value: 0.8143530817480181 and parameters: {'w0': 0.7920872450371516, 'w1': 0.1876190625809295}. Best is trial 326 with value: 0.8143541120205451.
[I 2024-1

[I 2024-11-10 15:53:23,298] Trial 498 finished with value: 0.814353266529455 and parameters: {'w0': 0.7867470931482671, 'w1': 0.1941653815025659}. Best is trial 326 with value: 0.8143541120205451.
[I 2024-11-10 15:53:23,322] Trial 496 finished with value: 0.8116014755303148 and parameters: {'w0': 0.1958190795085707, 'w1': 0.249096566848022}. Best is trial 326 with value: 0.8143541120205451.
[I 2024-11-10 15:53:23,419] Trial 502 finished with value: 0.8143540280673349 and parameters: {'w0': 0.7826706308804574, 'w1': 0.18930683265542536}. Best is trial 326 with value: 0.8143541120205451.
[I 2024-11-10 15:53:23,471] Trial 501 finished with value: 0.8141334260843616 and parameters: {'w0': 0.6185409871099166, 'w1': 0.20972968447440773}. Best is trial 326 with value: 0.8143541120205451.
[I 2024-11-10 15:53:23,658] Trial 503 finished with value: 0.8143536443716344 and parameters: {'w0': 0.7867855702451514, 'w1': 0.189344765746238}. Best is trial 326 with value: 0.8143541120205451.
[I 2024-11-

[I 2024-11-10 15:53:26,282] Trial 542 finished with value: 0.814341113616768 and parameters: {'w0': 0.7372654699847713, 'w1': 0.2008354161749351}. Best is trial 531 with value: 0.8143545444428575.
[I 2024-11-10 15:53:26,374] Trial 543 finished with value: 0.8143461371511478 and parameters: {'w0': 0.7440394293220096, 'w1': 0.20156453387323026}. Best is trial 531 with value: 0.8143545444428575.
[I 2024-11-10 15:53:26,439] Trial 545 finished with value: 0.814342476063459 and parameters: {'w0': 0.7432006623999783, 'w1': 0.1966674996119966}. Best is trial 531 with value: 0.8143545444428575.
[I 2024-11-10 15:53:26,614] Trial 548 finished with value: 0.8143449516811982 and parameters: {'w0': 0.7452577728480545, 'w1': 0.19820736998830243}. Best is trial 531 with value: 0.8143545444428575.
[I 2024-11-10 15:53:26,652] Trial 540 finished with value: 0.8143045606422833 and parameters: {'w0': 0.7014844729264227, 'w1': 0.2033541376735063}. Best is trial 531 with value: 0.8143545444428575.
[I 2024-11

[I 2024-11-10 15:53:29,064] Trial 583 finished with value: 0.8143536427896014 and parameters: {'w0': 0.7796391328537857, 'w1': 0.18727327075837563}. Best is trial 531 with value: 0.8143545444428575.
[I 2024-11-10 15:53:29,097] Trial 585 finished with value: 0.8143417121525298 and parameters: {'w0': 0.8365894130375295, 'w1': 0.16221801508433775}. Best is trial 531 with value: 0.8143545444428575.
[I 2024-11-10 15:53:29,106] Trial 584 finished with value: 0.8143410990620659 and parameters: {'w0': 0.8402729260928645, 'w1': 0.15818064923353692}. Best is trial 531 with value: 0.8143545444428575.
[I 2024-11-10 15:53:29,151] Trial 580 finished with value: 0.8139843195936499 and parameters: {'w0': 0.7780843205484941, 'w1': 0.010272745162426572}. Best is trial 531 with value: 0.8143545444428575.
[I 2024-11-10 15:53:29,401] Trial 587 finished with value: 0.8143421356099896 and parameters: {'w0': 0.8355870299094004, 'w1': 0.16281097661061533}. Best is trial 531 with value: 0.8143545444428575.
[I 2

[I 2024-11-10 15:53:32,318] Trial 625 finished with value: 0.8143438740004204 and parameters: {'w0': 0.745339821176327, 'w1': 0.1963302291053508}. Best is trial 531 with value: 0.8143545444428575.
[I 2024-11-10 15:53:32,534] Trial 626 finished with value: 0.8142661586932713 and parameters: {'w0': 0.6678517879793018, 'w1': 0.2151801736733451}. Best is trial 531 with value: 0.8143545444428575.
[I 2024-11-10 15:53:32,669] Trial 627 finished with value: 0.8143441464792118 and parameters: {'w0': 0.7455870345675458, 'w1': 0.19649064993828294}. Best is trial 531 with value: 0.8143545444428575.
[I 2024-11-10 15:53:32,692] Trial 629 finished with value: 0.8143445054424655 and parameters: {'w0': 0.7462046912321829, 'w1': 0.19637085958933098}. Best is trial 531 with value: 0.8143545444428575.
[I 2024-11-10 15:53:32,878] Trial 631 finished with value: 0.8143428136692694 and parameters: {'w0': 0.7448877188240322, 'w1': 0.1953390219411071}. Best is trial 531 with value: 0.8143545444428575.
[I 2024-1

[I 2024-11-10 15:53:36,009] Trial 669 finished with value: 0.8143471505486575 and parameters: {'w0': 0.8102829298015642, 'w1': 0.1853453023025004}. Best is trial 531 with value: 0.8143545444428575.
[I 2024-11-10 15:53:36,123] Trial 670 finished with value: 0.8143478754361098 and parameters: {'w0': 0.8099814856604388, 'w1': 0.18368951809543463}. Best is trial 531 with value: 0.8143545444428575.
[I 2024-11-10 15:53:36,184] Trial 668 finished with value: 0.8143347561122884 and parameters: {'w0': 0.8633293506985055, 'w1': 0.13596063685699936}. Best is trial 531 with value: 0.8143545444428575.
[I 2024-11-10 15:53:36,311] Trial 664 finished with value: 0.811225216966018 and parameters: {'w0': 0.17231964396829103, 'w1': 0.23533083302569952}. Best is trial 531 with value: 0.8143545444428575.
[I 2024-11-10 15:53:36,360] Trial 671 finished with value: 0.8143471972186266 and parameters: {'w0': 0.812270216916804, 'w1': 0.18288528244768784}. Best is trial 531 with value: 0.8143545444428575.
[I 2024

[I 2024-11-10 15:53:39,707] Trial 712 finished with value: 0.8143433414881629 and parameters: {'w0': 0.7356109144005276, 'w1': 0.20602775238795926}. Best is trial 531 with value: 0.8143545444428575.
[I 2024-11-10 15:53:39,838] Trial 711 finished with value: 0.8143517178774975 and parameters: {'w0': 0.7962577671465007, 'w1': 0.1894683615766674}. Best is trial 531 with value: 0.8143545444428575.
[I 2024-11-10 15:53:39,840] Trial 713 finished with value: 0.8143374130837269 and parameters: {'w0': 0.7305580363756923, 'w1': 0.2030909333043988}. Best is trial 531 with value: 0.8143545444428575.
[I 2024-11-10 15:53:39,899] Trial 708 finished with value: 0.8143377844395714 and parameters: {'w0': 0.7307983303101341, 'w1': 0.20326450065148105}. Best is trial 531 with value: 0.8143545444428575.
[I 2024-11-10 15:53:39,916] Trial 710 finished with value: 0.8143482418349179 and parameters: {'w0': 0.7980504839219311, 'w1': 0.19734141445337236}. Best is trial 531 with value: 0.8143545444428575.
[I 2024

[I 2024-11-10 15:53:42,966] Trial 744 finished with value: 0.8143545472905166 and parameters: {'w0': 0.7713255047579537, 'w1': 0.20056971078865193}. Best is trial 744 with value: 0.8143545472905166.
[I 2024-11-10 15:53:42,974] Trial 747 finished with value: 0.8143540795888715 and parameters: {'w0': 0.7674595718980435, 'w1': 0.2086280405753126}. Best is trial 744 with value: 0.8143545472905166.
[I 2024-11-10 15:53:43,133] Trial 751 finished with value: 0.8143530086581006 and parameters: {'w0': 0.7536348110957993, 'w1': 0.2088692220424462}. Best is trial 744 with value: 0.8143545472905166.
[I 2024-11-10 15:53:43,150] Trial 753 finished with value: 0.8143092416138165 and parameters: {'w0': 0.6965738048936895, 'w1': 0.21262414520200343}. Best is trial 744 with value: 0.8143545472905166.
[I 2024-11-10 15:53:43,364] Trial 757 finished with value: 0.8143060350441658 and parameters: {'w0': 0.6902535710971022, 'w1': 0.21708222757842083}. Best is trial 744 with value: 0.8143545472905166.
[I 2024

[I 2024-11-10 15:53:46,791] Trial 793 finished with value: 0.8143199022476498 and parameters: {'w0': 0.7146533008354055, 'w1': 0.20174286749537457}. Best is trial 744 with value: 0.8143545472905166.
[I 2024-11-10 15:53:47,051] Trial 796 finished with value: 0.8143544635482445 and parameters: {'w0': 0.770743643879021, 'w1': 0.20038878904754223}. Best is trial 744 with value: 0.8143545472905166.
[I 2024-11-10 15:53:47,180] Trial 798 finished with value: 0.8143542507120916 and parameters: {'w0': 0.7674584972854733, 'w1': 0.20074735455155066}. Best is trial 744 with value: 0.8143545472905166.
[I 2024-11-10 15:53:47,185] Trial 794 finished with value: 0.8143544438783026 and parameters: {'w0': 0.7702416360625999, 'w1': 0.2002572913227776}. Best is trial 744 with value: 0.8143545472905166.
[I 2024-11-10 15:53:47,194] Trial 795 finished with value: 0.8143543320285801 and parameters: {'w0': 0.7680803818526498, 'w1': 0.20085870420282606}. Best is trial 744 with value: 0.8143545472905166.
[I 2024

[I 2024-11-10 15:53:49,383] Trial 837 finished with value: 0.8143356313455961 and parameters: {'w0': 0.7258534548331113, 'w1': 0.2060972807358816}. Best is trial 744 with value: 0.8143545472905166.
[I 2024-11-10 15:53:49,493] Trial 830 finished with value: 0.8129017580861913 and parameters: {'w0': 0.3532983143785933, 'w1': 0.24017895062825936}. Best is trial 744 with value: 0.8143545472905166.
[I 2024-11-10 15:53:49,524] Trial 838 finished with value: 0.8143394462596122 and parameters: {'w0': 0.7287995262543109, 'w1': 0.2079554992423275}. Best is trial 744 with value: 0.8143545472905166.
[I 2024-11-10 15:53:49,586] Trial 835 finished with value: 0.8143395800468571 and parameters: {'w0': 0.7300770387643438, 'w1': 0.20669377022178761}. Best is trial 744 with value: 0.8143545472905166.
[I 2024-11-10 15:53:49,610] Trial 839 finished with value: 0.8143413466502069 and parameters: {'w0': 0.7324026707235155, 'w1': 0.20650925380860716}. Best is trial 744 with value: 0.8143545472905166.
[I 2024

In [133]:
study.best_params

{'w0': 0.7713255047579537, 'w1': 0.20056971078865193}

In [134]:
study.best_value

0.8143545472905166

In [135]:
weights = np.array([0.7726348139013947, 0.19979192880550867])
weights = np.append(weights, 1 - np.sum(weights))
weights

array([0.77263481, 0.19979193, 0.02757326])

In [136]:
# study = optuna.create_study(
#             study_name="blending",
#             direction="maximize")
# study.optimize(objective, timeout=60, n_jobs=N_JOBS)

In [38]:
# print("w6: ", 1 - np.sum(list(study.best_params.values())))

In [138]:
pred = np.round(np.sum((df_train[cfg["stack_features"]].to_numpy() * weights), axis=1), 10)

In [139]:
metric(df_train["target"], pred)

0.8143545481342676

In [123]:
test = pd.read_parquet(DATA_PATH / "test_preproc_oof.parquet")
#test[cfg["stack_features"]] = test[cfg["stack_features"]].astype(np.float32)
test["target"] = np.round(np.sum((test[cfg["stack_features"]].to_numpy() * weights), axis=1), 10)
test[['id', 'target']].to_csv(f'blend3_2.csv', index=False)


In [13]:
cfg["stack_features"]

['lamau_81425_full_dataset',
 'lgb_8122_full_dataset',
 'cb_8114_full_dataset',
 'xgb_81325_full_dataset',
 'lama_81298_full_dataset',
 'lamann_autoint_8053_full_dataset',
 'lamann_fttransformer_8050_full_dataset']

In [14]:
cols_to_consider = ["lamau_81425_full_dataset", "lama_stack_time_series", "xgb_81325_full_dataset"]
weights = np.array([0.15, 0.85])

In [15]:
test = pd.read_parquet(DATA_PATH / "test_preproc_oof.parquet")
lama_stack_pred = pd.read_csv("lama_stack_time_series.csv")
test["lama_stack_time_series"] = lama_stack_pred["target"]
test["target"] = np.round(np.sum((test[cols_to_consider].to_numpy() * weights), axis=1), 10)
test[['id', 'target']].to_csv(f'blend_stack_3.csv', index=False)

In [23]:
test[cols_to_consider].to_numpy()

array([[0.01422167, 0.01676077, 0.01310246, 0.01688302],
       [0.26035836, 0.26014048, 0.22095208, 0.2367294 ],
       [0.03843449, 0.04023904, 0.03368598, 0.03804932],
       ...,
       [0.00844403, 0.01244384, 0.00624904, 0.00835771],
       [0.02531996, 0.03010638, 0.02247832, 0.02501185],
       [0.01512659, 0.01911527, 0.01602159, 0.01462073]])

In [16]:
# res = pd.DataFrame()
# res[MODEL_NAME] = oof[:, 1]
# res.to_csv(MODEL_DIR / "oof.csv", index=False)
# #joblib.dump(model, MODEL_DIR / f"{MODEL_NAME}.joblib")

# with (MODEL_DIR / "params.yaml").open("w") as f:
#     yaml.dump(model.params, f)

# with (MODEL_DIR / "score.txt").open("w") as f:
#     print("OOF:", metric(y_train, oof), file=f)
    
# test = pd.read_parquet(DATA_PATH / "test_preproc_2.parquet")
# test["target"] = model.predict(test[cfg["selected_features"] + cat_columns])[:, 1]
# test[['id', 'target']].to_csv(MODEL_DIR / f'{MODEL_NAME}.csv', index=False)

In [24]:
test = pd.read_parquet(DATA_PATH / "test_preproc_oof.parquet")
test[cfg["stack_features"]] = test[cfg["stack_features"]].astype(np.float32)
test["target"] = model.predict(test[cfg["selected_features"] + cfg["stack_features"] + cat_columns])[:, 1]
test[['id', 'target']].to_csv(f'{MODEL_NAME}.csv', index=False)

In [26]:
model.model.get_feature_scores()[:50]

Unnamed: 0,Feature,Importance
0,lamau_814_full_dataset,78048.337071
1,xgb_81325_full_dataset,29297.001751
2,lama_81298_full_dataset,24098.444188
3,feature_162,1398.695796
4,feature_24,1321.674997
5,feature_18,1298.269506
6,feature_26,1290.120701
7,feature_145,1287.683197
8,feature_78,1279.916003
9,feature_36,1248.758602


In [24]:
imp = pd.DataFrame().assign(names=model.models[0].feature_names_, imp=model.models[0].feature_importances_)

In [41]:
imp.sort_values(by="imp", ascending=False).reset_index(drop=True).query("names == 'feature_185'")

Unnamed: 0,names,imp
61,feature_185,0.0


In [29]:
cat_columns

['feature_7',
 'feature_31',
 'feature_60',
 'feature_61',
 'feature_71',
 'feature_109',
 'feature_122',
 'feature_156',
 'feature_163',
 'feature_167',
 'feature_179',
 'feature_185']

## With Time series cross val

In [13]:
metric = RocAuc()

In [14]:
df_train = df_train.sort_values(by="id").reset_index(drop=True)
X_train, y_train = df_train[cfg["selected_features"] + cat_columns], df_train["target"]

In [15]:
model = LightGBMClassification(n_jobs=16, time_series=True)
model.tune(X_train, y_train, metric, timeout=60 * 60, categorical_features=cat_columns)
oof = model.fit(X_train, y_train, categorical_features=cat_columns)

print(metric(y_train, oof))

[2024-11-07 08:27:33,388] - [   START    ] - Tuning LightGBMClassification
[2024-11-07 08:27:40,118] - [   OPTUNA   ] - Trial 0. New best score 0.7903405446081995 with parameters {'max_depth': 6, 'num_leaves': 488, 'min_data_in_leaf': 188, 'bagging_fraction': 0.7993292420985183, 'bagging_freq': 0, 'feature_fraction': 0.49359671220172163, 'lambda_l1': 0.5808361216819946, 'lambda_l2': 8.661761457749352, 'min_gain_to_split': 12.022300234864176, 'is_unbalance': True, 'num_iterations': 2}
[2024-11-07 08:28:03,043] - [   OPTUNA   ] - Trial 2. New best score 0.7913515589848906 with parameters {'max_depth': 5, 'num_leaves': 194, 'min_data_in_leaf': 117, 'bagging_fraction': 0.8925879806965068, 'bagging_freq': 0, 'feature_fraction': 0.708540663048167, 'lambda_l1': 5.924145688620425, 'lambda_l2': 0.46450412719997725, 'min_gain_to_split': 12.150897038028766, 'is_unbalance': True, 'num_iterations': 2}
[2024-11-07 08:28:12,225] - [   OPTUNA   ] - Trial 3. New best score 0.8048686053278628 with param

In [27]:
none_oofs_idx = oof[np.any(np.isnan(oof), axis=1)].shape[0]

In [32]:
metric(y_train[none_oofs_idx:], oof[none_oofs_idx:])

0.8095227594190041

In [34]:
MODEL_NAME = "lgb_8095_full_dataset_time_series"
MODEL_DIR = Path(f"../../../data/models/{MODEL_NAME}")
MODEL_DIR.mkdir(exist_ok=True)

In [35]:
res = pd.DataFrame()
res[MODEL_NAME] = oof[none_oofs_idx:, 1]
res.to_csv(MODEL_DIR / "oof.csv", index=False)
joblib.dump(model, MODEL_DIR / f"{MODEL_NAME}.joblib")

with (MODEL_DIR / "params.yaml").open("w") as f:
    yaml.dump(model.params, f)

with (MODEL_DIR / "score.txt").open("w") as f:
    print("OOF:", metric(y_train, oof), file=f)
    
test = pd.read_parquet(DATA_PATH / "test_preproc_2.parquet")
test["target"] = model.predict(test[cfg["selected_features"] + cat_columns])[:, 1]
test[['id', 'target']].to_csv(MODEL_DIR / f'{MODEL_NAME}.csv', index=False)

## TEST 
**81.22112399468679**

## Inference

In [27]:
test = pd.read_parquet(DATA_PATH / "test_preproc_2.parquet")
test["target"] = model.predict(test[cfg["selected_features"] + cat_columns])[:, 1]
test[['id', 'target']].to_csv('lgb_813.csv', index=False)

In [25]:
pred_1 = pd.read_csv("lama_utilized.csv")
pred_2 = pd.read_csv("lgmb_oe_ohe_cols_0805.csv")
pred_3 = pd.read_csv("catboost_ts.csv")

In [26]:
pred_1["target"] = 0.6 * pred_1["target"] + 0.2 * pred_2["target"] + 0.2 * pred_3["target"]

In [29]:
pred_1.to_csv("blend.csv", index=False)

In [166]:
MODEL_DIR.open?

[0;31mSignature:[0m
[0mMODEL_DIR[0m[0;34m.[0m[0mopen[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mmode[0m[0;34m=[0m[0;34m'r'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mbuffering[0m[0;34m=[0m[0;34m-[0m[0;36m1[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mencoding[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0merrors[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mnewline[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Open the file pointed by this path and return a file object, as
the built-in open() function does.
[0;31mFile:[0m      /usr/lib/python3.10/pathlib.py
[0;31mType:[0m      method