In [54]:
DO_EMB_BASE = True
DO_T1_TAB   = True
DO_T2_FUS   = True
DO_SEG_OOF  = True


In [1]:
# [STEP 0] –ò–ú–ü–û–†–¢–´ –ò –ü–£–¢–ò
import os, io
from pathlib import Path
import numpy as np
import pandas as pd
from tqdm import tqdm
from PIL import Image

import torch
import torchvision as tv
from catboost import CatBoostRegressor, Pool

# –ö–æ—Ä–µ–Ω—å –ø—Ä–æ–µ–∫—Ç–∞ (—Ç–∞–º –¥–æ–ª–∂–Ω—ã –ª–µ–∂–∞—Ç—å parquet, sample_submission_f.csv –∏ –ø–∞–ø–∫–∏ —Å –∫–∞—Ä—Ç–∏–Ω–∫–∞–º–∏)
PROJ = Path(".").resolve()

TRAIN_PARQUET = PROJ / "train_dataset.parquet"
TEST_PARQUET  = PROJ / "test_dataset.parquet"
SAMPLE_CSV    = PROJ / "sample_submission_f.csv"

TRAIN_IMG_DIR = PROJ / "train_images"
TEST_IMG_DIR  = PROJ / "test_images"

CHUNK_DIR_TRAIN = PROJ / "emb_chunks_train"
CHUNK_DIR_TEST  = PROJ / "emb_chunks_test"
CHUNK_DIR_TRAIN.mkdir(exist_ok=True)
CHUNK_DIR_TEST.mkdir(exist_ok=True)

CKPT_LIST_TRAIN = PROJ / "_processed_train.txt"
CKPT_LIST_TEST  = PROJ / "_processed_test.txt"

# –ü—Ä–æ–≤–µ—Ä—è–µ–º
assert TRAIN_PARQUET.exists()
assert TEST_PARQUET.exists()
assert SAMPLE_CSV.exists()
assert TRAIN_IMG_DIR.exists()
assert TEST_IMG_DIR.exists()

# –£—Å—Ç—Ä–æ–π—Å—Ç–≤–æ: MPS –¥–ª—è Mac, –∏–Ω–∞—á–µ CPU
if torch.backends.mps.is_available():
    device = "mps"
elif torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"
print("Device:", device)


Device: mps


In [2]:
# [STEP 1] –ó–ê–ì–†–£–ó–ö–ê –¢–ê–ë–õ–ò–¶
train = pd.read_parquet(TRAIN_PARQUET, engine="pyarrow")
test  = pd.read_parquet(TEST_PARQUET, engine="pyarrow")

id_col = "ID"
target_col = "price_TARGET"

print("Train:", train.shape, "Test:", test.shape)


Train: (70000, 35) Test: (25000, 34)


In [6]:
train.head()

Unnamed: 0,ID,equipment,body_type,drive_type,engine_type,doors_number,color,pts,audiosistema,diski,...,fary_mult,multimedia_navigacia_mult,obogrev_mult,pamyat_nastroek_mult,podushki_bezopasnosti_mult,pomosh_pri_vozhdenii_mult,protivoygonnaya_sistema_mult,salon_mult,upravlenie_klimatom_mult,price_TARGET
0,58146,–ë–∞–∑–æ–≤–∞—è,–°–µ–¥–∞–Ω,–ü–µ—Ä–µ–¥–Ω–∏–π,–ë–µ–Ω–∑–∏–Ω,4,–°–∏–Ω–∏–π,–î—É–±–ª–∏–∫–∞—Ç,,,...,[None],[None],[None],[None],[None],[None],[None],[None],[None],51000
1,112144,–ë–∞–∑–æ–≤–∞—è,–£–Ω–∏–≤–µ—Ä—Å–∞–ª,–ó–∞–¥–Ω–∏–π,–ë–µ–Ω–∑–∏–Ω,5,–ë–µ–∂–µ–≤—ã–π,–û—Ä–∏–≥–∏–Ω–∞–ª,,"14""",...,[None],[None],[None],[None],[None],[None],[–°–∏–≥–Ω–∞–ª–∏–∑–∞—Ü–∏—è],[None],[None],195000
2,120705,,–í–Ω–µ–¥–æ—Ä–æ–∂–Ω–∏–∫,–ü–æ–ª–Ω—ã–π,–ì–∏–±—Ä–∏–¥,5,–ß—ë—Ä–Ω—ã–π,–≠–ª–µ–∫—Ç—Ä–æ–Ω–Ω—ã–π,,,...,"[–ü—Ä–æ—Ç–∏–≤–æ—Ç—É–º–∞–Ω–Ω—ã–µ, –û–º—ã–≤–∞—Ç–µ–ª–∏ —Ñ–∞—Ä, –ê–¥–∞–ø—Ç–∏–≤–Ω–æ–µ –æ—Å...","[CD –ø—Ä–∏–≤–æ–¥, MP3, –†–∞–¥–∏–æ, TV, –≠–∫—Ä–∞–Ω, –£–ø—Ä–∞–≤–ª–µ–Ω–∏–µ ...","[–ü–µ—Ä–µ–¥–Ω–∏—Ö —Å–∏–¥–µ–Ω–∏–π, –ó–∞–¥–Ω–∏—Ö —Å–∏–¥–µ–Ω–∏–π, –ó–µ—Ä–∫–∞–ª, –ó–∞–¥...","[–°–∏–¥–µ–Ω—å—è –≤–æ–¥–∏—Ç–µ–ª—è, –ó–∞–¥–Ω–∏—Ö —Å–∏–¥–µ–Ω–∏–π, –ó–µ—Ä–∫–∞–ª, –†—É–ª...","[–§—Ä–æ–Ω—Ç–∞–ª—å–Ω–∞—è –¥–ª—è –≤–æ–¥–∏—Ç–µ–ª—è, –ö–æ–ª–µ–Ω–Ω—ã–µ, –®—Ç–æ—Ä–∫–∏, –ë...","[–ê–≤—Ç–æ–ø–∞—Ä–∫–æ–≤—â–∏–∫, –î–∞—Ç—á–∏–∫ –¥–æ–∂–¥—è, –î–∞—Ç—á–∏–∫ —Å–≤–µ—Ç–∞, –ü–∞...","[–°–∏–≥–Ω–∞–ª–∏–∑–∞—Ü–∏—è, –¶–µ–Ω—Ç—Ä–∞–ª—å–Ω—ã–π –∑–∞–º–æ–∫, –ò–º–º–æ–±–∏–ª–∞–π–∑–µ—Ä...","[–ö–æ–∂–∞–Ω—ã–π —Ä—É–ª—å, –õ—é–∫]","[–£–ø—Ä–∞–≤–ª–µ–Ω–∏–µ –Ω–∞ —Ä—É–ª–µ, –ê—Ç–µ—Ä–º–∞–ª—å–Ω–æ–µ –æ—Å—Ç–µ–∫–ª–µ–Ω–∏–µ]",7251000
3,291392,Titanium,–°–µ–¥–∞–Ω,–ü–µ—Ä–µ–¥–Ω–∏–π,–ë–µ–Ω–∑–∏–Ω,4,–°–µ—Ä–µ–±—Ä—è–Ω—ã–π,–û—Ä–∏–≥–∏–Ω–∞–ª,6 –∫–æ–ª–æ–Ω–æ–∫,"16""",...,[None],"[CD –ø—Ä–∏–≤–æ–¥, MP3, –†–∞–¥–∏–æ, TV, –≠–∫—Ä–∞–Ω, –£–ø—Ä–∞–≤–ª–µ–Ω–∏–µ ...","[–ü–µ—Ä–µ–¥–Ω–∏—Ö —Å–∏–¥–µ–Ω–∏–π, –ó–∞–¥–Ω–µ–≥–æ —Å—Ç–µ–∫–ª–∞]",[None],"[–§—Ä–æ–Ω—Ç–∞–ª—å–Ω–∞—è –¥–ª—è –≤–æ–¥–∏—Ç–µ–ª—è, –ö–æ–ª–µ–Ω–Ω—ã–µ, –®—Ç–æ—Ä–∫–∏, –ë...","[–î–∞—Ç—á–∏–∫ –¥–æ–∂–¥—è, –î–∞—Ç—á–∏–∫ —Å–≤–µ—Ç–∞, –ü–∞—Ä–∫—Ç—Ä–æ–Ω–∏–∫ –∑–∞–¥–Ω–∏–π...","[–°–∏–≥–Ω–∞–ª–∏–∑–∞—Ü–∏—è, –¶–µ–Ω—Ç—Ä–∞–ª—å–Ω—ã–π –∑–∞–º–æ–∫]",[–ö–æ–∂–∞–Ω—ã–π —Ä—É–ª—å],[–£–ø—Ä–∞–≤–ª–µ–Ω–∏–µ –Ω–∞ —Ä—É–ª–µ],1067000
4,35742,–ë–∞–∑–æ–≤–∞—è,–°–µ–¥–∞–Ω,–ü–µ—Ä–µ–¥–Ω–∏–π,–ë–µ–Ω–∑–∏–Ω,4,–ß—ë—Ä–Ω—ã–π,–û—Ä–∏–≥–∏–Ω–∞–ª,,,...,[None],[None],[None],[None],[None],[None],[None],[None],[None],54000


In [57]:
# [STEP 2] –§–ò–ß–ò (—Ç–∞–±–ª–∏—á–Ω—ã–µ)
num_cols  = ["doors_number","crashes_count","owners_count","mileage","latitude","longitude"]
cat_cols  = ["equipment","body_type","drive_type","engine_type","color","pts",
             "steering_wheel","audiosistema","diski","electropodemniki","fary",
             "salon","upravlenie_klimatom","usilitel_rul"]
multi_cols = ["aktivnaya_bezopasnost_mult","audiosistema_mult","shini_i_diski_mult",
              "electroprivod_mult","fary_mult","multimedia_navigacia_mult","obogrev_mult",
              "pamyat_nastroek_mult","podushki_bezopasnosti_mult","pomosh_pri_vozhdenii_mult",
              "protivoygonnaya_sistema_mult","salon_mult","upravlenie_klimatom_mult"]

# —á–∏—Å–ª–æ–≤—ã–µ
for c in num_cols:
    train[c] = pd.to_numeric(train[c], errors="coerce").fillna(-1)
    test[c]  = pd.to_numeric(test[c], errors="coerce").fillna(-1)

# –∫–∞—Ç–µ–≥–æ—Ä–∏–∞–ª—å–Ω—ã–µ
for c in cat_cols:
    train[c] = train[c].fillna("Unknown").astype(str)
    test[c]  = test[c].fillna("Unknown").astype(str)


In [58]:
# [STEP 3] MULTI-COLS ‚Üí ONE-HOT (top-N)
from collections import Counter

def split_listlike(s):
    if s is None: return []
    if isinstance(s, (list, tuple, set)): return [str(x).strip() for x in s if str(x).strip() not in ("", "None", "[None]")]
    s = str(s).strip().replace("[","").replace("]","").replace("'","").replace('"',"")
    if s in ("", "None", "[None]"): return []
    return [p.strip() for p in s.split(",") if p.strip() not in ("", "None")]

def expand_multicolumns(df, cols, topN=400, vocab_map=None, fit=True):
    if fit:
        cnt = Counter()
        for c in cols:
            cnt.update(x for row in df[c].map(split_listlike) for x in row)
        vocab = [k for k,_ in cnt.most_common(topN)]
        vocab_map = {k:i for i,k in enumerate(vocab)}
    else:
        vocab = list(vocab_map.keys())

    X = np.zeros((len(df), len(vocab)), dtype=np.int8)
    for i, row in enumerate(df[cols].itertuples(index=False)):
        bag = set()
        for val in row: bag.update(split_listlike(val))
        for opt in bag:
            if opt in vocab_map:
                X[i, vocab_map[opt]] = 1

    return pd.DataFrame(X, index=df.index, columns=[f"m_{v}" for v in vocab]), vocab_map

mtrain, vocab_map = expand_multicolumns(train, multi_cols, fit=True)
mtest,  _         = expand_multicolumns(test,  multi_cols, vocab_map=vocab_map, fit=False)

train_exp = pd.concat([train[[id_col]+num_cols+cat_cols], mtrain], axis=1)
test_exp  = pd.concat([test[[id_col]+num_cols+cat_cols],  mtest],  axis=1)
print("Expanded:", train_exp.shape, test_exp.shape)


Expanded: (70000, 421) (25000, 421)


In [59]:
# [STEP 4] –ë–ê–ó–û–í–´–ô CATBOOST (–¢–∞–±–ª–∏—á–Ω—ã–µ)
def to_log(y): return np.log1p(y)
def from_log(y): return np.expm1(y)
def median_ape(y_true, y_pred):
    ape = np.abs(y_pred - y_true) / np.clip(y_true, 1e-9, None)
    return np.median(ape)

train_mask = train.index < int(0.85*len(train))
valid_mask = ~train_mask

X_tr = train_exp.loc[train_mask].drop(columns=[id_col])
X_va = train_exp.loc[valid_mask].drop(columns=[id_col])
y_tr = to_log(train.loc[train_mask, target_col].values)
y_va = to_log(train.loc[valid_mask, target_col].values)

cat_idx = [X_tr.columns.get_loc(c) for c in cat_cols if c in X_tr.columns]
pool_tr = Pool(X_tr, y_tr, cat_features=cat_idx)
pool_va = Pool(X_va, y_va, cat_features=cat_idx)

cb_tab = CatBoostRegressor(
    loss_function="RMSE", depth=10, learning_rate=0.05,
    iterations=5000, od_type="Iter", od_wait=200,
    random_seed=42, verbose=200
)
cb_tab.fit(pool_tr, eval_set=pool_va)

pred_va_tab = from_log(cb_tab.predict(pool_va))
val_medAPE_tab = median_ape(train.loc[valid_mask, target_col], pred_va_tab)
print("VALID medianAPE (tabular):", val_medAPE_tab)


0:	learn: 0.9761456	test: 0.9706357	best: 0.9706357 (0)	total: 75.6ms	remaining: 6m 17s
200:	learn: 0.4769189	test: 0.4979249	best: 0.4979249 (200)	total: 12s	remaining: 4m 47s
400:	learn: 0.4486526	test: 0.4818907	best: 0.4818907 (400)	total: 23.7s	remaining: 4m 31s
600:	learn: 0.4326295	test: 0.4762329	best: 0.4762329 (600)	total: 35.8s	remaining: 4m 21s
800:	learn: 0.4205640	test: 0.4731046	best: 0.4731046 (800)	total: 47.9s	remaining: 4m 11s
1000:	learn: 0.4102259	test: 0.4717161	best: 0.4717161 (1000)	total: 1m	remaining: 4m
1200:	learn: 0.4010198	test: 0.4704237	best: 0.4704237 (1200)	total: 1m 12s	remaining: 3m 49s
1400:	learn: 0.3925431	test: 0.4692284	best: 0.4692284 (1400)	total: 1m 24s	remaining: 3m 37s
1600:	learn: 0.3849826	test: 0.4683723	best: 0.4683723 (1600)	total: 1m 37s	remaining: 3m 26s
1800:	learn: 0.3783611	test: 0.4679684	best: 0.4679163 (1784)	total: 1m 50s	remaining: 3m 15s
2000:	learn: 0.3719208	test: 0.4674675	best: 0.4674675 (2000)	total: 2m 3s	remaining: 3m

In [60]:
# === –ü–ê–†–ê–ú–ï–¢–†–´ –ü–û–î–ë–û–†–ê (–ü–†–ê–í–¨ –¢–£–¢) ===
N_SUB = 30000            # –±—ã–ª–æ 20000; –º–æ–∂–Ω–æ 40000 –µ—Å–ª–∏ —Ö–≤–∞—Ç–∞–µ—Ç RAM
CV_FOLDS = 3             # –º–æ–∂–Ω–æ 5 –¥–ª—è –±–æ–ª–µ–µ —á–µ—Å—Ç–Ω–æ–π –æ—Ü–µ–Ω–∫–∏
ITER_OBJ = 1200          # –±—ã–ª–æ 600; —Å–∫–æ–ª—å–∫–æ –∏—Ç–µ—Ä–∞—Ü–∏–π –≤ –ö–ê–ñ–î–û–ú trial —É CatBoost
N_TRIALS = 40            # –±—ã–ª–æ 15; —Å–∫–æ–ª—å–∫–æ –ø—Ä–æ–± —É Optuna
OD_WAIT = 80             # —Ä–∞–Ω–Ω—è—è –æ—Å—Ç–∞–Ω–æ–≤–∫–∞ –≤ trial'–∞—Ö
FINAL_ITERS = 2000       # —Ñ–∏–Ω–∞–ª—å–Ω—ã–π fit cb_tab –Ω–∞ –í–°–Å–ú train
# =====================================

# —Å–∞–±—Å—ç–º–ø–ª
rng = np.random.RandomState(42)
sub_idx = rng.choice(len(train), size=min(N_SUB, len(train)), replace=False)
X_sub = X_tab_all.iloc[sub_idx].reset_index(drop=True)
y_sub = y_all[sub_idx]

cvk = KFold(n_splits=CV_FOLDS, shuffle=True, random_state=42)

if HAS_OPTUNA:
    pruner = optuna.pruners.MedianPruner(n_warmup_steps=max(2, CV_FOLDS-1))
    def objective_tab_lite(trial):
        params = {
            "loss_function": "RMSE",
            "depth": trial.suggest_int("depth", 7, 10),
            "learning_rate": trial.suggest_float("learning_rate", 0.03, 0.09, log=True),
            "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 2.0, 15.0, log=True),
            "bagging_temperature": trial.suggest_float("bagging_temperature", 0.0, 0.8),
            "rsm": trial.suggest_float("rsm", 0.85, 1.0),
            "random_strength": trial.suggest_float("random_strength", 0.8, 1.8),
            "iterations": ITER_OBJ,     # <‚Äî —É–≤–µ–ª–∏—á–µ–Ω–æ
            "od_type": "Iter",
            "od_wait": OD_WAIT,
            "random_seed": 42,
            "verbose": False,
            "thread_count": -1
        }
        oof = np.full(len(X_sub), np.nan, dtype=float)
        fold = 0
        for tr_idx, va_idx in cvk.split(X_sub):
            fold += 1
            X_tr, X_va = X_sub.iloc[tr_idx], X_sub.iloc[va_idx]
            y_tr, y_va = y_sub[tr_idx], y_sub[va_idx]
            cat_idx_tr = [X_tr.columns.get_loc(c) for c in cat_cols if c in X_tr.columns]
            cat_idx_va = [X_va.columns.get_loc(c) for c in cat_cols if c in X_va.columns]
            m = CatBoostRegressor(**params)
            m.fit(Pool(X_tr, to_log(y_tr), cat_features=cat_idx_tr),
                  eval_set=Pool(X_va, to_log(y_va), cat_features=cat_idx_va))
            pred_va = from_log(m.predict(Pool(X_va, cat_features=cat_idx_va)))
            oof[va_idx] = pred_va
            trial.report(median_ape(y_va, pred_va), step=fold)
            if trial.should_prune():
                raise optuna.exceptions.TrialPruned()
        return median_ape(y_sub, oof)

    study_tab_lite = optuna.create_study(direction="minimize", pruner=pruner)
    study_tab_lite.optimize(objective_tab_lite, n_trials=N_TRIALS, show_progress_bar=True)
    best_params = study_tab_lite.best_trial.params
    print("TAB-lite best medianAPE (sub, CV):", study_tab_lite.best_value)
    print("TAB-lite best params:", best_params)
else:
    best_params = dict(
        depth=10, learning_rate=0.05, l2_leaf_reg=8.0,
        bagging_temperature=0.3, rsm=0.9, random_strength=1.0,
        iterations=ITER_OBJ, od_type="Iter", od_wait=OD_WAIT
    )

# —Ñ–∏–Ω–∞–ª—å–Ω—ã–π CatBoost –Ω–∞ –≤—Å—ë–º train ‚Äî —É–≤–µ–ª–∏—á–∏–≤–∞–µ–º iterations
cb_tab = CatBoostRegressor(
    loss_function="RMSE", random_seed=42, verbose=200, thread_count=-1,
    **{**best_params, "iterations": FINAL_ITERS}
)
cb_tab.fit(Pool(X_tab_all, to_log(y_all), cat_features=cat_idx_tab))


[I 2025-10-02 02:57:03,705] A new study created in memory with name: no-name-9b607f44-c6ba-4948-82ce-b57df630cbe5
Best trial: 0. Best value: 0.273032:   2%|‚ñé         | 1/40 [00:50<32:32, 50.07s/it]

[I 2025-10-02 02:57:53,777] Trial 0 finished with value: 0.2730316946732103 and parameters: {'depth': 7, 'learning_rate': 0.03562262385685697, 'l2_leaf_reg': 3.558302467297511, 'bagging_temperature': 0.06175383446705274, 'rsm': 0.9668053333634705, 'random_strength': 1.377941309288312}. Best is trial 0 with value: 0.2730316946732103.


Best trial: 1. Best value: 0.271462:   5%|‚ñå         | 2/40 [01:41<32:16, 50.96s/it]

[I 2025-10-02 02:58:45,356] Trial 1 finished with value: 0.27146157869614007 and parameters: {'depth': 7, 'learning_rate': 0.08373456948979954, 'l2_leaf_reg': 8.516456404218069, 'bagging_temperature': 0.3021700357148615, 'rsm': 0.9644358086249386, 'random_strength': 1.5412626395873874}. Best is trial 1 with value: 0.27146157869614007.


Best trial: 2. Best value: 0.268546:   8%|‚ñä         | 3/40 [03:05<40:41, 65.99s/it]

[I 2025-10-02 03:00:09,239] Trial 2 finished with value: 0.2685455301917272 and parameters: {'depth': 9, 'learning_rate': 0.046196142562088496, 'l2_leaf_reg': 2.9756750122852633, 'bagging_temperature': 0.5567454913488928, 'rsm': 0.8840354137567983, 'random_strength': 0.8844576941948337}. Best is trial 2 with value: 0.2685455301917272.


Best trial: 2. Best value: 0.268546:  10%|‚ñà         | 4/40 [04:19<41:35, 69.32s/it]

[I 2025-10-02 03:01:23,668] Trial 3 finished with value: 0.2690160978753884 and parameters: {'depth': 8, 'learning_rate': 0.04882760771868774, 'l2_leaf_reg': 2.7360820703677198, 'bagging_temperature': 0.25997338214668425, 'rsm': 0.9184391014538555, 'random_strength': 0.9567275561398292}. Best is trial 2 with value: 0.2685455301917272.


Best trial: 4. Best value: 0.267906:  12%|‚ñà‚ñé        | 5/40 [06:16<50:21, 86.34s/it]

[I 2025-10-02 03:03:20,179] Trial 4 finished with value: 0.2679063439535001 and parameters: {'depth': 10, 'learning_rate': 0.06446258915141861, 'l2_leaf_reg': 4.744085752464073, 'bagging_temperature': 0.6099029822140482, 'rsm': 0.9965911020298823, 'random_strength': 1.6700322788137818}. Best is trial 4 with value: 0.2679063439535001.


Best trial: 4. Best value: 0.267906:  15%|‚ñà‚ñå        | 6/40 [06:51<39:01, 68.86s/it]

[I 2025-10-02 03:03:55,121] Trial 5 pruned. 


Best trial: 4. Best value: 0.267906:  18%|‚ñà‚ñä        | 7/40 [07:49<35:58, 65.42s/it]

[I 2025-10-02 03:04:53,453] Trial 6 pruned. 


Best trial: 4. Best value: 0.267906:  20%|‚ñà‚ñà        | 8/40 [08:34<31:20, 58.75s/it]

[I 2025-10-02 03:05:37,921] Trial 7 pruned. 


Best trial: 4. Best value: 0.267906:  22%|‚ñà‚ñà‚ñé       | 9/40 [10:17<37:37, 72.82s/it]

[I 2025-10-02 03:07:21,681] Trial 8 finished with value: 0.2689662175934525 and parameters: {'depth': 10, 'learning_rate': 0.03833698769458851, 'l2_leaf_reg': 6.000571607901351, 'bagging_temperature': 0.7952361392788685, 'rsm': 0.8670402951330227, 'random_strength': 1.6513981964420252}. Best is trial 4 with value: 0.2679063439535001.


Best trial: 4. Best value: 0.267906:  25%|‚ñà‚ñà‚ñå       | 10/40 [11:17<34:17, 68.58s/it]

[I 2025-10-02 03:08:20,769] Trial 9 pruned. 


Best trial: 10. Best value: 0.266301:  28%|‚ñà‚ñà‚ñä       | 11/40 [13:10<39:46, 82.31s/it]

[I 2025-10-02 03:10:14,199] Trial 10 finished with value: 0.2663007765887189 and parameters: {'depth': 10, 'learning_rate': 0.06466753348741862, 'l2_leaf_reg': 4.986095343730952, 'bagging_temperature': 0.4731878364294065, 'rsm': 0.9955992183399607, 'random_strength': 1.7834660196350933}. Best is trial 10 with value: 0.2663007765887189.


Best trial: 10. Best value: 0.266301:  30%|‚ñà‚ñà‚ñà       | 12/40 [15:02<42:36, 91.29s/it]

[I 2025-10-02 03:12:06,037] Trial 11 pruned. 


Best trial: 10. Best value: 0.266301:  32%|‚ñà‚ñà‚ñà‚ñé      | 13/40 [16:18<39:00, 86.67s/it]

[I 2025-10-02 03:13:22,068] Trial 12 pruned. 


Best trial: 10. Best value: 0.266301:  35%|‚ñà‚ñà‚ñà‚ñå      | 14/40 [18:08<40:41, 93.90s/it]

[I 2025-10-02 03:15:12,672] Trial 13 pruned. 


Best trial: 10. Best value: 0.266301:  38%|‚ñà‚ñà‚ñà‚ñä      | 15/40 [19:12<35:21, 84.85s/it]

[I 2025-10-02 03:16:16,541] Trial 14 pruned. 


Best trial: 10. Best value: 0.266301:  40%|‚ñà‚ñà‚ñà‚ñà      | 16/40 [21:16<38:34, 96.44s/it]

[I 2025-10-02 03:18:19,899] Trial 15 finished with value: 0.268151228102235 and parameters: {'depth': 10, 'learning_rate': 0.0568255832048168, 'l2_leaf_reg': 4.5338364767378, 'bagging_temperature': 0.6635767778893444, 'rsm': 0.9842180224802932, 'random_strength': 1.2322101403268149}. Best is trial 10 with value: 0.2663007765887189.


Best trial: 10. Best value: 0.266301:  42%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 17/40 [23:16<39:44, 103.68s/it]

[I 2025-10-02 03:20:20,436] Trial 16 pruned. 


Best trial: 10. Best value: 0.266301:  45%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 18/40 [24:04<31:53, 86.97s/it] 

[I 2025-10-02 03:21:08,490] Trial 17 pruned. 


Best trial: 10. Best value: 0.266301:  48%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 19/40 [25:09<28:03, 80.14s/it]

[I 2025-10-02 03:22:12,734] Trial 18 pruned. 


Best trial: 10. Best value: 0.266301:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 20/40 [27:13<31:08, 93.41s/it]

[I 2025-10-02 03:24:17,077] Trial 19 pruned. 


Best trial: 10. Best value: 0.266301:  52%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 21/40 [28:20<27:02, 85.40s/it]

[I 2025-10-02 03:25:23,777] Trial 20 pruned. 


Best trial: 10. Best value: 0.266301:  55%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 22/40 [30:23<29:00, 96.69s/it]

[I 2025-10-02 03:27:26,814] Trial 21 pruned. 


Best trial: 10. Best value: 0.266301:  57%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä    | 23/40 [32:28<29:49, 105.26s/it]

[I 2025-10-02 03:29:32,066] Trial 22 pruned. 


Best trial: 10. Best value: 0.266301:  60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 24/40 [33:49<26:09, 98.10s/it] 

[I 2025-10-02 03:30:53,467] Trial 23 pruned. 


Best trial: 10. Best value: 0.266301:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 25/40 [35:14<23:29, 93.95s/it]

[I 2025-10-02 03:32:17,730] Trial 24 pruned. 


Best trial: 10. Best value: 0.266301:  65%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 26/40 [36:21<20:04, 86.01s/it]

[I 2025-10-02 03:33:25,205] Trial 25 pruned. 


Best trial: 10. Best value: 0.266301:  68%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä   | 27/40 [37:42<18:18, 84.48s/it]

[I 2025-10-02 03:34:46,140] Trial 26 pruned. 


Best trial: 10. Best value: 0.266301:  70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 28/40 [38:49<15:52, 79.34s/it]

[I 2025-10-02 03:35:53,479] Trial 27 pruned. 


Best trial: 10. Best value: 0.266301:  72%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 29/40 [40:12<14:45, 80.49s/it]

[I 2025-10-02 03:37:16,664] Trial 28 pruned. 


Best trial: 10. Best value: 0.266301:  75%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 30/40 [41:01<11:49, 70.95s/it]

[I 2025-10-02 03:38:05,334] Trial 29 pruned. 


Best trial: 10. Best value: 0.266301:  78%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 31/40 [43:02<12:52, 85.86s/it]

[I 2025-10-02 03:40:05,986] Trial 30 pruned. 


Best trial: 10. Best value: 0.266301:  80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 32/40 [44:39<11:53, 89.14s/it]

[I 2025-10-02 03:41:42,792] Trial 31 pruned. 


Best trial: 10. Best value: 0.266301:  82%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 33/40 [45:43<09:31, 81.70s/it]

[I 2025-10-02 03:42:47,118] Trial 32 pruned. 


Best trial: 10. Best value: 0.266301:  85%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 34/40 [46:20<06:49, 68.20s/it]

[I 2025-10-02 03:43:23,810] Trial 33 pruned. 


Best trial: 10. Best value: 0.266301:  88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 35/40 [48:21<07:00, 84.19s/it]

[I 2025-10-02 03:45:25,330] Trial 34 pruned. 


Best trial: 10. Best value: 0.266301:  90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 36/40 [49:28<05:16, 79.00s/it]

[I 2025-10-02 03:46:32,224] Trial 35 pruned. 


Best trial: 10. Best value: 0.266301:  92%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 37/40 [51:25<04:30, 90.28s/it]

[I 2025-10-02 03:48:28,833] Trial 36 pruned. 


Best trial: 10. Best value: 0.266301:  95%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 38/40 [52:03<02:29, 74.65s/it]

[I 2025-10-02 03:49:07,007] Trial 37 pruned. 


Best trial: 10. Best value: 0.266301:  98%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 39/40 [53:14<01:13, 73.66s/it]

[I 2025-10-02 03:50:18,350] Trial 38 pruned. 


Best trial: 10. Best value: 0.266301: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 40/40 [54:56<00:00, 82.41s/it]


[I 2025-10-02 03:52:00,104] Trial 39 pruned. 
TAB-lite best medianAPE (sub, CV): 0.2663007765887189
TAB-lite best params: {'depth': 10, 'learning_rate': 0.06466753348741862, 'l2_leaf_reg': 4.986095343730952, 'bagging_temperature': 0.4731878364294065, 'rsm': 0.9955992183399607, 'random_strength': 1.7834660196350933}
0:	learn: 0.9701018	total: 77.2ms	remaining: 2m 34s
200:	learn: 0.4750052	total: 13.8s	remaining: 2m 3s
400:	learn: 0.4480871	total: 28.1s	remaining: 1m 51s
600:	learn: 0.4323454	total: 42.9s	remaining: 1m 39s
800:	learn: 0.4203027	total: 58s	remaining: 1m 26s
1000:	learn: 0.4102792	total: 1m 13s	remaining: 1m 12s
1200:	learn: 0.4012831	total: 1m 28s	remaining: 58.8s
1400:	learn: 0.3929938	total: 1m 43s	remaining: 44.4s
1600:	learn: 0.3855942	total: 1m 59s	remaining: 29.7s
1800:	learn: 0.3793007	total: 2m 14s	remaining: 14.9s
1999:	learn: 0.3730631	total: 2m 29s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x1634c0e00>

In [63]:
# [STEP 5] UNIVERSAL IMAGE EMBEDDING EXTRACTOR (DIR -> PARQUET CHUNKS) + AGGREGATOR

import os, io
from pathlib import Path
import numpy as np, pandas as pd
from tqdm import tqdm
from PIL import Image
import torch, torchvision as tv

# ---- device ----
device = "mps" if torch.backends.mps.is_available() else ("cuda" if torch.cuda.is_available() else "cpu")
torch.set_grad_enabled(False)
print("Device for embeddings:", device)

# ---- tiny backbone by default ----
backbone_tiny = tv.models.convnext_tiny(weights=tv.models.ConvNeXt_Tiny_Weights.IMAGENET1K_V1)
backbone_tiny.classifier = torch.nn.Identity()
backbone_tiny.eval().to(device)
tfm_tiny = tv.models.ConvNeXt_Tiny_Weights.IMAGENET1K_V1.transforms()

def embed_batch(x: torch.Tensor, model: torch.nn.Module) -> np.ndarray:
    with torch.no_grad():
        f = model(x)
        if isinstance(f, (list, tuple)):
            f = f[0]
        if f.ndim == 4:
            f = torch.flatten(f, 1)   # (N, C, 1, 1) -> (N, C)
        elif f.ndim == 3:
            f = f.mean(dim=-1)
        return f.detach().cpu().numpy()

def parse_id_from_name(name: str) -> int:
    return int(Path(name).stem.split("_")[0])

def load_processed_list(ckpt_path: Path):
    if ckpt_path.exists():
        return {line.strip() for line in ckpt_path.open()}
    return set()

def append_processed_list(ckpt_path: Path, names):
    with ckpt_path.open("a") as f:
        for n in names:
            f.write(n + "\n")

def next_chunk_path(chunk_dir: Path, split: str, idx: int):
    return chunk_dir / f"emb_{split}_chunk_{idx:05d}.parquet"

def extract_embeddings_from_dir(img_dir: Path, chunk_dir: Path, ckpt_path: Path,
                                split_name="train", batch_size=128, chunk_size=6000,
                                backbone: torch.nn.Module = backbone_tiny,
                                transforms = tfm_tiny):
    """
    STEP 5.1 ‚Äî –ü—Ä–æ–±–µ–≥–∞–µ–º –ø–∞–ø–∫—É —Å .jpg, —Å—á–∏—Ç–∞–µ–º —ç–º–±–µ–¥–¥–∏–Ω–≥–∏ –±–∞—Ç—á–∞–º–∏, –ø–∏—à–µ–º —á–∞–Ω–∫–∏ parquet (ID + features + filename).
    –í–æ–∑–æ–±–Ω–æ–≤–ª—è–µ–º–æ—Å—Ç—å: –ø—Ä–æ–ø—É—Å–∫–∞–µ–º —Ñ–∞–π–ª—ã –∏–∑ ckpt-—Å–ø–∏—Å–∫–∞.
    """
    chunk_dir.mkdir(exist_ok=True)
    processed = load_processed_list(ckpt_path)
    all_files = sorted([f for f in img_dir.iterdir() if f.suffix.lower() == ".jpg"])
    wrote_total, chunk_idx = 0, len(list(chunk_dir.glob("*.parquet")))

    buf_imgs, buf_ids, buf_names = [], [], []
    feats_chunks = []
    pbar = tqdm(all_files, desc=f"Embeddings {split_name} ({img_dir.name})")
    for fp in pbar:
        fname = fp.name
        if fname in processed: 
            continue
        try:
            car_id = parse_id_from_name(fname)
        except:
            continue

        im = Image.open(fp).convert("RGB")
        x = transforms(im)
        buf_imgs.append(x); buf_ids.append(car_id); buf_names.append(fname)

        if len(buf_imgs) >= batch_size:
            xb = torch.stack(buf_imgs).to(device)
            fb = embed_batch(xb, backbone)
            feats_chunks.append((buf_ids.copy(), buf_names.copy(), fb.copy()))
            buf_imgs.clear(); buf_ids.clear(); buf_names.clear()

        current_count = wrote_total + sum(len(ids) for ids,_,_ in feats_chunks) + len(buf_imgs)
        if current_count // chunk_size > wrote_total // chunk_size and feats_chunks:
            # —Å–±—Ä–∞—Å—ã–≤–∞–µ–º –Ω–∞ –¥–∏—Å–∫
            all_ids, all_names, all_f = [], [], []
            for ids, names, f in feats_chunks:
                all_ids.extend(ids); all_names.extend(names); all_f.append(f)
            all_f = np.concatenate(all_f, axis=0)
            df = pd.DataFrame(all_f); df.insert(0, "ID", all_ids); df["filename"] = all_names
            outp = next_chunk_path(chunk_dir, split_name, chunk_idx)
            df.to_parquet(outp, index=False)
            chunk_idx += 1
            append_processed_list(ckpt_path, all_names)
            wrote_total += len(all_names)
            feats_chunks.clear()
            pbar.set_postfix(saved=wrote_total)

    # –æ—Å—Ç–∞—Ç–æ–∫
    if buf_imgs:
        xb = torch.stack(buf_imgs).to(device)
        fb = embed_batch(xb, backbone)
        feats_chunks.append((buf_ids.copy(), buf_names.copy(), fb.copy()))
        buf_imgs.clear(); buf_ids.clear(); buf_names.clear()

    if feats_chunks:
        all_ids, all_names, all_f = [], [], []
        for ids, names, f in feats_chunks:
            all_ids.extend(ids); all_names.extend(names); all_f.append(f)
        all_f = np.concatenate(all_f, axis=0)
        df = pd.DataFrame(all_f); df.insert(0, "ID", all_ids); df["filename"] = all_names
        outp = next_chunk_path(chunk_dir, split_name, chunk_idx)
        df.to_parquet(outp, index=False)
        append_processed_list(ckpt_path, all_names)
        wrote_total += len(all_names)

    print(f"[{split_name}] DONE -> {chunk_dir} | processed: {wrote_total}")

def load_and_aggregate_chunks(chunk_dir: Path, id_col="ID"):
    """
    STEP 5.2 ‚Äî –ß—Ç–µ–Ω–∏–µ –≤—Å–µ—Ö parquet-—á–∞–Ω–∫–æ–≤ –∏ –∞–≥—Ä–µ–≥–∞—Ü–∏—è –ø–æ ID: mean + std.
    """
    files = sorted(chunk_dir.glob("*.parquet"))
    assert files, f"–ù–µ—Ç parquet-—Ñ–∞–π–ª–æ–≤ –≤ {chunk_dir}"
    dfs = [pd.read_parquet(p) for p in files]
    big = pd.concat(dfs, axis=0, ignore_index=True)
    feat_cols = [c for c in big.columns if c not in (id_col, "filename")]
    grp = big.groupby(id_col)[feat_cols]
    agg = pd.concat([grp.mean().add_prefix("img_mean_"),
                     grp.std().fillna(0).add_prefix("img_std_")], axis=1).reset_index()
    return agg


Device for embeddings: mps


In [62]:
# [STEP V1] CONVNEXT-BASE EMBEDDINGS (optional but recommended overnight)

# –ø–∞–ø–∫–∏ –¥–ª—è —á–∞–Ω–∫–æ–≤/—á–µ–∫–ø–æ–∏–Ω—Ç–æ–≤ Base
CHUNK_DIR_TRAIN_B = PROJ / "emb_chunks_train_cnvb"; CHUNK_DIR_TRAIN_B.mkdir(exist_ok=True)
CHUNK_DIR_TEST_B  = PROJ / "emb_chunks_test_cnvb";  CHUNK_DIR_TEST_B.mkdir(exist_ok=True)
CKPT_LIST_TRAIN_B = PROJ / "_processed_train_cnvb.txt"
CKPT_LIST_TEST_B  = PROJ / "_processed_test_cnvb.txt"

# backbone base
backbone_base = tv.models.convnext_base(weights=tv.models.ConvNeXt_Base_Weights.IMAGENET1K_V1)
backbone_base.classifier = torch.nn.Identity()
backbone_base.eval().to(device)
tfm_base = tv.models.ConvNeXt_Base_Weights.IMAGENET1K_V1.transforms()

# –∑–∞–ø—É—Å–∫–∞–µ–º (–º–æ–∂–Ω–æ –Ω–∞ –Ω–æ—á—å)
if DO_EMB_BASE:
    extract_embeddings_from_dir(TRAIN_IMG_DIR, CHUNK_DIR_TRAIN_B, CKPT_LIST_TRAIN_B,
                                split_name="train_base", batch_size=128, chunk_size=6000,
                                backbone=backbone_base, transforms=tfm_base)
    extract_embeddings_from_dir(TEST_IMG_DIR,  CHUNK_DIR_TEST_B,  CKPT_LIST_TEST_B,
                                split_name="test_base", batch_size=128, chunk_size=6000,
                                backbone=backbone_base, transforms=tfm_base)

    img_feat_train_b = load_and_aggregate_chunks(CHUNK_DIR_TRAIN_B)
    img_feat_test_b  = load_and_aggregate_chunks(CHUNK_DIR_TEST_B)
    print("Base image feats:", img_feat_train_b.shape, img_feat_test_b.shape)


Downloading: "https://download.pytorch.org/models/convnext_base-6075fbad.pth" to /Users/arutyunoff/.cache/torch/hub/checkpoints/convnext_base-6075fbad.pth


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 338M/338M [00:10<00:00, 32.3MB/s] 
Embeddings train_base (train_images):  31%|‚ñà‚ñà‚ñà       | 84991/273873 [3:54:24<8:40:55,  6.04it/s, saved=84096] 


KeyboardInterrupt: 

In [None]:
# [STEP 6] –ê–ì–†–ï–ì–ê–¶–ò–Ø –≠–ú–ë–ï–î–î–ò–ù–ì–û–í
def load_and_aggregate_chunks(chunk_dir: Path, id_col="ID"):
    files = sorted(chunk_dir.glob("*.parquet"))
    dfs = [pd.read_parquet(p) for p in files]
    big = pd.concat(dfs, axis=0, ignore_index=True)
    feat_cols = [c for c in big.columns if c not in (id_col,"filename")]
    grp = big.groupby(id_col)[feat_cols]
    agg_mean = grp.mean().add_prefix("img_mean_")
    agg_std  = grp.std().fillna(0).add_prefix("img_std_")
    return pd.concat([agg_mean, agg_std], axis=1).reset_index()

img_feat_train = load_and_aggregate_chunks(CHUNK_DIR_TRAIN)
img_feat_test  = load_and_aggregate_chunks(CHUNK_DIR_TEST)
print("Image feats:", img_feat_train.shape, img_feat_test.shape)


Image feats: (70000, 1537) (25000, 1537)


In [None]:
# [STEP 7] FUSION CatBoost
train_join = train_exp.merge(img_feat_train, on=id_col, how="left").fillna(0)
test_join  = test_exp.merge(img_feat_test,  on=id_col, how="left").fillna(0)
# [STEP T2] OPTUNA + 5-FOLD OOF FOR FUSION CATBOOST

X_fus = train_join.drop(columns=[id_col]).copy()
y_all = train[target_col].values.copy()
cat_idx_fus = [X_fus.columns.get_loc(c) for c in cat_cols if c in X_fus.columns]
cv = KFold(n_splits=5, shuffle=True, random_state=42)

def objective_fus(trial):
    params = {
        "loss_function": "RMSE",
        "depth": trial.suggest_int("depth", 6, 12),
        "learning_rate": trial.suggest_float("learning_rate", 0.02, 0.12, log=True),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1.0, 20.0, log=True),
        "random_strength": trial.suggest_float("random_strength", 0.5, 2.5),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0.0, 1.0),
        "rsm": trial.suggest_float("rsm", 0.6, 1.0),
        "iterations": 8000,
        "od_type": "Iter",
        "od_wait": trial.suggest_int("od_wait", 200, 600),
        "random_seed": 42,
        "verbose": False,
    }
    oof = np.zeros(len(X_fus))
    for tr_idx, va_idx in cv.split(X_fus):
        X_tr, X_va = X_fus.iloc[tr_idx], X_fus.iloc[va_idx]
        y_tr, y_va = y_all[tr_idx], y_all[va_idx]
        m = CatBoostRegressor(**params)
        m.fit(Pool(X_tr, to_log(y_tr), cat_features=cat_idx_fus),
              eval_set=Pool(X_va, to_log(y_va), cat_features=cat_idx_fus))
        oof[va_idx] = from_log(m.predict(Pool(X_va, cat_features=cat_idx_fus)))
    return median_ape(y_all, oof)

study_fus = optuna.create_study(direction="minimize")
study_fus.optimize(objective_fus, n_trials=80, show_progress_bar=True)
print("BEST FUS params:", study_fus.best_trial.params, "medianAPE:", study_fus.best_value)

# —Ñ–∏–Ω–∞–ª—å–Ω–∞—è fusion-–º–æ–¥–µ–ª—å –Ω–∞ –≤—Å—ë–º train
best_fus = CatBoostRegressor(**{**study_fus.best_trial.params,
                                "loss_function":"RMSE","iterations":8000,
                                "od_type":"Iter","random_seed":42,"verbose":200})
best_fus.fit(Pool(X_fus, to_log(y_all), cat_features=cat_idx_fus))


X_trj = train_join.loc[train_mask].drop(columns=[id_col])
X_vaj = train_join.loc[valid_mask].drop(columns=[id_col])
pool_trj = Pool(X_trj, y_tr, cat_features=cat_idx)
pool_vaj = Pool(X_vaj, y_va, cat_features=cat_idx)

cb_fusion = CatBoostRegressor(
    loss_function="RMSE", depth=10, learning_rate=0.05,
    iterations=5000, od_type="Iter", od_wait=200,
    random_seed=42, verbose=200
)
cb_fusion.fit(pool_trj, eval_set=pool_vaj)

pred_va_fusion = from_log(cb_fusion.predict(pool_vaj))
val_medAPE_fusion = median_ape(train.loc[valid_mask, target_col], pred_va_fusion)
print("VALID medianAPE (fusion):", val_medAPE_fusion)


0:	learn: 0.9776214	test: 0.9729582	best: 0.9729582 (0)	total: 635ms	remaining: 52m 52s
200:	learn: 0.3767605	test: 0.4236425	best: 0.4236425 (200)	total: 2m 16s	remaining: 54m 24s
400:	learn: 0.3130972	test: 0.4034035	best: 0.4034035 (400)	total: 4m 51s	remaining: 55m 41s
600:	learn: 0.2677465	test: 0.3957662	best: 0.3957662 (600)	total: 7m 21s	remaining: 53m 51s
800:	learn: 0.2323946	test: 0.3919557	best: 0.3919557 (800)	total: 9m 56s	remaining: 52m 6s
1000:	learn: 0.2031443	test: 0.3894679	best: 0.3894679 (1000)	total: 12m 38s	remaining: 50m 31s
1200:	learn: 0.1783893	test: 0.3879333	best: 0.3879333 (1200)	total: 15m 15s	remaining: 48m 16s
1400:	learn: 0.1564435	test: 0.3865915	best: 0.3865859 (1394)	total: 17m 59s	remaining: 46m 14s
1600:	learn: 0.1375072	test: 0.3857179	best: 0.3856994 (1589)	total: 20m 51s	remaining: 44m 16s
1800:	learn: 0.1209416	test: 0.3851273	best: 0.3851273 (1800)	total: 23m 37s	remaining: 41m 57s
2000:	learn: 0.1061339	test: 0.3847285	best: 0.3847206 (1998)

In [None]:
# [STEP A1] GET OOF PREDS (TAB & FUSION) WITH BEST PARAMS

def get_oof_preds(model_params, X, y, cat_idx, cv):
    oof = np.zeros(len(X))
    models = []
    for tr_idx, va_idx in cv.split(X):
        X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
        y_tr, y_va = y[tr_idx], y[va_idx]
        m = CatBoostRegressor(**model_params, loss_function="RMSE", iterations=8000,
                              od_type="Iter", random_seed=42, verbose=False)
        m.fit(Pool(X_tr, to_log(y_tr), cat_features=cat_idx),
              eval_set=Pool(X_va, to_log(y_va), cat_features=cat_idx))
        oof[va_idx] = from_log(m.predict(Pool(X_va, cat_features=cat_idx)))
        models.append(m)
    return oof, models

cv = KFold(n_splits=5, shuffle=True, random_state=42)
oof_tab, tab_models = get_oof_preds(study_tab.best_trial.params, X_tab, y_all, cat_idx_tab, cv)
oof_fus, fus_models = get_oof_preds(study_fus.best_trial.params, X_fus, y_all, cat_idx_fus, cv)

print("OOF TAB medianAPE:", median_ape(y_all, oof_tab))
print("OOF FUS medianAPE:", median_ape(y_all, oof_fus))


–°–µ–≥–º–µ–Ω—Ç–Ω—ã–µ –≤–µ—Å–∞ (—Ñ—Ä–∞–≥–º–µ–Ω—Ç): {'–°–µ–¥–∞–Ω': 0.7974897986676626, '–ú–∏–Ω–∏–≤—ç–Ω': 0.5019377502545568, '–§—É—Ä–≥–æ–Ω': 0.3266626961816229, '–í–Ω–µ–¥–æ—Ä–æ–∂–Ω–∏–∫': 0.5196959284591325, '–•–µ—Ç—á–±—ç–∫': 0.8934386424876263, '–£–Ω–∏–≤–µ—Ä—Å–∞–ª': 0.5375304408970908, '–õ–∏—Ñ—Ç–±–µ–∫': 0.8079228081536266, '–ú–∏–∫—Ä–æ–∞–≤—Ç–æ–±—É—Å': 0.7674456358531248}
‚úîÔ∏è submission_ensemble_segmented.csv –≥–æ—Ç–æ–≤


In [None]:
# [STEP A2] SEGMENTED ENSEMBLE WITH SHRINKAGE (USING OOF) + SUBMISSION

def best_w_ternary(p1, p2, y, iters=60):
    lo, hi = 0.0, 1.0
    for _ in range(iters):
        m1 = lo + (hi - lo)/3; m2 = hi - (hi - lo)/3
        s1 = median_ape(y, (1-m1)*p1 + m1*p2)
        s2 = median_ape(y, (1-m2)*p1 + m2*p2)
        if s1 < s2: hi = m2
        else:       lo = m1
    w = (lo+hi)/2
    return float(w), float(median_ape(y, (1-w)*p1 + w*p2))

# –≥–ª–æ–±–∞–ª—å–Ω—ã–π –≤–µ—Å –ø–æ OOF
w_glob, val_glob = best_w_ternary(oof_tab, oof_fus, y_all, iters=80)
print(f"GLOBAL w_fusion={w_glob:.3f} | OOF medianAPE={val_glob:.5f}")

# —Å–µ–≥–º–µ–Ω—Ç–Ω—ã–π –∫–ª—é—á (–ø—Ä–∏–º–µ—Ä: body_type*engine_type*drive_type + –±–∏–Ω—ã –ø–æ mileage)
seg_cols_cat = ["body_type","engine_type","drive_type"]
seg_key_all = train[seg_cols_cat].astype(str).agg("|".join, axis=1)
mileage_bins = pd.qcut(train["mileage"], q=[0,.2,.4,.6,.8,1.0], duplicates='drop').astype(str)
seg_key_all = (seg_key_all + "||" + mileage_bins).astype(str)

seg2w = {}
min_n = 150
shrink_k = 400.0

for key, idx in pd.Series(seg_key_all).groupby(seg_key_all):
    idx = idx.index.values
    if len(idx) < 30: 
        continue
    w_raw, _ = best_w_ternary(oof_tab[idx], oof_fus[idx], y_all[idx], iters=40)
    alpha = len(idx) / (len(idx) + shrink_k)
    w_smooth = alpha*w_raw + (1-alpha)*w_glob
    if len(idx) < min_n:
        w_smooth = 0.5*w_smooth + 0.5*w_glob
    seg2w[key] = float(w_smooth)

# —Ç–µ—Å—Ç–æ–≤—ã–µ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏—è –∫–∞–∫ —Å—Ä–µ–¥–Ω–µ–µ –ø–æ fold-–º–æ–¥–µ–ª—è–º
from catboost import Pool
pred_test_tab = np.mean([from_log(m.predict(Pool(test_exp.drop(columns=[id_col]), cat_features=cat_idx_tab)))
                         for m in tab_models], axis=0)
pred_test_fus = np.mean([from_log(m.predict(Pool(test_join.drop(columns=[id_col]), cat_features=cat_idx_fus)))
                         for m in fus_models], axis=0)

# —Å–µ–≥–º–µ–Ω—Ç–Ω—ã–µ –≤–µ—Å–∞ –Ω–∞ —Ç–µ—Å—Ç–µ
seg_key_test = test[seg_cols_cat].astype(str).agg("|".join, axis=1)
test_bins = pd.qcut(test["mileage"], q=[0,.2,.4,.6,.8,1.0], duplicates='drop').astype(str)
seg_key_test = (seg_key_test + "||" + test_bins).astype(str)

w_vec = np.array([seg2w.get(k, w_glob) for k in seg_key_test], dtype=float)
pred_test_seg = (1.0 - w_vec)*pred_test_tab + w_vec*pred_test_fus

# —Å–±–æ—Ä–∫–∞ –ø–æ sample (–∂—ë—Å—Ç–∫–∏–π —Ñ–æ—Ä–º–∞—Ç ID,target)
pred_df = pd.DataFrame({"ID": test[id_col].astype(int).values, "target": pred_test_seg.astype(float)})
sub = sample[["ID"]].merge(pred_df, on="ID", how="left")
assert list(sub.columns) == ["ID","target"] and len(sub)==len(sample)
sub.to_csv("submission_ensemble_segmented_oof.csv", index=False, sep=",", float_format="%.6f")
print("‚úîÔ∏è submission_ensemble_segmented_oof.csv saved")


In [None]:
# [STEP 8C] –¢–û–ß–ù–´–ô –í–ï–° –ê–ù–°–ê–ú–ë–õ–Ø (–¢–ï–†–ù–ê–†–ù–´–ô –ü–û–ò–°–ö –ü–û medianAPE)
import numpy as np

y_va_true = train.loc[valid_mask, target_col].values

def median_ape(y, yhat):
    y, yhat = np.asarray(y, float), np.asarray(yhat, float)
    return np.median(np.abs(yhat - y) / np.clip(y, 1e-9, None))

def best_w_ternary(p1, p2, y, iters=60):
    lo, hi = 0.0, 1.0
    for _ in range(iters):
        m1 = lo + (hi - lo) / 3
        m2 = hi - (hi - lo) / 3
        s1 = median_ape(y, (1-m1)*p1 + m1*p2)
        s2 = median_ape(y, (1-m2)*p1 + m2*p2)
        if s1 < s2: hi = m2
        else:       lo = m1
    w = (lo + hi) / 2
    val = median_ape(y, (1-w)*p1 + w*p2)
    return float(w), float(val)

w_opt, val_medape_opt = best_w_ternary(pred_va_tab, pred_va_fusion, y_va_true)
print(f"üîß w_fusion (ternary) = {w_opt:.4f}  |  VAL medianAPE = {val_medape_opt:.5f}  |  Score ‚âà {1/(1+val_medape_opt):.4f}")

# –ø—Ä–∏–º–µ–Ω—è–µ–º –∫ —Ç–µ—Å—Ç—É
pred_test_ens = (1.0 - w_opt) * pred_test_tab + w_opt * pred_test_fusion
# [STEP 8C FINAL] –°–û–•–†–ê–ù–ï–ù–ò–ï SUBMISSION –ü–û–°–õ–ï –¢–ï–†–ù–ê–†–ù–û–ì–û –ü–û–ò–°–ö–ê –í–ï–°–ê

# –ø—Ä–∏–º–µ–Ω—è–µ–º –Ω–∞ —Ç–µ—Å—Ç–µ –æ–ø—Ç–∏–º–∞–ª—å–Ω—ã–π –≤–µ—Å
pred_test_ens = (1.0 - w_opt) * pred_test_tab + w_opt * pred_test_fusion

# —Å–æ–±–∏—Ä–∞–µ–º –≤ —Ç–∞–±–ª–∏—Ü—É —Å ID
pred_df_ens = pd.DataFrame({
    "ID": test[id_col].astype(int).values,
    "target": pred_test_ens.astype(float)
})

# –ø–æ–¥–≥–æ–Ω—è–µ–º –ø–æ–¥ sample_submission (—á—Ç–æ–±—ã ID —Å–æ–≤–ø–∞–¥–∞–ª–∏ –ø–æ –ø–æ—Ä—è–¥–∫—É)
sub_ens = sample[["ID"]].merge(pred_df_ens, on="ID", how="left")

# —Å–æ—Ö—Ä–∞–Ω—è–µ–º –≤ CSV
sub_ens[["ID", "target"]].to_csv("submission_ensemble_8C.csv",
                                 index=False, sep=",", float_format="%.6f")

print(f"‚úîÔ∏è submission_ensemble_8C.csv —Å–æ—Ö—Ä–∞–Ω—ë–Ω (w_fusion={w_opt:.4f}, w_tab={1-w_opt:.4f})")
