In [143]:
LOAD_LIBS = False
if LOAD_LIBS:
    !pip install -q -U catboost
    !pip install -q -U lightgbm
    !pip install -q -U xgboost
    !pip install -q -U geopy
    !pip install -q -U phik
    
    

In [144]:
import os
import gc
import json
import time
import logging
import argparse
from tqdm import tqdm
from typing import List, Dict, Tuple, Optional

import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import (
    roc_auc_score,
    accuracy_score,
    log_loss,
    mean_squared_error,
    mean_absolute_error,
    confusion_matrix
)

from catboost import CatBoostClassifier, CatBoostRegressor, Pool
import lightgbm as lgb
import xgboost as xgb

from PIL import Image

import torch
import torch.nn as nn
from torchvision import models, transforms
from torch.utils.data import DataLoader
import open_clip

from transformers import AutoTokenizer, AutoModel
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
# –ì–µ–æ-–±—ç–∫–µ–Ω–¥: geopy (–µ—Å–ª–∏ —É—Å—Ç–∞–Ω–æ–≤–ª–µ–Ω) –∏–ª–∏ haversine
try:
    from geopy.distance import geodesic
    GEO_BACKEND = "geopy"
except ImportError:
    GEO_BACKEND = "haversine"


try:
    import phik
    HAS_PHIK = True
except ImportError:
    HAS_PHIK = False

In [145]:
GEO_BACKEND, HAS_PHIK

('geopy', True)

In [146]:
# –¥–ª—è —Ç–µ–∫—Å—Ç–∞, –Ω–∏–∫–∞–∫ –Ω–µ —Ç—Ä–æ–≥–∞—Ç—å
TEXT_MODEL = None
TEXT_TOKENIZER = None
TEXT_DEVICE = None

# TF-IDF –∏ SVD –æ—Ç–¥–µ–ª—å–Ω–æ –¥–ª—è –∫–∞–∂–¥–æ–≥–æ —Ç–µ–∫—Å—Ç–æ–≤–æ–≥–æ –ø–æ–ª—è (—á—Ç–æ–±—ã –Ω–µ –º–µ—à–∞—Ç—å —Ä–∞–∑–Ω—ã–µ –∏—Å—Ç–æ—á–Ω–∏–∫–∏)
TFIDF_VECTORIZERS = {}  # field_name -> vectorizer
TFIDF_SVDS = {}         # field_name -> svd


In [147]:
DATA_FOLDER_PATH = '.'
CONFIG = {
    # ---------- Paths ----------
    "train_path": os.path.join(DATA_FOLDER_PATH, "/kaggle/input/petfinder-pawpularity-score/train.csv"),
    "test_path": os.path.join(DATA_FOLDER_PATH, '/kaggle/input/petfinder-pawpularity-score/test.csv'),
    "sep": ",",
    "id_column": "Id",
    "target_column": "Pawpularity",
    "datetime_columns": [],
    "output_dir": "tabular_boosting_output",

    # ---------- Task ----------
    # "binary" / "multiclass" / "regression"
    "task_type": "regression",

    # ---------- Basic features ----------
    "basic_drop_columns": [],
    "basic_as_categorical": [],
    "basic_max_cat_unique": 64,
    "basic_datetime_expand": False,
    "basic_datetime_features": ["year", "month", "day", "dow", "hour"],

    # ---------- Categorical processing ----------
    # –º–∏–Ω–∏–º–∞–ª—å–Ω—ã–π "—Ä–∞–∑–º–µ—Ä" –∫–∞—Ç–µ–≥–æ—Ä–∏–∏ (–ø–æ —á–∏—Å–ª—É –æ–±—ä–µ–∫—Ç–æ–≤ –∏/–∏–ª–∏ –¥–æ–ª–µ),
    # –≤—Å–µ –∫–∞—Ç–µ–≥–æ—Ä–∏–∏ –º–µ–Ω—å—à–µ –ø–æ—Ä–æ–≥–∞ —Å–ª–∏–≤–∞—é—Ç—Å—è –≤ 'other'
    "cat_min_count": 20,          # –º–∏–Ω–∏–º–∞–ª—å–Ω–æ–µ –∫–æ–ª-–≤–æ –æ–±—ä–µ–∫—Ç–æ–≤ –≤ –∫–∞—Ç–µ–≥–æ—Ä–∏–∏
    "cat_min_freq": 0.0,          # –º–∏–Ω–∏–º–∞–ª—å–Ω–∞—è –¥–æ–ª—è (0..1); 0 = –∏–≥–Ω–æ—Ä–∏—Ä–æ–≤–∞—Ç—å
    # –µ—Å–ª–∏ –¥–æ–ª—è "—Ä–µ–¥–∫–∏—Ö" –∫–∞—Ç–µ–≥–æ—Ä–∏–π –≤ –ø—Ä–∏–∑–Ω–∞–∫–µ >= —ç—Ç–æ–≥–æ –ø–æ—Ä–æ–≥–∞ ‚Äî –ø—Ä–∏–∑–Ω–∞–∫ —É–¥–∞–ª—è–µ—Ç—Å—è
    "cat_max_rare_share": 0.98,
    # –µ—Å–ª–∏ –ø–æ—Å–ª–µ –æ–±—ä–µ–¥–∏–Ω–µ–Ω–∏—è –≤ other —É–Ω–∏–∫–∞–ª—å–Ω—ã—Ö –≤—Å—ë –µ—â—ë —Å–ª–∏—à–∫–æ–º –º–Ω–æ–≥–æ ‚Äî –ø—Ä–∏–∑–Ω–∞–∫ —É–¥–∞–ª—è–µ—Ç—Å—è
    "cat_max_unique_after_group": 500,

    # ---------- Post-feature service columns ----------
    # —Å–ª—É–∂–µ–±–Ω—ã–µ –∫–æ–ª–æ–Ω–∫–∏, –∫–æ—Ç–æ—Ä—ã–µ –Ω—É–∂–Ω–æ —É–¥–∞–ª–∏—Ç—å –ø–æ—Å–ª–µ –≥–µ–Ω–µ—Ä–∞—Ü–∏–∏ —Ñ–∏—á
    # (–∏—Å—Ö–æ–¥–Ω—ã–µ –∫–æ–æ—Ä–¥–∏–Ω–∞—Ç—ã, raw id, —Å—ã—Ä—ã–µ –∞–¥—Ä–µ—Å–∞ –∏ —Ç.–ø.)
    "post_feature_drop_columns": [], # –º–æ–∂–µ—Ç –±—ã—Ç—å —É–¥–∞–ª–∏—Ç—å category_geo_ref_lat, category_geo_ref_lon

    # ---------- Address processing ----------
    # –∫–æ–ª–æ–Ω–∫–∞-–∞–¥—Ä–µ—Å, –∏–∑ –∫–æ—Ç–æ—Ä–æ–π –Ω—É–∂–Ω–æ –∏–∑–≤–ª–µ—á—å –≥–æ—Ä–æ–¥
    "address_column": None,  

    # –∏–Ω–¥–µ–∫—Å —ç–ª–µ–º–µ–Ω—Ç–∞ –ø–æ—Å–ª–µ split (–Ω–∞–ø—Ä–∏–º–µ—Ä [-1] = –ø–æ—Å–ª–µ–¥–Ω–∏–π)
    # –º–æ–∂–Ω–æ —É–∫–∞–∑—ã–≤–∞—Ç—å –∫–∞–∫ –æ—Ç—Ä–∏—Ü–∞—Ç–µ–ª—å–Ω—ã–π –∏–Ω–¥–µ–∫—Å
    "address_city_index": -1,  

    # —Ä–∞–∑–¥–µ–ª–∏—Ç–µ–ª—å –≤ —Å—Ç—Ä–æ–∫–µ –∞–¥—Ä–µ—Å–∞
    "address_split_sep": ",",  

    # ---------- Aggregates ----------
    "agg_enable": False,
    "agg_groupby_cols": [],
    "agg_numeric_cols": [],
    "agg_aggs": ["mean", "std", "min", "max", "sum", "median", "nunique", "count"],
    "agg_prefix": "agg",

    # ---------- Geo features ----------
    "geo_enable": False, 
    "geo_lat_from_col": "pickup_lat",
    "geo_lon_from_col": "pickup_lon",
    "geo_lat_to_col": "dropoff_lat",
    "geo_lon_to_col": "dropoff_lon",
    "geo_ref_lat": 'category_geo_ref_lat',
    "geo_ref_lon": 'category_geo_ref_lon',
    "geo_prefix": "geo",
    # –¥–æ–ø–æ–ª–Ω–∏—Ç–µ–ª—å–Ω—ã–µ —Ñ–∏—á–∏ (bearing, manhattan, dlat/dlon, midpoint)
    "geo_extra_enable": False,

    "geo_from_coord_col": None,
    "geo_to_coord_col": None,
    "geo_coord_string_sep": ",",

    # ---------- Geo reference config ----------
    "geo_reference": {
        "enabled": False,
        "category_column": "category",
        "coordinates_column": "coordinates",  # —Å—Ç—Ä–æ–∫–∞ "lat,lon", –µ—Å–ª–∏ lat/lon –Ω–µ –∑–∞–¥–∞–Ω—ã
        "lat_column": None,                  # <- –µ—Å–ª–∏ –∑–∞–¥–∞—Ç—å, –±–µ—Ä—ë–º –æ—Ç—Å—é–¥–∞ —à–∏—Ä–æ—Ç—É
        "lon_column": None,                  # <- –µ—Å–ª–∏ –∑–∞–¥–∞—Ç—å, –±–µ—Ä—ë–º –æ—Ç—Å—é–¥–∞ –¥–æ–ª–≥–æ—Ç—É
        "output_column": "category_geo_ref",
        "output_lat_column": "category_geo_ref_lat",
        "output_lon_column": "category_geo_ref_lon",
    },

    # ---------- Correlation-based feature filtering ----------
    "corr_enable": False,          # –≤–∫–ª—é—á–∏—Ç—å/–≤—ã–∫–ª—é—á–∏—Ç—å —Ñ–∏–ª—å—Ç—Ä–∞—Ü–∏—é –ø–æ –∫–æ—Ä—Ä–µ–ª—è—Ü–∏–∏
    "corr_pearson_min_abs": 0.95,   # –ø–æ—Ä–æ–≥ |Pearson|; 0.1, 0.05 –∏ —Ç.–ø.
    "corr_use_phik": False,        # —Å—á–∏—Ç–∞—Ç—å –ª–∏ phik
    "corr_phik_min_abs": 0.0,      # –ø–æ—Ä–æ–≥ |phik|
    
    #----------images---------
    # –ø–∞–ø–∫–∞ —Å train-–∫–∞—Ä—Ç–∏–Ω–∫–∞–º–∏ (–µ—Å–ª–∏ –∏—Å–ø–æ–ª—å–∑—É–µ–º image-—Ñ–∏—á–∏)
    "train_images_dir": "/kaggle/input/petfinder-pawpularity-score/train",
    # –ø–∞–ø–∫–∞ —Å test-–∫–∞—Ä—Ç–∏–Ω–∫–∞–º–∏ (–µ—Å–ª–∏ –∏—Å–ø–æ–ª—å–∑—É–µ–º image-—Ñ–∏—á–∏)
    "test_images_dir": "/kaggle/input/petfinder-pawpularity-score/test",
    # –ï—Å–ª–∏ –≤ —Ç–∞–±–ª–∏—Ü–µ image_id —É–∂–µ —Å–æ–¥–µ—Ä–∂–∏—Ç ".jpg", –ø–æ—Å—Ç–∞–≤—å IMAGE_EXT = ""
    # —Ä–∞—Å—à–∏—Ä–µ–Ω–∏–µ —Ñ–∞–π–ª–æ–≤ –∫–∞—Ä—Ç–∏–Ω–æ–∫ (–µ—Å–ª–∏ –≤ —Ç–∞–±–ª–∏—Ü–µ —Ç–æ–ª—å–∫–æ id –±–µ–∑ —Ä–∞—Å—à–∏—Ä–µ–Ω–∏—è)
    "image_ext":".jpg",
    # –∫–æ–ª–æ–Ω–∫–∞ –≤ —Ç–∞–±–ª–∏—Ü–µ —Å –∏–º–µ–Ω–µ–º —Ñ–∞–π–ª–∞ –∫–∞—Ä—Ç–∏–Ω–∫–∏
    "file_names_column": "Id",
    # —Ä–∞–∑–º–µ—Ä –±–∞—Ç—á–∞ –ø—Ä–∏ –æ–±—Ä–∞–±–æ—Ç–∫–µ –∫–∞—Ä—Ç–∏–Ω–æ–∫
    "batch_size": 32,
    
    # ---------- Text features ----------
    # –≤–∫–ª—é—á–∏—Ç—å/–≤—ã–∫–ª—é—á–∏—Ç—å –ø–æ—Å—Ç—Ä–æ–µ–Ω–∏–µ —Ç–µ–∫—Å—Ç–æ–≤—ã—Ö —ç–º–±–µ–¥–¥–∏–Ω–≥–æ–≤
    "text_enable": False,  # –≤–∫–ª—é—á–∏—Ç—å/–≤—ã–∫–ª—é—á–∏—Ç—å —Ç–µ–∫—Å—Ç–æ–≤—ã–µ —Ñ–∏—á–∏

    # –ª—é–±—ã–µ —Ç–µ–∫—Å—Ç–æ–≤—ã–µ –∫–æ–ª–æ–Ω–∫–∏ –≤ –æ—Å–Ω–æ–≤–Ω–æ–π train/test —Ç–∞–±–ª–∏—Ü–µ
    "text_columns": [],  # –Ω–∞–ø—Ä–∏–º–µ—Ä ["title", "description"]

    # –≤–Ω–µ—à–Ω–∏–µ —Ç–µ–∫—Å—Ç–æ–≤—ã–µ —Ç–∞–±–ª–∏—Ü—ã (id, text) –≤ parquet/csv/tsv
    # –∫–∞–∂–¥–∞—è –∑–∞–ø–∏—Å—å:
    # {
    #   "name": "comments",        # –ª–æ–≥–∏—á–µ—Å–∫–æ–µ –∏–º—è (–¥–ª—è –Ω–∞–∑–≤–∞–Ω–∏—è —Ñ–∏—á)
    #   "train_path": "train_comments.parquet",
    #   "test_path": "test_comments.parquet",
    #   "format": "parquet",       # "parquet" / "csv" / "tsv" / "auto"
    #   "id_column": "index",      # –∫–æ–ª–æ–Ω–∫–∞ id –≤ —ç—Ç–æ–π —Ç–∞–±–ª–∏—Ü–µ
    #   "text_column": "text",     # –∫–æ–ª–æ–Ω–∫–∞ —Å —Ç–µ–∫—Å—Ç–æ–º –≤ —ç—Ç–æ–π —Ç–∞–±–ª–∏—Ü–µ
    #   "output_column": "comments_text"  # –∫–∞–∫ –Ω–∞–∑–≤–∞—Ç—å –∫–æ–ª–æ–Ω–∫—É –≤ –æ–±—â–µ–π —Ç–∞–±–ª–∏—Ü–µ (–æ–ø—Ü.)
    # }
    "text_external_tables": [{
        "name": "reviews",
        "train_path": "reviews.tsv",
        "test_path": "reviews.tsv",
        "format": "tsv",       # –∏–ª–∏ "auto"
        # –∏–º—è –∫–æ–ª–æ–Ω–∫–∏ —Å —É–Ω–∏–∫–∞–ª—å–Ω—ã–º –∏–¥–µ–Ω—Ç–∏—Ñ–∏–∫–∞—Ç–æ—Ä–æ–º –æ–±—ä–µ–∫—Ç–∞
    "id_column": "id",      # –≤ —ç—Ç–∏—Ö —Ñ–∞–π–ª–∞—Ö
        "text_column": "text",     # –∫–æ–ª–æ–Ω–∫–∞ —Å —Ç–µ–∫—Å—Ç–æ–º
        "output_column": "comments_text"
    }],

    # —Ç–∏–ø —ç–º–±–µ–¥–¥–∏–Ω–≥–æ–≤
    # "bert"      ‚Äî —Ç—Ä–∞–Ω—Å—Ñ–æ—Ä–º–µ—Ä-—ç–º–±–µ–¥–¥–∏–Ω–≥–∏
    # "tfidf_svd" ‚Äî TF-IDF + TruncatedSVD
    # —Ç–∏–ø —Ç–µ–∫—Å—Ç–æ–≤–æ–π –º–æ–¥–µ–ª–∏: "bert" –∏–ª–∏ "tfidf_svd"
    "text_model_type": "tfidf_svd",

    # –≤–∞—Ä–∏–∞–Ω—Ç—ã –º–æ–¥–µ–ª–µ–π (–∫–æ–º–º–µ–Ω—Ç–∞–º–∏, —á—Ç–æ–±—ã –º–æ–∂–Ω–æ –±—ã–ª–æ –±—ã—Å—Ç—Ä–æ –ø–µ—Ä–µ–∫–ª—é—á–∞—Ç—å)
    # "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"  # RU + EN (—É–Ω–∏–≤–µ—Ä—Å–∞–ª)
    # "DeepPavlov/rubert-base-cased-sentence"                        # ru-only
    # "sentence-transformers/all-mpnet-base-v2"                      # en-only
    "text_bert_model_name": "DeepPavlov/rubert-base-cased-sentence",

    # –Ω–∞—Å—Ç—Ä–æ–π–∫–∏ BERT
    "text_max_length": 256,
    "text_batch_size": 32,
    # –Ω–∞—Å—Ç—Ä–æ–π–∫–∏ TF-IDF + SVD
    "text_tfidf_max_features": 50000,
    "text_svd_n_components": 256,

    # ---------- CV ----------
    "cv_n_splits": 5,
    "cv_random_state": 42,
    "cv_shuffle": True,
    "cv_stratified": True,

    # ---------- Models ----------
    "use_catboost": True,
    "use_lightgbm": False,
    "use_xgboost": False,

    "catboost_params": {
        "iterations": 3500,
        "learning_rate": 0.05,
        "depth": 6,
        "loss_function": "MultiClass", # "Logloss"
        "eval_metric": "AUC",
        "random_seed": 42,
        "verbose": 100
    },


    "lgb_params": {
        "objective": "multiclass", # "binary"
        "eval_metric": "auc", #["auc", "binary_logloss"],
        "learning_rate": 0.05,
        "num_leaves": 31,
        "feature_fraction": 0.9,
        "bagging_fraction": 0.8,
        "bagging_freq": 5,
        "lambda_l2": 1.0,
        "num_threads": 0,
        "verbose": -1
    },

    "xgb_params": {
        "objective": "multi:softmax", #"binary:logistic",
        "eval_metric": "auc",
        "learning_rate": 0.05,
        "max_depth": 6,
        "subsample": 0.8,
        "colsample_bytree": 0.8,
        "lambda": 1.0,
        "alpha": 0.0,
        "tree_method": "hist"
    },

    # ---------- Blending ----------
    "blend_weights": {
        "catboost": 1,
        "lightgbm": 0.35,
        "xgboost": 0.15
    },

    # ---------- Feature importance ----------
    "compute_feature_importance": True,
    "top_features_to_show": 50,

    # ---------- Early stopping ----------
    # –ø–∞—Ä–∞–º–µ—Ç—Ä —Ä–∞–Ω–Ω–µ–π –æ—Å—Ç–∞–Ω–æ–≤–∫–∏ –¥–ª—è –±—É—Å—Ç–∏–Ω–≥–æ–≤
    "early_stopping_rounds": 100,# –ö–æ–ª–∏—á–µ—Å—Ç–≤–æ —Ä–∞—É–Ω–¥–æ–≤ –±–µ–∑ —É–ª—É—á—à–µ–Ω–∏—è –¥–ª—è —Ä–∞–Ω–Ω–µ–π –æ—Å—Ç–∞–Ω–æ–≤–∫–∏
    "device": "cuda",

    "seed": 42,
    "verbose": True
}


In [148]:
def setup_logging(output_dir: str) -> logging.Logger:
    os.makedirs(output_dir, exist_ok=True)
    log_path = os.path.join(output_dir, "training.log")

    logger = logging.getLogger("TABULAR_BOOSTING")
    logger.setLevel(logging.INFO)
    logger.handlers = []

    # –ò—Å–ø–æ–ª—å–∑—É–µ–º UTF-8 –∫–æ–¥–∏—Ä–æ–≤–∫—É –¥–ª—è —Ñ–∞–π–ª–∞, —á—Ç–æ–±—ã –ø–æ–¥–¥–µ—Ä–∂–∏–≤–∞—Ç—å —ç–º–æ–¥–∑–∏
    fh = logging.FileHandler(log_path, encoding='utf-8')
    fh.setLevel(logging.INFO)
    
    # –î–ª—è –∫–æ–Ω—Å–æ–ª–∏ —Ç–∞–∫–∂–µ –∏—Å–ø–æ–ª—å–∑—É–µ–º UTF-8, –µ—Å–ª–∏ –≤–æ–∑–º–æ–∂–Ω–æ
    import sys
    if sys.stdout.encoding != 'utf-8':
        # –ï—Å–ª–∏ –∫–æ–Ω—Å–æ–ª—å –Ω–µ –ø–æ–¥–¥–µ—Ä–∂–∏–≤–∞–µ—Ç UTF-8, —Å–æ–∑–¥–∞–µ–º –æ–±–µ—Ä—Ç–∫—É
        class UTF8StreamHandler(logging.StreamHandler):
            def emit(self, record):
                try:
                    msg = self.format(record)
                    # –£–±–∏—Ä–∞–µ–º —ç–º–æ–¥–∑–∏ –¥–ª—è –∫–æ–Ω—Å–æ–ª–∏, –µ—Å–ª–∏ –æ–Ω–∞ –Ω–µ –ø–æ–¥–¥–µ—Ä–∂–∏–≤–∞–µ—Ç UTF-8
                    import re
                    msg = re.sub(r'[^\x00-\x7F]+', '', msg)  # –£–¥–∞–ª—è–µ–º –Ω–µ-ASCII —Å–∏–º–≤–æ–ª—ã
                    stream = self.stream
                    stream.write(msg + self.terminator)
                    self.flush()
                except Exception:
                    self.handleError(record)
        ch = UTF8StreamHandler()
    else:
        ch = logging.StreamHandler()
    ch.setLevel(logging.INFO)

    fmt = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
    fh.setFormatter(fmt)
    ch.setFormatter(fmt)

    logger.addHandler(fh)
    logger.addHandler(ch)

    logger.info("Logger initialized.")
    return logger


def log_config(logger: logging.Logger, config: Dict):
    logger.info("========== CONFIG ==========")
    logger.info(json.dumps(config, indent=2, ensure_ascii=False))

In [149]:
start_time = time.time()

output_dir = CONFIG["output_dir"]
os.makedirs(output_dir, exist_ok=True)
logger = setup_logging(output_dir)
log_config(logger, CONFIG)

2025-11-17 22:38:32,263 - INFO - Logger initialized.
INFO:TABULAR_BOOSTING:Logger initialized.
2025-11-17 22:38:32,266 - INFO - {
  "train_path": "/kaggle/input/petfinder-pawpularity-score/train.csv",
  "test_path": "/kaggle/input/petfinder-pawpularity-score/test.csv",
  "sep": ",",
  "id_column": "Id",
  "target_column": "Pawpularity",
  "datetime_columns": [],
  "output_dir": "tabular_boosting_output",
  "task_type": "regression",
  "basic_drop_columns": [],
  "basic_as_categorical": [],
  "basic_max_cat_unique": 64,
  "basic_datetime_expand": false,
  "basic_datetime_features": [
    "year",
    "month",
    "day",
    "dow",
    "hour"
  ],
  "cat_min_count": 20,
  "cat_min_freq": 0.0,
  "cat_max_rare_share": 0.98,
  "cat_max_unique_after_group": 500,
  "post_feature_drop_columns": [],
  "address_column": null,
  "address_city_index": -1,
  "address_split_sep": ",",
  "agg_enable": false,
  "agg_groupby_cols": [],
  "agg_numeric_cols": [],
  "agg_aggs": [
    "mean",
    "std",
   

### Utils

In [150]:
def _ensure_lat_lon_from_single_column(
    df: pd.DataFrame,
    coord_col: str,
    lat_name: str,
    lon_name: str,
    sep: str = ","
) -> pd.DataFrame:
    """
    coord_col –º–æ–∂–µ—Ç –±—ã—Ç—å:
      - —Å—Ç—Ä–æ–∫–∞ –≤–∏–¥–∞ "55.75,37.62"
      - —Å–ø–∏—Å–æ–∫/–∫–æ—Ä—Ç–µ–∂ [55.75, 37.62]
    —Å–æ–∑–¥–∞—ë–º/–ø–µ—Ä–µ–∑–∞–ø–∏—Å—ã–≤–∞–µ–º –∫–æ–ª–æ–Ω–∫–∏ lat_name, lon_name
    """
    if coord_col not in df.columns:
        return df

    def _parse_one(x):
        if isinstance(x, (list, tuple)) and len(x) >= 2:
            return x[0], x[1]
        if isinstance(x, str):
            parts = x.split(sep)
            if len(parts) >= 2:
                try:
                    return float(parts[0]), float(parts[1])
                except Exception:
                    return np.nan, np.nan
        return np.nan, np.nan

    lat_list, lon_list = zip(*df[coord_col].map(_parse_one))
    df = df.copy()
    df[lat_name] = lat_list
    df[lon_name] = lon_list
    return df

# NEW: –æ—á–∏—Å—Ç–∫–∞ –∏–º—ë–Ω —Ñ–∏—á–µ–π –¥–ª—è –±—É—Å—Ç–∏–Ω–≥–æ–≤ (XGBoost –æ—Å–æ–±–µ–Ω–Ω–æ —Å—Ç—Ä–æ–≥–∏–π)
def sanitize_feature_names(
    df: pd.DataFrame,
    logger: logging.Logger
) -> Tuple[pd.DataFrame, Dict]:
    """
    –ü—Ä–µ–æ–±—Ä–∞–∑—É–µ—Ç –∏–º–µ–Ω–∞ –∫–æ–ª–æ–Ω–æ–∫ –≤ —Å—Ç—Ä–æ–∫–∏ –±–µ–∑ —Å–∏–º–≤–æ–ª–æ–≤ [, ], <, >,
    —Å–ª–µ–¥–∏—Ç –∑–∞ —É–Ω–∏–∫–∞–ª—å–Ω–æ—Å—Ç—å—é –∏–º—ë–Ω –∏ –≤–æ–∑–≤—Ä–∞—â–∞–µ—Ç:
      - –Ω–æ–≤—ã–π DataFrame —Å –ø–µ—Ä–µ–∏–º–µ–Ω–æ–≤–∞–Ω–Ω—ã–º–∏ –∫–æ–ª–æ–Ω–∫–∞–º–∏
      - mapping {—Å—Ç–∞—Ä–æ–µ_–∏–º—è -> –Ω–æ–≤–æ–µ_–∏–º—è}
    """
    old_cols = list(df.columns)
    new_cols: List[str] = []
    mapping: Dict = {}
    changed = False

    df = df.copy()

    for col in old_cols:
        new = str(col)

        # –∑–∞–ø—Ä–µ—â—ë–Ω–Ω—ã–µ —Å–∏–º–≤–æ–ª—ã (–º–æ–∂–Ω–æ —Ä–∞—Å—à–∏—Ä–∏—Ç—å –ø—Ä–∏ –Ω–µ–æ–±—Ö–æ–¥–∏–º–æ—Å—Ç–∏)
        new = new.replace("[", "(").replace("]", ")")
        new = new.replace("<", "_lt_").replace(">", "_gt_")

        # –Ω–∞ –≤—Å—è–∫–∏–π —Å–ª—É—á–∞–π –∏–∑–±–∞–≤–∏–º—Å—è –æ—Ç –æ—á–µ–Ω—å "—ç–∫–∑–æ—Ç–∏–∫–∏"
        # (–Ω–∞–ø—Ä–∏–º–µ—Ä, –ø—É—Å—Ç—ã–µ –∏–º–µ–Ω–∞)
        if new.strip() == "":
            new = "feature"

        base = new
        k = 1
        # –æ–±–µ—Å–ø–µ—á–∏–≤–∞–µ–º —É–Ω–∏–∫–∞–ª—å–Ω–æ—Å—Ç—å
        while new in new_cols:
            new = f"{base}__{k}"
            k += 1

        if new != col:
            changed = True
        mapping[col] = new
        new_cols.append(new)

    df.columns = new_cols

    if changed:
        logger.info("Sanitized feature names for boosting models.")
        # –ø—Ä–∏ –∂–µ–ª–∞–Ω–∏–∏ –º–æ–∂–Ω–æ –∑–∞–ª–æ–≥–∏—Ä–æ–≤–∞—Ç—å —á–∞—Å—Ç—å –º–∞–ø–ø–∏–Ω–≥–∞
        logger.info(
            "Example of feature name mapping: " +
            ", ".join(
                f"{k} -> {v}" for k, v in list(mapping.items())[:10]
            )
        )

    return df, mapping

In [151]:
class ImagesDataset:
    def __init__(self, file_paths, transform=None):
        self.file_paths = file_paths
        self.transform = transform
    
    def __getitem__(self, idx):
        ### read image

        file_path = self.file_paths[idx]
        image = Image.open(file_path).convert("RGB")
        image = self.transform(image)
        # DataLoader —Å–∞–º —Å–æ–∑–¥–∞—Å—Ç batch dimension, –ø–æ—ç—Ç–æ–º—É unsqueeze –Ω–µ –Ω—É–∂–µ–Ω
        return image
    
    def __len__(self):
        return len(self.file_paths)

In [152]:
def detect_categorical_columns(
    df: pd.DataFrame,
    max_unique: int,
    force_categorical: List[str]
) -> List[str]:
    # –ü—Ä–æ–≤–µ—Ä—è–µ–º, —á—Ç–æ max_unique –Ω–µ None
    if max_unique is None:
        max_unique = 64  # –∑–Ω–∞—á–µ–Ω–∏–µ –ø–æ —É–º–æ–ª—á–∞–Ω–∏—é
    
    # –§–∏–ª—å—Ç—Ä—É–µ–º None –∏–∑ force_categorical
    cats = set(c for c in force_categorical if c is not None and isinstance(c, str))
    
    for col in df.columns:
        # –ü—Ä–æ–ø—É—Å–∫–∞–µ–º None –∏–ª–∏ –Ω–µ-—Å—Ç—Ä–æ–∫–æ–≤—ã–µ –∏–º–µ–Ω–∞ –∫–æ–ª–æ–Ω–æ–∫
        if col is None or not isinstance(col, str):
            continue
            
        try:
            if df[col].dtype == "object":
                cats.add(col)
            else:
                try:
                    nunique_val = df[col].nunique(dropna=True)
                    # –ü—Ä–æ–≤–µ—Ä—è–µ–º, —á—Ç–æ nunique_val –Ω–µ None –∏ —è–≤–ª—è–µ—Ç—Å—è —á–∏—Å–ª–æ–º
                    # –¢–∞–∫–∂–µ –ø—Ä–æ–≤–µ—Ä—è–µ–º, —á—Ç–æ max_unique - —ç—Ç–æ —á–∏—Å–ª–æ
                    if (nunique_val is not None and 
                        isinstance(nunique_val, (int, np.integer)) and
                        isinstance(max_unique, (int, np.integer, float))):
                        if int(nunique_val) <= int(max_unique):
                            cats.add(col)
                except (TypeError, ValueError) as e:
                    # –ï—Å–ª–∏ –Ω–µ –º–æ–∂–µ–º –≤—ã—á–∏—Å–ª–∏—Ç—å nunique, –ø—Ä–æ–ø—É—Å–∫–∞–µ–º –∫–æ–ª–æ–Ω–∫—É
                    continue
        except Exception:
            # –ï—Å–ª–∏ –≤–æ–∑–Ω–∏–∫–∞—é—Ç –ø—Ä–æ–±–ª–µ–º—ã —Å –∫–æ–ª–æ–Ω–∫–æ–π, –ø—Ä–æ–ø—É—Å–∫–∞–µ–º –µ—ë
            continue
    
    # –§–∏–ª—å—Ç—Ä—É–µ–º None –∏ –Ω–µ-—Å—Ç—Ä–æ–∫–æ–≤—ã–µ –∑–Ω–∞—á–µ–Ω–∏—è –ø–µ—Ä–µ–¥ —Å–æ—Ä—Ç–∏—Ä–æ–≤–∫–æ–π
    # –¢–∞–∫–∂–µ —É–±–µ–∂–¥–∞–µ–º—Å—è, —á—Ç–æ –≤—Å–µ –∑–Ω–∞—á–µ–Ω–∏—è - —Å—Ç—Ä–æ–∫–∏
    cats_filtered = []
    for c in cats:
        if c is not None:
            # –ü—Ä–µ–æ–±—Ä–∞–∑—É–µ–º –≤ —Å—Ç—Ä–æ–∫—É, –µ—Å–ª–∏ —ç—Ç–æ –Ω–µ —Å—Ç—Ä–æ–∫–∞
            try:
                c_str = str(c) if not isinstance(c, str) else c
                if c_str and c_str.strip():  # –ü—Ä–æ–≤–µ—Ä—è–µ–º, —á—Ç–æ —Å—Ç—Ä–æ–∫–∞ –Ω–µ –ø—É—Å—Ç–∞—è
                    cats_filtered.append(c_str)
            except Exception:
                continue
    
    # –°–æ—Ä—Ç–∏—Ä—É–µ–º —Ç–æ–ª—å–∫–æ —Å—Ç—Ä–æ–∫–∏
    try:
        return sorted(cats_filtered)
    except TypeError as e:
        # –ï—Å–ª–∏ –≤—Å–µ –µ—â–µ –æ—à–∏–±–∫–∞ —Å–æ—Ä—Ç–∏—Ä–æ–≤–∫–∏, –≤–æ–∑–≤—Ä–∞—â–∞–µ–º –±–µ–∑ —Å–æ—Ä—Ç–∏—Ä–æ–≤–∫–∏
        return list(cats_filtered)


def process_categorical_features(
    X: pd.DataFrame,
    test: Optional[pd.DataFrame],
    cat_cols: List[str],
    config: Dict,
    logger: logging.Logger
) -> Tuple[pd.DataFrame, Optional[pd.DataFrame], List[str]]:
    """
    –î–ª—è –∫–∞–∂–¥–æ–π –∫–∞—Ç–µ–≥–æ—Ä–∏–∞–ª—å–Ω–æ–π —Ñ–∏—á–∏:
      1) –∫–∞—Ç–µ–≥–æ—Ä–∏–∏ —Å –º–∞–ª–æ–π —á–∞—Å—Ç–æ—Ç–æ–π/–¥–æ–ª–µ–π —Å–ª–∏–≤–∞—é—Ç—Å—è –≤ 'other'
      2) –µ—Å–ª–∏ –¥–æ–ª—è —Ä–µ–¥–∫–∏—Ö –∫–∞—Ç–µ–≥–æ—Ä–∏–π —Å–ª–∏—à–∫–æ–º –≤–µ–ª–∏–∫–∞ –∏–ª–∏
         –¥–∞–∂–µ –ø–æ—Å–ª–µ —Å–ª–∏—è–Ω–∏—è —Å–ª–∏—à–∫–æ–º –º–Ω–æ–≥–æ —É–Ω–∏–∫–∞–ª—å–Ω—ã—Ö ‚Äî —Ñ–∏—á–∞ –≤—ã–∫–∏–¥—ã–≤–∞–µ—Ç—Å—è.

    –í–æ–∑–≤—Ä–∞—â–∞–µ—Ç:
      X_new, test_new, updated_cat_cols
    """
    if not cat_cols:
        return X, test, cat_cols

    min_count = int(config.get("cat_min_count", 0) or 0)
    min_freq = float(config.get("cat_min_freq", 0.0) or 0.0)
    max_rare_share = float(config.get("cat_max_rare_share", 1.0) or 1.0)
    max_unique_after = int(config.get("cat_max_unique_after_group", 10**9) or 10**9)

    if min_count <= 0 and min_freq <= 0.0 and max_rare_share >= 1.0:
        # —Ñ–∞–∫—Ç–∏—á–µ—Å–∫–∏ —Ñ–∏–ª—å—Ç—Ä–∞—Ü–∏—è –æ—Ç–∫–ª—é—á–µ–Ω–∞
        logger.info("Categorical processing: thresholds are trivial, skipping.")
        return X, test, cat_cols

    logger.info(
        "Categorical processing: "
        f"cat_min_count={min_count}, cat_min_freq={min_freq}, "
        f"cat_max_rare_share={max_rare_share}, "
        f"cat_max_unique_after_group={max_unique_after}"
    )

    X_new = X.copy()
    test_new = test.copy() if test is not None else None
    to_drop = []

    n = len(X_new)

    for col in tqdm(cat_cols, desc="Processing categorical features"):
        if col not in X_new.columns:
            continue

        vc = X_new[col].value_counts(dropna=False)
        total = vc.sum()

        # –æ–ø—Ä–µ–¥–µ–ª—è–µ–º "—Ä–µ–¥–∫–∏–µ" –∫–∞—Ç–µ–≥–æ—Ä–∏–∏
        rare_mask = np.zeros(len(vc), dtype=bool)
        if min_count > 0:
            rare_mask |= (vc.values < min_count)
        if min_freq > 0.0:
            rare_mask |= (vc.values / total < min_freq)

        rare_cats = vc.index[rare_mask]
        rare_share = vc[rare_cats].sum() / total if len(rare_cats) > 0 else 0.0

        logger.info(
            f"[cat] {col}: unique={vc.size}, rare_cats={len(rare_cats)}, "
            f"rare_share={rare_share:.4f}"
        )

        # –µ—Å–ª–∏ —Ä–µ–¥–∫–∏—Ö —Å–ª–∏—à–∫–æ–º –º–Ω–æ–≥–æ ‚Äî —Ñ–∏—á–∞ –±–µ—Å–ø–æ–ª–µ–∑–Ω–∞, –≤—ã–∫–∏–¥—ã–≤–∞–µ–º
        if rare_share >= max_rare_share:
            logger.info(
                f"[cat] {col}: rare_share={rare_share:.4f} >= {max_rare_share}, "
                f"dropping whole feature."
            )
            to_drop.append(col)
            continue

        if len(rare_cats) == 0:
            # –Ω–µ—á–µ–≥–æ –æ–±—ä–µ–¥–∏–Ω—è—Ç—å
            continue

        # –æ–±—ä–µ–¥–∏–Ω—è–µ–º —Ä–µ–¥–∫–∏–µ –≤ 'other'
        other_label = "__OTHER__"
        X_new[col] = X_new[col].where(~X_new[col].isin(rare_cats), other_label)
        if test_new is not None:
            test_new[col] = test_new[col].where(~test_new[col].isin(rare_cats), other_label)

        # –ø–µ—Ä–µ—Å—á–∏—Ç—ã–≤–∞–µ–º —É–Ω–∏–∫–∞–ª—å–Ω—ã–µ –ø–æ—Å–ª–µ –æ–±—ä–µ–¥–∏–Ω–µ–Ω–∏—è
        new_unique = X_new[col].nunique(dropna=True)
        if new_unique > max_unique_after:
            logger.info(
                f"[cat] {col}: unique_after={new_unique} > "
                f"cat_max_unique_after_group={max_unique_after}, dropping feature."
            )
            to_drop.append(col)

    if to_drop:
        logger.info(f"Dropping categorical features: {to_drop}")
        X_new = X_new.drop(columns=[c for c in to_drop if c in X_new.columns])
        if test_new is not None:
            test_new = test_new.drop(columns=[c for c in to_drop if c in test_new.columns])
        cat_cols = [c for c in cat_cols if c not in to_drop]

    return X_new, test_new, cat_cols


def expand_datetime_columns(
    df: pd.DataFrame,
    datetime_cols: List[str],
    features: List[str]
) -> pd.DataFrame:
    df = df.copy()
    for col in datetime_cols:
        if col not in df.columns:
            continue
        s = pd.to_datetime(df[col], errors="coerce")

        if "year" in features:
            df[f"{col}_year"] = s.dt.year.astype("Int64")
        if "month" in features:
            df[f"{col}_month"] = s.dt.month.astype("Int64")
        if "day" in features:
            df[f"{col}_day"] = s.dt.day.astype("Int64")
        if "dow" in features:
            df[f"{col}_dow"] = s.dt.dayofweek.astype("Int64")
        if "hour" in features:
            df[f"{col}_hour"] = s.dt.hour.astype("Int64")

    return df


def haversine_distance(
    lat1, lon1, lat2, lon2, radius: float = 6371.0
) -> np.ndarray:
    lat1 = np.radians(lat1.astype(float))
    lon1 = np.radians(lon1.astype(float))
    lat2 = np.radians(lat2.astype(float))
    lon2 = np.radians(lon2.astype(float))

    dlat = lat2 - lat1
    dlon = lon2 - lon1

    a = np.sin(dlat / 2.0) ** 2 + \
        np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2.0) ** 2
    c = 2 * np.arcsin(np.sqrt(a))
    return radius * c


### Data processing

In [153]:
def load_data(config: Dict, logger: logging.Logger) -> Tuple[pd.DataFrame, Optional[pd.DataFrame]]:
    train_path = config["train_path"]
    test_path = config["test_path"]

    logger.info(f"üì• Loading train from {train_path}")
    train = pd.read_csv(train_path, sep=config["sep"])

    if test_path:
        logger.info(f"üì• Loading test from {test_path}")
        test = pd.read_csv(test_path, sep=config["sep"])
    else:
        test = None

    logger.info(f"Train shape: {train.shape}")
    if test is not None:
        logger.info(f"Test  shape: {test.shape}")

    return train, test

In [154]:
train_df, test_df = load_data(CONFIG, logger)

id_col = CONFIG["id_column"]
target_col = CONFIG["target_column"]

test_ids = None
if test_df is not None and id_col in test_df.columns:
    test_ids = test_df[id_col].copy()

y = train_df[target_col]
X = train_df.drop(columns=[target_col])

2025-11-17 22:38:32,336 - INFO -  Loading train from /kaggle/input/petfinder-pawpularity-score/train.csv
INFO:TABULAR_BOOSTING:üì• Loading train from /kaggle/input/petfinder-pawpularity-score/train.csv
2025-11-17 22:38:32,357 - INFO -  Loading test from /kaggle/input/petfinder-pawpularity-score/test.csv
INFO:TABULAR_BOOSTING:üì• Loading test from /kaggle/input/petfinder-pawpularity-score/test.csv
2025-11-17 22:38:32,359 - INFO - Train shape: (9912, 14)
INFO:TABULAR_BOOSTING:Train shape: (9912, 14)
2025-11-17 22:38:32,360 - INFO - Test  shape: (8, 13)
INFO:TABULAR_BOOSTING:Test  shape: (8, 13)


In [155]:
### –î–æ–±–∞–≤—å —Å—é–¥–∞ —Å–æ–∑–¥–∞–Ω–∏–µ reference –∫–æ–ª–æ–Ω–∫–∏ –¥–ª—è –≥–µ–æ-—Ñ–∏—á–µ–π
# –ù–∞ –æ—Å–Ω–æ–≤–µ —É—Å—Ä–µ–¥–Ω–µ–Ω–∏—è –∫–æ–æ—Ä–¥–∏–Ω–∞—Ç –ø—Ä–∏ –≥—Ä—É–ø–ø–∏—Ä–æ–≤–∫–µ –ø–æ –∫–∞—Ç–µ–≥–æ—Ä–∏–∞–ª—å–Ω–æ–π –∫–æ–ª–æ–Ω–∫–µ
def fit_geo_reference(df: pd.DataFrame, config: Dict, logger: Optional[logging.Logger] = None) -> pd.DataFrame:
    """
    –°—Ç—Ä–æ–∏—Ç DataFrame geo_ref_df, –∫–æ—Ç–æ—Ä—ã–π —Å–æ–ø–æ—Å—Ç–∞–≤–ª—è–µ—Ç –∫–∞–∂–¥–æ–π –∫–∞—Ç–µ–≥–æ—Ä–∏–∏ —Å—Ä–µ–¥–Ω–∏–µ –∫–æ–æ—Ä–¥–∏–Ω–∞—Ç—ã.
    –ó–¥–µ—Å—å —Å–æ–∑–¥–∞—ë—Ç—Å—è reference-–∫–æ–ª–æ–Ω–∫–∞ –¥–ª—è –≥–µ–æ-—Ñ–∏—á–µ–π:
      - <output_lat_column>, <output_lon_column> (float)
      - <output_column> –∫–∞–∫ —Å—Ç—Ä–æ–∫–∞ "lat,lon".
    """
    geo_cfg = config.get("geo_reference", {})
    cat_col = geo_cfg.get("category_column", "category")
    coord_col = geo_cfg.get("coordinates_column", "coordinates")
    out_col = geo_cfg.get("output_column", f"{cat_col}_geo_ref")
    out_lat_col = geo_cfg.get("output_lat_column", f"{cat_col}_geo_ref_lat")
    out_lon_col = geo_cfg.get("output_lon_column", f"{cat_col}_geo_ref_lon")

    lat_col_cfg = geo_cfg.get("lat_column")
    lon_col_cfg = geo_cfg.get("lon_column")

    df_work = df.copy()

    # 1) –ë–µ—Ä—ë–º lat/lon –ª–∏–±–æ –∏–∑ —è–≤–Ω—ã—Ö –∫–æ–ª–æ–Ω–æ–∫, –ª–∏–±–æ –ø–∞—Ä—Å–∏–º —Å—Ç—Ä–æ–∫—É "lat,lon"
    if (
        lat_col_cfg
        and lon_col_cfg
        and lat_col_cfg in df_work.columns
        and lon_col_cfg in df_work.columns
    ):
        # –Ø–≤–Ω—ã–µ –∫–æ–ª–æ–Ω–∫–∏ —à–∏—Ä–æ—Ç—ã/–¥–æ–ª–≥–æ—Ç—ã
        df_work["_lat"] = pd.to_numeric(df_work[lat_col_cfg], errors="coerce")
        df_work["_lon"] = pd.to_numeric(df_work[lon_col_cfg], errors="coerce")
    else:
        # –ü–∞—Ä—Å–∏–º coordinates –∫–∞–∫ —Å—Ç—Ä–æ–∫—É "lat,lon" (–º–æ–∂–Ω–æ —Å –ø—Ä–æ–±–µ–ª–∞–º–∏ –∏ –∑–Ω–∞–∫–∞–º–∏)
        coords_parsed = (
            df_work[coord_col]
            .astype(str)
            .str.extract(r"([+-]?\d+\.?\d*)\s*,\s*([+-]?\d+\.?\d*)")
        )
        df_work["_lat"] = pd.to_numeric(coords_parsed[0], errors="coerce")
        df_work["_lon"] = pd.to_numeric(coords_parsed[1], errors="coerce")

    # 2) –°—á–∏—Ç–∞–µ–º —Å—Ä–µ–¥–Ω–∏–µ –∫–æ–æ—Ä–¥–∏–Ω–∞—Ç—ã –ø–æ –≥—Ä—É–ø–ø–∞–º
    geo_ref_df = (
        df_work.groupby(cat_col)[["_lat", "_lon"]]
        .mean()
        .rename(columns={"_lat": out_lat_col, "_lon": out_lon_col})
        .reset_index()
    )

    # 3) –°—Ç—Ä–æ–∫–æ–≤–∞—è –∫–æ–ª–æ–Ω–∫–∞ "lat,lon" (–¥–ª—è –æ—Ç–ª–∞–¥–∫–∏/—ç–∫—Å–ø–æ—Ä—Ç–∞)
    geo_ref_df[out_col] = (
        geo_ref_df[out_lat_col].round(6).astype(str)
        + "," +
        geo_ref_df[out_lon_col].round(6).astype(str)
    )

    if logger:
        logger.info(
            f"üåç –û–±—É—á–∏–ª–∏ reference-–≥–µ–æ-—Ü–µ–Ω—Ç—Ä—ã –ø–æ {cat_col}, –≤—Å–µ–≥–æ –≥—Ä—É–ø–ø: {geo_ref_df.shape[0]}"
        )
    return geo_ref_df

def add_geo_reference_column(
    df: pd.DataFrame,
    geo_ref_df: pd.DataFrame,
    config: Dict,
    logger: Optional[logging.Logger] = None
) -> pd.DataFrame:
    """
    –î–æ–±–∞–≤–ª—è–µ—Ç reference-–≥–µ–æ-–∫–æ–ª–æ–Ω–∫—É(–∏) –∫ df, –∏—Å–ø–æ–ª—å–∑—É—è geo_ref_df, –ø–æ—Å—á–∏—Ç–∞–Ω–Ω—É—é –Ω–∞ train.
    """
    geo_cfg = config.get("geo_reference", {})
    cat_col = geo_cfg.get("category_column", "category")
    out_col = geo_cfg.get("output_column", f"{cat_col}_geo_ref")
    out_lat_col = geo_cfg.get("output_lat_column", f"{cat_col}_geo_ref_lat")
    out_lon_col = geo_cfg.get("output_lon_column", f"{cat_col}_geo_ref_lon")

    df = df.merge(geo_ref_df[[cat_col, out_col, out_lat_col, out_lon_col]], how="left", on=cat_col)
    n_missing = df[out_col].isna().sum()
    if logger:
        logger.info(
            f"‚úÖ –î–æ–±–∞–≤–ª–µ–Ω—ã reference-–≥–µ–æ-–∫–æ–ª–æ–Ω–∫–∏ ({out_col}, {out_lat_col}, {out_lon_col}) "
            f"–∫ df (geo reference –ø–æ {cat_col}). –ü—Ä–æ–ø—É—Å–∫–æ–≤: {n_missing}"
        )
    return df


def maybe_apply_geo_reference(
    df: pd.DataFrame,
    config: Dict,
    logger: Optional[logging.Logger] = None,
    fit: bool = False,
    geo_ref_df: Optional[pd.DataFrame] = None
):
    """
    –û–±–µ—Ä—Ç–∫–∞ –¥–ª—è train/test: –µ—Å–ª–∏ fit=True, –≤—ã—á–∏—Å–ª—è–µ—Ç geo reference –∏ –¥–æ–±–∞–≤–ª—è–µ—Ç –≤ train,
    –µ—Å–ª–∏ fit=False (test), –ø—Ä–æ—Å—Ç–æ –ø—Ä–∏–º–µ–Ω—è–µ—Ç geo_ref_df –∫ df.
    –í–æ–∑–≤—Ä–∞—â–∞–µ—Ç:
      - –ø—Ä–∏ fit=True: (df_with_ref, geo_ref_df)
      - –ø—Ä–∏ fit=False: df_with_ref
    """
    geo_cfg = config.get("geo_reference", {})
    if not geo_cfg.get("enabled", False):
        if logger:
            logger.info("üåé –ì–µ–æ-reference –≤—ã–∫–ª—é—á–µ–Ω (config['geo_reference']['enabled']=False)")
        return (df, None) if fit else df

    if fit:
        geo_ref_df = fit_geo_reference(df, config, logger)
        df = add_geo_reference_column(df, geo_ref_df, config, logger)
        return df, geo_ref_df
    else:
        if geo_ref_df is None:
            raise ValueError("geo_ref_df must be passed for applying on test set")
        df = add_geo_reference_column(df, geo_ref_df, config, logger)
        return df

# --- –ü—Ä–∏–º–µ–Ω—è–µ–º reference-–≥–µ–æ-—Ñ–∏—á–∏ –∫ train –∏ test, –µ—Å–ª–∏ –Ω—É–∂–Ω–æ ---
if CONFIG.get("geo_reference", {}).get("enabled", False):
    logger.info("üåé –ü—Ä–∏–º–µ–Ω—è–µ–º –≥–µ–æ-reference features...")
    train_df, geo_ref_df = maybe_apply_geo_reference(train_df, CONFIG, logger, fit=True)
    test_df = maybe_apply_geo_reference(test_df, CONFIG, logger, fit=False, geo_ref_df=geo_ref_df)

In [156]:
if CONFIG['task_type'] in ['multiclass', 'binary']:
    y = y.astype(int)

–£–î–ê–õ–ï–ù–ò–ï ID

In [157]:
# NOTE: ID column is intentionally NOT dropped here.
# –û–Ω–æ –±—É–¥–µ—Ç —É–¥–∞–ª–µ–Ω–æ –ø–æ–∑–∂–µ, –Ω–µ–ø–æ—Å—Ä–µ–¥—Å—Ç–≤–µ–Ω–Ω–æ –ø–µ—Ä–µ–¥ –æ–±—É—á–µ–Ω–∏–µ–º –±—É—Å—Ç–∏–Ω–≥–æ–≤,
# —á—Ç–æ–±—ã —Å–æ—Ö—Ä–∞–Ω–∏—Ç—å ID –¥–ª—è –≥–µ–Ω–µ—Ä–∞—Ü–∏–∏ —Ñ–∏—á (–∫–∞—Ä—Ç–∏–Ω–∫–∏/—Ç–µ–∫—Å—Ç—ã/–∞–≥—Ä–µ–≥–∞—Ü–∏–∏).

### Features generation

In [158]:
#!pip install open_clip_torch


In [159]:
import open_clip


In [160]:
def generate_basic_features(
    df: pd.DataFrame,
    config: Dict,
    logger: Optional[logging.Logger] = None
) -> pd.DataFrame:
    df = df.copy()

    drop_cols = config["basic_drop_columns"]
    for c in drop_cols:
        if c in df.columns:
            if logger:
                logger.info(f"Dropping column: {c}")
            df = df.drop(columns=[c])

    if config["basic_datetime_expand"] and config["datetime_columns"]:
        if logger:
            logger.info(f"Expanding datetime columns: {config['datetime_columns']}")
        df = expand_datetime_columns(
            df,
            datetime_cols=config["datetime_columns"],
            features=config["basic_datetime_features"]
        )

    return df


def generate_aggregate_features(
    train: pd.DataFrame,
    test: Optional[pd.DataFrame],
    config: Dict,
    logger: Optional[logging.Logger] = None
) -> Tuple[pd.DataFrame, Optional[pd.DataFrame]]:
    if not config["agg_enable"]:
        if logger:
            logger.info("Aggregate features disabled in CONFIG.")
        return train, test

    groupby_cols = config["agg_groupby_cols"]
    if not groupby_cols:
        if logger:
            logger.info("No agg_groupby_cols specified ‚Äî skipping aggregates.")
        return train, test

    if logger:
        logger.info(f"Generating aggregate features by {groupby_cols}")

    all_df = train if test is None else pd.concat([train, test], axis=0, ignore_index=True)

    if config["agg_numeric_cols"]:
        num_cols = [c for c in config["agg_numeric_cols"] if c in all_df.columns]
    else:
        num_cols = [
            c for c in all_df.columns
            if pd.api.types.is_numeric_dtype(all_df[c])
        ]
        for c in [config["target_column"], config["id_column"]]:
            if c in num_cols:
                num_cols.remove(c)

    aggs = config["agg_aggs"]
    prefix = config["agg_prefix"]

    if logger:
        logger.info(f"Aggregating numeric cols: {num_cols}")
        logger.info(f"Using aggs: {aggs}, prefix: {prefix}")

    grouped = all_df.groupby(groupby_cols)[num_cols].agg(aggs)
    grouped.columns = [
        f"{prefix}_" + "_".join(map(str, col)).strip()
        for col in grouped.columns.to_flat_index()
    ]
    grouped = grouped.reset_index()

    if logger:
        logger.info(f"Aggregate frame shape: {grouped.shape}")

    train_merged = train.merge(grouped, on=groupby_cols, how="left")
    test_merged = None
    if test is not None:
        test_merged = test.merge(grouped, on=groupby_cols, how="left")

    return train_merged, test_merged


def generate_geo_features(
    train: pd.DataFrame,
    test: Optional[pd.DataFrame],
    config: Dict,
    logger: Optional[logging.Logger] = None
) -> Tuple[pd.DataFrame, Optional[pd.DataFrame]]:
    """
    –ë–∞–∑–æ–≤—ã–µ —Ñ–∏—á–∏:
      - geo_dist_from_to_km
      - geo_dist_from_ref_km, geo_dist_to_ref_km (–µ—Å–ª–∏ –∑–∞–¥–∞–Ω ref)
      - geo_lat_from_abs / geo_lon_from_abs / geo_lat_to_abs / geo_lon_to_abs

    –ï—Å–ª–∏ geo_extra_enable=True:
      - geo_dlat / geo_dlon
      - geo_manhattan_km
      - geo_bearing_deg
      - geo_mid_lat / geo_mid_lon
    """
    if not config["geo_enable"]:
        if logger:
            logger.info("Geo features disabled in CONFIG.")
        return train, test

    lat_from_col = config["geo_lat_from_col"]
    lon_from_col = config["geo_lon_from_col"]
    lat_to_col = config["geo_lat_to_col"]
    lon_to_col = config["geo_lon_to_col"]
    prefix = config["geo_prefix"]

    from_coord_col = config.get("geo_from_coord_col")
    to_coord_col = config.get("geo_to_coord_col")
    coord_sep = config.get("geo_coord_string_sep", ",")

    if from_coord_col is not None:
        train = _ensure_lat_lon_from_single_column(
            train, from_coord_col, lat_from_col, lon_from_col, sep=coord_sep
        )
        if test is not None:
            test = _ensure_lat_lon_from_single_column(
                test, from_coord_col, lat_from_col, lon_from_col, sep=coord_sep
            )

    if to_coord_col is not None:
        train = _ensure_lat_lon_from_single_column(
            train, to_coord_col, lat_to_col, lon_to_col, sep=coord_sep
        )
        if test is not None:
            test = _ensure_lat_lon_from_single_column(
                test, to_coord_col, lat_to_col, lon_to_col, sep=coord_sep
            )


    for col in [lat_from_col, lon_from_col, lat_to_col, lon_to_col]:
        if col and col not in train.columns:
            raise ValueError(f"Column {col} not found in train for geo features")

    if logger:
        logger.info(f"Generating geo features with backend: {GEO_BACKEND}")

    def _distance_rowwise(
        lat1: pd.Series, lon1: pd.Series, lat2: pd.Series, lon2: pd.Series
    ) -> np.ndarray:
        if GEO_BACKEND == "geopy":
            def _one(a, b, c, d):
                if pd.isna(a) or pd.isna(b) or pd.isna(c) or pd.isna(d):
                    return np.nan
                return geodesic((a, b), (c, d)).km
            return np.vectorize(_one)(lat1, lon1, lat2, lon2)
        else:
            return haversine_distance(lat1, lon1, lat2, lon2)

    def _bearing(
        lat1: pd.Series, lon1: pd.Series, lat2: pd.Series, lon2: pd.Series
    ) -> np.ndarray:
        # initial bearing (degrees)
        lat1_r = np.radians(lat1.astype(float))
        lat2_r = np.radians(lat2.astype(float))
        dlon_r = np.radians(lon2.astype(float) - lon1.astype(float))

        x = np.sin(dlon_r) * np.cos(lat2_r)
        y = np.cos(lat1_r) * np.sin(lat2_r) - np.sin(lat1_r) * np.cos(lat2_r) * np.cos(dlon_r)
        brng = np.degrees(np.arctan2(x, y))
        brng = (brng + 360) % 360
        return brng

    def _apply_geo(df: pd.DataFrame) -> pd.DataFrame:
        df = df.copy()

        df[f"{prefix}_dist_from_to_km"] = _distance_rowwise(
            df[lat_from_col], df[lon_from_col],
            df[lat_to_col], df[lon_to_col]
        )

        ref_lat = config["geo_ref_lat"]
        ref_lon = config["geo_ref_lon"]
        if ref_lat is not None and ref_lon is not None:
            df[f"{prefix}_dist_from_ref_km"] = _distance_rowwise(
                df[lat_from_col], df[lon_from_col],
                pd.Series(ref_lat, index=df.index),
                pd.Series(ref_lon, index=df.index),
            )
            df[f"{prefix}_dist_to_ref_km"] = _distance_rowwise(
                df[lat_to_col], df[lon_to_col],
                pd.Series(ref_lat, index=df.index),
                pd.Series(ref_lon, index=df.index),
            )

        df[f"{prefix}_lat_from_abs"] = df[lat_from_col].abs()
        df[f"{prefix}_lon_from_abs"] = df[lon_from_col].abs()
        df[f"{prefix}_lat_to_abs"] = df[lat_to_col].abs()
        df[f"{prefix}_lon_to_abs"] = df[lon_to_col].abs()

        if config.get("geo_extra_enable", False):
            df[f"{prefix}_dlat"] = df[lat_to_col] - df[lat_from_col]
            df[f"{prefix}_dlon"] = df[lon_to_col] - df[lon_from_col]

            df[f"{prefix}_manhattan_km"] = (
                haversine_distance(df[lat_from_col], df[lon_from_col], df[lat_to_col], df[lon_from_col]) +
                haversine_distance(df[lat_to_col], df[lon_from_col], df[lat_to_col], df[lon_to_col])
            )

            df[f"{prefix}_bearing_deg"] = _bearing(
                df[lat_from_col], df[lon_from_col],
                df[lat_to_col], df[lon_to_col]
            )

            df[f"{prefix}_mid_lat"] = (df[lat_from_col] + df[lat_to_col]) / 2.0
            df[f"{prefix}_mid_lon"] = (df[lon_from_col] + df[lon_to_col]) / 2.0

        return df

    train_geo = _apply_geo(train)
    test_geo = _apply_geo(test) if test is not None else None
    return train_geo, test_geo


def process_address_extract_city(
    df: pd.DataFrame,
    column: str,
    city_index: int,
    sep: str,
    logger: logging.Logger
) -> pd.DataFrame:
    """
    –ó–¥–µ—Å—å –∏–∑–≤–ª–µ–∫–∞–µ—Ç—Å—è –∫–æ–ª–æ–Ω–∫–∞ '–≥–æ—Ä–æ–¥':
      - address -> —Å–ø–∏—Å–æ–∫ —á–∞—Å—Ç–µ–π (split)
      - –∏–∑ —Å–ø–∏—Å–∫–∞ –±–µ—Ä—ë–º city_index (–ø–æ —É–º–æ–ª—á–∞–Ω–∏—é -1 = –ø–æ—Å–ª–µ–¥–Ω–∏–π —ç–ª–µ–º–µ–Ω—Ç)
      - —Å–æ–∑–¥–∞—ë–º –Ω–æ–≤—É—é –∫–æ–ª–æ–Ω–∫—É address_city
      - –æ–Ω–∞ –≤–ø–æ—Å–ª–µ–¥—Å—Ç–≤–∏–∏ —Å—Ç–∞–Ω–æ–≤–∏—Ç—Å—è –∫–∞—Ç–µ–≥–æ—Ä–∏–∞–ª—å–Ω–æ–π –∏ –ø—Ä–æ—Ö–æ–¥–∏—Ç
        —Ç–∞–∫—É—é –∂–µ –æ–±—Ä–∞–±–æ—Ç–∫—É, –∫–∞–∫ –∏ –æ—Å—Ç–∞–ª—å–Ω—ã–µ –∫–∞—Ç–µ–≥–æ—Ä–∏–∞–ª—å–Ω—ã–µ —Ñ–∏—á–∏
    """
    if column not in df.columns:
        logger.warning(f"Address column '{column}' not found, skipping city extraction.")
        return df

    df = df.copy()

    def _extract(addr):
        if not isinstance(addr, str):
            return None
        parts = [p.strip() for p in addr.split(sep)]
        if len(parts) == 0:
            return None
        try:
            return parts[city_index]
        except Exception:
            # –µ—Å–ª–∏ –∏–Ω–¥–µ–∫—Å –≤–Ω–µ –¥–∏–∞–ø–∞–∑–æ–Ω–∞
            return parts[-1]

    df["address_city"] = df[column].map(_extract)
    return df


In [161]:
import os

def _normalize_image_name(name: str, default_ext: str | None = None) -> str:
    """
    –ü—Ä–∏–≤–æ–¥–∏–º –∏–º—è —Ñ–∞–π–ª–∞ –∫ –Ω–æ—Ä–º–∞–ª—å–Ω–æ–º—É –≤–∏–¥—É:
    - —É–±–∏—Ä–∞–µ–º –ø—Ä–æ–±–µ–ª—ã/–ø–µ—Ä–µ–≤–æ–¥—ã —Å—Ç—Ä–æ–∫
    - –µ—Å–ª–∏ –Ω–µ—Ç —Ä–∞—Å—à–∏—Ä–µ–Ω–∏—è, –¥–æ–±–∞–≤–ª—è–µ–º default_ext –∏–ª–∏ CONFIG["image_ext"]
    """
    name = str(name).strip()
    base = os.path.basename(name)
    root, ext = os.path.splitext(base)
    if ext == "":
        if default_ext is None:
            default_ext = CONFIG.get("image_ext", "")
        return root + (default_ext or "")
    return root + ext

def images_features(X, test_df):
    """
    –î–æ–±–∞–≤–ª—è–µ—Ç –∫ —Ç–∞–±–ª–∏—á–Ω—ã–º —Ñ–∏—á–∞–º —ç–º–±–µ–¥–¥–∏–Ω–≥–∏ –∫–∞—Ä—Ç–∏–Ω–æ–∫, –ø–æ—Å—á–∏—Ç–∞–Ω–Ω—ã–µ CLIP (OpenCLIP, —Ç–æ–ª—å–∫–æ image-encoder).
    –û–∂–∏–¥–∞–µ—Ç—Å—è, —á—Ç–æ –≤ CONFIG –∑–∞–¥–∞–Ω—ã:
      - 'file_names_column'  ‚Äî –∫–æ–ª–æ–Ω–∫–∞ —Å –∏–º–µ–Ω–∞–º–∏ —Ñ–∞–π–ª–æ–≤
      - 'train_images_dir'   ‚Äî –ø–∞–ø–∫–∞ —Å train-–∫–∞—Ä—Ç–∏–Ω–∫–∞–º–∏
      - 'test_images_dir'    ‚Äî –ø–∞–ø–∫–∞ —Å test-–∫–∞—Ä—Ç–∏–Ω–∫–∞–º–∏
      - 'batch_size'         ‚Äî —Ä–∞–∑–º–µ—Ä –±–∞—Ç—á–∞ –¥–ª—è –∏–Ω—Ñ–µ—Ä–µ–Ω—Å–∞
    """
    file_name_column = CONFIG['file_names_column']

    if file_name_column not in X.columns:
        raise KeyError(f"CONFIG['file_names_column']={file_name_column!r} –Ω–µ—Ç –≤ train")

    if test_df is not None and file_name_column not in test_df.columns:
        raise KeyError(f"CONFIG['file_names_column']={file_name_column!r} –Ω–µ—Ç –≤ test")

    # ----------------- CLIP BACKBONE (image encoder only) -----------------
    if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
        device = "mps"
    else:
        device = "cuda" if torch.cuda.is_available() else "cpu"

    model, _, preprocess = open_clip.create_model_and_transforms(
        "ViT-B-32",
        pretrained="laion2b_s34b_b79k"
    )
    model = model.to(device)
    model.eval()
    image_transform = preprocess

    # ----------------- TRAIN IMAGES -----------------
    train_file_paths = [
        os.path.join(
            CONFIG['train_images_dir'],
            _normalize_image_name(file_name, CONFIG.get("image_ext", ""))
        )
        for file_name in X[file_name_column]
    ]
    train_images_part = process_images(train_file_paths, model, image_transform, name="train_clip")
    train_images_part.index = X.index
    X = pd.concat([X, train_images_part], axis=1)

    # ----------------- TEST IMAGES ------------------
    if test_df is not None:
        test_file_paths = [
            os.path.join(
                CONFIG['test_images_dir'],
                _normalize_image_name(file_name, CONFIG.get("image_ext", ""))
            )
            for file_name in test_df[file_name_column]
        ]
        test_images_part = process_images(test_file_paths, model, image_transform, name="test_clip")
        test_images_part.index = test_df.index
        test_df = pd.concat([test_df, test_images_part], axis=1)

    # (–æ–ø—Ü–∏–æ–Ω–∞–ª—å–Ω–æ) –≤—ã–±—Ä–∞—Å—ã–≤–∞–µ–º –∫–æ–ª–æ–Ω–∫—É —Å –∏–º–µ–Ω–µ–º —Ñ–∞–π–ª–∞, —á—Ç–æ–±—ã –æ–Ω–∞ –Ω–µ —É—à–ª–∞ –≤ –±—É—Å—Ç–∏–Ω–≥–∏
    for df in (X, test_df):
        if df is not None and file_name_column in df.columns:
            df.drop(columns=[file_name_column], inplace=True)

    return X, test_df

def process_images(file_paths, model, image_transform, name):
    """
    –°—á–∏—Ç–∞–µ—Ç —ç–º–±–µ–¥–¥–∏–Ω–≥–∏ –¥–ª—è —Å–ø–∏—Å–∫–∞ –ø—É—Ç–µ–π –∫ –∫–∞—Ä—Ç–∏–Ω–∫–∞–º –æ–¥–Ω–æ–π –º–æ–¥–µ–ª—å—é.
    –ü–æ–¥–¥–µ—Ä–∂–∏–≤–∞–µ—Ç –∫–∞–∫ –æ–±—ã—á–Ω—ã–µ CNN (model(x)), —Ç–∞–∫ –∏ CLIP-–º–æ–¥–µ–ª–∏ (model.encode_image(x)).
    """
    img_dataset = ImagesDataset(file_paths, image_transform)
    img_dataloader = DataLoader(img_dataset, batch_size=CONFIG['batch_size'], shuffle=False)

    # –ü–æ–ª—É—á–∞–µ–º device –º–æ–¥–µ–ª–∏
    device = next(model.parameters()).device

    collect_embs = []
    with torch.no_grad():
        for images in tqdm(img_dataloader, total=len(img_dataloader), desc=f"Images {name} process..."):
            images = images.to(device)

            # –ï—Å–ª–∏ —ç—Ç–æ CLIP-–º–æ–¥–µ–ª—å, –∏—Å–ø–æ–ª—å–∑—É–µ–º encode_image, –∏–Ω–∞—á–µ –æ–±—ã—á–Ω—ã–π forward
            if hasattr(model, "encode_image"):
                embs = model.encode_image(images)
            else:
                embs = model(images)

            # –ù–∞ —Å–ª—É—á–∞–π, –µ—Å–ª–∏ –º–æ–¥–µ–ª—å –≤–µ—Ä–Ω—ë—Ç 4D-—Ç–µ–Ω–∑–æ—Ä (N, C, 1, 1)
            if embs.ndim == 4:
                embs = torch.flatten(embs, 1)

            embs = embs.cpu().numpy()
            # L2-–Ω–æ—Ä–º–∞–ª–∏–∑–∞—Ü–∏—è –ø–æ —Å—Ç—Ä–æ–∫–∞–º (num_samples x dim)
            norms = np.linalg.norm(embs, axis=1, keepdims=True) + 1e-12
            embs = embs / norms
            collect_embs.extend(embs.tolist())

    if not collect_embs:
        # –ù–∞ –≤—Å—è–∫–∏–π —Å–ª—É—á–∞–π, —á—Ç–æ–±—ã –Ω–µ —É–ø–∞—Å—Ç—å –Ω–∞ –ø—É—Å—Ç–æ–º —Å–ø–∏—Å–∫–µ
        return pd.DataFrame()

    n_feat = len(collect_embs[0])
    return pd.DataFrame(collect_embs, columns=[f"emb_{i}" for i in range(n_feat)])


–¢–ï–ö–°–¢–û–í–´–ï –§–ò–ß–ò

In [162]:
def get_text_device():
    global TEXT_DEVICE
    if TEXT_DEVICE is None:
        TEXT_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    return TEXT_DEVICE


def init_bert_model():
    """
    –ò–Ω–∏—Ü–∏–∞–ª–∏–∑–∏—Ä—É–µ–º BERT-–º–æ–¥–µ–ª—å –∏ —Ç–æ–∫–µ–Ω–∞–π–∑–µ—Ä –æ–¥–∏–Ω —Ä–∞–∑.
    """
    global TEXT_MODEL, TEXT_TOKENIZER, TEXT_DEVICE

    if TEXT_MODEL is not None:
        return

    model_name = CONFIG.get(
        "text_bert_model_name",
        "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
    )
    TEXT_DEVICE = get_text_device()

    TEXT_TOKENIZER = AutoTokenizer.from_pretrained(model_name)
    TEXT_MODEL = AutoModel.from_pretrained(model_name)
    TEXT_MODEL.to(TEXT_DEVICE)
    TEXT_MODEL.eval()
    
def _detect_format_from_path(path: str) -> str:
    ext = os.path.splitext(path)[1].lower()
    if ext == ".parquet":
        return "parquet"
    if ext == ".csv":
        return "csv"
    if ext == ".tsv":
        return "tsv"
    # –ø–æ —É–º–æ–ª—á–∞–Ω–∏—é —Å—á–∏—Ç–∞–µ–º csv
    return "csv"


def load_text_table(path: str, fmt: str | None, id_column: str, text_column: str) -> pd.DataFrame:
    """
    –ó–∞–≥—Ä—É–∂–∞–µ–º –≤–Ω–µ—à–Ω—é—é —Ç–∞–±–ª–∏—Ü—É —Å —Ç–µ–∫—Å—Ç–æ–º.
    –ü–æ–¥–¥–µ—Ä–∂–∏–≤–∞–µ–º parquet/csv/tsv. format="auto" -> –ø–æ —Ä–∞—Å—à–∏—Ä–µ–Ω–∏—é.
    –í–æ–∑–≤—Ä–∞—â–∞–µ–º —Ç–æ–ª—å–∫–æ —Å—Ç–æ–ª–±—Ü—ã [id_column, text_column].
    """
    if fmt is None or fmt == "auto":
        fmt = _detect_format_from_path(path)

    if fmt == "parquet":
        df = pd.read_parquet(path)
    elif fmt == "tsv":
        df = pd.read_csv(path, sep="\t")
    else:  # "csv" –∏ –≤—Å—ë –æ—Å—Ç–∞–ª—å–Ω–æ–µ –ø–æ-—É–º–æ–ª—á–∞–Ω–∏—é
        df = pd.read_csv(path)

    # –Ø–≤–Ω–æ –ø—Ä–æ–≤–µ—Ä—è–µ–º, —á—Ç–æ –Ω—É–∂–Ω—ã–µ –∫–æ–ª–æ–Ω–∫–∏ –µ—Å—Ç—å
    missing = [c for c in (id_column, text_column) if c not in df.columns]
    if missing:
        raise ValueError(
            f"Columns {missing} not found in external text table {path}. "
            f"Available columns: {list(df.columns)}"
        )

    return df[[id_column, text_column]].copy()

def attach_external_text_tables(X: pd.DataFrame, test_df: pd.DataFrame):
    """–ü–æ–¥—Ü–µ–ø–ª—è–µ–º –≤–Ω–µ—à–Ω–∏–µ —Ç–µ–∫—Å—Ç–æ–≤—ã–µ —Ç–∞–±–ª–∏—Ü—ã (CONFIG['text_external_tables']) –∫ X –∏ test_df.

    –í–∞–∂–Ω–æ: –µ—Å–ª–∏ –≤–æ –≤–Ω–µ—à–Ω–µ–π —Ç–∞–±–ª–∏—Ü–µ –Ω–µ—Å–∫–æ–ª—å–∫–æ —Å—Ç—Ä–æ–∫ –Ω–∞ –æ–¥–∏–Ω –∏ —Ç–æ—Ç –∂–µ id,
    –º—ã —Å–Ω–∞—á–∞–ª–∞ –∞–≥—Ä–µ–≥–∏—Ä—É–µ–º –∏—Ö –≤ –û–î–ù–£ —Å—Ç—Ä–æ–∫—É (–∫–æ–Ω–∫–∞—Ç–µ–Ω–∏—Ä—É–µ–º —Ç–µ–∫—Å—Ç—ã —á–µ—Ä–µ–∑ –ø—Ä–æ–±–µ–ª),
    —á—Ç–æ–±—ã merge –ù–ï —Ä–∞–∑–¥—É–≤–∞–ª –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ —Å—Ç—Ä–æ–∫ –≤ –æ—Å–Ω–æ–≤–Ω–æ–π —Ç–∞–±–ª–∏—Ü–µ.
    """
    external_cfgs = CONFIG.get("text_external_tables", []) or []
    if not external_cfgs:
        return X, test_df, []

    main_id_col = CONFIG.get("id_column", "index")
    added_cols: list[str] = []

    for cfg in external_cfgs:
        name = cfg.get("name", "ext")

        train_path = cfg["train_path"]
        # –µ—Å–ª–∏ test_path –Ω–µ —É–∫–∞–∑–∞–Ω ‚Äî –∏—Å–ø–æ–ª—å–∑—É–µ–º —Ç–æ—Ç –∂–µ —Ñ–∞–π–ª
        test_path = cfg.get("test_path", train_path)

        fmt = cfg.get("format", "auto")

        # ID-–∫–æ–ª–æ–Ω–∫–∞ –≤ external-—Ç–∞–±–ª–∏—Ü–µ: –ø–æ —É–º–æ–ª—á–∞–Ω–∏—é —Ç–∞–∫–∞—è –∂–µ, –∫–∞–∫ –≤ –æ—Å–Ω–æ–≤–Ω–æ–π
        ext_id_col = cfg.get("id_column", main_id_col)

        text_col_ext = cfg["text_column"]
        output_col = cfg.get("output_column", f"text_ext_{name}")

        # --- TRAIN ---
        ext_train = load_text_table(train_path, fmt, ext_id_col, text_col_ext)
        ext_train = ext_train.rename(columns={text_col_ext: output_col})

        # NEW: –∞–≥—Ä–µ–≥–∏—Ä—É–µ–º –Ω–µ—Å–∫–æ–ª—å–∫–æ —Ç–µ–∫—Å—Ç–æ–≤ –ø–æ –æ–¥–Ω–æ–º—É id –≤ –æ–¥–Ω—É —Å—Ç—Ä–æ–∫—É
        if ext_train[ext_id_col].duplicated().any():
            ext_train = (
                ext_train
                .groupby(ext_id_col, as_index=False)[output_col]
                .agg(lambda s: " ".join(map(str, s)))
            )

        # –ü—Ä–æ–≤–µ—Ä—è–µ–º, –µ—Å—Ç—å –ª–∏ main_id_col –≤ –∫–æ–ª–æ–Ω–∫–∞—Ö X
        # –ï—Å–ª–∏ –Ω–µ—Ç, –∏—Å–ø–æ–ª—å–∑—É–µ–º –∏–Ω–¥–µ–∫—Å –¥–ª—è merge
        if main_id_col in X.columns:
            # ID –∫–æ–ª–æ–Ω–∫–∞ –µ—Å—Ç—å –≤ X
            if main_id_col == ext_id_col:
                # –ò–º—è ID –æ–¥–∏–Ω–∞–∫–æ–≤–æ–µ –≤ –æ—Å–Ω–æ–≤–Ω–æ–π –∏ –≤–Ω–µ—à–Ω–µ–π —Ç–∞–±–ª–∏—Ü–µ
                X = X.merge(
                    ext_train[[ext_id_col, output_col]],
                    on=ext_id_col,
                    how="left"
                )
            else:
                # –ò–º—è ID –≤ –æ—Å–Ω–æ–≤–Ω–æ–π –∏ –≤–Ω–µ—à–Ω–µ–π —Ç–∞–±–ª–∏—Ü–∞—Ö —Ä–∞–∑–Ω–æ–µ
                X = X.merge(
                    ext_train[[ext_id_col, output_col]],
                    left_on=main_id_col,
                    right_on=ext_id_col,
                    how="left"
                ).drop(columns=[ext_id_col])
        else:
            # ID –∫–æ–ª–æ–Ω–∫–∏ –Ω–µ—Ç –≤ X, –∏—Å–ø–æ–ª—å–∑—É–µ–º –∏–Ω–¥–µ–∫—Å
            # –£—Å—Ç–∞–Ω–∞–≤–ª–∏–≤–∞–µ–º ext_id_col –∫–∞–∫ –∏–Ω–¥–µ–∫—Å –≤–æ –≤–Ω–µ—à–Ω–µ–π —Ç–∞–±–ª–∏—Ü–µ
            ext_train_indexed = ext_train.set_index(ext_id_col)[[output_col]]
            X = X.merge(
                ext_train_indexed,
                left_index=True,
                right_index=True,
                how="left"
            )

        # --- TEST ---
        if test_df is not None:
            ext_test = load_text_table(test_path, fmt, ext_id_col, text_col_ext)
            ext_test = ext_test.rename(columns={text_col_ext: output_col})

            # NEW: –∞–≥—Ä–µ–≥–∏—Ä—É–µ–º –Ω–µ—Å–∫–æ–ª—å–∫–æ —Ç–µ–∫—Å—Ç–æ–≤ –ø–æ –æ–¥–Ω–æ–º—É id –≤ –æ–¥–Ω—É —Å—Ç—Ä–æ–∫—É
            if ext_test[ext_id_col].duplicated().any():
                ext_test = (
                    ext_test
                    .groupby(ext_id_col, as_index=False)[output_col]
                    .agg(lambda s: " ".join(map(str, s)))
                )

            # –ü—Ä–æ–≤–µ—Ä—è–µ–º, –µ—Å—Ç—å –ª–∏ main_id_col –≤ –∫–æ–ª–æ–Ω–∫–∞—Ö test_df
            if main_id_col in test_df.columns:
                if main_id_col == ext_id_col:
                    test_df = test_df.merge(
                        ext_test[[ext_id_col, output_col]],
                        on=ext_id_col,
                        how="left"
                    )
                else:
                    test_df = test_df.merge(
                        ext_test[[ext_id_col, output_col]],
                        left_on=main_id_col,
                        right_on=ext_id_col,
                        how="left"
                    ).drop(columns=[ext_id_col])
            else:
                # ID –∫–æ–ª–æ–Ω–∫–∏ –Ω–µ—Ç –≤ test_df, –∏—Å–ø–æ–ª—å–∑—É–µ–º –∏–Ω–¥–µ–∫—Å
                # –£—Å—Ç–∞–Ω–∞–≤–ª–∏–≤–∞–µ–º ext_id_col –∫–∞–∫ –∏–Ω–¥–µ–∫—Å –≤–æ –≤–Ω–µ—à–Ω–µ–π —Ç–∞–±–ª–∏—Ü–µ
                ext_test_indexed = ext_test.set_index(ext_id_col)[[output_col]]
                test_df = test_df.merge(
                    ext_test_indexed,
                    left_index=True,
                    right_index=True,
                    how="left"
                )

        added_cols.append(output_col)

    return X, test_df, added_cols


def build_bert_embeddings_for_field(field_name: str,
                                    train_texts: list[str],
                                    test_texts: list[str]) -> tuple[np.ndarray, np.ndarray]:
    """
    –°—Ç—Ä–æ–∏–º —ç–º–±–µ–¥–¥–∏–Ω–≥–∏ BERT –¥–ª—è –æ–¥–Ω–æ–≥–æ –ª–æ–≥–∏—á–µ—Å–∫–æ–≥–æ —Ç–µ–∫—Å—Ç–æ–≤–æ–≥–æ –ø–æ–ª—è.
    BERT-–º–æ–¥–µ–ª—å –æ–±—â–∞—è –¥–ª—è –≤—Å–µ—Ö –ø–æ–ª–µ–π, –Ω–æ —Å—á–∏—Ç–∞–µ–º –æ—Ç–¥–µ–ª—å–Ω–æ –¥–ª—è –∫–∞–∂–¥–æ–≥–æ.
    """
    init_bert_model()

    device = get_text_device()
    max_len = CONFIG.get("text_max_length", 256)
    batch_size = CONFIG.get("text_batch_size", 32)

    def encode_list(texts: list[str], desc: str) -> np.ndarray:
        all_embs = []
        for i in tqdm(range(0, len(texts), batch_size), desc=f"{desc} [{field_name}]"):
            batch = texts[i:i + batch_size]

            enc = TEXT_TOKENIZER(
                batch,
                padding=True,
                truncation=True,
                max_length=max_len,
                return_tensors="pt"
            )
            enc = {k: v.to(device) for k, v in enc.items()}

            with torch.no_grad():
                outputs = TEXT_MODEL(**enc)
                last_hidden = outputs.last_hidden_state  # [bs, seq_len, hidden]

                mask = enc["attention_mask"].unsqueeze(-1).expand(last_hidden.size())
                masked = last_hidden * mask

                summed = masked.sum(dim=1)
                counts = mask.sum(dim=1).clamp(min=1)
                mean_pooled = summed / counts  # [bs, hidden]

                embs = mean_pooled.cpu().numpy()

                # L2 –Ω–æ—Ä–º–∞–ª–∏–∑–∞—Ü–∏—è
                norms = np.linalg.norm(embs, axis=1, keepdims=True)
                norms = np.clip(norms, 1e-12, None)
                embs = embs / norms

                all_embs.append(embs)

        return np.vstack(all_embs) if len(all_embs) > 0 else np.zeros((0, TEXT_MODEL.config.hidden_size))

    train_embs = encode_list(train_texts, "BERT text embeddings (train)")
    test_embs = encode_list(test_texts, "BERT text embeddings (test)")

    return train_embs, test_embs
def build_tfidf_svd_embeddings_for_field(field_name: str,
                                         train_texts: list[str],
                                         test_texts: list[str],
                                         logger: logging.Logger) -> tuple[np.ndarray, np.ndarray]:
    """
    TF-IDF (uni+bi-grams) -> TruncatedSVD –¥–æ –∫–æ–º–ø–∞–∫—Ç–Ω–æ–≥–æ –≤–µ–∫—Ç–æ—Ä–∞ -> L2-–Ω–æ—Ä–º–∞–ª–∏–∑–∞—Ü–∏—è.
    –î–ª—è –∫–∞–∂–¥–æ–≥–æ —Ç–µ–∫—Å—Ç–æ–≤–æ–≥–æ –ø–æ–ª—è —Å–≤–æ–π TF-IDF –∏ SVD, —á—Ç–æ–±—ã –Ω–µ –º–µ—à–∞—Ç—å —Ä–∞–∑–Ω—ã–µ –∏—Å—Ç–æ—á–Ω–∏–∫–∏.
    """
    max_features = CONFIG.get("text_tfidf_max_features", 50000)
    n_components = CONFIG.get("text_svd_n_components", 256)
    random_state = CONFIG.get("seed", 42)

    vect = TFIDF_VECTORIZERS.get(field_name)
    svd = TFIDF_SVDS.get(field_name)

    if vect is None or svd is None:
        vect = TfidfVectorizer(
            max_features=max_features,
            ngram_range=(1, 2)
        )
        logger.info(f"TF-IDF fitting for field '{field_name}'...")
        train_tfidf = vect.fit_transform(train_texts)

        svd = TruncatedSVD(
            n_components=n_components,
            random_state=random_state
        )
        logger.info(f"SVD fitting for field '{field_name}'...")
        train_embs = svd.fit_transform(train_tfidf)

        TFIDF_VECTORIZERS[field_name] = vect
        TFIDF_SVDS[field_name] = svd
    else:
        train_tfidf = vect.transform(train_texts)
        train_embs = svd.transform(train_tfidf)

    test_tfidf = vect.transform(test_texts)
    test_embs = svd.transform(test_tfidf)

    # L2-–Ω–æ—Ä–º–∞–ª–∏–∑–∞—Ü–∏—è
    def l2_norm(x: np.ndarray) -> np.ndarray:
        norms = np.linalg.norm(x, axis=1, keepdims=True)
        norms = np.where(norms == 0, 1, norms)
        return x / norms

    train_embs = l2_norm(train_embs)
    test_embs = l2_norm(test_embs)

    return train_embs, test_embs


In [163]:
def text_features(X: pd.DataFrame, test_df: pd.DataFrame, logger: logging.Logger):
    """
    1) –ü—Ä–∏–∫–ª–µ–∏–≤–∞–µ–º –≤–Ω–µ—à–Ω–∏–µ —Ç–µ–∫—Å—Ç–æ–≤—ã–µ —Ç–∞–±–ª–∏—Ü—ã (parquet/csv/tsv) –ø–æ id.
    2) –î–ª—è –∫–∞–∂–¥–æ–π —Ç–µ–∫—Å—Ç–æ–≤–æ–π –∫–æ–ª–æ–Ω–∫–∏ —Å—Ç—Ä–æ–∏–º –æ—Ç–¥–µ–ª—å–Ω—ã–π —ç–º–±–µ–¥–¥–∏–Ω–≥ (BERT –∏–ª–∏ TF-IDF+SVD).
    3) –ö–æ–Ω–∫–∞—Ç–∏–º –≤—Å—ë –∫ X –∏ test_df.
    """
    if not CONFIG.get("text_enable", False):
        return X, test_df

    logger.info("Text feature extraction starts...")

    # 1. –ü–æ–¥—Ü–µ–ø–ª—è–µ–º –≤–Ω–µ—à–Ω–∏–µ —Ç–µ–∫—Å—Ç–æ–≤—ã–µ —Ç–∞–±–ª–∏—Ü—ã
    X, test_df, external_cols = attach_external_text_tables(X, test_df)

    # 2. –°–æ–±–∏—Ä–∞–µ–º —Ñ–∏–Ω–∞–ª—å–Ω—ã–π —Å–ø–∏—Å–æ–∫ —Ç–µ–∫—Å—Ç–æ–≤—ã—Ö –∫–æ–ª–æ–Ω–æ–∫
    base_text_cols = CONFIG.get("text_columns", []) or []
    # —Ñ–∏–ª—å—Ç—Ä—É–µ–º —Ç–æ–ª—å–∫–æ —Ç–µ, –∫–æ—Ç–æ—Ä—ã–µ —Ä–µ–∞–ª—å–Ω–æ –µ—Å—Ç—å –≤ —Ç–∞–±–ª–∏—Ü–µ
    base_text_cols = [c for c in base_text_cols if c in X.columns]

    all_text_cols = base_text_cols + external_cols

    if not all_text_cols:
        logger.warning("No text columns found for text_features. Skipping.")
        return X, test_df

    model_type = CONFIG.get("text_model_type", "bert")

    # 3. –î–ª—è –∫–∞–∂–¥–æ–π —Ç–µ–∫—Å—Ç–æ–≤–æ–π –∫–æ–ª–æ–Ω–∫–∏ —Å—Ç—Ä–æ–∏–º —Å–≤–æ–∏ —ç–º–±–µ–¥–¥–∏–Ω–≥–∏ –∏ –∫–æ–Ω–∫–∞—Ç–∏–º
    for col in tqdm(all_text_cols, desc="Building text embeddings"):
        logger.info(f"Building text embeddings for column: {col}")

        # —Å–æ–±–∏—Ä–∞–µ–º —Ç–µ–∫—Å—Ç—ã (—Å—Ç—Ä–æ–∫–∞ -> str) - –í–ê–ñ–ù–û: —Å–æ—Ö—Ä–∞–Ω—è–µ–º –ø–æ—Ä—è–¥–æ–∫ —á–µ—Ä–µ–∑ –∏–Ω–¥–µ–∫—Å—ã
        train_texts = X[col].fillna("").astype(str).tolist()
        test_texts = test_df[col].fillna("").astype(str).tolist()

        # –ü—Ä–æ–≤–µ—Ä–∫–∞ —Ä–∞–∑–º–µ—Ä–æ–≤ –ø–µ—Ä–µ–¥ —Å–æ–∑–¥–∞–Ω–∏–µ–º —ç–º–±–µ–¥–¥–∏–Ω–≥–æ–≤
        if len(train_texts) != len(X):
            raise ValueError(f"Train texts length {len(train_texts)} != X length {len(X)} for column {col}")
        if len(test_texts) != len(test_df):
            raise ValueError(f"Test texts length {len(test_texts)} != test_df length {len(test_df)} for column {col}")

        # –ø—Ä–æ–≤–µ—Ä–∫–∞, –∞ –Ω–µ –≤—Å–µ –ª–∏ –ø—É—Å—Ç—ã–µ
        if all(t == "" for t in train_texts) and all(t == "" for t in test_texts):
            logger.warning(f"Column '{col}' has only empty texts. Skipping.")
            continue

        field_name = col  # –º–æ–∂–Ω–æ –ø–æ—Ç–æ–º –º–∞–ø–ø–∏—Ç—å/–ø–µ—Ä–µ–∏–º–µ–Ω–æ–≤–∞—Ç—å –µ—Å–ª–∏ —Ö–æ—á–µ—à—å

        if model_type == "bert":
            train_embs, test_embs = build_bert_embeddings_for_field(field_name, train_texts, test_texts)
        elif model_type == "tfidf_svd":
            train_embs, test_embs = build_tfidf_svd_embeddings_for_field(field_name, train_texts, test_texts, logger)
        else:
            raise ValueError(f"Unknown text_model_type: {model_type}")

        # –ü—Ä–æ–≤–µ—Ä–∫–∞ —Ä–∞–∑–º–µ—Ä–æ–≤ —ç–º–±–µ–¥–¥–∏–Ω–≥–æ–≤
        if train_embs.shape[0] != len(X):
            raise ValueError(f"Train embeddings shape {train_embs.shape[0]} != X length {len(X)} for column {col}")
        if test_embs.shape[0] != len(test_df):
            raise ValueError(f"Test embeddings shape {test_embs.shape[0]} != test_df length {len(test_df)} for column {col}")

        dim = train_embs.shape[1]
        # –ø—Ä–µ—Ñ–∏–∫—Å —Å –Ω–∞–∑–≤–∞–Ω–∏–µ–º –∫–æ–ª–æ–Ω–∫–∏, —á—Ç–æ–±—ã –ø–æ–Ω–∏–º–∞—Ç—å –∏—Å—Ç–æ—á–Ω–∏–∫
        prefix = f"text_{field_name}_emb"
        cols = [f"{prefix}_{i}" for i in range(dim)]

        # –°–æ–∑–¥–∞–µ–º DataFrame —Å –ø—Ä–∞–≤–∏–ª—å–Ω—ã–º–∏ –∏–Ω–¥–µ–∫—Å–∞–º–∏
        train_text_df = pd.DataFrame(train_embs, columns=cols, index=X.index)
        test_text_df = pd.DataFrame(test_embs, columns=cols, index=test_df.index)

        X = pd.concat([X, train_text_df], axis=1)
        test_df = pd.concat([test_df, test_text_df], axis=1)

        logger.info(f"Text features for '{col}' added: {dim} dims ({prefix}_*)")

    return X, test_df


In [164]:
def prepare_all_features(X, test_df, config, logger):
    # –ö–†–ò–¢–ò–ß–ï–°–ö–ê–Ø –ü–†–û–í–ï–†–ö–ê: —Å–æ—Ö—Ä–∞–Ω—è–µ–º –∏—Å—Ö–æ–¥–Ω—ã–µ —Ä–∞–∑–º–µ—Ä—ã
    original_X_size = len(X)
    original_test_size = len(test_df) if test_df is not None else 0
    logger.info(f"üîç ORIGINAL SIZES: X={original_X_size}, test_df={original_test_size}")
    
    # ----- Features -----  
    logger.info("üîß Generating basic features...")
    logger.info(f"X shape before basic: {X.shape}")
    if test_df is not None:
        logger.info(f"test_df shape before basic: {test_df.shape}")
    
    X = generate_basic_features(X, config, logger)
    if test_df is not None:
        test_df = generate_basic_features(test_df, config, logger)
    
    logger.info(f"X shape after basic: {X.shape}")
    if test_df is not None:
        logger.info(f"test_df shape after basic: {test_df.shape}")
        if len(test_df) != original_test_size:
            raise ValueError(f"‚ùå test_df size changed in generate_basic_features: {original_test_size} -> {len(test_df)}")

    logger.info("üìä Generating aggregate features...")
    logger.info(f"X shape before aggregate: {X.shape}")
    if test_df is not None:
        logger.info(f"test_df shape before aggregate: {test_df.shape}")
    
    X, test_df = generate_aggregate_features(X, test_df, config, logger)
    
    logger.info(f"X shape after aggregate: {X.shape}")
    if test_df is not None:
        logger.info(f"test_df shape after aggregate: {test_df.shape}")
        if len(test_df) != original_test_size:
            raise ValueError(f"‚ùå test_df size changed in generate_aggregate_features: {original_test_size} -> {len(test_df)}")

    logger.info("üó∫  Generating geo features...")
    logger.info(f"X shape before geo: {X.shape}")
    if test_df is not None:
        logger.info(f"test_df shape before geo: {test_df.shape}")
    
    X, test_df = generate_geo_features(X, test_df, config, logger)
    
    logger.info(f"X shape after geo: {X.shape}")
    if test_df is not None:
        logger.info(f"test_df shape after geo: {test_df.shape}")
        if len(test_df) != original_test_size:
            raise ValueError(f"‚ùå test_df size changed in generate_geo_features: {original_test_size} -> {len(test_df)}")
    
    if config['train_images_dir']:
        logger.info("Images process starts...")
        X, test_df = images_features(X, test_df)
    
    if CONFIG.get("text_enable", False):
        logger.info("Text process starts...")
        logger.info(f"X shape before text: {X.shape}")
        if test_df is not None:
            logger.info(f"test_df shape before text: {test_df.shape}")
        
        X, test_df = text_features(X, test_df, logger)
        
        logger.info(f"X shape after text: {X.shape}")
        if test_df is not None:
            logger.info(f"test_df shape after text: {test_df.shape}")
            if len(test_df) != original_test_size:
                raise ValueError(f"‚ùå test_df size changed in text_features: {original_test_size} -> {len(test_df)}")

    # ----- Address ‚Üí city extraction -----
    addr_col = config.get("address_column")
    if addr_col:
        logger.info(f"üèô  Extracting city from address column '{addr_col}' ...")
        logger.info(f"X shape before address: {X.shape}")
        if test_df is not None:
            logger.info(f"test_df shape before address: {test_df.shape}")
        
        city_index = config.get("address_city_index", -1)
        sep = config.get("address_split_sep", ",")
        X = process_address_extract_city(X, addr_col, city_index, sep, logger)
        if test_df is not None:
            test_df = process_address_extract_city(test_df, addr_col, city_index, sep, logger)
        
        logger.info(f"X shape after address: {X.shape}")
        if test_df is not None:
            logger.info(f"test_df shape after address: {test_df.shape}")
            if len(test_df) != original_test_size:
                raise ValueError(f"‚ùå test_df size changed in process_address_extract_city: {original_test_size} -> {len(test_df)}")

    # ----- Categorical detection -----
    logger.info("üîé Detecting categorical columns...")
    logger.info(f"X shape before categorical detection: {X.shape}")
    if test_df is not None:
        logger.info(f"test_df shape before categorical detection: {test_df.shape}")
    
    concat_df = pd.concat([X] if test_df is None else [X, test_df], ignore_index=False)
    cat_cols = detect_categorical_columns(
        concat_df,
        max_unique=config["basic_max_cat_unique"],
        force_categorical=config["basic_as_categorical"]
    )
    cat_cols = [c for c in cat_cols if c in X.columns]
    logger.info(f"Categorical columns: {cat_cols}")

    # ----- Categorical post-processing: rare ‚Üí 'other' -----
    logger.info("üß© Processing categorical features (merge rare into 'other')...")
    logger.info(f"X shape before categorical processing: {X.shape}")
    if test_df is not None:
        logger.info(f"test_df shape before categorical processing: {test_df.shape}")
    
    X, test_df, cat_cols = process_categorical_features(X, test_df, cat_cols, config, logger)
    logger.info(f"Categorical columns after processing: {cat_cols}")
    
    logger.info(f"X shape after categorical processing: {X.shape}")
    if test_df is not None:
        logger.info(f"test_df shape after categorical processing: {test_df.shape}")
        if len(test_df) != original_test_size:
            raise ValueError(f"‚ùå test_df size changed in process_categorical_features: {original_test_size} -> {len(test_df)}")

    # ----- Post-feature service columns drop -----
    post_drop = config.get("post_feature_drop_columns", [])
    if post_drop:
        logger.info(f"üßπ Dropping post-feature service columns: {post_drop}")
        X = X.drop(columns=[c for c in post_drop if c in X.columns])
        if test_df is not None:
            test_df = test_df.drop(columns=[c for c in post_drop if c in test_df.columns])
        # –æ–±–Ω–æ–≤–ª—è–µ–º —Å–ø–∏—Å–æ–∫ –∫–∞—Ç–µ–≥–æ—Ä–∏–∞–ª—å–Ω—ã—Ö
        cat_cols = [c for c in cat_cols if c not in post_drop]

    # –û–±—Ä–∞–±–æ—Ç–∫–∞ NaN: —Å–Ω–∞—á–∞–ª–∞ –∫–∞—Ç–µ–≥–æ—Ä–∏–∞–ª—å–Ω—ã–µ, –ø–æ—Ç–æ–º —á–∏—Å–ª–æ–≤—ã–µ
    # –î–ª—è –∫–∞—Ç–µ–≥–æ—Ä–∏–∞–ª—å–Ω—ã—Ö –∏—Å–ø–æ–ª—å–∑—É–µ–º —Å–ø–µ—Ü–∏–∞–ª—å–Ω–æ–µ –∑–Ω–∞—á–µ–Ω–∏–µ
    for col_name in cat_cols:
        if col_name in X.columns:
            X[col_name] = X[col_name].fillna("__MISSING__").astype(str)
        if test_df is not None and col_name in test_df.columns:
            test_df[col_name] = test_df[col_name].fillna("__MISSING__").astype(str)
    
    # –î–ª—è —á–∏—Å–ª–æ–≤—ã—Ö –∫–æ–ª–æ–Ω–æ–∫ –∑–∞–ø–æ–ª–Ω—è–µ–º NaN
    numeric_cols = [c for c in X.columns if c not in cat_cols and pd.api.types.is_numeric_dtype(X[c])]
    if numeric_cols:
        X[numeric_cols] = X[numeric_cols].fillna(-99999999)
        if test_df is not None:
            # –ü—Ä–æ–≤–µ—Ä—è–µ–º, —á—Ç–æ –∫–æ–ª–æ–Ω–∫–∏ –µ—Å—Ç—å –≤ test_df
            numeric_cols_test = [c for c in numeric_cols if c in test_df.columns]
            if numeric_cols_test:
                test_df[numeric_cols_test] = test_df[numeric_cols_test].fillna(-99999999)
    
    # –§–∏–Ω–∞–ª—å–Ω–∞—è –ø—Ä–æ–≤–µ—Ä–∫–∞ —Ä–∞–∑–º–µ—Ä–æ–≤
    logger.info(f"Final X shape: {X.shape}")
    if test_df is not None:
        logger.info(f"Final test_df shape: {test_df.shape}")
        
        # –ö–†–ò–¢–ò–ß–ï–°–ö–ê–Ø –ü–†–û–í–ï–†–ö–ê: —Ä–∞–∑–º–µ—Ä test_df –Ω–µ –¥–æ–ª–∂–µ–Ω –±—ã–ª –∏–∑–º–µ–Ω–∏—Ç—å—Å—è
        if len(test_df) != original_test_size:
            raise ValueError(
                f"‚ùå CRITICAL: test_df size changed from {original_test_size} to {len(test_df)} during prepare_all_features! "
                f"This should not happen. Please check the code above."
            )
        
        # –ü—Ä–æ–≤–µ—Ä—è–µ–º, —á—Ç–æ –∫–æ–ª–æ–Ω–∫–∏ —Å–æ–≤–ø–∞–¥–∞—é—Ç (–∫—Ä–æ–º–µ –≤–æ–∑–º–æ–∂–Ω—ã—Ö —Ä–∞–∑–ª–∏—á–∏–π –≤ –∫–∞—Ç–µ–≥–æ—Ä–∏–∞–ª—å–Ω—ã—Ö)
        X_cols = set(X.columns)
        test_cols = set(test_df.columns)
        missing_in_test = X_cols - test_cols
        extra_in_test = test_cols - X_cols
        if missing_in_test:
            logger.warning(f"Columns in X but not in test_df: {list(missing_in_test)[:10]}")
        if extra_in_test:
            logger.warning(f"Columns in test_df but not in X: {list(extra_in_test)[:10]}")
    
    return X, test_df, cat_cols

In [165]:
X, test_df, cat_cols = prepare_all_features(X, test_df, CONFIG, logger)

2025-11-17 22:38:32,535 - INFO -  ORIGINAL SIZES: X=9912, test_df=8
INFO:TABULAR_BOOSTING:üîç ORIGINAL SIZES: X=9912, test_df=8
2025-11-17 22:38:32,537 - INFO -  Generating basic features...
INFO:TABULAR_BOOSTING:üîß Generating basic features...
2025-11-17 22:38:32,538 - INFO - X shape before basic: (9912, 13)
INFO:TABULAR_BOOSTING:X shape before basic: (9912, 13)
2025-11-17 22:38:32,540 - INFO - test_df shape before basic: (8, 13)
INFO:TABULAR_BOOSTING:test_df shape before basic: (8, 13)
2025-11-17 22:38:32,542 - INFO - X shape after basic: (9912, 13)
INFO:TABULAR_BOOSTING:X shape after basic: (9912, 13)
2025-11-17 22:38:32,543 - INFO - test_df shape after basic: (8, 13)
INFO:TABULAR_BOOSTING:test_df shape after basic: (8, 13)
2025-11-17 22:38:32,545 - INFO -  Generating aggregate features...
INFO:TABULAR_BOOSTING:üìä Generating aggregate features...
2025-11-17 22:38:32,546 - INFO - X shape before aggregate: (9912, 13)
INFO:TABULAR_BOOSTING:X shape before aggregate: (9912, 13)
2025

### Features filter

In [166]:
def filter_features_by_correlation(
    X: pd.DataFrame,
    y: pd.Series,
    config: Dict,
    logger: logging.Logger
) -> Tuple[pd.DataFrame, List[str]]:
    """
    1) Pearson –ø–æ —á–∏—Å–ª–æ–≤—ã–º –ø—Ä–∏–∑–Ω–∞–∫–∞–º
    2) phik (–µ—Å–ª–∏ –≤–∫–ª—é—á–µ–Ω–æ –∏ —É—Å—Ç–∞–Ω–æ–≤–ª–µ–Ω phik)
    –í–æ–∑–≤—Ä–∞—â–∞–µ—Ç X —Å –æ—Ç–æ–±—Ä–∞–Ω–Ω—ã–º–∏ –ø—Ä–∏–∑–Ω–∞–∫–∞–º–∏ –∏ —Å–ø–∏—Å–æ–∫ –∏—Ö –∏–º—ë–Ω.
    """
    selected = list(X.columns)
    task_type = config["task_type"]

    # --- Pearson ---
    pearson_thr = float(config.get("corr_pearson_min_abs", 0.0) or 0.0)
    if pearson_thr > 0.0:
        logger.info(f"Applying Pearson filter with |corr| >= {pearson_thr}")
        corr_vals = {}
        y_series = pd.Series(y)

        for col in tqdm(selected, desc="Computing Pearson correlations"):
            if pd.api.types.is_numeric_dtype(X[col]):
                try:
                    c = X[col].corr(y_series)
                except Exception:
                    c = np.nan
            else:
                c = np.nan
            corr_vals[col] = c

        corr_series = pd.Series(corr_vals)
        keep_mask = corr_series.abs() < pearson_thr
        kept = corr_series.index[keep_mask].tolist()
        logger.info(
            f"Pearson filter: kept {len(kept)} / {len(corr_series)} features "
            f"(min abs corr {corr_series.abs().min():.6f}, max {corr_series.abs().max():.6f})"
        )
        selected = kept

    # --- phik ---
    use_phik = bool(config.get("corr_use_phik", False))
    phik_thr = float(config.get("corr_phik_min_abs", 0.0) or 0.0)

    if use_phik and phik_thr > 0.0:
        if not HAS_PHIK:
            logger.warning("corr_use_phik=True, –Ω–æ –±–∏–±–ª–∏–æ—Ç–µ–∫–∞ phik –Ω–µ —É—Å—Ç–∞–Ω–æ–≤–ª–µ–Ω–∞ ‚Äî –ø—Ä–æ–ø—É—Å–∫–∞–µ–º phik-—Ñ–∏–ª—å—Ç—Ä.")
        else:
            logger.info(f"Applying phik filter with |phik| >= {phik_thr}")
            df_phik = X[selected].copy()
            tmp_target_col = "__target_for_phik__"
            df_phik[tmp_target_col] = y.values

            # —á–∏—Å–ª–æ–≤—ã–µ –¥–ª—è interval_cols
            interval_cols = [
                c for c in df_phik.columns
                if pd.api.types.is_numeric_dtype(df_phik[c])
            ]
            try:
                phik_matrix = df_phik.phik_matrix(interval_cols=interval_cols)
                target_corr = phik_matrix[tmp_target_col].drop(
                    labels=[tmp_target_col], errors="ignore"
                )
                keep_mask = target_corr.abs() >= phik_thr
                kept_phik = target_corr.index[keep_mask].tolist()
                logger.info(
                    f"phik filter: kept {len(kept_phik)} / {len(target_corr)} features "
                    f"(min abs phik {target_corr.abs().min():.6f}, "
                    f"max {target_corr.abs().max():.6f})"
                )
                # –ø–µ—Ä–µ—Å–µ—á–µ–Ω–∏–µ —Å —É–∂–µ –æ—Ç—Ñ–∏–ª—å—Ç—Ä–æ–≤–∞–Ω–Ω—ã–º–∏ –ø–æ Pearson
                selected = [c for c in selected if c in kept_phik]
            except Exception as e:
                logger.warning(f"phik computation failed, skip phik filter. Error: {e}")

    logger.info(f"Total features after correlation filtering: {len(selected)}")
    return X[selected], selected

In [167]:
# NEW: —Ñ–∏–ª—å—Ç—Ä–∞—Ü–∏—è –ø–æ –∫–æ—Ä—Ä–µ–ª—è—Ü–∏–∏
if CONFIG.get("corr_enable", False):
    logger.info("üìâ Applying correlation-based feature filtering...")
    # –ö–†–ò–¢–ò–ß–ï–°–ö–ê–Ø –ü–†–û–í–ï–†–ö–ê: —Å–æ—Ö—Ä–∞–Ω—è–µ–º —Ä–∞–∑–º–µ—Ä –ø–µ—Ä–µ–¥ —Ñ–∏–ª—å—Ç—Ä–∞—Ü–∏–µ–π
    test_size_before_corr = len(test_df) if test_df is not None else 0
    logger.info(f"üîç test_df size BEFORE correlation filter: {test_size_before_corr}")
    
    X, selected_cols = filter_features_by_correlation(X, y, CONFIG, logger)

    if test_df is not None:
        missing_in_test = [c for c in selected_cols if c not in test_df.columns]
        if missing_in_test:
            logger.warning(
                f"{len(missing_in_test)} features selected by correlation "
                f"missing in test: {missing_in_test[:10]}..."
            )
        # –æ—Å—Ç–∞–≤–ª—è–µ–º —Ç–æ–ª—å–∫–æ —Ç–µ, —á—Ç–æ –µ—Å—Ç—å –≤ —Ç–µ—Å—Ç–µ
        keep_for_test = [c for c in selected_cols if c in test_df.columns]
        test_df = test_df[keep_for_test]
        
        # –ö–†–ò–¢–ò–ß–ï–°–ö–ê–Ø –ü–†–û–í–ï–†–ö–ê: —Ä–∞–∑–º–µ—Ä –Ω–µ –¥–æ–ª–∂–µ–Ω –∏–∑–º–µ–Ω–∏—Ç—å—Å—è –ø—Ä–∏ —Ñ–∏–ª—å—Ç—Ä–∞—Ü–∏–∏ –∫–æ–ª–æ–Ω–æ–∫
        logger.info(f"üîç test_df size AFTER correlation filter: {len(test_df)}")
        if len(test_df) != test_size_before_corr:
            raise ValueError(
                f"‚ùå CRITICAL: test_df size changed from {test_size_before_corr} to {len(test_df)} "
                f"during correlation filtering! This should not happen when filtering columns."
            )

In [168]:
 # NEW: sanitize feature names –¥–ª—è –≤—Å–µ—Ö –º–æ–¥–µ–ª–µ–π (–æ—Å–æ–±–µ–Ω–Ω–æ XGBoost)
logger.info("üßæ Sanitizing feature names...")
# –ö–†–ò–¢–ò–ß–ï–°–ö–ê–Ø –ü–†–û–í–ï–†–ö–ê: —Å–æ—Ö—Ä–∞–Ω—è–µ–º —Ä–∞–∑–º–µ—Ä –ø–µ—Ä–µ–¥ sanitize
test_size_before_sanitize = len(test_df) if test_df is not None else 0
logger.info(f"üîç test_df size BEFORE sanitize: {test_size_before_sanitize}")

X, feature_name_mapping = sanitize_feature_names(X, logger)

if test_df is not None:
    # —Å–Ω–∞—á–∞–ª–∞ –ø–µ—Ä–µ–∏–º–µ–Ω—É–µ–º –∫–æ–ª–æ–Ω–∫–∏ —Ç–µ—Å—Ç–∞ –ø–æ —Ç–æ–º—É –∂–µ –º–∞–ø–ø–∏–Ω–≥—É
    test_df = test_df.rename(columns=feature_name_mapping)
    # –µ—Å–ª–∏ –≤ —Ç–µ—Å—Ç–µ –æ—Å—Ç–∞–ª–∏—Å—å –∫–∞–∫–∏–µ-—Ç–æ –¥–æ–ø–æ–ª–Ω–∏—Ç–µ–ª—å–Ω—ã–µ –∫–æ–ª–æ–Ω–∫–∏ (–≤ —Ä–µ–∞–ª—å–Ω–æ—Å—Ç–∏ —Ä–µ–¥–∫–æ),
    # –º–æ–∂–Ω–æ –µ—â—ë —Ä–∞–∑ –ø—Ä–æ–≥–Ω–∞—Ç—å sanitize, –Ω–æ –∏–º–µ–Ω–∞ —É–∂–µ –±—É–¥—É—Ç –±–µ–∑–æ–ø–∞—Å–Ω—ã–µ
    # test, _ = sanitize_feature_names(test, logger)
    
    # –ö–†–ò–¢–ò–ß–ï–°–ö–ê–Ø –ü–†–û–í–ï–†–ö–ê: —Ä–∞–∑–º–µ—Ä –Ω–µ –¥–æ–ª–∂–µ–Ω –∏–∑–º–µ–Ω–∏—Ç—å—Å—è –ø—Ä–∏ –ø–µ—Ä–µ–∏–º–µ–Ω–æ–≤–∞–Ω–∏–∏
    logger.info(f"üîç test_df size AFTER sanitize: {len(test_df)}")
    if len(test_df) != test_size_before_sanitize:
        raise ValueError(
            f"‚ùå CRITICAL: test_df size changed from {test_size_before_sanitize} to {len(test_df)} "
            f"during sanitize! This should not happen when renaming columns."
        )

# –æ–±–Ω–æ–≤–ª—è–µ–º —Å–ø–∏—Å–æ–∫ –∫–∞—Ç–µ–≥–æ—Ä–∏–∞–ª—å–Ω—ã—Ö –∫–æ–ª–æ–Ω–æ–∫ –ø–æ–¥ –Ω–æ–≤—ã–µ –∏–º–µ–Ω–∞
cat_cols = [feature_name_mapping.get(c, c) for c in cat_cols]

2025-11-17 22:41:16,100 - INFO -  Sanitizing feature names...
INFO:TABULAR_BOOSTING:üßæ Sanitizing feature names...
2025-11-17 22:41:16,101 - INFO -  test_df size BEFORE sanitize: 8
INFO:TABULAR_BOOSTING:üîç test_df size BEFORE sanitize: 8
2025-11-17 22:41:16,145 - INFO -  test_df size AFTER sanitize: 8
INFO:TABULAR_BOOSTING:üîç test_df size AFTER sanitize: 8


### Cross-validation

In [169]:
def make_folds(
    y: pd.Series,
    config: Dict
):
    n_splits = config["cv_n_splits"]
    seed = config["cv_random_state"]
    shuffle = config["cv_shuffle"]
    stratified = config["cv_stratified"]
    task_type = config["task_type"]

    if stratified and task_type in ["binary", "multiclass"]:
        splitter = StratifiedKFold(
            n_splits=n_splits,
            shuffle=shuffle,
            random_state=seed
        )
    else:
        splitter = KFold(
            n_splits=n_splits,
            shuffle=shuffle,
            random_state=seed
        )
    return list(splitter.split(np.zeros(len(y)), y))


def build_catboost_model(
    config: Dict
):
    params = config["catboost_params"].copy()
    
    # –î–æ–±–∞–≤–ª—è–µ–º device –≤ –ø–∞—Ä–∞–º–µ—Ç—Ä—ã (CatBoost –∏—Å–ø–æ–ª—å–∑—É–µ—Ç "GPU" –∏–ª–∏ "CPU")
    device_str = config.get("device", "CPU")
    if device_str.upper() == "CUDA":
        params["task_type"] = "GPU"
    elif device_str.upper() == "GPU":
        params["task_type"] = "GPU"
    else:
        params["task_type"] = "CPU"
    
    if config["task_type"] == "regression":
        params["loss_function"] = "RMSE"
        params["eval_metric"] = "RMSE"
        model = CatBoostRegressor(**params)
    else:
        if config["task_type"] == "multiclass":
            params["loss_function"] = "MultiClass"
            params["eval_metric"] = "MultiClass"
        model = CatBoostClassifier(**params)
    return model


def build_lgb_params(
    config: Dict,
    num_classes: Optional[int] = None
):
    params = config["lgb_params"].copy()
    task_type = config["task_type"]

    # ---------- Device (CPU / GPU) ----------
    device_str = config.get("device", "CPU").upper()
    if device_str in ["CUDA", "GPU"]:
        # –¥–ª—è LightGBM –ø—Ä–∞–≤–∏–ª—å–Ω–æ "device_type"
        params["device_type"] = "gpu"
    else:
        params["device_type"] = "cpu"

    # ---------- Objective + metric –ø–æ–¥ —Ç–∏–ø –∑–∞–¥–∞—á–∏ ----------
    if task_type == "regression":
        # –æ–±—ã—á–Ω–∞—è —Ä–µ–≥—Ä–µ—Å—Å–∏—è
        params["objective"] = "regression"
        # –º–µ—Ç—Ä–∏–∫–∏ LightGBM –∑–∞–¥–∞—é—Ç—Å—è –∫–ª—é—á–æ–º "metric"
        params["metric"] = ["mae"]   # –∏–ª–∏ ["rmse"]
    elif task_type == "binary":
        params["objective"] = "binary"
        params["metric"] = ["auc", "binary_logloss"]
    else:  # multiclass
        if num_classes is None or num_classes <= 1:
            raise ValueError("num_classes must be > 1 for multiclass")
        params["objective"] = "multiclass"
        params["num_class"] = num_classes
        params["metric"] = ["multi_logloss", "multi_error"]

    return params



def build_xgb_params(
    config: Dict,
    num_classes: Optional[int] = None,
):
    params = config["xgb_params"].copy()
    task_type = config["task_type"]

    # ----- —á–∏—Å—Ç–∏–º –≤–æ–∑–º–æ–∂–Ω—ã–π –º—É—Å–æ—Ä -----
    # –µ—Å–ª–∏ –≤ –∫–æ–Ω—Ñ–∏–≥–µ –±—ã–ª–æ —á—Ç–æ-—Ç–æ –≤—Ä–æ–¥–µ "device": 0 ‚Äî —É–±–∏—Ä–∞–µ–º
    if "device" in params and not isinstance(params["device"], str):
        params.pop("device")

    # ----- CPU / GPU -----
    device_str = config.get("device", "CPU").upper()
    if device_str in ["CUDA", "GPU"]:
        # XGBoost 2.x —Ö–æ—á–µ—Ç —Å—Ç—Ä–æ–∫–æ–≤—ã–π device
        params["device"] = "cuda"      # –º–æ–∂–Ω–æ "cuda:0"
        # –Ω–∞ GPU –æ–±—ã—á–Ω–æ –¥–æ—Å—Ç–∞—Ç–æ—á–Ω–æ tree_method="hist" –∏–ª–∏ "gpu_hist"
        # –æ—Å—Ç–∞–≤–∏–º "gpu_hist", –µ—Å–ª–∏ –≤ –∫–æ–Ω—Ñ–∏–≥–µ —É–∂–µ –∑–∞–¥–∞–Ω, –∏–Ω–∞—á–µ –ø–æ—Å—Ç–∞–≤–∏–º –º—ã
        params.setdefault("tree_method", "gpu_hist")
    else:
        params["device"] = "cpu"
        params.setdefault("tree_method", "hist")

    # –ø–æ—Ç–æ–∫–æ–≤ –ø–æ —É–º–æ–ª—á–∞–Ω–∏—é ‚Äî –≤—Å–µ —è–¥—Ä–∞
    params.setdefault("nthread", 0)

    # ----- —Ç–∏–ø –∑–∞–¥–∞—á–∏ -----
    if task_type == "regression":
        params["objective"] = "reg:squarederror"
        params["eval_metric"] = "mae"           # –º–æ–∂–Ω–æ –ø–æ–º–µ–Ω—è—Ç—å –Ω–∞ "rmse"
    elif task_type == "binary":
        params["objective"] = "binary:logistic"
        params["eval_metric"] = "auc"
    else:
        # multiclass
        if num_classes is None:
            raise ValueError("num_classes must be provided for multiclass")
        params["objective"] = "multi:softprob"
        params["num_class"] = num_classes
        params["eval_metric"] = "mlogloss"

    return params




def _evaluate_predictions(
    y_true: pd.Series,
    preds: np.ndarray,
    task_type: str,
    name_prefix: str,
    logger: logging.Logger
) -> Dict[str, float]:
    metrics = {}
    if preds is None:
        return metrics

    if task_type == "regression":
        rmse = mean_squared_error(y_true, preds, squared=False)
        mae = mean_absolute_error(y_true, preds)
        metrics[f"{name_prefix}_RMSE"] = rmse
        metrics[f"{name_prefix}_MAE"] = rmse
        
        logger.info(f"{name_prefix} RMSE: {rmse:.6f} MAE: {mae:.6f}")
    elif task_type == "binary":
        auc = roc_auc_score(y_true, preds)
        logloss = log_loss(y_true, preds)

        ## –º–æ–∂–Ω–æ —Å–¥–µ–ª–∞—Ç—å –ø–æ–¥–±–æ—Ä –æ–ø—Ç–∏–º–∞–ª—å–Ω–æ–≥–æ –ø–æ—Ä–æ–≥–∞
        preds_label = (preds >= 0.5).astype(int)
        acc = accuracy_score(y_true, preds_label)
        cm = confusion_matrix(y_true, preds_label)
        metrics[f"{name_prefix}_AUC"] = auc
        metrics[f"{name_prefix}_LogLoss"] = logloss
        metrics[f"{name_prefix}_ACC"] = acc
        logger.info(f"{name_prefix} AUC: {auc:.6f}, LogLoss: {logloss:.6f}, ACC: {acc:.6f}")
        logger.info(f"{name_prefix} confusion matrix:\n{cm}")
    else:
        # multiclass
        # –µ—Å–ª–∏ –ø—Ä–∏—à–ª–∏ –≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç–∏ (n_samples, n_classes) ‚Äî –±–µ—Ä—ë–º argmax
        if preds.ndim == 2:
            preds_label = np.argmax(preds, axis=1)
        else:
            preds_label = preds

        acc = accuracy_score(y_true, preds_label)
        cm = confusion_matrix(y_true, preds_label)
        metrics[f"{name_prefix}_ACC"] = acc
        logger.info(f"{name_prefix} ACC: {acc:.6f}")
        logger.info(f"{name_prefix} confusion matrix:\n{cm}")

    return metrics


def train_and_evaluate(
    X: pd.DataFrame,
    y: pd.Series,
    cat_cols: List[str],
    config: Dict,
    logger: logging.Logger
) -> Dict:
    task_type = config["task_type"]
    folds = make_folds(y, config)
    logger.info(f"Using {len(folds)} folds.")

    cat_features_idx = [X.columns.get_loc(c) for c in cat_cols if c in X.columns]

    # –°–ù–ê–ß–ê–õ–ê —Å—á–∏—Ç–∞–µ–º classes_
    classes_ = np.unique(y) if task_type == "multiclass" else None

    n = len(y)
    if task_type == "multiclass":
        n_classes = len(classes_)
        oof_preds_cat = np.zeros((n, n_classes)) if config["use_catboost"] else None
        oof_preds_lgb = np.zeros((n, n_classes)) if config["use_lightgbm"] else None
        oof_preds_xgb = np.zeros((n, n_classes)) if config["use_xgboost"] else None
    else:
        oof_preds_cat = np.zeros(n) if config["use_catboost"] else None
        oof_preds_lgb = np.zeros(n) if config["use_lightgbm"] else None
        oof_preds_xgb = np.zeros(n) if config["use_xgboost"] else None

    models_cat: List = []
    models_lgb: List = []
    models_xgb: List = []

    feature_importances_lgb = []
    feature_importances_xgb = []

    for fold_idx, (tr_idx, val_idx) in enumerate(tqdm(folds, desc="Cross-validation folds")):
        logger.info(f"========== Fold {fold_idx + 1}/{len(folds)} ==========")

        X_tr, X_val = X.iloc[tr_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[tr_idx], y.iloc[val_idx]

        if config["use_catboost"]:
            logger.info("Training CatBoost...")
            train_pool = Pool(
                X_tr, y_tr,
                cat_features=cat_features_idx
            )
            val_pool = Pool(
                X_val, y_val,
                cat_features=cat_features_idx
            )
            model_cat = build_catboost_model(config)
            # Early stopping –¥–ª—è CatBoost
            early_stopping_rounds = config.get("early_stopping_rounds", 100)
            model_cat.fit(
                train_pool, 
                eval_set=val_pool,
                early_stopping_rounds=early_stopping_rounds,
                use_best_model=True
            )

            if task_type == "regression":
                preds_val = model_cat.predict(val_pool)
            else:
                proba = model_cat.predict_proba(val_pool)
                if task_type == "binary":
                    preds_val = proba[:, 1]
                else:
                    oof_preds_cat[val_idx, :] = proba
                    preds_val = proba  # –¥–ª—è –µ–¥–∏–Ω–æ–æ–±—Ä–∞–∑–∏—è, –µ—Å–ª–∏ –≤–¥—Ä—É–≥ –≥–¥–µ-—Ç–æ –∏—Å–ø–æ–ª—å–∑—É–µ—à—å

            if task_type != "multiclass":
                oof_preds_cat[val_idx] = preds_val
            models_cat.append(model_cat)

        if config["use_lightgbm"]:
            logger.info("Training LightGBM...")

            num_classes = len(classes_) if task_type == "multiclass" else None
            lgb_params = build_lgb_params(config, num_classes=num_classes)

            X_tr_lgb = X_tr.copy()
            X_val_lgb = X_val.copy()
            for c in cat_cols:
                if c in X_tr_lgb.columns:
                    X_tr_lgb[c] = X_tr_lgb[c].astype("category")
                    X_val_lgb[c] = X_val_lgb[c].astype("category")

            lgb_train = lgb.Dataset(X_tr_lgb, label=y_tr)
            lgb_val = lgb.Dataset(X_val_lgb, label=y_val)
            print(lgb_params)
            # Early stopping –¥–ª—è LightGBM
            early_stopping_rounds = config.get("early_stopping_rounds", 100)
            model_lgb = lgb.train(
                lgb_params,
                lgb_train,
                num_boost_round=1000,
                valid_sets=[lgb_train, lgb_val],
                valid_names=["train", "valid"],
                callbacks=[
                    lgb.early_stopping(stopping_rounds=early_stopping_rounds),
                    lgb.log_evaluation(100)
                ]
            )

            if task_type == "regression":
                preds_val = model_lgb.predict(X_val_lgb, num_iteration=model_lgb.best_iteration)
            else:
                proba = model_lgb.predict(X_val_lgb, num_iteration=model_lgb.best_iteration)
                if task_type == "binary":
                    preds_val = proba
                else:
                    # multiclass: —Å–æ—Ö—Ä–∞–Ω—è–µ–º –≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç–∏
                    oof_preds_lgb[val_idx, :] = proba
                    preds_val = proba

            if task_type != "multiclass":
                oof_preds_lgb[val_idx] = preds_val
            models_lgb.append(model_lgb)
            feature_importances_lgb.append(model_lgb.feature_importance(importance_type="gain"))

        if config["use_xgboost"]:
            logger.info("Training XGBoost...")
            num_classes = len(classes_) if task_type == "multiclass" else None
            xgb_params = build_xgb_params(config, num_classes=num_classes)

            X_tr_xgb = X_tr.copy()
            X_val_xgb = X_val.copy()
            for c in X_tr_xgb.columns:
                if X_tr_xgb[c].dtype == "object":
                    X_tr_xgb[c] = X_tr_xgb[c].astype("category")
                    X_val_xgb[c] = X_val_xgb[c].astype("category")

            dtrain = xgb.DMatrix(X_tr_xgb, label=y_tr, enable_categorical=True)
            dvalid = xgb.DMatrix(X_val_xgb, label=y_val, enable_categorical=True)

            evals = [(dtrain, "train"), (dvalid, "valid")]
            print(dtrain, dvalid)
            print(xgb_params)
            # Early stopping –¥–ª—è XGBoost
            early_stopping_rounds = config.get("early_stopping_rounds", 100)
            model_xgb = xgb.train(
                params=xgb_params,
                dtrain=dtrain,
                num_boost_round=1000,
                evals=evals,
                early_stopping_rounds=early_stopping_rounds,
                verbose_eval=100
            )

            if task_type == "regression":
                preds_val = model_xgb.predict(
                    dvalid,
                    iteration_range=(0, model_xgb.best_iteration + 1)
                )
            else:
                proba = model_xgb.predict(
                    dvalid,
                    iteration_range=(0, model_xgb.best_iteration + 1)
                )
                if task_type == "binary":
                    preds_val = proba
                else:
                    # multiclass: —Å–æ—Ö—Ä–∞–Ω—è–µ–º –≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç–∏
                    oof_preds_xgb[val_idx, :] = proba
                    preds_val = proba

            if task_type != "multiclass":
                oof_preds_xgb[val_idx] = preds_val
            models_xgb.append(model_xgb)

            fi_xgb = model_xgb.get_score(importance_type="gain")
            fi_vec = np.array([fi_xgb.get(f, 0.0) for f in X.columns])
            feature_importances_xgb.append(fi_vec)

        gc.collect()

    metrics_all: Dict[str, float] = {}
    metrics_all.update(_evaluate_predictions(y, oof_preds_cat, task_type, "CatBoost", logger))
    metrics_all.update(_evaluate_predictions(y, oof_preds_lgb, task_type, "LightGBM", logger))
    metrics_all.update(_evaluate_predictions(y, oof_preds_xgb, task_type, "XGBoost", logger))

    logger.info("Evaluating blended OOF predictions...")
    blend_weights = config["blend_weights"]
    
    blend_preds = None
    if task_type == "multiclass":
        # –æ–ø—Ä–µ–¥–µ–ª—è–µ–º —á–∏—Å–ª–æ –∫–ª–∞—Å—Å–æ–≤ –ø–æ –ø–µ—Ä–≤–æ–π –Ω–µ-None –º–∞—Ç—Ä–∏—Ü–µ
        num_classes = None
        for arr in [oof_preds_cat, oof_preds_lgb, oof_preds_xgb]:
            if arr is not None:
                num_classes = arr.shape[1]
                break
        if num_classes is None:
            logger.warning("No OOF predictions for multiclass blend.")
        else:
            blend_num = np.zeros((n, num_classes), dtype=float)
            blend_den = np.zeros(n, dtype=float)

            if oof_preds_cat is not None and blend_weights.get("catboost", 0) != 0:
                w = blend_weights["catboost"]
                blend_num += w * oof_preds_cat
                blend_den += w
            if oof_preds_lgb is not None and blend_weights.get("lightgbm", 0) != 0:
                w = blend_weights["lightgbm"]
                blend_num += w * oof_preds_lgb
                blend_den += w
            if oof_preds_xgb is not None and blend_weights.get("xgboost", 0) != 0:
                w = blend_weights["xgboost"]
                blend_num += w * oof_preds_xgb
                blend_den += w

            valid_mask = blend_den > 0
            if valid_mask.any():
                blend_preds = np.zeros_like(blend_num)
                # –Ω–æ—Ä–º–∏—Ä—É–µ–º –ø–æ –≤–µ—Å–∞–º, broadcast –ø–æ axis=1
                blend_preds[valid_mask] = (
                    blend_num[valid_mask] /
                    blend_den[valid_mask][:, None]
                )
                metrics_all.update(_evaluate_predictions(y, blend_preds, task_type, "Blend", logger))
            else:
                logger.warning("No models contributions to blend; blend_den is zero everywhere.")
    else:
        # binary / regression ‚Äî –∫–∞–∫ —Ä–∞–Ω—å—à–µ (1D)
        blend_num = np.zeros(n, dtype=float)
        blend_den = np.zeros(n, dtype=float)

        if oof_preds_cat is not None and blend_weights.get("catboost", 0) != 0:
            w = blend_weights["catboost"]
            blend_num += w * oof_preds_cat
            blend_den += w
        if oof_preds_lgb is not None and blend_weights.get("lightgbm", 0) != 0:
            w = blend_weights["lightgbm"]
            blend_num += w * oof_preds_lgb
            blend_den += w
        if oof_preds_xgb is not None and blend_weights.get("xgboost", 0) != 0:
            w = blend_weights["xgboost"]
            blend_num += w * oof_preds_xgb
            blend_den += w

        valid_mask = blend_den > 0
        if valid_mask.any():
            blend_preds = np.zeros_like(blend_num)
            blend_preds[valid_mask] = blend_num[valid_mask] / blend_den[valid_mask]
            metrics_all.update(_evaluate_predictions(y, blend_preds, task_type, "Blend", logger))
        else:
            logger.warning("No models contributions to blend; blend_den is zero everywhere.")
    logger.info("========== OOF metrics ==========")
    logger.info(metrics_all)

    results = {
        "oof_catboost": oof_preds_cat,
        "oof_lightgbm": oof_preds_lgb,
        "oof_xgboost": oof_preds_xgb,
        "oof_blend": blend_preds,
        "metrics": metrics_all,
        "models_catboost": models_cat,
        "models_lightgbm": models_lgb,
        "models_xgboost": models_xgb
    }

    if feature_importances_lgb:
        fi_lgb = np.mean(feature_importances_lgb, axis=0)
        results["feature_importance_lgb"] = fi_lgb

    if feature_importances_xgb:
        fi_xgb_mean = np.mean(feature_importances_xgb, axis=0)
        results["feature_importance_xgb"] = fi_xgb_mean

    return results


def ensemble_predict(
    models: Dict[str, List],
    X: pd.DataFrame,
    cat_cols: List[str],
    config: Dict
) -> Dict[str, Optional[np.ndarray]]:
    """
    –î–µ–ª–∞–µ—Ç –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏—è –∞–Ω—Å–∞–º–±–ª–µ–º:
      - –ø–æ –∫–∞–∂–¥–æ–º—É –±—É—Å—Ç–∏–Ω–≥—É —É—Å—Ä–µ–¥–Ω—è–µ—Ç –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏—è –ø–æ —Ñ–æ–ª–¥–∞–º
      - —Ñ–æ—Ä–º–∏—Ä—É–µ—Ç blended –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏—è –ø–æ –≤–µ—Å–∞–º

    –í–æ–∑–≤—Ä–∞—â–∞–µ—Ç:
      {
        "catboost": np.ndarray –∏–ª–∏ None,
        "lightgbm": np.ndarray –∏–ª–∏ None,
        "xgboost": np.ndarray –∏–ª–∏ None,
        "blend": np.ndarray –∏–ª–∏ None,
      }
    """
    task_type = config["task_type"]
    blend_weights = config["blend_weights"]

    n = len(X)
    preds_cat = None
    preds_lgb = None
    preds_xgb = None

    X_lgb = X.copy()
    for c in cat_cols:
        if c in X_lgb.columns:
            X_lgb[c] = X_lgb[c].astype("category")

    # CatBoost
    if models.get("catboost"):
        cat_features_idx = [X.columns.get_loc(c) for c in cat_cols if c in X.columns]
        test_pool = Pool(X, cat_features=cat_features_idx)
        for m in models["catboost"]:
            if task_type == "regression":
                proba = m.predict(test_pool)              # (n,)
            else:
                proba_full = m.predict_proba(test_pool)   # (n,2) –∏–ª–∏ (n,C)
                if task_type == "binary":
                    proba = proba_full[:, 1]              # (n,)
                else:
                    proba = proba_full                    # (n,C)
            if preds_cat is None:
                preds_cat = np.zeros_like(proba, dtype=float)
            preds_cat += proba
        preds_cat /= len(models["catboost"])

    if models.get("lightgbm"):
        for m in models["lightgbm"]:
            if task_type == "regression":
                proba = m.predict(X_lgb, num_iteration=m.best_iteration)  # (n,)
            else:
                proba_full = m.predict(X_lgb, num_iteration=m.best_iteration)  # (n,) –∏–ª–∏ (n,C)
                if task_type == "binary":
                    proba = proba_full                                      # (n,)
                else:
                    proba = proba_full                                      # (n,C)
            if preds_lgb is None:
                preds_lgb = np.zeros_like(proba, dtype=float)
            preds_lgb += proba
        preds_lgb /= len(models["lightgbm"])

    # XGBoost
    if models.get("xgboost"):
        X_xgb = X.copy()
        for c in X_xgb.columns:
            if X_xgb[c].dtype == "object":
                X_xgb[c] = X_xgb[c].astype("category")
        dtest = xgb.DMatrix(X_xgb, enable_categorical=True)

        for m in models["xgboost"]:
            if task_type == "regression":
                proba = m.predict(
                    dtest,
                    iteration_range=(0, m.best_iteration + 1)
                )  # shape: (n,)
            else:
                proba_full = m.predict(
                    dtest,
                    iteration_range=(0, m.best_iteration + 1)
                )  # shape: (n,) for binary, (n, C) for multiclass
                if task_type == "binary":
                    proba = proba_full          # (n,)
                else:
                    proba = proba_full          # (n, C)

            if preds_xgb is None:
                preds_xgb = np.zeros_like(proba, dtype=float)
            preds_xgb += proba

        preds_xgb /= len(models["xgboost"])

    # Blending
    blend_preds = None
    if task_type == "multiclass":
        # –æ–ø—Ä–µ–¥–µ–ª—è–µ–º —Ñ–æ—Ä–º—É –ø–æ –ø–µ—Ä–≤–æ–π –Ω–µ-None –º–∞—Ç—Ä–∏—Ü–µ
        base = None
        for arr in [preds_cat, preds_lgb, preds_xgb]:
            if arr is not None:
                base = arr
                break
        if base is not None:
            blend_num = np.zeros_like(base, dtype=float)  # (n,C)
            blend_den = np.zeros(n, dtype=float)

            if preds_cat is not None and blend_weights.get("catboost", 0) != 0:
                w = blend_weights["catboost"]
                blend_num += w * preds_cat
                blend_den += w
            if preds_lgb is not None and blend_weights.get("lightgbm", 0) != 0:
                w = blend_weights["lightgbm"]
                blend_num += w * preds_lgb
                blend_den += w
            if preds_xgb is not None and blend_weights.get("xgboost", 0) != 0:
                w = blend_weights["xgboost"]
                blend_num += w * preds_xgb
                blend_den += w

            valid_mask = blend_den > 0
            if valid_mask.any():
                blend_preds = np.zeros_like(blend_num)
                blend_preds[valid_mask] = (
                    blend_num[valid_mask] /
                    blend_den[valid_mask][:, None]
                )
    else:
        blend_num = np.zeros(n, dtype=float)
        blend_den = np.zeros(n, dtype=float)
        if preds_cat is not None and blend_weights.get("catboost", 0) != 0:
            w = blend_weights["catboost"]
            blend_num += w * preds_cat
            blend_den += w
        if preds_lgb is not None and blend_weights.get("lightgbm", 0) != 0:
            w = blend_weights["lightgbm"]
            blend_num += w * preds_lgb
            blend_den += w
        if preds_xgb is not None and blend_weights.get("xgboost", 0) != 0:
            w = blend_weights["xgboost"]
            blend_num += w * preds_xgb
            blend_den += w

        valid_mask = blend_den > 0
        if valid_mask.any():
            blend_preds = np.zeros_like(blend_num)
            blend_preds[valid_mask] = blend_num[valid_mask] / blend_den[valid_mask]

    return {
        "catboost": preds_cat,
        "lightgbm": preds_lgb,
        "xgboost": preds_xgb,
        "blend": blend_preds
    }


In [170]:
# –£–¥–∞–ª—è–µ–º ID —Ç–æ–ª—å–∫–æ –ø–µ—Ä–µ–¥ –æ–±—É—á–µ–Ω–∏–µ–º –±—É—Å—Ç–∏–Ω–≥–æ–≤,
# —á—Ç–æ–±—ã ID –Ω–µ —É—Ç–µ–∫ –≤ –º–æ–¥–µ–ª–∏, –Ω–æ –æ—Å—Ç–∞–ª—Å—è –¥–ª—è –≥–µ–Ω–µ—Ä–∞—Ü–∏–∏ —Ñ–∏—á (–∫–∞—Ä—Ç–∏–Ω–∫–∏/—Ç–µ–∫—Å—Ç—ã –∏ —Ç.–ø.)
id_col = CONFIG["id_column"]

if id_col in X.columns:
    logger.info(f"Dropping ID column '{id_col}' before boosting")
    X = X.drop(columns=[id_col])

if test_df is not None and id_col in test_df.columns:
    test_df = test_df.drop(columns=[id_col])

# –ù–∞ –≤—Å—è–∫–∏–π —Å–ª—É—á–∞–π –∏—Å–∫–ª—é—á–∞–µ–º ID –∏–∑ —Å–ø–∏—Å–∫–∞ –∫–∞—Ç–µ–≥–æ—Ä–∏–∞–ª—å–Ω—ã—Ö –∫–æ–ª–æ–Ω–æ–∫
if "cat_cols" in globals():
    cat_cols = [c for c in cat_cols if c != id_col]

results = train_and_evaluate(X, y, cat_cols, CONFIG, logger)

# ----- Save OOF, metrics, FI -----
logger.info("üíæ Saving artifacts...")
if results["oof_catboost"] is not None:
    np.save(os.path.join(output_dir, "oof_catboost.npy"), results["oof_catboost"])
if results["oof_lightgbm"] is not None:
    np.save(os.path.join(output_dir, "oof_lightgbm.npy"), results["oof_lightgbm"])
if results["oof_xgboost"] is not None:
    np.save(os.path.join(output_dir, "oof_xgboost.npy"), results["oof_xgboost"])
if results["oof_blend"] is not None:
    np.save(os.path.join(output_dir, "oof_blend.npy"), results["oof_blend"])

with open(os.path.join(output_dir, "metrics.json"), "w", encoding="utf-8") as f:
    json.dump(results["metrics"], f, indent=2, ensure_ascii=False)


2025-11-17 22:41:16,205 - INFO - Using 5 folds.
INFO:TABULAR_BOOSTING:Using 5 folds.
2025-11-17 22:41:16,222 - INFO - Training CatBoost...
INFO:TABULAR_BOOSTING:Training CatBoost...


0:	learn: 20.3509813	test: 20.8968497	best: 20.8968497 (0)	total: 49ms	remaining: 2m 51s
100:	learn: 17.1722585	test: 18.6767809	best: 18.6767809 (100)	total: 2.53s	remaining: 1m 25s
200:	learn: 16.1641711	test: 18.4464673	best: 18.4464673 (200)	total: 4.84s	remaining: 1m 19s
300:	learn: 15.3545042	test: 18.3727145	best: 18.3571554 (267)	total: 7.13s	remaining: 1m 15s
400:	learn: 14.6745026	test: 18.2805158	best: 18.2805158 (400)	total: 9.4s	remaining: 1m 12s
500:	learn: 14.0644080	test: 18.2433360	best: 18.2392514 (477)	total: 11.7s	remaining: 1m 9s
600:	learn: 13.5331456	test: 18.2240427	best: 18.2240427 (600)	total: 14s	remaining: 1m 7s
700:	learn: 13.0542636	test: 18.2249049	best: 18.2116408 (622)	total: 16.3s	remaining: 1m 4s
bestTest = 18.21164083
bestIteration = 622
Shrink model to first 623 iterations.


2025-11-17 22:41:34,042 - INFO - Training CatBoost...
INFO:TABULAR_BOOSTING:Training CatBoost...


0:	learn: 20.4647170	test: 20.4394947	best: 20.4394947 (0)	total: 26.7ms	remaining: 1m 33s
100:	learn: 17.1269989	test: 18.5709513	best: 18.5709513 (100)	total: 2.45s	remaining: 1m 22s
200:	learn: 16.1446972	test: 18.4335277	best: 18.4277459 (191)	total: 4.73s	remaining: 1m 17s
300:	learn: 15.3075787	test: 18.3989811	best: 18.3869003 (275)	total: 7.05s	remaining: 1m 14s
400:	learn: 14.6230574	test: 18.3657423	best: 18.3640716 (395)	total: 9.3s	remaining: 1m 11s
500:	learn: 14.0326319	test: 18.3367599	best: 18.3360801 (498)	total: 11.6s	remaining: 1m 9s
600:	learn: 13.5162672	test: 18.3127423	best: 18.3045903 (584)	total: 13.8s	remaining: 1m 6s
bestTest = 18.30459026
bestIteration = 584
Shrink model to first 585 iterations.


2025-11-17 22:41:50,845 - INFO - Training CatBoost...
INFO:TABULAR_BOOSTING:Training CatBoost...


0:	learn: 20.5785771	test: 20.0000473	best: 20.0000473 (0)	total: 26.8ms	remaining: 1m 33s
100:	learn: 17.3018962	test: 17.9568837	best: 17.9568837 (100)	total: 2.42s	remaining: 1m 21s
200:	learn: 16.2963588	test: 17.7994578	best: 17.7994578 (200)	total: 4.69s	remaining: 1m 17s
300:	learn: 15.4766334	test: 17.7283084	best: 17.7283084 (300)	total: 6.94s	remaining: 1m 13s
400:	learn: 14.7961053	test: 17.7114247	best: 17.7065973 (397)	total: 9.2s	remaining: 1m 11s
bestTest = 17.70659728
bestIteration = 397
Shrink model to first 398 iterations.


2025-11-17 22:42:03,262 - INFO - Training CatBoost...
INFO:TABULAR_BOOSTING:Training CatBoost...


0:	learn: 20.4767392	test: 20.4635707	best: 20.4635707 (0)	total: 25.1ms	remaining: 1m 27s
100:	learn: 17.2467077	test: 18.3303449	best: 18.3303449 (100)	total: 2.42s	remaining: 1m 21s
200:	learn: 16.2350071	test: 18.1432834	best: 18.1432834 (200)	total: 4.7s	remaining: 1m 17s
300:	learn: 15.4192109	test: 18.0421962	best: 18.0421962 (300)	total: 6.96s	remaining: 1m 14s
400:	learn: 14.7411514	test: 17.9901886	best: 17.9901886 (400)	total: 9.18s	remaining: 1m 10s
500:	learn: 14.1436286	test: 17.9704277	best: 17.9668801 (470)	total: 11.4s	remaining: 1m 8s
600:	learn: 13.5918707	test: 17.9593210	best: 17.9522655 (562)	total: 13.7s	remaining: 1m 5s
700:	learn: 13.1169694	test: 17.9453434	best: 17.9437891 (644)	total: 15.9s	remaining: 1m 3s
800:	learn: 12.6550173	test: 17.9422469	best: 17.9376478 (740)	total: 18.2s	remaining: 1m 1s
bestTest = 17.93764778
bestIteration = 740
Shrink model to first 741 iterations.


2025-11-17 22:42:23,414 - INFO - Training CatBoost...
INFO:TABULAR_BOOSTING:Training CatBoost...


0:	learn: 20.4288712	test: 20.5685614	best: 20.5685614 (0)	total: 26.8ms	remaining: 1m 33s
100:	learn: 17.1953137	test: 18.4641369	best: 18.4641369 (100)	total: 2.38s	remaining: 1m 20s
200:	learn: 16.1935210	test: 18.3105309	best: 18.3105309 (200)	total: 4.63s	remaining: 1m 16s
300:	learn: 15.4001359	test: 18.2430975	best: 18.2429264 (290)	total: 6.89s	remaining: 1m 13s
400:	learn: 14.7449529	test: 18.2265263	best: 18.2175527 (381)	total: 9.1s	remaining: 1m 10s
500:	learn: 14.1073176	test: 18.1987846	best: 18.1987846 (500)	total: 11.3s	remaining: 1m 7s
600:	learn: 13.5640121	test: 18.1915819	best: 18.1825104 (520)	total: 13.6s	remaining: 1m 5s
700:	learn: 13.0778895	test: 18.1746853	best: 18.1729450 (696)	total: 15.8s	remaining: 1m 2s
800:	learn: 12.6253132	test: 18.1784890	best: 18.1682038 (730)	total: 18s	remaining: 1m
bestTest = 18.16820375
bestIteration = 730
Shrink model to first 731 iterations.


Cross-validation folds: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [01:26<00:00, 17.38s/it]
2025-11-17 22:42:43,088 - INFO - CatBoost RMSE: 18.067071 MAE: 13.183403
INFO:TABULAR_BOOSTING:CatBoost RMSE: 18.067071 MAE: 13.183403
2025-11-17 22:42:43,089 - INFO - Evaluating blended OOF predictions...
INFO:TABULAR_BOOSTING:Evaluating blended OOF predictions...
2025-11-17 22:42:43,092 - INFO - Blend RMSE: 18.067071 MAE: 13.183403
INFO:TABULAR_BOOSTING:Blend RMSE: 18.067071 MAE: 13.183403
2025-11-17 22:42:43,095 - INFO - {'CatBoost_RMSE': 18.06707139565178, 'CatBoost_MAE': 18.06707139565178, 'Blend_RMSE': 18.06707139565178, 'Blend_MAE': 18.06707139565178}
INFO:TABULAR_BOOSTING:{'CatBoost_RMSE': 18.06707139565178, 'CatBoost_MAE': 18.06707139565178, 'Blend_RMSE': 18.06707139565178, 'Blend_MAE': 18.06707139565178}
2025-11-17 22:42:43,101 - INFO -  Saving artifacts...
INFO:TABULAR_BOOSTING:üíæ Saving artifacts...


### Importances

In [171]:
if "feature_importance_lgb" in results:
    fi = results["feature_importance_lgb"]
    fi_df = pd.DataFrame({
        "feature": X.columns,
        "importance_gain": fi
    }).sort_values("importance_gain", ascending=False)
    fi_df.to_csv(os.path.join(output_dir, "feature_importance_lgb.csv"), index=False)
    logger.info("Top features (LGB, gain):")
    logger.info(fi_df.head(CONFIG["top_features_to_show"]))

if "feature_importance_xgb" in results:
    fi_xgb = results["feature_importance_xgb"]
    fi_xgb_df = pd.DataFrame({
        "feature": X.columns,
        "importance_gain": fi_xgb
    }).sort_values("importance_gain", ascending=False)
    fi_xgb_df.to_csv(os.path.join(output_dir, "feature_importance_xgb.csv"), index=False)
    logger.info("Top features (XGB, gain):")
    logger.info(fi_xgb_df.head(CONFIG["top_features_to_show"]))

### Submission

In [172]:
# ----- Test predictions (ensemble) -----
if test_df is not None:
    # NEW: –≤—ã—Ä–∞–≤–Ω–∏–≤–∞–µ–º –∫–∞—Ç–µ–≥–æ—Ä–∏–∞–ª—å–Ω—ã–µ –∑–Ω–∞—á–µ–Ω–∏—è —Ç–µ—Å—Ç–∞ –ø–æ train,
    # –º–∞–ø–ø–∏–º –≤—Å–µ unseen ‚Üí "other" (–µ—Å–ª–∏ –æ–Ω–∞ –µ—Å—Ç—å), –∏–Ω–∞—á–µ ‚Üí –º–æ–¥–∞ —Ç—Ä–µ–π–Ω–∞
    if cat_cols:
        logger.info("üß© Aligning categorical values in test with train for XGBoost...")
        for c in cat_cols:
            if c in test_df.columns and c in X.columns:

                train_vals = X[c].dropna()
                if train_vals.empty:
                    continue

                known = set(train_vals.unique())
                mask_new = ~test_df[c].isin(known)

                if mask_new.any():
                    # –ø—Ä–æ–≤–µ—Ä—è–µ–º —Å—É—â–µ—Å—Ç–≤–æ–≤–∞–Ω–∏–µ –∫–∞—Ç–µ–≥–æ—Ä–∏–π "other"
                    known_lower = {str(v).lower() for v in known}
                    if "other" in known_lower:
                        # –Ω–∞—Ö–æ–¥–∏–º —Ç–æ—á–Ω–æ–µ –∑–Ω–∞—á–µ–Ω–∏–µ "other" –≤ train
                        other_value = next(v for v in known if str(v).lower() == "other")
                        replacement = other_value
                    else:
                        # –µ—Å–ª–∏ "other" –Ω–µ—Ç ‚Äî fallback –Ω–∞ –º–æ–¥—É
                        replacement = train_vals.mode(dropna=True).iloc[0]

                    logger.info(
                        f"üîÑ Column '{c}': mapping {mask_new.sum()} unseen categories ‚Üí '{replacement}'"
                    )
                    test_df.loc[mask_new, c] = replacement

    logger.info("üì§ Making ensemble predictions for test...")
    # –ö–†–ò–¢–ò–ß–ï–°–ö–ê–Ø –ü–†–û–í–ï–†–ö–ê: —É–±–µ–∂–¥–∞–µ–º—Å—è, —á—Ç–æ test_df –∏–º–µ–µ—Ç –ø—Ä–∞–≤–∏–ª—å–Ω—ã–π —Ä–∞–∑–º–µ—Ä
    logger.info(f"test_df shape before ensemble_predict: {test_df.shape}")
    logger.info(f"test_ids length: {len(test_ids) if test_ids is not None else 'None'}")
    
    # –ö–†–ò–¢–ò–ß–ï–°–ö–ê–Ø –ü–†–û–í–ï–†–ö–ê: –µ—Å–ª–∏ —Ä–∞–∑–º–µ—Ä test_df –Ω–µ —Å–æ–≤–ø–∞–¥–∞–µ—Ç —Å test_ids, —ç—Ç–æ –æ—à–∏–±–∫–∞
    if test_ids is not None:
        expected_test_size = len(test_ids)
        actual_test_size = len(test_df)
        if actual_test_size != expected_test_size:
            error_msg = (
                f"‚ùå CRITICAL ERROR: test_df size ({actual_test_size}) != test_ids size ({expected_test_size})! "
                f"This indicates a problem in data processing. "
                f"test_df should have {expected_test_size} rows, but has {actual_test_size}. "
                f"Please check your data processing pipeline, especially generate_aggregate_features."
            )
            logger.error(error_msg)
            raise ValueError(error_msg)
    
    pred_dict = ensemble_predict(
        models={
            "catboost": results["models_catboost"],
            "lightgbm": results["models_lightgbm"],
            "xgboost": results["models_xgboost"],
        },
        X=test_df,
        cat_cols=cat_cols,
        config=CONFIG
    )

    if test_ids is None:
        test_ids_series = pd.Series(np.arange(len(test_df)), name=id_col)
    else:
        test_ids_series = test_ids

    # –ü—Ä–æ–≤–µ—Ä–∫–∞ —Ä–∞–∑–º–µ—Ä–æ–≤ –ø–µ—Ä–µ–¥ —Å–æ–∑–¥–∞–Ω–∏–µ–º submission
    logger.info(f"Test DataFrame shape: {test_df.shape}")
    logger.info(f"Test IDs length: {len(test_ids_series)}")

    for name, preds in pred_dict.items():
        if preds is None:
            continue

        # –ü—Ä–æ–≤–µ—Ä–∫–∞ —Ä–∞–∑–º–µ—Ä–æ–≤ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–π
        if isinstance(preds, np.ndarray):
            pred_len = preds.shape[0] if preds.ndim == 1 else preds.shape[0]
            logger.info(f"Predictions '{name}' shape: {preds.shape}, length: {pred_len}")
            
            if pred_len != len(test_df):
                raise ValueError(
                    f"Predictions length {pred_len} != test_df length {len(test_df)} for {name}. "
                    f"Test_df shape: {test_df.shape}, predictions shape: {preds.shape}"
                )

        if isinstance(preds, np.ndarray) and preds.ndim == 2:
            n_classes = preds.shape[1]
            data = {
                id_col: test_ids_series.values,  # –ò—Å–ø–æ–ª—å–∑—É–µ–º .values –¥–ª—è –≥–∞—Ä–∞–Ω—Ç–∏–∏ –ø—Ä–∞–≤–∏–ª—å–Ω–æ–≥–æ —Ä–∞–∑–º–µ—Ä–∞
                "prediction": np.argmax(preds, axis=1),  # –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–Ω—ã–π –∫–ª–∞—Å—Å
            }
            for k in range(n_classes):
                data[f"proba_class_{k}"] = preds[:, k]
            out_df = pd.DataFrame(data)
        else:
            # –£–±–µ–∂–¥–∞–µ–º—Å—è, —á—Ç–æ —Ä–∞–∑–º–µ—Ä—ã —Å–æ–≤–ø–∞–¥–∞—é—Ç
            preds_flat = preds.flatten() if isinstance(preds, np.ndarray) else preds
            if len(preds_flat) != len(test_ids_series):
                raise ValueError(
                    f"Predictions length {len(preds_flat)} != test_ids length {len(test_ids_series)} for {name}"
                )
            out_df = pd.DataFrame({
                id_col: test_ids_series.values,  # –ò—Å–ø–æ–ª—å–∑—É–µ–º .values
                "prediction": preds_flat
            })

        out_df.to_csv(os.path.join(output_dir, f"pred_{name}.csv"), index=False)
        logger.info(f"Saved predictions: pred_{name}.csv")

elapsed = time.time() - start_time
logger.info(f"‚è± Finished in {elapsed:.1f} seconds")

2025-11-17 22:42:43,135 - INFO -  Aligning categorical values in test with train for XGBoost...
INFO:TABULAR_BOOSTING:üß© Aligning categorical values in test with train for XGBoost...
2025-11-17 22:42:43,165 - INFO -  Making ensemble predictions for test...
INFO:TABULAR_BOOSTING:üì§ Making ensemble predictions for test...
2025-11-17 22:42:43,167 - INFO - test_df shape before ensemble_predict: (8, 524)
INFO:TABULAR_BOOSTING:test_df shape before ensemble_predict: (8, 524)
2025-11-17 22:42:43,169 - INFO - test_ids length: 8
INFO:TABULAR_BOOSTING:test_ids length: 8
2025-11-17 22:42:43,207 - INFO - Test DataFrame shape: (8, 524)
INFO:TABULAR_BOOSTING:Test DataFrame shape: (8, 524)
2025-11-17 22:42:43,209 - INFO - Test IDs length: 8
INFO:TABULAR_BOOSTING:Test IDs length: 8
2025-11-17 22:42:43,211 - INFO - Predictions 'catboost' shape: (8,), length: 8
INFO:TABULAR_BOOSTING:Predictions 'catboost' shape: (8,), length: 8
2025-11-17 22:42:43,214 - INFO - Saved predictions: pred_catboost.csv
INF

In [173]:
pred_dict

{'catboost': array([39.95485419, 43.05118036, 42.0404833 , 42.20302056, 41.35563384,
        39.69841935, 40.951625  , 42.01597805]),
 'lightgbm': None,
 'xgboost': None,
 'blend': array([39.95485419, 43.05118036, 42.0404833 , 42.20302056, 41.35563384,
        39.69841935, 40.951625  , 42.01597805])}

In [174]:
# ----- FINAL SUBMISSION -----
logger.info("üìù Preparing final submission file...")

# –ù–∞–π–¥—ë–º –∏—Ç–æ–≥–æ–≤—ã–π –∞–Ω—Å–∞–º–±–ª–µ–≤—ã–π prediction
final_preds = pred_dict.get("catboost")

if final_preds is None:
    logger.warning("Blend predictions are None. Falling back to first available model.")
    for name in ["catboost", "lightgbm", "xgboost"]:
        if pred_dict.get(name) is not None:
            final_preds = pred_dict[name]
            logger.info(f"Using '{name}' predictions for submission.")
            break

if final_preds is None:
    raise ValueError("No predictions available to form submission.")

# –°–æ–∑–¥–∞—ë–º DataFrame —Å id
submission = pd.DataFrame({id_col: test_ids_series})

# –¢–µ–ø–µ—Ä—å –¥–æ–±–∞–≤–ª—è–µ–º target
task_type = CONFIG["task_type"]

if task_type == "multiclass":
    # final_preds ‚Äî –º–∞—Ç—Ä–∏—Ü–∞ (n, C)
    submission[CONFIG["target_column"]] = np.argmax(final_preds, axis=1)

elif task_type == "binary":
    # —Ñ–∏–Ω–∞–ª—å–Ω—ã–π prediction ‚Äî –≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç—å –∫–ª–∞—Å—Å–∞ 1
    # –µ—Å–ª–∏ –Ω—É–∂–µ–Ω label, –∑–∞–ª–æ–∂–∏ –ø–æ—Ä–æ–≥ = 0.5
    submission[CONFIG["target_column"]] = (final_preds >= 0.5).astype(int)

else:  # regression
    submission[CONFIG["target_column"]] = final_preds

# –°–æ—Ö—Ä–∞–Ω—è–µ–º
sub_path = os.path.join(output_dir, "submission.csv")
if "index" in submission.columns:
    submission.drop(columns="index", inplace=True)
submission.to_csv(sub_path, index=False)

logger.info(f"‚úÖ Submission file saved: {sub_path}")
print(submission.head())

2025-11-17 22:42:43,242 - INFO -  Preparing final submission file...
INFO:TABULAR_BOOSTING:üìù Preparing final submission file...
2025-11-17 22:42:43,246 - INFO -  Submission file saved: tabular_boosting_output/submission.csv
INFO:TABULAR_BOOSTING:‚úÖ Submission file saved: tabular_boosting_output/submission.csv


                                 Id  Pawpularity
0  4128bae22183829d2b5fea10effdb0c3    39.954854
1  43a2262d7738e3d420d453815151079e    43.051180
2  4e429cead1848a298432a0acad014c9d    42.040483
3  80bc3ccafcc51b66303c2c263aa38486    42.203021
4  8f49844c382931444e68dffbe20228f4    41.355634


In [175]:
submission.head(35)

Unnamed: 0,Id,Pawpularity
0,4128bae22183829d2b5fea10effdb0c3,39.954854
1,43a2262d7738e3d420d453815151079e,43.05118
2,4e429cead1848a298432a0acad014c9d,42.040483
3,80bc3ccafcc51b66303c2c263aa38486,42.203021
4,8f49844c382931444e68dffbe20228f4,41.355634
5,b03f7041962238a7c9d6537e22f9b017,39.698419
6,c978013571258ed6d4637f6e8cc9d6a3,40.951625
7,e0de453c1bffc20c22b072b34b54e50f,42.015978


In [176]:
# ============================
# Optuna: CPU-only hyperparameter tuning
# ============================
import optuna

def _metric_for_optuna(y_true, y_pred, task_type: str):
    """
    –í–Ω—É—Ç—Ä–µ–Ω–Ω—è—è –º–µ—Ç—Ä–∏–∫–∞ –¥–ª—è Optuna: —á–µ–º –ë–û–õ–¨–®–ï, —Ç–µ–º –ª—É—á—à–µ.
    - regression:  -RMSE
    - binary/multiclass: accuracy
    """
    import numpy as np
    from sklearn.metrics import mean_squared_error, accuracy_score

    if task_type == "regression":
        rmse = mean_squared_error(y_true, y_pred, squared=False)
        return -rmse
    else:
        if isinstance(y_pred, np.ndarray) and y_pred.ndim == 2:
            y_hat = np.argmax(y_pred, axis=1)
        else:
            y_hat = y_pred
        acc = accuracy_score(y_true, y_hat)
        return acc


def tune_all_boostings_optuna(
    X,
    y,
    cat_cols,
    CONFIG,
    logger,
    timeout: int = 3600,
    tune_cat: bool = True,
    tune_lgb: bool = True,
    tune_xgb: bool = True,
):
    """
    –ü–æ–¥–±–æ—Ä –≥–∏–ø–µ—Ä–æ–≤ CatBoost, LightGBM –∏ XGBoost –Ω–∞ CPU —Å –ø–æ–º–æ—â—å—é Optuna.

    tune_cat / tune_lgb / tune_xgb ‚Äî —Ñ–ª–∞–≥–∏, –∫–∞–∫–∏–µ –º–æ–¥–µ–ª–∏ —Ç—é–Ω–∏—Ç—å.
    –ù–∞–ø—Ä–∏–º–µ—Ä, —á—Ç–æ–±—ã —Ç—é–Ω–∏—Ç—å —Ç–æ–ª—å–∫–æ CatBoost:
        tune_cat=True, tune_lgb=False, tune_xgb=False
    """
    import numpy as np
    from sklearn.model_selection import StratifiedKFold, KFold
    from catboost import CatBoostClassifier, CatBoostRegressor, Pool
    import lightgbm as lgb
    import xgboost as xgb

    task_type = CONFIG.get("task_type", "regression")

    # ====================
    # CV-—Å–ø–ª–∏—Ç—Ç–µ—Ä
    # ====================
    n_splits = CONFIG.get("cv_n_splits", 3)
    cv_shuffle = CONFIG.get("cv_shuffle", True)
    cv_random_state = CONFIG.get("cv_random_state", 42)
    cv_stratified = CONFIG.get("cv_stratified", True)

    if task_type == "regression" or not cv_stratified:
        kf = KFold(
            n_splits=n_splits,
            shuffle=cv_shuffle,
            random_state=cv_random_state,
        )
    else:
        kf = StratifiedKFold(
            n_splits=n_splits,
            shuffle=cv_shuffle,
            random_state=cv_random_state,
        )

    # ======================================================
    # 1) CatBoost (CPU, —Å –∞–≤—Ç–æ–º–∞—Ç–∏—á–µ—Å–∫–æ–π –Ω–∞—Å—Ç—Ä–æ–π–∫–æ–π loss/metric)
    # ======================================================
    if tune_cat and CONFIG.get("use_catboost", False):
        logger.info("üîç Optuna tuning for CatBoost (CPU)...")

        def objective_cat(trial):
            base_params = CONFIG["catboost_params"].copy()

            # –ø—Ä–∏–Ω—É–¥–∏—Ç–µ–ª—å–Ω–æ –Ω–∞ CPU (Optuna –Ω–µ —Ç—Ä–æ–≥–∞–µ—Ç —Ç–≤–æ—ë GPU-–æ–±—É—á–µ–Ω–∏–µ)
            base_params["task_type"] = "CPU"
            base_params["thread_count"] = 0
            base_params["verbose"] = False

            # –≥–∏–ø–µ—Ä—ã –¥–ª—è —Ç—é–Ω–∏–Ω–≥–∞
            params = {
                **base_params,
                "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
                "depth": trial.suggest_int("depth", 4, 10),
                "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-2, 10.0, log=True),
                "bagging_temperature": trial.suggest_float("bagging_temperature", 0.0, 10.0),
                "border_count": trial.suggest_int("border_count", 32, 255),
            }

            # ----- loss_function / eval_metric –ø–æ–¥ task_type -----
            if task_type == "regression":
                params["loss_function"] = "MAE"
                params["eval_metric"] = "MAE"
                ModelCls = CatBoostRegressor
            elif task_type == "binary":
                params["loss_function"] = "Logloss"
                params["eval_metric"] = "AUC"
                ModelCls = CatBoostClassifier
            elif task_type == "multiclass":
                params["loss_function"] = "MultiClass"
                params["eval_metric"] = "MultiClass"
                ModelCls = CatBoostClassifier
            else:
                raise ValueError(f"Unsupported task_type '{task_type}' for CatBoost")

            oof_preds = None

            for fold, (tr_idx, va_idx) in enumerate(kf.split(X, y)):
                X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
                y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]

                train_pool = Pool(X_tr, y_tr, cat_features=cat_cols)
                valid_pool = Pool(X_va, y_va, cat_features=cat_cols)

                model = ModelCls(**params)

                model.fit(
                    train_pool,
                    eval_set=valid_pool,
                    early_stopping_rounds=CONFIG.get("early_stopping_rounds", 100),
                    verbose=False,
                )

                if task_type == "regression":
                    preds = model.predict(X_va)
                else:
                    proba = model.predict_proba(X_va)
                    if task_type == "binary":
                        preds = proba[:, 1]
                    else:
                        preds = proba

                if oof_preds is None:
                    if isinstance(preds, np.ndarray) and preds.ndim == 2:
                        oof_preds = np.zeros((len(y), preds.shape[1]), dtype=float)
                    else:
                        oof_preds = np.zeros(len(y), dtype=float)
                oof_preds[va_idx] = preds

            score = _metric_for_optuna(y.values, oof_preds, task_type)
            return score

        study_cat = optuna.create_study(direction="maximize")
        study_cat.optimize(objective_cat, timeout=timeout)
        logger.info(f"CatBoost best value: {study_cat.best_value}")
        logger.info(f"CatBoost best params: {study_cat.best_params}")

        CONFIG["catboost_params"].update({
            "learning_rate": study_cat.best_params["learning_rate"],
            "depth": study_cat.best_params["depth"],
            "l2_leaf_reg": study_cat.best_params["l2_leaf_reg"],
            "bagging_temperature": study_cat.best_params["bagging_temperature"],
            "border_count": study_cat.best_params["border_count"],
        })

    # ======================================================
    # 2) LightGBM (CPU, objective/metric –ø–æ task_type)
    # ======================================================
    if tune_lgb and CONFIG.get("use_lightgbm", False):
        logger.info("üîç Optuna tuning for LightGBM (CPU)...")

        def objective_lgb(trial):
            base_params = CONFIG["lgb_params"].copy()

            base_params["device_type"] = "cpu"
            base_params["num_threads"] = 0

            if task_type == "regression":
                base_params["objective"] = "regression"
                base_params["metric"] = ["mae"]
            elif task_type == "binary":
                base_params["objective"] = "binary"
                base_params["metric"] = ["auc", "binary_logloss"]
            elif task_type == "multiclass":
                num_classes = len(np.unique(y))
                base_params["objective"] = "multiclass"
                base_params["num_class"] = num_classes
                base_params["metric"] = ["multi_logloss", "multi_error"]
            else:
                raise ValueError(f"Unsupported task_type '{task_type}' for LightGBM")

            params = {
                **base_params,
                "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
                "num_leaves": trial.suggest_int("num_leaves", 31, 255),
                "feature_fraction": trial.suggest_float("feature_fraction", 0.6, 1.0),
                "bagging_fraction": trial.suggest_float("bagging_fraction", 0.6, 1.0),
                "bagging_freq": trial.suggest_int("bagging_freq", 0, 10),
                "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 10, 100),
                "lambda_l1": trial.suggest_float("lambda_l1", 0.0, 10.0),
                "lambda_l2": trial.suggest_float("lambda_l2", 0.0, 10.0),
            }

            oof_preds = None
            for fold, (tr_idx, va_idx) in enumerate(kf.split(X, y)):
                X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
                y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]

                train_set = lgb.Dataset(X_tr, label=y_tr)
                valid_set = lgb.Dataset(X_va, label=y_va)

                model = lgb.train(
                    params,
                    train_set,
                    num_boost_round=5000,
                    valid_sets=[valid_set],
                    valid_names=["valid"],
                    early_stopping_rounds=CONFIG.get("early_stopping_rounds", 100),
                    verbose_eval=False,
                )

                preds = model.predict(X_va, num_iteration=model.best_iteration)

                if oof_preds is None:
                    if isinstance(preds, np.ndarray) and preds.ndim == 2:
                        oof_preds = np.zeros((len(y), preds.shape[1]), dtype=float)
                    else:
                        oof_preds = np.zeros(len(y), dtype=float)
                oof_preds[va_idx] = preds

            score = _metric_for_optuna(y.values, oof_preds, task_type)
            return score

        study_lgb = optuna.create_study(direction="maximize")
        study_lgb.optimize(objective_lgb, timeout=timeout)
        logger.info(f"LightGBM best value: {study_lgb.best_value}")
        logger.info(f"LightGBM best params: {study_lgb.best_params}")

        CONFIG["lgb_params"].update({
            "learning_rate": study_lgb.best_params["learning_rate"],
            "num_leaves": study_lgb.best_params["num_leaves"],
            "feature_fraction": study_lgb.best_params["feature_fraction"],
            "bagging_fraction": study_lgb.best_params["bagging_fraction"],
            "bagging_freq": study_lgb.best_params["bagging_freq"],
            "min_data_in_leaf": study_lgb.best_params["min_data_in_leaf"],
            "lambda_l1": study_lgb.best_params["lambda_l1"],
            "lambda_l2": study_lgb.best_params["lambda_l2"],
        })

    # ======================================================
    # 3) XGBoost (CPU, objective/metric –ø–æ task_type)
    # ======================================================
    if tune_xgb and CONFIG.get("use_xgboost", False):
        logger.info("üîç Optuna tuning for XGBoost (CPU)...")

        def objective_xgb(trial):
            base_params = CONFIG["xgb_params"].copy()

            base_params["device"] = "cpu"
            base_params.setdefault("tree_method", "hist")
            base_params.setdefault("nthread", 0)

            if task_type == "regression":
                base_params["objective"] = "reg:squarederror"
                base_params["eval_metric"] = "mae"
            elif task_type == "binary":
                base_params["objective"] = "binary:logistic"
                base_params["eval_metric"] = "auc"
            elif task_type == "multiclass":
                num_classes = len(np.unique(y))
                base_params["objective"] = "multi:softprob"
                base_params["num_class"] = num_classes
                base_params["eval_metric"] = "mlogloss"
            else:
                raise ValueError(f"Unsupported task_type '{task_type}' for XGBoost")

            params = {
                **base_params,
                "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
                "max_depth": trial.suggest_int("max_depth", 3, 10),
                "min_child_weight": trial.suggest_float("min_child_weight", 1e-2, 10.0, log=True),
                "subsample": trial.suggest_float("subsample", 0.6, 1.0),
                "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
                "lambda": trial.suggest_float("lambda", 0.0, 10.0),
                "alpha": trial.suggest_float("alpha", 0.0, 10.0),
            }

            oof_preds = None
            for fold, (tr_idx, va_idx) in enumerate(kf.split(X, y)):
                X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
                y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]

                dtrain = xgb.DMatrix(X_tr, label=y_tr)
                dvalid = xgb.DMatrix(X_va, label=y_va)

                model = xgb.train(
                    params=params,
                    dtrain=dtrain,
                    num_boost_round=4000,
                    evals=[(dvalid, "valid")],
                    early_stopping_rounds=CONFIG.get("early_stopping_rounds", 100),
                    verbose_eval=False,
                )

                preds = model.predict(dvalid, iteration_range=(0, model.best_iteration + 1))

                if oof_preds is None:
                    if isinstance(preds, np.ndarray) and preds.ndim == 2:
                        oof_preds = np.zeros((len(y), preds.shape[1]), dtype=float)
                    else:
                        oof_preds = np.zeros(len(y), dtype=float)
                oof_preds[va_idx] = preds

            score = _metric_for_optuna(y.values, oof_preds, task_type)
            return score

        study_xgb = optuna.create_study(direction="maximize")
        study_xgb.optimize(objective_xgb, timeout=timeout)
        logger.info(f"XGBoost best value: {study_xgb.best_value}")
        logger.info(f"XGBoost best params: {study_xgb.best_params}")

        CONFIG["xgb_params"].update({
            "learning_rate": study_xgb.best_params["learning_rate"],
            "max_depth": study_xgb.best_params["max_depth"],
            "min_child_weight": study_xgb.best_params["min_child_weight"],
            "subsample": study_xgb.best_params["subsample"],
            "colsample_bytree": study_xgb.best_params["colsample_bytree"],
            "lambda": study_xgb.best_params["lambda"],
            "alpha": study_xgb.best_params["alpha"],
        })

    logger.info("‚úÖ Optuna tuning finished. CONFIG –æ–±–Ω–æ–≤–ª—ë–Ω –ª—É—á—à–∏–º–∏ –≥–∏–ø–µ—Ä–ø–∞—Ä–∞–º–µ—Ç—Ä–∞–º–∏.")
    return CONFIG

# –ü—Ä–∏–º–µ—Ä –≤—ã–∑–æ–≤–∞ –¢–û–õ–¨–ö–û –¥–ª—è CatBoost:
#CONFIG = tune_all_boostings_optuna(X, y, cat_cols, CONFIG, logger, timeout=3600,tune_cat=True, tune_lgb=False, tune_xgb=False)
