In [1]:
! pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m99.2/99.2 MB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [2]:
! unzip /content/train.tsv.zip
! unzip /content/test.tsv.zip
! unzip /content/reviews.txv.zip

Archive:  /content/train.tsv.zip
  inflating: train.tsv               
Archive:  /content/test.tsv.zip
  inflating: test.tsv                
Archive:  /content/reviews.txv.zip
  inflating: reviews.tsv             


In [4]:
import os
os.environ["WANDB_DISABLED"] = "true"

import warnings
warnings.filterwarnings("ignore")

import gc
import ast
import math
import random
import numpy as np
import pandas as pd

from sklearn.model_selection import GroupKFold, KFold
from sklearn.preprocessing import QuantileTransformer, StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_absolute_error
from sklearn.cluster import MiniBatchKMeans
from sklearn.neighbors import BallTree
from sklearn.isotonic import IsotonicRegression
from sklearn.feature_extraction.text import TfidfVectorizer

import lightgbm as lgb

# =============== SEED ===============
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

# =============== –ó–ê–ì–†–£–ó–ö–ê –î–ê–ù–ù–´–• ===============
print("üì• –ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞–Ω–Ω—ã—Ö...")
train = pd.read_csv("train.tsv", sep="\t")
test = pd.read_csv("test.tsv", sep="\t")
reviews = pd.read_csv("reviews.tsv", sep="\t")

print(f"–ò—Å—Ö–æ–¥–Ω—ã–π —Ä–∞–∑–º–µ—Ä train: {len(train)}")
print(f"–†–∞–∑–º–µ—Ä test: {len(test)}")
print(f"–†–∞–∑–º–µ—Ä reviews: {len(reviews)}")

# –£–¥–∞–ª—è–µ–º target=0 (–Ω–µ–∏–Ω—Ñ–æ—Ä–º–∞—Ç–∏–≤–Ω—ã–µ)
print("üßπ –£–¥–∞–ª—è–µ–º –æ–±—ä–µ–∫—Ç—ã —Å target=0 –∏–∑ train...")
removed = int((train["target"] == 0).sum())
train = train[train["target"] != 0].reset_index(drop=True)
print(f"–£–¥–∞–ª–µ–Ω–æ: {removed} | –ù–æ–≤—ã–π —Ä–∞–∑–º–µ—Ä train: {len(train)}")

# =============== –ê–ì–†–ï–ì–ê–¶–ò–Ø –û–¢–ó–´–í–û–í ===============
print("üìä –ê–≥–≥—Ä–µ–≥–∞—Ü–∏—è –æ—Ç–∑—ã–≤–æ–≤ –ø–æ id...")
reviews_agg = reviews.groupby('id', as_index=False).agg(
    all_reviews_text=('text', lambda x: ' '.join(str(t) for t in x if pd.notna(t)))
)
review_counts = reviews.groupby('id', as_index=False).size().rename(columns={'size': 'review_count'})
review_counts['review_count'] = np.log1p(review_counts['review_count'])

for df in (train, test):
    df.drop(columns=[c for c in ["all_reviews_text", "review_count"] if c in df.columns], inplace=True, errors='ignore')

train = train.merge(reviews_agg, on='id', how='left')
train = train.merge(review_counts, on='id', how='left')
test = test.merge(reviews_agg, on='id', how='left')
test = test.merge(review_counts, on='id', how='left')

for df in [train, test]:
    df['all_reviews_text'] = df['all_reviews_text'].fillna('')
    df['review_count'] = df['review_count'].fillna(0.0).astype(np.float32)

# =============== –ö–û–û–†–î–ò–ù–ê–¢–´ –ò –ë–ê–ó–û–í–ê–Ø –ì–ï–û–ì–†–ê–§–ò–Ø ===============
print("üìç –ö–æ–æ—Ä–¥–∏–Ω–∞—Ç—ã –∏ –±–∞–∑–æ–≤–∞—è –≥–µ–æ–≥—Ä–∞—Ñ–∏—è...")
def parse_coords_to_lon_lat(s):
    try:
        if isinstance(s, str):
            coords = ast.literal_eval(s)
        else:
            coords = s
        lon, lat = coords[0], coords[1]
        return lon, lat
    except Exception:
        return np.nan, np.nan

def haversine_km(lon1, lat1, lon2, lat2):
    R = 6371.0
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat/2.0)**2 + np.cos(lat1)*np.cos(lat2)*np.sin(dlon/2.0)**2
    c = 2 * np.arcsin(np.sqrt(a))
    return R * c

for df in [train, test]:
    lon_lat = df["coordinates"].apply(parse_coords_to_lon_lat)
    df["lon"] = lon_lat.apply(lambda x: x[0]).astype(float)
    df["lat"] = lon_lat.apply(lambda x: x[1]).astype(float)

for df in [train, test]:
    df["lon"] = df["lon"].fillna(df["lon"].median())
    df["lat"] = df["lat"].fillna(df["lat"].median())

# —Ä–∞—Å—Å—Ç–æ—è–Ω–∏–µ –¥–æ —Ü–µ–Ω—Ç—Ä–∞ –ú–æ—Å–∫–≤—ã
moscow_center_lat, moscow_center_lon = 55.7558, 37.6173
for df in [train, test]:
    df["dist_to_center_km"] = haversine_km(
        df["lon"].values, df["lat"].values,
        np.full(len(df), moscow_center_lon), np.full(len(df), moscow_center_lat)
    ).astype(np.float32)
    df["log_dist_to_center"] = np.log1p(df["dist_to_center_km"]).astype(np.float32)

# –ì–µ–æ-–∫–ª–∞—Å—Ç–µ—Ä–∏–∑–∞—Ü–∏—è
all_coords = pd.concat([train[["lat", "lon"]], test[["lat", "lon"]]], axis=0).reset_index(drop=True)
scaler = StandardScaler()
coords_scaled = scaler.fit_transform(all_coords.values)
kmeans = MiniBatchKMeans(n_clusters=50, random_state=SEED, batch_size=2048, n_init=10, max_no_improvement=100)
clusters = kmeans.fit_predict(coords_scaled)
train["geo_cluster"] = clusters[:len(train)]
test["geo_cluster"] = clusters[len(train):]

# –†–∞—Å—à–∏—Ä–µ–Ω–Ω—ã–µ –≥–µ–æ-—Ñ–∏—á–∏
print("üß≠ –†–∞—Å—à–∏—Ä–µ–Ω–Ω—ã–µ –≥–µ–æ-–ø—Ä–∏–∑–Ω–∞–∫–∏...")
for df in [train, test]:
    df["lat_rad"] = np.deg2rad(df["lat"].astype(float))
    df["lon_rad"] = np.deg2rad(df["lon"].astype(float))
    df["sin_lat"] = np.sin(df["lat_rad"]).astype(np.float32)
    df["cos_lat"] = np.cos(df["lat_rad"]).astype(np.float32)
    df["sin_lon"] = np.sin(df["lon_rad"]).astype(np.float32)
    df["cos_lon"] = np.cos(df["lon_rad"]).astype(np.float32)
    # –ø—Ä–æ—Å—Ç–∞—è –º–µ—Ä–∫–∞—Ç–æ—Ä-–ø—Ä–æ–µ–∫—Ü–∏—è
    df["merc_x"] = df["lon_rad"].astype(np.float32)
    merc_y = np.log(np.tan(np.pi/4 + df["lat_rad"]/2))
    merc_y = np.where(np.isfinite(merc_y), merc_y, 0.0)
    df["merc_y"] = merc_y.astype(np.float32)

# —Ä–∞—Å—Å—Ç–æ—è–Ω–∏–µ –¥–æ —Ü–µ–Ω—Ç—Ä–∞ –≥–µ–æ-–∫–ª–∞—Å—Ç–µ—Ä–∞
geo_centers = train.groupby("geo_cluster")[["lat","lon"]].mean()
def dist_to_cluster_center_row(row):
    gc_id = row["geo_cluster"]
    if gc_id in geo_centers.index:
        cl = geo_centers.loc[gc_id]
        return haversine_km(row["lon"], row["lat"], cl["lon"], cl["lat"])
    return np.nan

for df in [train, test]:
    df["dist_to_cluster_center"] = df.apply(dist_to_cluster_center_row, axis=1).astype(np.float32)
    df["dist_to_cluster_center"] = df["dist_to_cluster_center"].fillna(df["dist_to_cluster_center"].median())

# =============== *_300m/_1000m –§–ò–ß–ò (ratio/diff/log) ===============
def build_scope_features(train, test):
    all_cols = set(train.columns).intersection(set(test.columns))
    base_pairs = []
    for col in all_cols:
        if col.endswith("_300m"):
            base = col[:-5]
            mate = base + "_1000m"
            if mate in all_cols:
                base_pairs.append(base)
    MAX_BASES = 200
    if len(base_pairs) > MAX_BASES:
        base_pairs = sorted(base_pairs)[:MAX_BASES]

    def add_feats(df):
        for base in base_pairs:
            c300 = base + "_300m"; c1000 = base + "_1000m"
            if c300 in df.columns: df[f"{base}_300m_log1p"] = np.log1p(df[c300].astype(float)).astype(np.float32)
            if c1000 in df.columns: df[f"{base}_1000m_log1p"] = np.log1p(df[c1000].astype(float)).astype(np.float32)
            df[f"{base}_diff_1000_300"] = (df.get(c1000, 0.0).astype(float) - df.get(c300, 0.0).astype(float)).astype(np.float32)
            df[f"{base}_ratio_300_1000"] = ((df.get(c300, 0.0).astype(float) + 1.0) / (df.get(c1000, 0.0).astype(float) + 1.0)).astype(np.float32)
        return df

    train_out = add_feats(train.copy())
    test_out = add_feats(test.copy())
    return train_out, test_out, base_pairs

train, test, scope_bases = build_scope_features(train, test)
print(f"üîé –°—Ñ–æ—Ä–º–∏—Ä–æ–≤–∞–Ω–æ {len(scope_bases)} –ø–∞—Ä *_300m/_1000m.")

# =============== OOF TARGET ENCODING (—É—Ç–∏–ª–∏—Ç–∞) ===============
def oof_target_encoding(train_df, test_df, col, target_col, n_splits=5, groups=None, smoothing=10.0, global_mean=None):
    if global_mean is None:
        global_mean = train_df[target_col].mean()
    te_train = np.zeros(len(train_df), dtype=np.float32)
    splitter = GroupKFold(n_splits=n_splits) if groups is not None else KFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    for trn_idx, val_idx in splitter.split(train_df, train_df[target_col], groups=groups):
        trn_data = train_df.iloc[trn_idx]
        stats = trn_data.groupby(col, as_index=True)[target_col].agg(['mean', 'count'])
        smoothed = (stats['mean'] * stats['count'] + global_mean * smoothing) / (stats['count'] + smoothing)
        te_map = smoothed.to_dict()
        te_train[val_idx] = train_df.iloc[val_idx][col].map(te_map).fillna(global_mean).values.astype(np.float32)
    stats_full = train_df.groupby(col, as_index=True)[target_col].agg(['mean', 'count'])
    smoothed_full = (stats_full['mean'] * stats_full['count'] + global_mean * smoothing) / (stats['count'] + smoothing)
    smoothed_full = (stats_full['mean'] * stats_full['count'] + global_mean * smoothing) / (stats_full['count'] + smoothing)
    te_map_full = smoothed_full.to_dict()
    te_test = test_df[col].map(te_map_full).fillna(global_mean).values.astype(np.float32)
    return te_train, te_test

global_mean = train["target"].mean()

# =============== –ì–ï–û-–¢–ê–ô–õ–´ + OOF-TE ===============
print("üß± –ì–µ–æ-—Ç–∞–π–ª—ã 500–º/1000–º –∏ OOF-TE...")
def make_geo_tiles(df):
    df["tile_500m"] = (np.floor(df["lat"].astype(float)/0.005).astype(int).astype(str) + "_" +
                       np.floor(df["lon"].astype(float)/0.005).astype(int).astype(str))
    df["tile_1000m"] = (np.floor(df["lat"].astype(float)/0.010).astype(int).astype(str) + "_" +
                        np.floor(df["lon"].astype(float)/0.010).astype(int).astype(str))
    return ["tile_500m", "tile_1000m"]

tile_cols = make_geo_tiles(train)
_ = make_geo_tiles(test)

for col in tile_cols:
    te_tr, te_te = oof_target_encoding(
        train, test, col=col, target_col="target",
        n_splits=5, groups=train["geo_cluster"], smoothing=10.0, global_mean=global_mean
    )
    train[f"{col}_te"] = te_tr
    test[f"{col}_te"] = te_te

    for df in [train, test]:
        df[f"{col}_x_category"] = df[col].astype(str) + "|" + df["category"].astype(str)
    te_tr2, te_te2 = oof_target_encoding(
        train, test, col=f"{col}_x_category", target_col="target",
        n_splits=5, groups=train["geo_cluster"], smoothing=20.0, global_mean=global_mean
    )
    train[f"{col}_x_category_te"] = te_tr2
    test[f"{col}_x_category_te"] = te_te2

# =============== TF-IDF –î–õ–Ø –ù–ê–ó–í–ê–ù–ò–ô (char) + SVD ===============
print("üî§ TF-IDF –ø–æ –Ω–∞–∑–≤–∞–Ω–∏—è–º (char 3‚Äì5) + SVD...")
all_names = pd.concat([train['name'].fillna(''), test['name'].fillna('')], axis=0).astype(str).tolist()
vec_name = TfidfVectorizer(analyzer='char', ngram_range=(3, 5), min_df=2, max_features=30000, lowercase=True, sublinear_tf=True)
X_name = vec_name.fit_transform(all_names)
svd_name = TruncatedSVD(n_components=16, random_state=SEED)
X_name_svd = svd_name.fit_transform(X_name)
name_svd_cols = [f"name_tfidf_svd_{i}" for i in range(X_name_svd.shape[1])]
name_svd_train = pd.DataFrame(X_name_svd[:len(train)], columns=name_svd_cols, index=train.index).astype(np.float32)
name_svd_test  = pd.DataFrame(X_name_svd[len(train):], columns=name_svd_cols, index=test.index).astype(np.float32)
del X_name, X_name_svd; gc.collect()
print(f"‚úÖ EVR (name): {svd_name.explained_variance_ratio_.sum():.3f}")

# =============== –£–ü–†–û–©–ï–ù–ù–ê–Ø –ü–†–ï–î–û–ë–†–ê–ë–û–¢–ö–ê –û–¢–ó–´–í–û–í + TF-IDF (word/char) + SVD + –ö–õ–Æ–ß–ï–í–´–ï –°–õ–û–í–ê ===============
print("üßº –£–ø—Ä–æ—â–µ–Ω–Ω–∞—è –ø—Ä–µ–¥–æ–±—Ä–∞–±–æ—Ç–∫–∞ –æ—Ç–∑—ã–≤–æ–≤ –∏ TF-IDF –ø–æ —Ç–µ–∫—Å—Ç–∞–º...")

import re
for df in [train, test]:
    df["reviews_exclamations"] = df["all_reviews_text"].fillna("").astype(str).str.count("!").astype(np.int32)
    df["reviews_questions"]   = df["all_reviews_text"].fillna("").astype(str).str.count(r"\?").astype(np.int32)
    lens = df["all_reviews_text"].fillna("").astype(str).str.len().replace(0, 1)
    df["reviews_upper_frac"]  = (df["all_reviews_text"].fillna("").astype(str).apply(lambda s: sum(c.isupper() for c in s)) / lens).astype(np.float32)

def simple_clean(text: str) -> str:
    s = str(text).lower()
    s = re.sub(r'https?://\S+|www\.\S+|\S+@\S+', ' ', s)
    s = s.replace('—ë', '–µ')
    s = re.sub(r'[^a-z–∞-—è0-9 ]+', ' ', s)
    s = re.sub(r'\s+', ' ', s).strip()
    return s

train["reviews_clean"] = train["all_reviews_text"].fillna("").astype(str).apply(simple_clean)
test["reviews_clean"]  = test["all_reviews_text"].fillna("").astype(str).apply(simple_clean)

KW_PATTERNS = {
    "kw_pos":       [r"\b–æ—Ç–ª–∏—á–Ω\w*", r"\b–∫–ª–∞—Å—Å–Ω\w*", r"\b—Ö–æ—Ä–æ—à\w*", r"\b–≤–∫—É—Å–Ω\w*", r"\b—Ä–µ–∫–æ–º–µ–Ω–¥\w*", r"\b–ª—é–±–∏–º\w*", r"\b–Ω—Ä–∞–≤\w*", r"\b—á–∏—Å—Ç\w*", r"\b—É–¥–æ–±–Ω\w*", r"\b–∫–æ–º—Ñ–æ—Ä—Ç\w*"],
    "kw_neg":       [r"\b—É–∂–∞—Å\w*", r"\b–ø–ª–æ—Ö\w*", r"\b–Ω–µ–≤–∫—É—Å\w*", r"\b–≥—Ä—è–∑\w*", r"\b—Ö–∞–º\w*", r"\b–æ–±–º–∞–Ω\w*", r"\b—Ä–∞–∑–æ—á–∞—Ä\w*", r"\b–¥–æ—Ä–æ–≥\w*", r"\b–∫–æ—à–º–∞—Ä\w*"],
    "kw_service":   [r"\b–≤–µ–∂–ª–∏–≤\w*", r"\b–ø–µ—Ä—Å–æ–Ω–∞–ª\w*", r"\b–æ—Ñ–∏—Ü–∏–∞–Ω—Ç\w*", r"\b–∞–¥–º–∏–Ω–∏—Å—Ç—Ä–∞—Ç–æ—Ä\w*", r"\b—Å–µ—Ä–≤–∏—Å\w*", r"\b–∫–æ–Ω—Å—É–ª—å—Ç–∞–Ω—Ç\w*", r"\b–æ—Ç–Ω–æ—à–µ–Ω\w*"],
    "kw_speed":     [r"\b–±—ã—Å—Ç—Ä\w*", r"\b–º–µ–¥–ª–µ–Ω\w*", r"\b–¥–æ–ª–≥\w*", r"\b–æ—á–µ—Ä–µ–¥\w*", r"\b–æ–∂–∏–¥–∞–Ω\w*", r"\b–æ–ø–µ—Ä–∞—Ç–∏–≤\w*", r"\b–∑–∞—Ç—è–∂\w*"],
    "kw_price_pos": [r"\b–¥–µ—à–µ–≤\w*", r"\b–≤—ã–≥–æ–¥\w*", r"\b–∞–∫—Ü–∏\w*", r"\b—Å–∫–∏–¥–∫\w*", r"\b–ª–æ—è–ª—å\w*", r"\b–±–æ–Ω—É—Å\w*"],
    "kw_price_neg": [r"\b–¥–æ—Ä–æ–≥\w*", r"\b–∑–∞–≤—ã—à–µ–Ω–Ω\w*", r"\b—Ü–µ–Ω–Ω–∏–∫\w*", r"\b–ø–µ—Ä–µ–ø–ª–∞—Ç\w*", r"\b—Ü–µ–Ω\w*"],
    "kw_delivery":  [r"\b–¥–æ—Å—Ç–∞–≤–∫\w*", r"\b–∫—É—Ä—å–µ—Ä\w*", r"\b–ø—Ä–∏–≤–µ–∑\w*", r"\b–æ–ø–∞–∑–¥\w*", r"\b—Å—Ä–æ–∫\w*", r"\b–∑–∞–∫–∞–∑\w*", r"\b–ø—Ä–∏–≤–æ–∑\w*"],
    "kw_quality":   [r"\b–∫–∞—á–µ—Å—Ç–≤\w*", r"\b—Å–≤–µ–∂\w*", r"\b–≥–æ—Ä—è—á\w*", r"\b—Ö–æ–ª–æ–¥\w*", r"\b–ø—Ä–æ—Å—Ä–æ—á\w*", r"\b–±—Ä–∞–∫\w*", r"\b–∏—Å–ø—Ä–∞–≤–Ω\w*", r"\b–æ—Ä–∏–≥–∏–Ω–∞–ª\w*", r"\b–ø–æ–¥–¥–µ–ª\w*"],
    "kw_taste":     [r"\b–≤–∫—É—Å–Ω\w*", r"\b—Å–æ–ª–µ–Ω\w*", r"\b–ø–µ—Ä–µ—Å–æ–ª–µ–Ω\w*", r"\b–ø–µ—Ä–µ—Å—É—à\w*", r"\b–æ—Å—Ç—Ä\w*", r"\b–∂–∏—Ä–Ω\w*", r"\b–ø–æ—Ä—Ü–∏\w*"],
}
KW_PATTERNS = {k: [re.compile(p, flags=re.U) for p in v] for k, v in KW_PATTERNS.items()}

def kw_counts_regex(text: str):
    toks = text.split()
    total = max(1, len(toks))
    feats = {}
    for k, patterns in KW_PATTERNS.items():
        cnt = 0
        for pat in patterns:
            cnt += len(pat.findall(text))
        feats[f"{k}_cnt"] = int(cnt)
        feats[f"{k}_share"] = float(cnt) / total
    feats["reviews_tokens"] = total
    return feats

def add_kw_simple(df):
    feats_list = df["reviews_clean"].fillna("").apply(kw_counts_regex).tolist()
    feats_df = pd.DataFrame(feats_list, index=df.index)
    for c in feats_df.columns:
        if feats_df[c].dtype == "float64": feats_df[c] = feats_df[c].astype(np.float32)
        if feats_df[c].dtype == "int64":   feats_df[c] = feats_df[c].astype(np.int32)
    return pd.concat([df, feats_df], axis=1)

train = add_kw_simple(train)
test = add_kw_simple(test)

# —Ä—É—Å—Å–∫–∏–µ —Å—Ç–æ–ø-—Å–ª–æ–≤–∞ (–°–ü–ò–°–û–ö! —Ç.–∫. sklearn –Ω–µ –ø—Ä–∏–Ω–∏–º–∞–µ—Ç set)
RUSSIAN_STOPWORDS = [
    "–∏","–≤","–≤–æ","–Ω–µ","—á—Ç–æ","–æ–Ω","–æ–Ω–∞","–æ–Ω–∏","–æ–Ω–æ","–∫–∞–∫","–∫","–∫–æ","–Ω–∞","–∏–∑","–∑–∞","–æ—Ç","–¥–æ","–ø–æ","—Å","—Å–æ",
    "—É","–æ","–æ–±","–æ–±–æ","–ø—Ä–∏","–¥–ª—è","–±–µ–∑","–ø—Ä–æ","—ç—Ç–æ","—ç—Ç–∞","—ç—Ç–æ—Ç","—ç—Ç–∏","—Ç–æ—Ç","—Ç–∞","—Ç–µ","–∂–µ","–±—ã","–ª–∏",
    "—É–∂","–≤–µ–¥—å","–µ—â–µ","—É–∂–µ","–∫–æ–≥–¥–∞","–≥–¥–µ","–∫—É–¥–∞","–æ—Ç–∫—É–¥–∞","–ø–æ—Ç–æ–º—É","–∫–æ—Ç–æ—Ä—ã–π","–∫–æ—Ç–æ—Ä–∞—è","–∫–æ—Ç–æ—Ä—ã–µ","—Ç–∞–∫–∂–µ",
    "—Ç–∞–∫","—Ç—É—Ç","—Ç–∞–º","—Ç–æ–≥–¥–∞","–ª–∏—à—å","—Ç–æ–ª—å–∫–æ","–æ—á–µ–Ω—å","—Å–æ–≤—Å–µ–º","–ø–æ—á—Ç–∏","–µ—Å–ª–∏","—Ç–æ","–≤—Å–µ","–≤–µ—Å—å","–≤—Å—è",
    "–º–æ–π","–º–æ—è","–º–æ–∏","—Ç–≤–æ–π","—Ç–≤–æ—è","—Ç–≤–æ–∏","–≤–∞—à","–≤–∞—à–∞","–≤–∞—à–∏","–Ω–∞—à","–Ω–∞—à–∞","–Ω–∞—à–∏","–∏—Ö","—Å–≤–æ–π","–∏–ª–∏",
    "–ª–∏–±–æ","–Ω–∏","–¥–∞","–Ω–µ—Ç","–Ω—É","–≤–æ—Ç","–¥–∞–∂–µ","—á—Ç–æ–±—ã","—á—Ç–æ–±","–º–µ–∂–¥—É","–Ω–∞–¥","–ø–æ–¥","–ø–æ—Ç–æ–º","–∑–∞—Ç–µ–º","–æ–ø—è—Ç—å",
    "—Å–Ω–æ–≤–∞","–≤—Å–µ–≥–¥–∞","–±—ã–ª","–±—ã–ª–∞","–±—ã–ª–æ","–±—ã–ª–∏","–±—ã—Ç—å","–µ—Å—Ç—å","–±—É–¥—É—Ç","–±—É–¥–µ–º","–Ω–µ—Ç—É","–Ω–µ—Ç","–µ—Å—Ç—å"
]

# TF-IDF (word 1‚Äì2) + SVD
all_reviews_clean = pd.concat([train['reviews_clean'], test['reviews_clean']], axis=0).tolist()
vec_rev_word = TfidfVectorizer(
    analyzer='word',
    ngram_range=(1, 2),
    min_df=2,
    max_features=100_000,
    stop_words=RUSSIAN_STOPWORDS,  # –í–ê–ñ–ù–û: —Å–ø–∏—Å–æ–∫, –∞ –Ω–µ set
    lowercase=False,
    sublinear_tf=True
)
X_rev_word = vec_rev_word.fit_transform(all_reviews_clean)
svd_rev_word = TruncatedSVD(n_components=32, random_state=SEED)
X_rev_word_svd = svd_rev_word.fit_transform(X_rev_word)
revW_svd_cols = [f"reviews_tfidfW_svd_{i}" for i in range(X_rev_word_svd.shape[1])]
revW_svd_train = pd.DataFrame(X_rev_word_svd[:len(train)], columns=revW_svd_cols, index=train.index).astype(np.float32)
revW_svd_test  = pd.DataFrame(X_rev_word_svd[len(train):], columns=revW_svd_cols, index=test.index).astype(np.float32)
del X_rev_word, X_rev_word_svd; gc.collect()

# TF-IDF (char 3‚Äì5) + SVD
vec_rev_char = TfidfVectorizer(
    analyzer='char',
    ngram_range=(3, 5),
    min_df=2,
    max_features=120_000,
    lowercase=False,
    sublinear_tf=True
)
X_rev_char = vec_rev_char.fit_transform(all_reviews_clean)
svd_rev_char = TruncatedSVD(n_components=24, random_state=SEED)
X_rev_char_svd = svd_rev_char.fit_transform(X_rev_char)
revC_svd_cols = [f"reviews_tfidfC_svd_{i}" for i in range(X_rev_char_svd.shape[1])]
revC_svd_train = pd.DataFrame(X_rev_char_svd[:len(train)], columns=revC_svd_cols, index=train.index).astype(np.float32)
revC_svd_test  = pd.DataFrame(X_rev_char_svd[len(train):], columns=revC_svd_cols, index=test.index).astype(np.float32)
del X_rev_char, X_rev_char_svd; gc.collect()

print(f"‚úÖ EVR (reviews word): {svd_rev_word.explained_variance_ratio_.sum():.3f}")
print(f"‚úÖ EVR (reviews char): {svd_rev_char.explained_variance_ratio_.sum():.3f}")

# =============== –¢–ï–ö–°–¢-–°–¢–ê–¢–´ ===============
for df in [train, test]:
    df["name_len"] = np.log1p(df["name"].fillna("").str.len().astype(float)).astype(np.float32)
    df["reviews_text_len"] = np.log1p(df["all_reviews_text"].fillna("").str.len().astype(float)).astype(np.float32)

# =============== BRAND –Ω–æ—Ä–º–∞–ª–∏–∑–∞—Ü–∏—è + —Ä–∞–∑–º–µ—Ä —Å–µ—Ç–∏ ===============
import re as _re
def normalize_brand(x: str) -> str:
    s = str(x).lower()
    s = _re.sub(r'[^a-z–∞-—è0-9 ]+', ' ', s)
    s = _re.sub(r'\b(–æ–æ–æ|–∏–ø|ooo|oao|zao|–º–∞–≥–∞–∑–∏–Ω|–∫–∞—Ñ–µ|–±–∞—Ä|–∞–ø—Ç–µ–∫–∞|—Å–∞–ª–æ–Ω|clinic|salon|restaurant|rest|grill)\b', ' ', s)
    s = _re.sub(r'\s+', ' ', s).strip()
    return s

for df in [train, test]:
    df['brand'] = df['name'].map(normalize_brand)

brand_size = pd.concat([train['brand'], test['brand']]).value_counts()
for df in [train, test]:
    df['brand_size'] = df['brand'].map(brand_size).fillna(1).astype(np.int32)

# =============== OOF TARGET ENCODING (category, geo_cluster, brand) ===============
print("üéØ Target encoding (OOF) –ø–æ category/geo/brand...")
te_cat_tr, te_cat_te = oof_target_encoding(train, test, col="category", target_col="target",
                                           n_splits=5, groups=train["geo_cluster"], smoothing=5.0, global_mean=global_mean)
te_geo_tr, te_geo_te = oof_target_encoding(train, test, col="geo_cluster", target_col="target",
                                           n_splits=5, groups=train["geo_cluster"], smoothing=10.0, global_mean=global_mean)
te_brand_tr, te_brand_te = oof_target_encoding(train, test, col="brand", target_col="target",
                                               n_splits=5, groups=train["geo_cluster"], smoothing=5.0, global_mean=global_mean)

train["category_te"] = te_cat_tr; test["category_te"] = te_cat_te
train["geo_cluster_te"] = te_geo_tr; test["geo_cluster_te"] = te_geo_te
train["brand_te"] = te_brand_tr; test["brand_te"] = te_brand_te

# =============== KNN OOF –ì–ï–û-–§–ò–ß–ò (0.5 –∏ 1.0 –∫–º; all/same_cat + IDW + HHI) ===============
print("üó∫Ô∏è KNN OOF –≥–µ–æ-—Ñ–∏—á–∏ (0.5/1.0 –∫–º)...")
EARTH_R = 6371.0

def knn_geo_oof_features_simple(train_df, test_df, radii_km=(0.5, 1.0), idw_eps=0.05):
    gkf = GroupKFold(n_splits=5)
    def build_feats(n, radii):
        feats = {}
        for r in radii:
            feats[f"knn_count_all_{r}km"] = np.zeros(n, dtype=np.float32)
            feats[f"knn_mean_all_{r}km"] = np.zeros(n, dtype=np.float32)
            feats[f"knn_idw_mean_all_{r}km"] = np.zeros(n, dtype=np.float32)
            feats[f"knn_count_same_{r}km"] = np.zeros(n, dtype=np.float32)
            feats[f"knn_mean_same_{r}km"] = np.zeros(n, dtype=np.float32)
            feats[f"knn_idw_mean_same_{r}km"] = np.zeros(n, dtype=np.float32)
            feats[f"knn_ratio_same_all_{r}km"] = np.zeros(n, dtype=np.float32)
        feats["knn_hhi_1km"] = np.zeros(n, dtype=np.float32)
        return feats

    # OOF
    oof_feats = build_feats(len(train_df), radii_km)
    for trn_idx, val_idx in gkf.split(train_df, groups=train_df["geo_cluster"]):
        trn, val = train_df.iloc[trn_idx], train_df.iloc[val_idx]
        tree = BallTree(np.c_[trn["lat_rad"], trn["lon_rad"]], metric='haversine')
        for r in radii_km:
            ind, dist = tree.query_radius(np.c_[val["lat_rad"], val["lon_rad"]], r=r/EARTH_R, return_distance=True, sort_results=True)
            all_means, all_counts, idw_all = [], [], []
            same_means, same_counts, idw_same = [], [], []
            hhis = []
            for i, (nbr_idx, nbr_dist) in enumerate(zip(ind, dist)):
                if len(nbr_idx) == 0:
                    all_counts.append(0); all_means.append(global_mean); idw_all.append(global_mean)
                    same_counts.append(0); same_means.append(global_mean); idw_same.append(global_mean)
                    if r == 1.0: hhis.append(0.0)
                    continue
                nbr_ids = trn.index.values[nbr_idx]
                nbr_vals = train_df.loc[nbr_ids, "target"].values
                all_counts.append(len(nbr_ids))
                all_means.append(np.mean(nbr_vals))
                dk = (nbr_dist * EARTH_R) + idw_eps
                w = 1.0 / dk
                idw_all.append(np.average(nbr_vals, weights=w))
                mask_same = (train_df.loc[nbr_ids, "category"].values == val.iloc[i]["category"])
                if mask_same.any():
                    v_same = nbr_vals[mask_same]
                    same_counts.append(len(v_same))
                    same_means.append(np.mean(v_same))
                    idw_same.append(np.average(v_same, weights=w[mask_same]))
                else:
                    same_counts.append(0); same_means.append(global_mean); idw_same.append(global_mean)
                if r == 1.0:
                    from collections import Counter as Ctr
                    cats = train_df.loc[nbr_ids, "category"].values
                    n = len(cats)
                    if n > 0:
                        ctr = Ctr(cats)
                        shares = np.array([c/n for c in ctr.values()], dtype=np.float32)
                        hhis.append(float(np.sum(shares**2)))
                    else:
                        hhis.append(0.0)
            oof_feats[f"knn_count_all_{r}km"][val_idx]    = np.array(all_counts, dtype=np.float32)
            oof_feats[f"knn_mean_all_{r}km"][val_idx]     = np.array(all_means, dtype=np.float32)
            oof_feats[f"knn_idw_mean_all_{r}km"][val_idx] = np.array(idw_all, dtype=np.float32)
            oof_feats[f"knn_count_same_{r}km"][val_idx]    = np.array(same_counts, dtype=np.float32)
            oof_feats[f"knn_mean_same_{r}km"][val_idx]     = np.array(same_means, dtype=np.float32)
            oof_feats[f"knn_idw_mean_same_{r}km"][val_idx] = np.array(idw_same, dtype=np.float32)
            ratio = (oof_feats[f"knn_count_same_{r}km"][val_idx] + 1.0) / (oof_feats[f"knn_count_all_{r}km"][val_idx] + 1.0)
            oof_feats[f"knn_ratio_same_all_{r}km"][val_idx] = ratio.astype(np.float32)
            if r == 1.0:
                oof_feats["knn_hhi_1km"][val_idx] = np.array(hhis, dtype=np.float32)

    # test
    test_feats = build_feats(len(test_df), radii_km)
    tree_full = BallTree(np.c_[train_df["lat_rad"], train_df["lon_rad"]], metric='haversine')
    for r in radii_km:
        ind, dist = tree_full.query_radius(np.c_[test_df["lat_rad"], test_df["lon_rad"]], r=r/EARTH_R, return_distance=True, sort_results=True)
        all_means, all_counts, idw_all = [], [], []
        same_means, same_counts, idw_same = [], [], []
        hhis = []
        for i, (nbr_idx, nbr_dist) in enumerate(zip(ind, dist)):
            if len(nbr_idx) == 0:
                all_counts.append(0); all_means.append(global_mean); idw_all.append(global_mean)
                same_counts.append(0); same_means.append(global_mean); idw_same.append(global_mean)
                if r == 1.0: hhis.append(0.0)
                continue
            nbr_ids = train_df.index.values[nbr_idx]
            nbr_vals = train_df.loc[nbr_ids, "target"].values
            all_counts.append(len(nbr_ids))
            all_means.append(np.mean(nbr_vals))
            dk = (nbr_dist * EARTH_R) + idw_eps
            w = 1.0 / dk
            idw_all.append(np.average(nbr_vals, weights=w))
            mask_same = (train_df.loc[nbr_ids, "category"].values == test_df.iloc[i]["category"])
            if mask_same.any():
                v_same = nbr_vals[mask_same]
                same_counts.append(len(v_same))
                same_means.append(np.mean(v_same))
                idw_same.append(np.average(v_same, weights=w[mask_same]))
            else:
                same_counts.append(0); same_means.append(global_mean); idw_same.append(global_mean)
            if r == 1.0:
                from collections import Counter as Ctr
                cats = train_df.loc[nbr_ids, "category"].values
                n = len(cats)
                if n > 0:
                    ctr = Ctr(cats)
                    shares = np.array([c/n for c in ctr.values()], dtype=np.float32)
                    hhis.append(float(np.sum(shares**2)))
                else:
                    hhis.append(0.0)
        test_feats[f"knn_count_all_{r}km"]    = np.array(all_counts, dtype=np.float32)
        test_feats[f"knn_mean_all_{r}km"]     = np.array(all_means, dtype=np.float32)
        test_feats[f"knn_idw_mean_all_{r}km"] = np.array(idw_all, dtype=np.float32)
        test_feats[f"knn_count_same_{r}km"]    = np.array(same_counts, dtype=np.float32)
        test_feats[f"knn_mean_same_{r}km"]     = np.array(same_means, dtype=np.float32)
        test_feats[f"knn_idw_mean_same_{r}km"] = np.array(idw_same, dtype=np.float32)
        ratio = (test_feats[f"knn_count_same_{r}km"] + 1.0) / (test_feats[f"knn_count_all_{r}km"] + 1.0)
        test_feats[f"knn_ratio_same_all_{r}km"] = ratio.astype(np.float32)
        if r == 1.0:
            test_feats["knn_hhi_1km"] = np.array(hhis, dtype=np.float32)

    return oof_feats, test_feats

knn_oof_tr, knn_te = knn_geo_oof_features_simple(train, test, radii_km=(0.5, 1.0))
for k, v in knn_oof_tr.items(): train[k] = v
for k, v in knn_te.items():     test[k] = v

# =============== DEMO ALIGNMENT (OOF) + –≠–ù–¢–†–û–ü–ò–ò ===============
print("üß¨ Demo alignment + —ç–Ω—Ç—Ä–æ–ø–∏–∏ –¥–µ–º–æ–≥—Ä–∞—Ñ–∏–∏...")
def build_demo_matrix(df):
    eps = 1e-9
    cols = df.columns
    age_cols = [c for c in cols if c.startswith("age_") and c.endswith("_1000m")]
    age_sum = df[age_cols].astype(float).sum(axis=1) + eps if len(age_cols)>0 else pd.Series(1.0, index=df.index)
    age_shares = {}
    for c in age_cols:
        age_shares[c+"_share"] = (df[c].astype(float) / age_sum).astype(np.float32)
    if "female_1000m" in cols and "male_1000m" in cols:
        sex_sum = df["female_1000m"].astype(float) + df["male_1000m"].astype(float) + eps
        female_share = (df["female_1000m"].astype(float) / sex_sum).astype(np.float32).rename("female_share")
    else:
        female_share = pd.Series(0.5, index=df.index, name="female_share")
    if "married_1000m" in cols and "not_married_1000m" in cols:
        sm = df["married_1000m"].astype(float) + df["not_married_1000m"].astype(float) + eps
        married_share = (df["married_1000m"].astype(float) / sm).astype(np.float32).rename("married_share")
    else:
        married_share = pd.Series(0.5, index=df.index, name="married_share")
    if "has_children_1000m" in cols and "no_children_1000m" in cols:
        sm = df["has_children_1000m"].astype(float) + df["no_children_1000m"].astype(float) + eps
        has_children_share = (df["has_children_1000m"].astype(float) / sm).astype(np.float32).rename("has_children_share")
    else:
        has_children_share = pd.Series(0.5, index=df.index, name="has_children_share")
    if "employed_1000m" in cols and "unemployed_1000m" in cols:
        sm = df["employed_1000m"].astype(float) + df["unemployed_1000m"].astype(float) + eps
        employed_share = (df["employed_1000m"].astype(float) / sm).astype(np.float32).rename("employed_share")
    else:
        employed_share = pd.Series(0.5, index=df.index, name="employed_share")
    if "higher_education_1000m" in cols and "no_higher_education_1000m" in cols:
        sm = df["higher_education_1000m"].astype(float) + df["no_higher_education_1000m"].astype(float) + eps
        higher_edu_share = (df["higher_education_1000m"].astype(float) / sm).astype(np.float32).rename("higher_edu_share")
    else:
        higher_edu_share = pd.Series(0.5, index=df.index, name="higher_edu_share")
    parts = [pd.DataFrame(age_shares)] if len(age_shares) > 0 else []
    parts += [female_share, married_share, has_children_share, employed_share, higher_edu_share]
    demo_df = pd.concat(parts, axis=1)
    demo_df = demo_df.fillna(demo_df.median())
    return demo_df

def demo_alignment_oof(train_df, test_df, category_col="category"):
    demo_train = build_demo_matrix(train_df)
    demo_test = build_demo_matrix(test_df)
    gkf = GroupKFold(n_splits=5)
    align_train = np.zeros(len(train_df), dtype=np.float32)
    for trn_idx, val_idx in gkf.split(train_df, groups=train_df["geo_cluster"]):
        trn, val = train_df.iloc[trn_idx], train_df.iloc[val_idx]
        demo_trn, demo_val = demo_train.iloc[trn_idx], demo_train.iloc[val_idx]
        protos = demo_trn.groupby(trn[category_col]).mean()
        cur = demo_val.values
        proto_mat = protos.reindex(val[category_col]).values
        cur_n = cur / (np.linalg.norm(cur, axis=1, keepdims=True) + 1e-9)
        proto_n = proto_mat / (np.linalg.norm(proto_mat, axis=1, keepdims=True) + 1e-9)
        sim = np.sum(cur_n * proto_n, axis=1)
        global_proto = demo_trn.mean().values
        global_proto_n = global_proto / (np.linalg.norm(global_proto) + 1e-9)
        mask_nan = np.isnan(sim)
        if mask_nan.any():
            sim[mask_nan] = np.sum(cur_n[mask_nan] * global_proto_n, axis=1)
        align_train[val_idx] = sim.astype(np.float32)
    protos_full = build_demo_matrix(train_df).groupby(train_df[category_col]).mean()
    cur = demo_test.values
    proto_mat = protos_full.reindex(test_df[category_col]).values
    cur_n = cur / (np.linalg.norm(cur, axis=1, keepdims=True) + 1e-9)
    proto_n = proto_mat / (np.linalg.norm(proto_mat, axis=1, keepdims=True) + 1e-9)
    sim_te = np.sum(cur_n * proto_n, axis=1)
    global_proto = build_demo_matrix(train_df).mean().values
    global_proto_n = global_proto / (np.linalg.norm(global_proto) + 1e-9)
    mask_nan = np.isnan(sim_te)
    if mask_nan.any():
        sim_te[mask_nan] = np.sum(cur_n[mask_nan] * global_proto_n, axis=1)
    align_test = sim_te.astype(np.float32)
    return align_train, align_test

demo_align_tr, demo_align_te = demo_alignment_oof(train, test, category_col="category")
train["demo_alignment"] = demo_align_tr.astype(np.float32)
test["demo_alignment"] = demo_align_te.astype(np.float32)

def entropy_from_cols(df, cols):
    eps = 1e-9
    if len(cols) == 0:
        return np.zeros(len(df), dtype=np.float32)
    arr = df[cols].astype(float).values
    s = arr.sum(axis=1, keepdims=True) + eps
    p = arr / s
    ent = -np.sum(p * np.log(p + eps), axis=1)
    return ent.astype(np.float32)

age_cols_1000 = [c for c in train.columns if c.startswith("age_") and c.endswith("_1000m")]
income_cols_1000 = [c for c in train.columns if c.endswith("_income_1000m")]

for df in [train, test]:
    df["age_entropy_1000m"] = entropy_from_cols(df, age_cols_1000)
    df["income_entropy_1000m"] = entropy_from_cols(df, income_cols_1000)

# =============== –û–ë–™–ï–î–ò–ù–ï–ù–ò–ï –¢–ï–ö–°–¢–û–í–´–• –ü–†–ò–ó–ù–ê–ö–û–í + –ò–ù–¢–ï–†–ê–ö–¶–ò–ò ===============
print("üîÑ –û–±—ä–µ–¥–∏–Ω–µ–Ω–∏–µ —Ç–µ–∫—Å—Ç–æ–≤—ã—Ö —Ñ–∏—á...")
train = pd.concat([train, name_svd_train, revW_svd_train, revC_svd_train], axis=1)
test  = pd.concat([test,  name_svd_test,  revW_svd_test,  revC_svd_test], axis=1)
del name_svd_train, name_svd_test, revW_svd_train, revW_svd_test, revC_svd_train, revC_svd_test
gc.collect()

for df in [train, test]:
    df["inv_dist"] = (1.0 / (1.0 + df["dist_to_center_km"])).astype(np.float32)
    for base in ["name_tfidf_svd_", "reviews_tfidfW_svd_", "reviews_tfidfC_svd_"]:
        for i in range(3):
            col = f"{base}{i}"
            if col in df.columns:
                df[f"{col}_x_rcount"]  = (df[col] * df["review_count"]).astype(np.float32)
                df[f"{col}_x_catTE"]   = (df[col] * df["category_te"]).astype(np.float32)
                df[f"{col}_x_invDist"] = (df[col] * df["inv_dist"]).astype(np.float32)

# =============== –¢–†–ê–ù–°–§–û–†–ú –¢–ê–†–ì–ï–¢–ê ===============
print("üìä –ö–≤–∞–Ω—Ç–∏–ª—å–Ω–æ–µ –ø—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏–µ target...")
qt = QuantileTransformer(output_distribution="normal", random_state=SEED, n_quantiles=min(2000, max(10, len(train)//5)))
train["target_qt"] = qt.fit_transform(train[["target"]]).astype(np.float32)

# =============== –ü–†–ò–ó–ù–ê–ö–ò: —Å–ø–∏—Å–æ–∫ –∏ —Ç–∏–ø—ã ===============
ignore_cols = {
    "id", "name", "category", "address", "coordinates", "brand",
    "all_reviews_text", "reviews_clean", "target", "target_qt",
    "tile_500m", "tile_1000m", "tile_500m_x_category", "tile_1000m_x_category"
}
num_cols = [c for c in train.columns if c not in ignore_cols]

for df in [train, test]:
    for c in num_cols:
        if c in df.columns:
            if df[c].dtype == "float64":
                df[c] = df[c].astype(np.float32)
            if df[c].dtype == "int64":
                df[c] = df[c].astype(np.int32)

available_features = [c for c in num_cols if c in test.columns]
print(f"üéØ –ò—Å–ø–æ–ª—å–∑—É–µ—Ç—Å—è {len(available_features)} –ø—Ä–∏–∑–Ω–∞–∫–æ–≤")
tfidf_features_count = sum(1 for f in available_features if f.startswith("name_tfidf_svd_") or f.startswith("reviews_tfidfW_svd_") or f.startswith("reviews_tfidfC_svd_"))
print(f" - TF-IDF SVD –ø—Ä–∏–∑–Ω–∞–∫–∏: {tfidf_features_count}")

# =============== LightGBM (CV) + –º–æ–Ω–æ—Ç–æ–Ω–Ω—ã–µ –æ–≥—Ä–∞–Ω–∏—á–µ–Ω–∏—è ===============
params = {
    "objective": "regression",
    "metric": "mae",
    "learning_rate": 0.05,
    "num_leaves": 127,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "lambda_l1": 0.2,
    "lambda_l2": 0.3,
    "min_data_in_leaf": 20,
    "max_depth": -1,
    "n_estimators": 400,
    "random_state": SEED,
    "verbosity": -1,
    "n_jobs": -1
}

monotone_map = {
    'review_count': +1,
    'traffic_300m': +1,
    'traffic_1000m': +1,
    'dist_to_center_km': -1,
    'inv_dist': +1,
    'knn_count_all_0.5km': +1,
    'knn_count_all_1.0km': +1
}
constraints = [monotone_map.get(f, 0) for f in available_features]
params["monotone_constraints"] = constraints

folds = GroupKFold(n_splits=5)
oof_predictions = np.zeros(len(train), dtype=np.float32)
best_iterations = []

print("\nüéØ –û–±—É—á–µ–Ω–∏–µ LightGBM (CV)...")
for fold, (trn_idx, val_idx) in enumerate(folds.split(train, train["target_qt"], groups=train["geo_cluster"])):
    print(f"Fold {fold+1}/5")
    X_tr, y_tr = train.iloc[trn_idx][available_features], train.iloc[trn_idx]["target_qt"]
    X_val, y_val = train.iloc[val_idx][available_features], train.iloc[val_idx]["target_qt"]

    model = lgb.LGBMRegressor(**params)
    try:
        model.fit(
            X_tr, y_tr,
            eval_set=[(X_val, y_val)],
            eval_metric="mae",
            callbacks=[
                lgb.early_stopping(stopping_rounds=200, verbose=False),
                lgb.log_evaluation(period=100)
            ]
        )
    except lgb.basic.LightGBMError:
        print("‚ö†Ô∏è Monotone constraints –Ω–µ –ø–æ–¥–¥–µ—Ä–∂–∞–Ω—ã. –û–±—É—á–∞–µ–º –±–µ–∑ –æ–≥—Ä–∞–Ω–∏—á–µ–Ω–∏–π.")
        p2 = {**params}
        p2.pop("monotone_constraints", None)
        model = lgb.LGBMRegressor(**p2)
        model.fit(
            X_tr, y_tr,
            eval_set=[(X_val, y_val)],
            eval_metric="mae",
            callbacks=[
                lgb.early_stopping(stopping_rounds=200, verbose=False),
                lgb.log_evaluation(period=100)
            ]
        )

    preds_val = model.predict(X_val, num_iteration=model.best_iteration_)
    oof_predictions[val_idx] = preds_val.astype(np.float32)
    best_iterations.append(model.best_iteration_)

    fold_mae = mean_absolute_error(
        qt.inverse_transform(y_val.values.reshape(-1, 1)).ravel(),
        qt.inverse_transform(preds_val.reshape(-1, 1)).ravel()
    )
    print(f"  Fold MAE: {fold_mae:.4f}")

oof_pred = qt.inverse_transform(oof_predictions.reshape(-1, 1)).ravel()
oof_mae = mean_absolute_error(train["target"].values, oof_pred)
print(f"\nüéØ Final OOF MAE (–¥–æ –∫–∞–ª–∏–±—Ä–æ–≤–∫–∏): {oof_mae:.4f}")

# =============== –ò–ó–û–¢–û–ù–ò–ß–ï–°–ö–ê–Ø –ö–ê–õ–ò–ë–†–û–í–ö–ê –ü–û OOF ===============
print("üìà –ò–∑–æ—Ç–æ–Ω–∏—á–µ—Å–∫–∞—è –∫–∞–ª–∏–±—Ä–æ–≤–∫–∞ –ø–æ OOF...")
iso = IsotonicRegression(out_of_bounds='clip')
iso.fit(oof_pred, train['target'].values)
oof_pred_cal = iso.transform(oof_pred)
oof_mae_cal = mean_absolute_error(train["target"].values, np.clip(oof_pred_cal, 1.0, 5.0))
print(f"üéØ OOF MAE –ø–æ—Å–ª–µ –∫–∞–ª–∏–±—Ä–æ–≤–∫–∏: {oof_mae_cal:.4f}")

# =============== –§–ò–ù–ê–õ–¨–ù–ê–Ø –ú–û–î–ï–õ–¨ ===============
print("\nüöÄ –û–±—É—á–µ–Ω–∏–µ —Ñ–∏–Ω–∞–ª—å–Ω–æ–π –º–æ–¥–µ–ª–∏...")
final_n_estimators = int(np.clip(np.mean(best_iterations) * 1.1, 200, 1000))
print(f"–ò—Å–ø–æ–ª—å–∑—É–µ–º n_estimators={final_n_estimators}")

final_model = lgb.LGBMRegressor(**{**params, "n_estimators": final_n_estimators})
final_model.fit(train[available_features], train["target_qt"])

feature_importance = pd.DataFrame({
    'feature': available_features,
    'importance': final_model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nüìä –¢–æ–ø-20 –≤–∞–∂–Ω—ã—Ö –ø—Ä–∏–∑–Ω–∞–∫–æ–≤:")
print(feature_importance.head(20))
tfidf_in_top = sum(feature_importance.head(20)['feature'].str.contains('tfidf'))
print(f"üìù TF-IDF –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ –≤ —Ç–æ–ø-20: {tfidf_in_top}")

# =============== –ü–†–ï–î–°–ö–ê–ó–ê–ù–ò–ï + –ö–ê–õ–ò–ë–†–û–í–ö–ê ===============
print("\nüßÆ –ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏—è –Ω–∞ —Ç–µ—Å—Ç–µ...")
test_pred_qt = final_model.predict(test[available_features])
test_pred = qt.inverse_transform(test_pred_qt.reshape(-1, 1)).ravel()
test_pred = iso.transform(test_pred)
test_pred = np.clip(test_pred, 1.0, 5.0)

# =============== –°–û–•–†–ê–ù–ï–ù–ò–ï ===============
sub = pd.DataFrame({"id": test["id"], "target": test_pred})
sub.to_csv("submission_tfidf_geo_plus.csv", index=False)

print(f"\n‚úÖ submission_tfidf_geo_plus.csv –≥–æ—Ç–æ–≤!")
print(f"üéØ Final OOF MAE (raw): {oof_mae:.4f} | (calibrated): {oof_mae_cal:.4f}")
print(f"üîß –ò—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–æ {len(available_features)} –ø—Ä–∏–∑–Ω–∞–∫–æ–≤")

üì• –ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞–Ω–Ω—ã—Ö...
–ò—Å—Ö–æ–¥–Ω—ã–π —Ä–∞–∑–º–µ—Ä train: 41105
–†–∞–∑–º–µ—Ä test: 9276
–†–∞–∑–º–µ—Ä reviews: 440082
üßπ –£–¥–∞–ª—è–µ–º –æ–±—ä–µ–∫—Ç—ã —Å target=0 –∏–∑ train...
–£–¥–∞–ª–µ–Ω–æ: 3938 | –ù–æ–≤—ã–π —Ä–∞–∑–º–µ—Ä train: 37167
üìä –ê–≥–≥—Ä–µ–≥–∞—Ü–∏—è –æ—Ç–∑—ã–≤–æ–≤ –ø–æ id...
üìç –ö–æ–æ—Ä–¥–∏–Ω–∞—Ç—ã –∏ –±–∞–∑–æ–≤–∞—è –≥–µ–æ–≥—Ä–∞—Ñ–∏—è...
üß≠ –†–∞—Å—à–∏—Ä–µ–Ω–Ω—ã–µ –≥–µ–æ-–ø—Ä–∏–∑–Ω–∞–∫–∏...
üîé –°—Ñ–æ—Ä–º–∏—Ä–æ–≤–∞–Ω–æ 140 –ø–∞—Ä *_300m/_1000m.
üß± –ì–µ–æ-—Ç–∞–π–ª—ã 500–º/1000–º –∏ OOF-TE...
üî§ TF-IDF –ø–æ –Ω–∞–∑–≤–∞–Ω–∏—è–º (char 3‚Äì5) + SVD...
‚úÖ EVR (name): 0.096
üßº –£–ø—Ä–æ—â–µ–Ω–Ω–∞—è –ø—Ä–µ–¥–æ–±—Ä–∞–±–æ—Ç–∫–∞ –æ—Ç–∑—ã–≤–æ–≤ –∏ TF-IDF –ø–æ —Ç–µ–∫—Å—Ç–∞–º...
‚úÖ EVR (reviews word): 0.055
‚úÖ EVR (reviews char): 0.118
üéØ Target encoding (OOF) –ø–æ category/geo/brand...
üó∫Ô∏è KNN OOF –≥–µ–æ-—Ñ–∏—á–∏ (0.5/1.0 –∫–º)...
üß¨ Demo alignment + —ç–Ω—Ç—Ä–æ–ø–∏–∏ –¥–µ–º–æ–≥—Ä–∞—Ñ–∏–∏...
üîÑ –û–±—ä–µ–¥–∏–Ω–µ–Ω–∏–µ —Ç–µ–∫—Å—Ç–æ–≤—ã—Ö —Ñ–∏—á...
ü