In [10]:
# –®–ê–ì 1: –ò–º–ø–æ—Ä—Ç—ã, –∫–æ–Ω—Ñ–∏–≥ –∏ –±–∞–∑–æ–≤—ã–µ –∫–æ–Ω—Å—Ç–∞–Ω—Ç—ã  (–î–û–ë–ê–í–õ–ï–ù tqdm)
import os, gc, math, json, warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import mean_absolute_error
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.cluster import MiniBatchKMeans
from sklearn.linear_model import Ridge
from sklearn.neighbors import BallTree

from catboost import CatBoostRegressor, Pool
from tqdm import tqdm  # ‚Üê –≤–æ—Ç –æ–Ω

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# –§–ª–∞–≥–∏ —Ç—è–∂—ë–ª—ã—Ö –±–ª–æ–∫–æ–≤
RUN_RUBERT             = True         # ruBERT-—ç–º–±–µ–¥–¥–∏–Ω–≥–∏ –æ—Ç–∑—ã–≤–æ–≤ (—Å –∫—ç—à–µ–º)
RUN_RUBERT_SVD         = True         # SVD-—Å–∂–∞—Ç–∏–µ ruBERT
RUBERT_SVD_COMPONENTS  = 64
RUN_SENTIMENT          = True         # –±—ã—Å—Ç—Ä—ã–π sentiment (POS/NEG/NEU) —Å –∫—ç—à–µ–º
MAX_REVIEWS_PER_PLACE  = 3

# –ì–µ–æ-–Ω–∞—Å—Ç—Ä–æ–π–∫–∏
RADII_M = [100, 300, 600, 1000, 1500]
EARTH_RADIUS_M  = 6371000.0
EARTH_RADIUS_KM = 6371.0088


In [2]:
# –®–ê–ì 2: –ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞–Ω–Ω—ã—Ö
train = pd.read_csv('train.tsv', sep='\t')
test  = pd.read_csv('test.tsv',  sep='\t')
reviews = pd.read_csv('reviews.tsv', sep='\t')

for df in (train, test, reviews):
    df['id'] = df['id'].astype(str)

print("Shapes:", train.shape, test.shape, reviews.shape)
print(train.head(2))


Shapes: (41105, 286) (9276, 285) (440082, 2)
     id                                     name             coordinates  \
0  1365  –ì–æ—Ä–æ–¥—Å–∫–∞—è –ø–æ–ª–∏–∫–ª–∏–Ω–∏–∫–∞ ‚Ññ 109, —Ñ–∏–ª–∏–∞–ª ‚Ññ 2  [37.735049, 55.719667]   
1  8230                       Wellness Club Nebo  [37.537083, 55.749511]   

        category                                   address  target  \
0         health  –ì—Ä–∞–π–≤–æ—Ä–æ–Ω–æ–≤—Å–∫–∞—è —É–ª., 18, –∫–æ—Ä–ø. 1, –ú–æ—Å–∫–≤–∞     4.1   
1  swimming_pool              –ü—Ä–µ—Å–Ω–µ–Ω—Å–∫–∞—è –Ω–∞–±., 12, –ú–æ—Å–∫–≤–∞     3.6   

   traffic_300m    homes_300m    works_300m  female_300m  ...  doramas_1000m  \
0         75429  16113.582471  15756.246444      51316.0  ...         4668.0   
1        246535   8578.458740  31315.672794     192547.0  ...         3431.0   

   computer_components_1000m  humor_1000m  car_market_1000m  \
0                     7718.0      33389.0           18306.0   
1                    11463.0      61107.0           23662.0   

   no_h

In [3]:
# –®–ê–ì 3: –£—Ç–∏–ª–∏—Ç—ã (–≥–µ–æ/–Ω–æ—Ä–º–∞–ª–∏–∑–∞—Ü–∏—è/—Å–ø–ª–∏—Ç—ã)
EARTH_RADIUS_M  = 6371000.0
EARTH_RADIUS_KM = 6371.0088

def meters_to_radians(m): 
    return m / EARTH_RADIUS_M

def safe_ratio(a, b): 
    return a / (b + 1e-6)

def l2norm(X):
    n = np.linalg.norm(X, axis=1, keepdims=True) + 1e-12
    return X / n

def parse_lonlat(df):
    out = df.copy()
    if 'coordinates' in out.columns:
        def _parse(val):
            if isinstance(val, str):
                s = val.strip().replace('[','').replace(']','')
                parts = [p.strip() for p in s.split(',')]
                if len(parts) >= 2:
                    try:   return float(parts[0]), float(parts[1])
                    except: return np.nan, np.nan
            if isinstance(val, (list, tuple)) and len(val)>=2:
                return float(val[0]), float(val[1])
            return np.nan, np.nan
        lonlat = out['coordinates'].map(_parse)
        out['lon'] = [t[0] for t in lonlat]
        out['lat'] = [t[1] for t in lonlat]
    out['lon'] = out.get('lon', np.nan)
    out['lat'] = out.get('lat', np.nan)
    return out

# Fourier —Å –ø–∞—Ä–∞–º–µ—Ç—Ä–∞–º–∏, —Å–Ω—è—Ç—ã–º–∏ –ø–æ train
def add_fourier_train_test(train_df, test_df, fourier_k=8):
    tr = train_df.copy(); te = test_df.copy()
    for col in ['lon','lat']:
        v_tr = tr[col].astype(float)
        mn, mx = np.nanmin(v_tr), np.nanmax(v_tr)
        if not np.isfinite(mn) or not np.isfinite(mx) or mn==mx:
            norm_tr = np.zeros_like(v_tr)
            norm_te = np.zeros_like(te[col].astype(float))
        else:
            norm_tr = (v_tr - mn) / (mx - mn)
            norm_te = (te[col].astype(float) - mn) / (mx - mn)
        for k in range(1, fourier_k+1):
            tr[f'{col}_sin_{k}'] = np.sin(2*np.pi*k*norm_tr)
            tr[f'{col}_cos_{k}'] = np.cos(2*np.pi*k*norm_tr)
            te[f'{col}_sin_{k}'] = np.sin(2*np.pi*k*norm_te)
            te[f'{col}_cos_{k}'] = np.cos(2*np.pi*k*norm_te)
    return tr, te

# KMeans –ø–æ –≥–µ–æ (–≤ —Ä–∞–¥–∏–∞–Ω–∞—Ö) + —Ä–∞—Å—Å—Ç–æ—è–Ω–∏–µ –¥–æ —Ü–µ–Ω—Ç—Ä–æ–∏–¥–∞ –∫–ª–∞—Å—Ç–µ—Ä–∞
def add_geo_kmeans(train_df, test_df, n_clusters=64):
    tr = train_df.copy(); te = test_df.copy()
    lat_tr = np.deg2rad(tr['lat'].astype(float).values)
    lon_tr = np.deg2rad(tr['lon'].astype(float).values)
    lat_te = np.deg2rad(te['lat'].astype(float).values)
    lon_te = np.deg2rad(te['lon'].astype(float).values)
    XY_tr = np.c_[lat_tr, lon_tr]
    XY_te = np.c_[lat_te, lon_te]
    km = MiniBatchKMeans(n_clusters=n_clusters, random_state=RANDOM_SEED, batch_size=4096)
    km.fit(XY_tr)
    tr['geo_cluster'] = km.labels_.astype(int)
    te['geo_cluster'] = km.predict(XY_te).astype(int)
    centers = km.cluster_centers_
    ctr_tr = centers[tr['geo_cluster'].values]
    ctr_te = centers[te['geo_cluster'].values]
    tr['geo_dist2centroid_km'] = np.sqrt(((XY_tr-ctr_tr)**2).sum(axis=1)) * EARTH_RADIUS_KM
    te['geo_dist2centroid_km'] = np.sqrt(((XY_te-ctr_te)**2).sum(axis=1)) * EARTH_RADIUS_KM
    return tr, te

# –ë–∞–∑–æ–≤—ã–µ 5-—Ñ–æ–ª–¥–æ–≤—ã–µ —Å–ø–ª–∏—Ç—ã –ø–æ —Ä–∞–∑–º–µ—á–µ–Ω–Ω—ã–º —Å—Ç—Ä–æ–∫–∞–º (–æ–±—â–∏–µ –¥–ª—è –≤—Å–µ—Ö OOF)
def build_base_splits(train_df, n_splits=5, seed=42):
    lab_idx = train_df.index[(train_df['target']>0) & train_df['target'].notna()].to_numpy()
    y = train_df.loc[lab_idx, 'target'].astype(float)
    bins = pd.qcut(y, q=min(10, max(2, y.nunique())), duplicates='drop').cat.codes
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    base_splits = [(lab_idx[tr], lab_idx[va]) for tr, va in skf.split(lab_idx, bins)]
    return base_splits, lab_idx


In [4]:
# –®–ê–ì 4: –ì–µ–æ-–±–∞–∑–∞ (lon/lat, Fourier, KMeans)
train_geo = parse_lonlat(train)
test_geo  = parse_lonlat(test)
train_geo, test_geo = add_fourier_train_test(train_geo, test_geo, fourier_k=8)
train_geo, test_geo = add_geo_kmeans(train_geo, test_geo, n_clusters=64)


In [5]:
# –®–ê–ì 5: –¢–µ–∫—Å—Ç–æ–≤—ã–µ –ª–∞—Ç–µ–Ω—Ç—ã TF-IDF+SVD (–∞–≥—Ä–µ–≥–∞—Ü–∏—è –ø–æ id) + –ø—Ä–æ—Å—Ç—ã–µ —Ç–µ–∫—Å—Ç-—Å—Ç–∞—Ç—ã
def build_text_latents(reviews: pd.DataFrame, n_components=300, max_features=120_000):
    if reviews.empty:
        return pd.DataFrame(columns=['id'])
    rv = reviews.copy()
    rv['text'] = rv['text'].fillna('')
    vectorizer = TfidfVectorizer(
        lowercase=True,
        token_pattern=r"[A-Za-z–ê-–Ø–∞-—è–Å—ë0-9_]+",
        ngram_range=(1,2),
        max_features=max_features,
        min_df=2
    )
    X = vectorizer.fit_transform(rv['text'])  # —Ç—Ä–∞–Ω—Å–¥—É–∫—Ç–∏–≤–Ω–æ –æ–∫
    n_components = min(n_components, max(1, X.shape[1]-1))
    svd = TruncatedSVD(n_components=n_components, random_state=RANDOM_SEED)
    X_svd = svd.fit_transform(X)
    df = pd.DataFrame(X_svd, columns=[f'txt_svd_{i}' for i in range(n_components)])
    df.insert(0, 'id', rv['id'].values)
    agg = df.groupby('id').mean().reset_index()
    stats = (rv.assign(len_char=rv['text'].str.len(),
                       len_tok=rv['text'].str.split().map(lambda x: len(x) if isinstance(x, list) else 0))
             .groupby('id')
             .agg(n_reviews=('text','count'),
                  mean_len_char=('len_char','mean'),
                  mean_len_tok=('len_tok','mean'))
             .reset_index())
    return agg.merge(stats, on='id', how='left')

txt_lat = build_text_latents(reviews)
train_geo = train_geo.merge(txt_lat, on='id', how='left')
test_geo  = test_geo.merge(txt_lat, on='id',  how='left')


In [6]:
# –®–ê–ì 6: –ë–ª–æ–∫ —Ñ–∏—á –ø–æ —Ä–∞–¥–∏—É—Å–∞–º (–¥–æ–ª–∏/ratio/–∏–Ω–¥–µ–∫—Å—ã + –ª–æ–≥–∞—Ä–∏—Ñ–º—ã/–Ω–∞–≥—Ä—É–∑–∫–∏)
def make_block_features(df):
    out = df.copy()
    # –±–∞–∑–æ–≤—ã–µ ratio/diff
    for base in ['traffic','homes','works','mean_income']:
        for r in ['300m','1000m']:
            col = f'{base}_{r}'
            if col not in out.columns:
                out[col] = np.nan
        a = out[f'{base}_300m']; b = out[f'{base}_1000m']
        out[f'{base}_ratio'] = safe_ratio(a, b)
        out[f'{base}_diff']  = (a - b)

    # –¥–µ–º–æ–≥—Ä–∞—Ñ–∏—è/—Å–æ—Ü-—ç–∫–æ –ø–æ —Ä–∞–¥–∏—É—Å–∞–º
    for r in ['300m','1000m']:
        female = out.get(f'female_{r}',0).fillna(0)
        male   = out.get(f'male_{r}',0).fillna(0)
        gtot = female + male
        out[f'female_share_{r}'] = safe_ratio(female, gtot)

        age_cols = [c for c in out.columns if c.startswith('age_') and c.endswith(f'_{r}')]
        if age_cols:
            atot = out[age_cols].sum(axis=1)
            youth = out[[c for c in age_cols if ('18-24' in c) or ('25-34' in c)]].sum(axis=1)
            elderly = out[[c for c in age_cols if ('>55' in c)]].sum(axis=1)
            out[f'youth_share_{r}'] = safe_ratio(youth, atot)
            out[f'elderly_share_{r}'] = safe_ratio(elderly, atot)

        he = out.get(f'higher_education_{r}',0).fillna(0)
        nhe= out.get(f'no_higher_education_{r}',0).fillna(0)
        out[f'higher_ed_share_{r}'] = safe_ratio(he, he+nhe)

        emp = out.get(f'employed_{r}',0).fillna(0)
        une = out.get(f'unemployed_{r}',0).fillna(0)
        out[f'employment_rate_{r}'] = safe_ratio(emp, emp+une)

        hc  = out.get(f'has_children_{r}',0).fillna(0)
        nc  = out.get(f'no_children_{r}',0).fillna(0)
        out[f'children_share_{r}'] = safe_ratio(hc, hc+nc)

        mar = out.get(f'married_{r}',0).fillna(0)
        nmar= out.get(f'not_married_{r}',0).fillna(0)
        out[f'married_share_{r}'] = safe_ratio(mar, mar+nmar)

        # –¥–æ—Ö–æ–¥–Ω—ã–µ –¥–æ–ª–∏
        buckets = [f'below_average_income_{r}', f'average_income_{r}', f'above_average_income_{r}', f'high_income_{r}', f'premium_income_{r}']
        pres = [c for c in buckets if c in out.columns]
        if pres:
            tot = out[pres].sum(axis=1)
            hi = out[[c for c in pres if ('above_average' in c) or ('high_income' in c) or ('premium' in c)]].sum(axis=1)
            out[f'high_income_share_{r}'] = safe_ratio(hi, tot)

    # cross-radius
    for name in ['female_share','youth_share','elderly_share','higher_ed_share','employment_rate','children_share','married_share','high_income_share']:
        a = out.get(f'{name}_300m'); b = out.get(f'{name}_1000m')
        if a is not None and b is not None:
            out[f'{name}_ratio'] = safe_ratio(a, b)
            out[f'{name}_diff']  = (a - b)

    # –∫–æ–º–ø–æ–∑–∏—Ç–Ω—ã–µ –∏–Ω–¥–µ–∫—Å—ã
    out['socio_index']  = 0.5*out.get('high_income_share_300m',0).fillna(0) + 0.3*out.get('higher_ed_share_300m',0).fillna(0) + 0.2*out.get('employment_rate_300m',0).fillna(0)
    out['family_index'] = 0.6*out.get('children_share_300m',0).fillna(0)     + 0.4*out.get('married_share_300m',0).fillna(0)

    # –ª–æ–≥–∞—Ä–∏—Ñ–º—ã –∏ –Ω–∞–≥—Ä—É–∑–∫–∏
    base_cols = [
        'traffic_300m','homes_300m','works_300m','mean_income_300m',
        'traffic_1000m','homes_1000m','works_1000m','mean_income_1000m'
    ]
    for c in base_cols:
        if c in out.columns:
            out[c+'_log1p'] = np.log1p(out[c])

    if {'female_300m','male_300m'}.issubset(out.columns):
        out['pop_300m'] = out['female_300m'] + out['male_300m']
        out['sex_ratio_300m'] = safe_ratio(out['female_300m'], out['male_300m'])
    if {'female_1000m','male_1000m'}.issubset(out.columns):
        out['pop_1000m'] = out['female_1000m'] + out['male_1000m']
        out['sex_ratio_1000m'] = safe_ratio(out['female_1000m'], out['male_1000m'])

    if {'traffic_300m','homes_300m'}.issubset(out.columns):
        out['traffic_per_home_300m'] = safe_ratio(out['traffic_300m'], out['homes_300m'])
    if {'works_300m','homes_300m'}.issubset(out.columns):
        out['works_per_home_300m'] = safe_ratio(out['works_300m'], out['homes_300m'])
    if {'traffic_1000m','homes_1000m'}.issubset(out.columns):
        out['traffic_per_home_1000m'] = safe_ratio(out['traffic_1000m'], out['homes_1000m'])
    if {'works_1000m','homes_1000m'}.issubset(out.columns):
        out['works_per_home_1000m'] = safe_ratio(out['works_1000m'], out['homes_1000m'])

    if {'mean_income_300m','pop_300m'}.issubset(out.columns):
        out['income_per_capita_300m'] = safe_ratio(out['mean_income_300m'], out['pop_300m'])
    if {'mean_income_1000m','pop_1000m'}.issubset(out.columns):
        out['income_per_capita_1000m'] = safe_ratio(out['mean_income_1000m'], out['pop_1000m'])

    return out

train_fe = make_block_features(train_geo)
test_fe  = make_block_features(test_geo)


In [7]:
# –®–ê–ì 7: –ò–Ω—Ç–µ—Ä–µ—Å—ã ‚Äî —ç–Ω—Ç—Ä–æ–ø–∏—è + —Ä–∞–∑–¥–µ–ª—å–Ω—ã–π PCA –¥–ª—è 300–º –∏ 1000–º
def get_interest_cols(df, radius_tag):
    excl = ['traffic','homes','works','mean_income','female','male','age_','married','not_married',
            'has_children','no_children','employed','unemployed','higher_education','no_higher_education',
            'below_average_income','average_income','above_average_income','high_income','premium_income']
    cols = [c for c in df.columns if c.endswith(f'_{radius_tag}') and not any(c.startswith(p) for p in excl)]
    return cols

def add_interest_entropy_and_pca(train_df, test_df, n_components=12):
    tr = train_df.copy(); te = test_df.copy()
    for radius in ['300m','1000m']:
        use_cols = sorted(set(get_interest_cols(tr, radius) + get_interest_cols(te, radius)))
        if not use_cols:
            continue
        def normalize_block(df):
            X = df[use_cols].fillna(0.0).astype(float)
            row_sum = X.sum(axis=1).replace(0, np.nan)
            Xn = X.div(row_sum, axis=0).fillna(0.0)
            # —ç–Ω—Ç—Ä–æ–ø–∏—è
            p = Xn.values
            ent = -(p * np.log(p + 1e-12)).sum(axis=1)
            return Xn, ent
        Xtr, ent_tr = normalize_block(tr)
        Xte, ent_te = normalize_block(te)
        tr[f'int_entropy_{radius}'] = ent_tr
        te[f'int_entropy_{radius}'] = ent_te
        pca = PCA(n_components=min(n_components, max(1, min(Xtr.shape[1], 64))), random_state=RANDOM_SEED)
        pca.fit(pd.concat([Xtr, Xte], axis=0))
        tr_lat = pca.transform(Xtr); te_lat = pca.transform(Xte)
        for i in range(tr_lat.shape[1]):
            tr[f'int_{radius}_lat_{i}'] = tr_lat[:, i]
            te[f'int_{radius}_lat_{i}'] = te_lat[:, i]
    return tr, te

train_fe, test_fe = add_interest_entropy_and_pca(train_fe, test_fe, n_components=12)


In [8]:
# –®–ê–ì 8: ruBERT —ç–º–±–µ–¥–¥–∏–Ω–≥–∏ –æ—Ç–∑—ã–≤–æ–≤ (–∫—ç—à) + SVD + "–≤–µ—Å" –ø–æ —á–∏—Å–ª—É –æ—Ç–∑—ã–≤–æ–≤
rubert_cols = []
if RUN_RUBERT:
    try:
        import torch
        from sentence_transformers import SentenceTransformer
        RUBERT_PKL = 'rubert_latents.pkl'
        if os.path.exists(RUBERT_PKL):
            print("üìÇ –ó–∞–≥—Ä—É–∂–∞—é ruBERT –∏–∑ –∫—ç—à–∞‚Ä¶")
            rubert_latents = pd.read_pickle(RUBERT_PKL)
        else:
            print("‚öôÔ∏è –°—Ç—Ä–æ—é ruBERT —ç–º–±–µ–¥–¥–∏–Ω–≥–∏‚Ä¶")
            device = 'cuda' if torch.cuda.is_available() else ('mps' if getattr(torch.backends,'mps',None) and torch.backends.mps.is_available() else 'cpu')
            model = SentenceTransformer('DeepPavlov/rubert-base-cased-sentence', device=device)
            rv = reviews[['id','text']].copy()
            rv['text'] = rv['text'].fillna('')
            embs = model.encode(rv['text'].tolist(),
                                show_progress_bar=True, batch_size=64,
                                convert_to_numpy=True, normalize_embeddings=True)
            df_emb = pd.DataFrame(embs)
            df_emb['id'] = rv['id'].values
            rubert_latents = df_emb.groupby('id').mean().reset_index()
            rubert_latents.columns = ['id'] + [f'rubert_{i}' for i in range(embs.shape[1])]
            rubert_latents.to_pickle(RUBERT_PKL)
            print("üíæ ruBERT –∫—ç—à —Å–æ—Ö—Ä–∞–Ω—ë–Ω:", RUBERT_PKL)

        train_fe = train_fe.merge(rubert_latents, on='id', how='left')
        test_fe  = test_fe.merge(rubert_latents,  on='id', how='left')
        rubert_cols = [c for c in train_fe.columns if c.startswith('rubert_')]
    except Exception as e:
        print("‚ö†Ô∏è –û—à–∏–±–∫–∞ ruBERT:", e)
        RUN_RUBERT = False

# SVD-—Å–∂–∞—Ç–∏–µ ruBERT (–∏ —Ç–æ–ª—å–∫–æ –µ–≥–æ –∏—Å–ø–æ–ª—å–∑—É–µ–º –∫–∞–∫ –ø—Ä–∏–∑–Ω–∞–∫–∏)
if RUN_RUBERT and RUN_RUBERT_SVD and rubert_cols:
    SVD_PKL = "rubert_svd_latents.pkl"
    if os.path.exists(SVD_PKL):
        print("üìÇ –ó–∞–≥—Ä—É–∂–∞—é SVD-–ø—Ä–æ–µ–∫—Ü–∏–∏ –∏–∑ –∫—ç—à–∞‚Ä¶")
        svd_lat = pd.read_pickle(SVD_PKL)
        svd_train = svd_lat.query("split=='train'").drop(columns=["split"])
        svd_test  = svd_lat.query("split=='test'").drop(columns=["split"])
    else:
        tr_mat = train_fe[rubert_cols].astype("float32").fillna(0.0).values
        te_mat = test_fe[rubert_cols].astype("float32").fillna(0.0).values
        max_comp = min(RUBERT_SVD_COMPONENTS, tr_mat.shape[1], max(2, tr_mat.shape[0]-1))
        svd = TruncatedSVD(n_components=max_comp, random_state=RANDOM_SEED)
        tr_proj = svd.fit_transform(tr_mat)
        te_proj = svd.transform(te_mat)
        svd_train = pd.DataFrame(tr_proj, columns=[f"rubert_svd_{i}" for i in range(tr_proj.shape[1])])
        svd_test  = pd.DataFrame(te_proj,  columns=[f"rubert_svd_{i}" for i in range(te_proj.shape[1])])
        svd_train["id"] = train_fe["id"].values
        svd_test["id"]  = test_fe["id"].values
        svd_save = pd.concat([svd_train.assign(split="train"), svd_test.assign(split="test")], ignore_index=True)
        svd_save.to_pickle(SVD_PKL)
        print("üíæ SVD –∫—ç—à —Å–æ—Ö—Ä–∞–Ω—ë–Ω:", SVD_PKL)

    train_fe = train_fe.merge(svd_train, on="id", how="left")
    test_fe  = test_fe.merge(svd_test,  on="id", how="left")
    # –≤–µ—Å –ø–æ —á–∏—Å–ª—É –æ—Ç–∑—ã–≤–æ–≤ (–µ—Å–ª–∏ n_reviews –µ—Å—Ç—å)
    if 'n_reviews' in train_fe.columns:
        for c in [col for col in train_fe.columns if col.startswith('rubert_svd_')]:
            train_fe[c] = train_fe[c] * np.log1p(train_fe['n_reviews'].fillna(0).values)
            test_fe[c]  = test_fe[c]  * np.log1p(test_fe['n_reviews'].fillna(0).values)
    # —É–¥–∞–ª–∏–º —Å—ã—Ä—ã–µ rubert_*
    train_fe.drop(columns=[c for c in rubert_cols if c in train_fe.columns], inplace=True, errors='ignore')
    test_fe.drop(columns=[c for c in rubert_cols if c in test_fe.columns], inplace=True, errors='ignore')


üìÇ –ó–∞–≥—Ä—É–∂–∞—é ruBERT –∏–∑ –∫—ç—à–∞‚Ä¶
üìÇ –ó–∞–≥—Ä—É–∂–∞—é SVD-–ø—Ä–æ–µ–∫—Ü–∏–∏ –∏–∑ –∫—ç—à–∞‚Ä¶


In [12]:
# –®–ê–ì 9: –ë—ã—Å—Ç—Ä—ã–π SENTIMENT v2 (–±–µ–∑ pipeline, –±—ã—Å—Ç—Ä–µ–µ –Ω–∞ MPS/CPU, —Å tqdm)
import os, torch
from tqdm import tqdm
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification

if RUN_SENTIMENT:
    os.environ["TOKENIZERS_PARALLELISM"] = "true"  # —Ä–∞–∑—Ä–µ—à–∏–º —Ä–∞—Å–ø–∞—Ä–∞–ª–ª–µ–ª–∏–≤–∞–Ω–∏–µ —Ç–æ–∫–µ–Ω–∞–π–∑–µ—Ä–∞
    FAST_PKL = "sent_latents_fast_v3.pkl"
    sent_latents = None

    if os.path.exists(FAST_PKL):
        print("üìÇ –ó–∞–≥—Ä—É–∂–∞—é sentiment –∏–∑ –∫—ç—à–∞‚Ä¶")
        sent_latents = pd.read_pickle(FAST_PKL)
    else:
        # 1) –ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞ –∫–æ—Ä–ø—É—Å–∞
        rv = reviews[['id','text']].copy()
        rv['text'] = rv['text'].fillna('')
        rv['len_char'] = rv['text'].str.len()
        rv_sampled = (rv.sort_values(['id','len_char'], ascending=[True, False])
                        .groupby('id', as_index=False)
                        .head(MAX_REVIEWS_PER_PLACE)
                        .reset_index(drop=True))
        print(f"üîé –î–ª—è –∞–Ω–∞–ª–∏–∑–∞ –æ—Ç–æ–±—Ä–∞–Ω–æ {len(rv_sampled):,} –æ—Ç–∑—ã–≤–æ–≤ (‚â§{MAX_REVIEWS_PER_PLACE} –Ω–∞ id).")

        # 2) –î–µ–≤–∞–π—Å
        if torch.cuda.is_available():
            device = torch.device("cuda"); dev_name = "CUDA"
        elif getattr(torch.backends, 'mps', None) and torch.backends.mps.is_available():
            device = torch.device("mps"); dev_name = "Apple MPS"
        else:
            device = torch.device("cpu"); dev_name = "CPU"
        print(f"üñ•Ô∏è  –ò–Ω—Ñ–µ—Ä–µ–Ω—Å –Ω–∞: {dev_name}")

        # 3) –ú–æ–¥–µ–ª—å/—Ç–æ–∫–µ–Ω–∞–π–∑–µ—Ä
        model_name = "blanchefort/rubert-base-cased-sentiment"
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)
        model.eval()
        torch.set_grad_enabled(False)

        # 4) –ü–∞—Ä–∞–º–µ—Ç—Ä—ã –∏–Ω—Ñ–µ—Ä–µ–Ω—Å–∞
        MAX_LEN = 256      # –±—ã–ª–æ 512 ‚Äî —Å–æ–∫—Ä–∞—â–∞–µ–º –≤ 2 —Ä–∞–∑–∞
        BATCH_TRY = [256, 192, 128, 64]  # –≤—ã–±–µ—Ä–µ–º –º–∞–∫—Å–∏–º–∞–ª—å–Ω–æ –≤–æ–∑–º–æ–∂–Ω—ã–π
        texts = rv_sampled['text'].tolist()

        # 5) –§—É–Ω–∫—Ü–∏—è –±–∞—Ç—á-–∏–Ω—Ñ–µ—Ä–µ–Ω—Å–∞
        def infer_batch(batch_texts):
            enc = tokenizer(batch_texts,
                            padding=True, truncation=True, max_length=MAX_LEN,
                            return_tensors="pt")
            enc = {k: v.to(device) for k, v in enc.items()}
            with torch.no_grad():
                out = model(**enc)
                prob = out.logits.softmax(dim=-1).cpu().numpy()
            return prob

        # 6) –ü–æ–¥–±–æ—Ä –±–µ–∑–æ–ø–∞—Å–Ω–æ–≥–æ —Ä–∞–∑–º–µ—Ä–∞ –±–∞—Ç—á–∞ + –ø—Ä–æ–≥–æ–Ω
        for BATCH in BATCH_TRY:
            try:
                _ = infer_batch(texts[:min(BATCH, len(texts))])
                break
            except RuntimeError as e:
                if "out of memory" in str(e).lower():
                    torch.cuda.empty_cache() if device.type == "cuda" else None
                    continue
                else:
                    raise
        print(f"üì¶ batch_size={BATCH}, max_len={MAX_LEN}")

        # 7) –û—Å–Ω–æ–≤–Ω–æ–π —Ü–∏–∫–ª
        probs = []
        for i in tqdm(range(0, len(texts), BATCH), desc="üé≠ Sentiment v2", unit="batch"):
            probs.append(infer_batch(texts[i:i+BATCH]))
            if device.type == "mps":
                torch.mps.synchronize()
        probs = np.vstack(probs)  # shape: [N, 3]

        # 8) –°–æ–±–µ—Ä—ë–º –º–µ—Ç–∫–∏/—Å–∫–æ—Ä—ã
        label_ids = probs.argmax(1)
        labels_map = {0: "NEGATIVE", 1: "NEUTRAL", 2: "POSITIVE"}  # –ø–æ—Ä—è–¥–æ–∫ —É —ç—Ç–æ–π –º–æ–¥–µ–ª–∏ —Ç–∞–∫–æ–π
        labels = [labels_map[i] for i in label_ids]
        scores = probs.max(1)

        rv_sampled['label'] = labels
        rv_sampled['score'] = scores

        # 9) –ê–≥—Ä–µ–≥–∞—Ç—ã –ø–æ –∑–∞–≤–µ–¥–µ–Ω–∏—é (–∫–∞–∫ —Ä–∞–Ω—å—à–µ + –∏–Ω–¥–µ–∫—Å —Ç–æ–Ω–∞)
        g = rv_sampled.groupby('id')
        sent_latents = pd.DataFrame({
            'sent_n_sampled'     : g.size(),
            'sent_pos_cnt'       : g['label'].apply(lambda s: (s=='POSITIVE').sum()),
            'sent_neg_cnt'       : g['label'].apply(lambda s: (s=='NEGATIVE').sum()),
            'sent_neu_cnt'       : g['label'].apply(lambda s: (s=='NEUTRAL').sum()),
            'sent_mean_conf'     : g['score'].mean(),
            'sent_strong_cnt'    : g['score'].apply(lambda s: (s>0.8).sum()),
            'sent_mean_len_char' : g['len_char'].mean(),
        }).reset_index()

        total_cnt = reviews.groupby('id').size().rename('sent_total_reviews').reset_index()
        sent_latents = sent_latents.merge(total_cnt, on='id', how='left')

        sent_latents['sent_pos_share'] = (sent_latents['sent_pos_cnt'] / sent_latents['sent_n_sampled']).fillna(0.0)
        total = (sent_latents['sent_pos_cnt'] + sent_latents['sent_neg_cnt'] + sent_latents['sent_neu_cnt']).replace(0, np.nan)
        sent_latents['sent_tone_index'] = ((sent_latents['sent_pos_cnt'] - sent_latents['sent_neg_cnt']) / total).fillna(0.0)

        sent_latents.to_pickle(FAST_PKL)
        print("üíæ –°–æ—Ö—Ä–∞–Ω–∏–ª sentiment –∫—ç—à:", FAST_PKL)

    # merge –≤ —Ñ–∏—á–∏
    train_fe = train_fe.merge(sent_latents, on='id', how='left')
    test_fe  = test_fe.merge(sent_latents,  on='id', how='left')


üîé –î–ª—è –∞–Ω–∞–ª–∏–∑–∞ –æ—Ç–æ–±—Ä–∞–Ω–æ 115,657 –æ—Ç–∑—ã–≤–æ–≤ (‚â§3 –Ω–∞ id).
üñ•Ô∏è  –ò–Ω—Ñ–µ—Ä–µ–Ω—Å –Ω–∞: Apple MPS
üì¶ batch_size=256, max_len=256


üé≠ Sentiment v2: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 452/452 [30:01<00:00,  3.99s/batch]  


üíæ –°–æ—Ö—Ä–∞–Ω–∏–ª sentiment –∫—ç—à: sent_latents_fast_v3.pkl


In [14]:
# –®–ê–ì 10: –ì–µ–æ-–æ–∫—Ä—É–∂–µ–Ω–∏–µ (OOF) ‚Äî allcat/samecat cnt + mean/weighted_mean + min_dist/flags
# –ø–æ–¥–≥–æ—Ç–æ–≤–∫–∞ –∫–æ–æ—Ä–¥–∏–Ω–∞—Ç/–∫–∞—Ç–µ–≥–æ—Ä–∏–π
train_coords_rad = np.c_[np.deg2rad(train_fe['lat'].astype(float).values),
                         np.deg2rad(train_fe['lon'].astype(float).values)]
test_coords_rad  = np.c_[np.deg2rad(test_fe['lat'].astype(float).values),
                         np.deg2rad(test_fe['lon'].astype(float).values)]
train_cat = train_fe['category'].astype('category') if 'category' in train_fe.columns else pd.Series(['_']*len(train_fe)).astype('category')
test_cat  = (test_fe['category'].astype('category') if 'category' in test_fe.columns else pd.Series(['_']*len(test_fe)).astype('category')).cat.set_categories(train_cat.cat.categories)
train_cat_codes = train_cat.cat.codes.values
test_cat_codes  = test_cat.cat.codes.values

base_splits, lab_idx = build_base_splits(train_fe, n_splits=5, seed=RANDOM_SEED)
y_full = train_fe['target'].astype(float).values
valid_y = (y_full > 0) & ~np.isnan(y_full)
y_masked = np.where(valid_y, y_full, np.nan)

# –∑–∞–≥–æ—Ç–æ–≤–∫–∞ –∫–æ–ª–æ–Ω–æ–∫
for r in RADII_M:
    for pref in ['geo_allcat','geo_samecat']:
        for stat in ['cnt','mean_target','wmean_target','min_dist_m','has_any']:
            col = f'{pref}_{stat}_{r}m'
            train_fe[col] = np.nan
            test_fe[col]  = np.nan

# OOF –¥–ª—è train
for fold, (tr_idx, va_idx) in enumerate(base_splits, 1):
    tree = BallTree(train_coords_rad[tr_idx], metric='haversine')
    y_tr  = y_masked[tr_idx]
    cat_tr= train_cat_codes[tr_idx]
    XY_va = train_coords_rad[va_idx]
    cat_va= train_cat_codes[va_idx]

    for r in RADII_M:
        rad = meters_to_radians(r)
        ind_list = tree.query_radius(XY_va, r=rad, return_distance=True)
        cnt_all = np.zeros(len(va_idx)); mean_all = np.full(len(va_idx), np.nan)
        wmean_all= np.full(len(va_idx), np.nan); mind_all = np.full(len(va_idx), np.nan)
        has_all = np.zeros(len(va_idx))
        cnt_same = np.zeros(len(va_idx)); mean_same = np.full(len(va_idx), np.nan)
        wmean_same= np.full(len(va_idx), np.nan); mind_same = np.full(len(va_idx), np.nan)
        has_same = np.zeros(len(va_idx))

        for i, (neigh_local, dist) in enumerate(zip(*ind_list)):
            # ALL
            if neigh_local.size > 0:
                vals = y_tr[neigh_local]
                cnt_all[i] = neigh_local.size
                has_all[i] = 1.0
                if np.isfinite(vals).any():
                    mean_all[i] = np.nanmean(vals)
                    d_m = dist * EARTH_RADIUS_M
                    w = 1.0 / (d_m + 50.0)  # 50 –º —Å–≥–ª–∞–∂–∏–≤–∞–Ω–∏–µ
                    w[np.isnan(vals)] = 0.0
                    if 'sent_total_reviews' in train_fe.columns:
                        w *= np.log1p(train_fe['sent_total_reviews'].values[tr_idx][neigh_local])
                    s = w.sum()
                    wmean_all[i] = np.nan if s==0 else float(np.nansum(vals*w)/s)
                mind_all[i] = (dist.min()*EARTH_RADIUS_M) if dist.size>0 else np.nan
            # SAME CAT
            mask_same = (cat_tr[neigh_local] == cat_va[i]) if neigh_local.size>0 else np.array([], dtype=bool)
            neigh_same = neigh_local[mask_same]
            dist_same  = dist[mask_same]
            if neigh_same.size > 0:
                vals = y_tr[neigh_same]
                cnt_same[i] = neigh_same.size
                has_same[i] = 1.0
                if np.isfinite(vals).any():
                    mean_same[i] = np.nanmean(vals)
                    d_m = dist_same * EARTH_RADIUS_M
                    w = 1.0 / (d_m + 50.0)
                    w[np.isnan(vals)] = 0.0
                    if 'sent_total_reviews' in train_fe.columns:
                        w *= np.log1p(train_fe['sent_total_reviews'].values[tr_idx][neigh_same])
                    s = w.sum()
                    wmean_same[i] = np.nan if s==0 else float(np.nansum(vals*w)/s)
                mind_same[i] = (dist_same.min()*EARTH_RADIUS_M) if dist_same.size>0 else np.nan

        train_fe.loc[va_idx, f'geo_allcat_cnt_{r}m']          = cnt_all
        train_fe.loc[va_idx, f'geo_allcat_mean_target_{r}m']  = mean_all
        train_fe.loc[va_idx, f'geo_allcat_wmean_target_{r}m'] = wmean_all
        train_fe.loc[va_idx, f'geo_allcat_min_dist_m_{r}m']   = mind_all
        train_fe.loc[va_idx, f'geo_allcat_has_any_{r}m']      = has_all

        train_fe.loc[va_idx, f'geo_samecat_cnt_{r}m']          = cnt_same
        train_fe.loc[va_idx, f'geo_samecat_mean_target_{r}m']  = mean_same
        train_fe.loc[va_idx, f'geo_samecat_wmean_target_{r}m'] = wmean_same
        train_fe.loc[va_idx, f'geo_samecat_min_dist_m_{r}m']   = mind_same
        train_fe.loc[va_idx, f'geo_samecat_has_any_{r}m']      = has_same

# –î–ª—è test ‚Äî –¥–µ—Ä–µ–≤–æ –Ω–∞ –≤—Å—ë–º train
full_tree = BallTree(train_coords_rad, metric='haversine')
for r in RADII_M:
    rad = meters_to_radians(r)
    ind_list = full_tree.query_radius(test_coords_rad, r=rad, return_distance=True)
    cnt_all = np.zeros(len(test_fe)); mean_all = np.full(len(test_fe), np.nan)
    wmean_all= np.full(len(test_fe), np.nan); mind_all = np.full(len(test_fe), np.nan)
    has_all = np.zeros(len(test_fe))
    cnt_same= np.zeros(len(test_fe)); mean_same= np.full(len(test_fe), np.nan)
    wmean_same= np.full(len(test_fe), np.nan); mind_same= np.full(len(test_fe), np.nan)
    has_same = np.zeros(len(test_fe))
    for i, (neigh, dist) in enumerate(zip(*ind_list)):
        if neigh.size>0:
            vals = y_masked[neigh]
            cnt_all[i] = neigh.size; has_all[i] = 1.0
            if np.isfinite(vals).any():
                mean_all[i] = np.nanmean(vals)
                d_m = dist * EARTH_RADIUS_M
                w = 1.0 / (d_m + 50.0)
                w[np.isnan(vals)] = 0.0
                if 'sent_total_reviews' in train_fe.columns:
                    w *= np.log1p(train_fe['sent_total_reviews'].values[neigh])
                s = w.sum()
                wmean_all[i] = np.nan if s==0 else float(np.nansum(vals*w)/s)
            mind_all[i] = (dist.min()*EARTH_RADIUS_M) if dist.size>0 else np.nan
        mask_same = (train_cat_codes[neigh] == test_cat_codes[i]) if neigh.size>0 else np.array([], dtype=bool)
        neigh_s = neigh[mask_same]; dist_s = dist[mask_same]
        if neigh_s.size>0:
            vals = y_masked[neigh_s]
            cnt_same[i] = neigh_s.size; has_same[i] = 1.0
            if np.isfinite(vals).any():
                mean_same[i] = np.nanmean(vals)
                d_m = dist_s * EARTH_RADIUS_M
                w = 1.0 / (d_m + 50.0)
                w[np.isnan(vals)] = 0.0
                if 'sent_total_reviews' in train_fe.columns:
                    w *= np.log1p(train_fe['sent_total_reviews'].values[neigh_s])
                s = w.sum()
                wmean_same[i] = np.nan if s==0 else float(np.nansum(vals*w)/s)
            mind_same[i] = (dist_s.min()*EARTH_RADIUS_M) if dist_s.size>0 else np.nan

    test_fe[f'geo_allcat_cnt_{r}m']          = cnt_all
    test_fe[f'geo_allcat_mean_target_{r}m']  = mean_all
    test_fe[f'geo_allcat_wmean_target_{r}m'] = wmean_all
    test_fe[f'geo_allcat_min_dist_m_{r}m']   = mind_all
    test_fe[f'geo_allcat_has_any_{r}m']      = has_all

    test_fe[f'geo_samecat_cnt_{r}m']          = cnt_same
    test_fe[f'geo_samecat_mean_target_{r}m']  = mean_same
    test_fe[f'geo_samecat_wmean_target_{r}m'] = wmean_same
    test_fe[f'geo_samecat_min_dist_m_{r}m']   = mind_same
    test_fe[f'geo_samecat_has_any_{r}m']      = has_same


In [None]:
# –®–ê–ì 11: –ö–æ—Å–∏–Ω—É—Å –∫ —Ü–µ–Ω—Ç—Ä–æ–∏–¥—É —Å–æ—Å–µ–¥–µ–π —Ç–æ–π –∂–µ –∫–∞—Ç–µ–≥–æ—Ä–∏–∏ (–≤–∑–≤–µ—à–µ–Ω–Ω—ã–π)
emb_cols = [c for c in train_fe.columns if c.startswith('rubert_svd_')]
if not emb_cols:
    emb_cols = [c for c in train_fe.columns if c.startswith('rubert_') and ('_pos' not in c and '_neg' not in c)]

if emb_cols:
    Z_tr = l2norm(train_fe[emb_cols].fillna(0.0).values.astype('float32'))
    Z_te = l2norm(test_fe[emb_cols].fillna(0.0).values.astype('float32'))

    for r in RADII_M:
        train_fe[f'geo_samecat_sim_{r}m'] = np.nan
        test_fe[f'geo_samecat_sim_{r}m']  = np.nan

    # OOF
    for tr_idx, va_idx in base_splits:
        tree = BallTree(train_coords_rad[tr_idx], metric='haversine')
        trZ  = Z_tr[tr_idx]
        trC  = train_cat_codes[tr_idx]
        vaZ  = Z_tr[va_idx]
        vaC  = train_cat_codes[va_idx]
        vaXY = train_coords_rad[va_idx]
        for r in RADII_M:
            rad = meters_to_radians(r)
            ind_list = tree.query_radius(vaXY, r=rad, return_distance=True)
            sims = np.zeros(len(va_idx), dtype='float32')
            for i, (neigh_local, dist) in enumerate(zip(*ind_list)):
                mask = (trC[neigh_local] == vaC[i])
                neigh_local = neigh_local[mask]
                dist = dist[mask]
                if neigh_local.size == 0:
                    sims[i] = 0.0
                else:
                    w = 1.0 / (dist*EARTH_RADIUS_M + 50.0)
                    if 'sent_total_reviews' in train_fe.columns:
                        w *= np.log1p(train_fe['sent_total_reviews'].values[tr_idx][neigh_local])
                    w = w / (w.sum() + 1e-12)
                    centroid = (trZ[neigh_local] * w[:,None]).sum(axis=0)
                    sims[i] = float(np.dot(vaZ[i], centroid / (np.linalg.norm(centroid)+1e-12)))
            train_fe.loc[va_idx, f'geo_samecat_sim_{r}m'] = sims

    # test
    full_tree = BallTree(train_coords_rad, metric='haversine')
    for r in RADII_M:
        rad = meters_to_radians(r)
        ind_list = full_tree.query_radius(test_coords_rad, r=rad, return_distance=True)
        sims = np.zeros(len(test_fe), dtype='float32')
        for i, (neigh, dist) in enumerate(zip(*ind_list)):
            mask = (train_cat_codes[neigh] == test_cat_codes[i])
            neigh = neigh[mask]; dist = dist[mask]
            if neigh.size == 0:
                sims[i] = 0.0
            else:
                w = 1.0 / (dist*EARTH_RADIUS_M + 50.0)
                if 'sent_total_reviews' in train_fe.columns:
                    w *= np.log1p(train_fe['sent_total_reviews'].values[neigh])
                w = w / (w.sum() + 1e-12)
                centroid = (Z_tr[neigh] * w[:,None]).sum(axis=0)
                sims[i] = float(np.dot(Z_te[i], centroid / (np.linalg.norm(centroid)+1e-12)))
        test_fe[f'geo_samecat_sim_{r}m'] = sims
else:
    print("‚ÑπÔ∏è –ù–µ—Ç —Ç–µ–∫—Å—Ç–æ–≤—ã—Ö —ç–º–±–µ–¥–¥–∏–Ω–≥–æ–≤ ‚Äî —à–∞–≥ 11 –ø—Ä–æ–ø—É—â–µ–Ω.")


In [None]:
# –®–ê–ì 12: Target Encoding (OOF meta) –¥–ª—è category –∏ category√ógeo_cluster
def oof_target_encode_meta(train_df, test_df, cols_key, target_col='target', n_splits=5, seed=42, prior=200):
    tr_mask = (train_df[target_col] > 0) & train_df[target_col].notna()
    tr = train_df.loc[tr_mask].copy()
    key = tr[cols_key].apply(lambda r: tuple(r), axis=1)
    gmean = tr[target_col].mean()

    # OOF
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    bins = pd.qcut(tr[target_col], q=min(10, tr[target_col].nunique()), duplicates='drop').cat.codes
    te_oof = np.zeros(len(tr))
    for tr_idx, va_idx in skf.split(tr, bins):
        mean_by_key = tr.iloc[tr_idx].groupby(key.iloc[tr_idx])[target_col].agg(['mean','count'])
        mean_by_key['smooth'] = (mean_by_key['mean']*mean_by_key['count'] + gmean*prior) / (mean_by_key['count'] + prior)
        map_dict = mean_by_key['smooth'].to_dict()
        te_oof[va_idx] = [map_dict.get(tuple(k), gmean) for k in key.iloc[va_idx]]

    # FULL map -> –ø—Ä–∏–º–µ–Ω—è–µ–º –∫ train/test
    key_full_tr = train_df[cols_key].apply(lambda r: tuple(r), axis=1)
    key_full_te = test_df[cols_key].apply(lambda r: tuple(r), axis=1)
    mean_by_key_full = tr.groupby(key)[target_col].agg(['mean','count'])
    mean_by_key_full['smooth'] = (mean_by_key_full['mean']*mean_by_key_full['count'] + gmean*prior) / (mean_by_key_full['count'] + prior)
    map_full = mean_by_key_full['smooth'].to_dict()
    te_train_full = np.array([map_full.get(tuple(k), gmean) for k in key_full_tr])
    te_test_full  = np.array([map_full.get(tuple(k), gmean) for k in key_full_te])

    # meta: train=OOF (–Ω–∞ —Ä–∞–∑–º–µ—á–µ–Ω–Ω—ã—Ö), test=FULL
    meta_tr = np.zeros(len(train_df))
    meta_tr[tr_mask.values] = te_oof
    meta_te = te_test_full
    return meta_tr, meta_te

# TE: category
train_fe['TE_category_meta'], test_fe['TE_category_meta'] = oof_target_encode_meta(train_fe, test_fe, ['category'], prior=200)

# TE: category √ó geo_cluster (–µ—Å–ª–∏ –µ—Å—Ç—å)
if 'geo_cluster' in train_fe.columns:
    train_fe['TE_cat_cluster_meta'], test_fe['TE_cat_cluster_meta'] = oof_target_encode_meta(train_fe, test_fe, ['category','geo_cluster'], prior=300)


In [None]:
# –®–ê–ì 13: TF-IDF (char_wb 3‚Äì5) ‚Üí Ridge (OOF meta –ø—Ä–∏–∑–Ω–∞–∫)
MAX_CHARS = 10000
place_text = reviews.groupby('id', as_index=False)['text'].apply(lambda s: ' '.join(s.fillna(''))).rename(columns={'text':'agg_text'})
place_text['agg_text'] = place_text['agg_text'].str[:MAX_CHARS]

tr_text = train_fe[['id']].merge(place_text, on='id', how='left')['agg_text'].fillna('')
te_text = test_fe[['id'] ].merge(place_text, on='id', how='left')['agg_text'].fillna('')

tfidf = TfidfVectorizer(min_df=5, max_df=0.9, ngram_range=(3,5), analyzer='char_wb', max_features=200_000)
X_tr = tfidf.fit_transform(tr_text)
X_te = tfidf.transform(te_text)

y_full = train_fe['target'].astype(float).values
mask = (y_full > 0) & ~np.isnan(y_full)
X_tr_m = X_tr[mask]
y_m    = y_full[mask]

kf = KFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)
oof = np.zeros(X_tr_m.shape[0]); test_pred = np.zeros(X_te.shape[0])
for tr_idx, va_idx in kf.split(X_tr_m):
    model = Ridge(alpha=2.0, random_state=RANDOM_SEED)
    model.fit(X_tr_m[tr_idx], y_m[tr_idx])
    oof[va_idx] = model.predict(X_tr_m[va_idx])
    test_pred += model.predict(X_te) / kf.get_n_splits()

train_fe['tfidf_meta'] = 0.0
train_fe.loc[mask, 'tfidf_meta'] = oof
test_fe['tfidf_meta'] = test_pred


In [None]:
# –®–ê–ì 14: ¬´–¶–µ–Ω—Ç—Ä –≥–æ—Ä–æ–¥–∞¬ª –±–µ–∑ —Ö–∞—Ä–¥–∫–æ–¥–∞ ‚Äî —á–µ—Ä–µ–∑ KMeans –ø–æ –∫–æ–æ—Ä–¥–∏–Ω–∞—Ç–∞–º
XY_all = np.c_[np.deg2rad(pd.concat([train_fe['lat'], test_fe['lat']]).astype(float).values),
               np.deg2rad(pd.concat([train_fe['lon'], test_fe['lon']]).astype(float).values)]
km_c = MiniBatchKMeans(n_clusters=5, random_state=RANDOM_SEED, batch_size=4096).fit(XY_all)
centers = km_c.cluster_centers_

def min_dist_to_centers(df):
    pts = np.c_[np.deg2rad(df['lat'].astype(float).values), np.deg2rad(df['lon'].astype(float).values)]
    d = ((pts[:,None,:]-centers[None,:,:])**2).sum(axis=2)**0.5
    return d.min(axis=1) * EARTH_RADIUS_KM

train_fe['dist_center_km'] = min_dist_to_centers(train_fe)
test_fe['dist_center_km']  = min_dist_to_centers(test_fe)


In [None]:
# –®–ê–ì 15: –§–∏–Ω–∞–ª—å–Ω–∞—è —Å–∏–Ω—Ö—Ä–æ–Ω–∏–∑–∞—Ü–∏—è train/test, —Å–±–æ—Ä —Å–ø–∏—Å–∫–∞ –ø—Ä–∏–∑–Ω–∞–∫–æ–≤
from pandas.api.types import is_numeric_dtype, is_string_dtype, is_categorical_dtype

# –ö–∞—Ç–µ–≥–æ—Ä–∏—è —Å—Ç—Ä–æ–∫–æ–π –¥–ª—è CatBoost
if 'category' in train_fe.columns:
    train_fe['category'] = train_fe['category'].astype(str)
    test_fe['category']  = test_fe['category'].astype(str)

drop_cols = {'target','name','address','coordinates'}
features = [c for c in train_fe.columns if c not in drop_cols]

# –ò—Å–∫–ª—é—á–∏–º —è–≤–Ω—ã–µ –Ω–µ-—Å–∫–∞–ª—è—Ä–Ω—ã–µ –ø–æ–ª—è, –µ—Å–ª–∏ —Å–ª—É—á–∞–π–Ω–æ –ø–æ–ø–∞–ª–∏
def _is_complex(x): return isinstance(x, (list, tuple, dict, set))
complex_cols = []
for c in list(features):
    if c in train_fe.columns and (train_fe[c].apply(_is_complex).any() or (c in test_fe.columns and test_fe[c].apply(_is_complex).any())):
        complex_cols.append(c); features.remove(c)
if complex_cols: print("üóëÔ∏è –£–±—Ä–∞–Ω—ã –Ω–µ-—Å–∫–∞–ª—è—Ä–Ω—ã–µ —Ñ–∏—á–∏:", complex_cols[:10], "‚Ä¶")

# –°–∏–Ω—Ö—Ä–æ–Ω–∏–∑–∞—Ü–∏—è –Ω–∞–±–æ—Ä–∞ –∫–æ–ª–æ–Ω–æ–∫
common_cols = sorted(list(set(train_fe.columns) & set(test_fe.columns)))
if 'target' in common_cols: common_cols.remove('target')
train_cols_final = common_cols + (['target'] if 'target' in train_fe.columns else [])
train_fe = train_fe[train_cols_final].copy()
test_fe  = test_fe[common_cols].copy()

# –ó–∞–ø–æ–ª–Ω–µ–Ω–∏–µ NaN: –±–µ—Ä–µ–∂–Ω–æ ‚Äî NaN –≤ geo_/TE_/tfidf_meta –æ—Å—Ç–∞–≤–ª—è–µ–º (CatBoost —É–º–µ–µ—Ç)
num_common = [c for c in common_cols if is_numeric_dtype(train_fe[c])]
protect_prefixes = ('geo_', 'TE_', 'tfidf_meta')
safe_fill = [c for c in num_common if not any(c.startswith(p) for p in protect_prefixes)]
train_fe[safe_fill] = train_fe[safe_fill].fillna(0)
test_fe[safe_fill]  = test_fe[safe_fill].fillna(0)

# –∑–∞—á–∏—Å—Ç–∫–∞ inf
for df in (train_fe, test_fe):
    arr = df[num_common].to_numpy()
    mask_bad = ~np.isfinite(arr)
    if mask_bad.any():
        arr[mask_bad] = 0.0
        df[num_common] = arr

# –ü–µ—Ä–µ—Å–±–æ—Ä–∫–∞ features –ø–æ —Ñ–∞–∫—Ç—É
features = [c for c in common_cols if c != 'id']
num_features = [c for c in features if is_numeric_dtype(train_fe[c])]
cat_features = [c for c in features if c not in num_features]

# –ö–∞—Ç–µ–≥–æ—Ä–∏–∞–ª—å–Ω—ã–µ –∫ —Å—Ç—Ä–æ–∫–∞–º/–∫–∞—Ç–µ–≥–æ—Ä–∏—è–º, –±–µ–∑ NaN
for df_name, df in (('train', train_fe), ('test', test_fe)):
    for c in cat_features:
        if df[c].isna().any():
            df[c] = df[c].fillna("<unknown>")
        if not (is_categorical_dtype(df[c]) or is_string_dtype(df[c])):
            df[c] = df[c].astype('string')

print(f"üìä –§–∏–Ω–∞–ª—å–Ω—ã–µ —Ä–∞–∑–º–µ—Ä—ã: train={train_fe.shape}, test={test_fe.shape}")
print(f"üß© –§–∏—á –≤—Å–µ–≥–æ: {len(features)} | —á–∏—Å–ª–æ–≤—ã—Ö: {len(num_features)} | –∫–∞—Ç–µ–≥–æ—Ä–∏–∞–ª—å–Ω—ã—Ö: {len(cat_features)}")


In [None]:
# –®–ê–ì 16: 5-fold CatBoost CV (MAE) + –∏–Ω—Ñ–µ—Ä–µ–Ω—Å –Ω–∞ test
params = dict(
    loss_function='MAE',
    eval_metric='MAE',
    learning_rate=0.03,
    depth=8,
    l2_leaf_reg=5.0,
    random_seed=RANDOM_SEED,
    iterations=10000,
    od_type='Iter',
    od_wait=200,
    verbose=200,
    allow_writing_files=False
)

# –±–µ—Ä—ë–º —Ç–æ–ª—å–∫–æ —Ä–∞–∑–º–µ—á–µ–Ω–Ω—ã–µ —Å—Ç—Ä–æ–∫–∏
labeled = train_fe.loc[(train_fe['target']>0) & train_fe['target'].notna()].reset_index(drop=True)
X = labeled[features]
y = labeled['target'].astype(float).values

# —Å—Ç—Ä–∞—Ç–∏—Ñ–∏–∫–∞—Ü–∏—è –ø–æ –±–∏–Ω–∞–º —Ç–∞—Ä–≥–µ—Ç–∞
labeled['target_bin'] = pd.qcut(labeled['target'], q=min(10, labeled['target'].nunique()), duplicates='drop').cat.codes
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)

oof = np.zeros(len(labeled), dtype=float)
test_pred = np.zeros(len(test_fe), dtype=float)

for fold, (tr_idx, va_idx) in enumerate(skf.split(X, labeled['target_bin']), 1):
    tr_pool = Pool(X.iloc[tr_idx], label=y[tr_idx], cat_features=cat_features)
    va_pool = Pool(X.iloc[va_idx], label=y[va_idx], cat_features=cat_features)
    model = CatBoostRegressor(**params)
    print(f"\n--- –§–æ–ª–¥ {fold}/5 ---")
    model.fit(tr_pool, eval_set=va_pool, use_best_model=True)
    oof[va_idx] = model.predict(va_pool)
    test_pred += model.predict(Pool(test_fe[features], cat_features=cat_features)) / 5.0

mae = mean_absolute_error(y, oof)
print(f"\n‚úÖ OOF MAE: {mae:.6f}")

final_pred = np.clip(test_pred, 1, 5)
print("pred stats:", float(final_pred.min()), float(final_pred.mean()), float(final_pred.max()))


In [None]:
# –®–ê–ì 17: –°–∞–±–º–∏—Ç
sub_name = f"submission_cat_5fold_seed{RANDOM_SEED}_d8.csv"
pd.DataFrame({'id': test_fe['id'].astype(str), 'target': final_pred.astype(float)}).to_csv(sub_name, index=False)
print("üíæ Saved:", sub_name)
