In [None]:
import pandas as pd
from geopy.distance import geodesic
import h3
import geopandas as gpd
import numpy as np
from typing import Tuple, List
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor, Pool, cv
import optuna
from tqdm import tqdm
import re
from sklearn.metrics import mean_absolute_error
import rasterio
from sklearn.neighbors import BallTree
from sklearn.neighbors import NearestNeighbors
import rasterio
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error

In [45]:
# import pandas as pd
# from sklearn.cluster import KMeans

# def add_multiple_kmeans_clusters(train_path: str, test_path: str, random_state: int = 42):
#     train_df = pd.read_csv(train_path)
#     test_df = pd.read_csv(test_path)

#     feature_cols = [c for c in train_df.columns if c.startswith('feat_')]
#     X_train = train_df[feature_cols].values
#     X_test = test_df[feature_cols].values

#     cluster_counts = [2, 10, 20]

#     for n_clusters in cluster_counts:
#         print(f"Обучаем KMeans для k={n_clusters}...")
#         kmeans = KMeans(n_clusters=n_clusters, random_state=random_state, n_init='auto')
#         kmeans.fit(X_train)

#         train_df[f'cluster_{n_clusters}'] = pd.Categorical(kmeans.predict(X_train))
#         test_df[f'cluster_{n_clusters}'] = pd.Categorical(kmeans.predict(X_test))

#     train_df.to_csv('train_with_multi_clusters.csv', index=False)
#     test_df.to_csv('test_with_multi_clusters.csv', index=False)

#     print("Файлы сохранены: train_with_multi_clusters.csv и test_with_multi_clusters.csv")

# add_multiple_kmeans_clusters("train_bert_features.csv", "test_bert_features.csv")


In [46]:
train = pd.read_csv("train.tsv", sep='\t')
test = pd.read_csv("test.tsv", sep='\t')
reviews = pd.read_csv("reviews.tsv", sep='\t')

In [47]:
test[['lon', 'lat']] = test['coordinates'].apply(lambda x: pd.Series(eval(x)))
train[['lon', 'lat']] = train['coordinates'].apply(lambda x: pd.Series(eval(x)))

In [48]:
train.drop(columns=['coordinates'], inplace=True)
test.drop(columns=['coordinates'], inplace=True)

In [49]:
def apply_custom_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    def ratio(a, b): 
        return np.where(b != 0, a / b, 0)

    df['traffic_density_ratio'] = ratio(df['traffic_300m'], df['traffic_1000m'])
    df['homes_density_ratio'] = ratio(df['homes_300m'], df['homes_1000m'])
    df['works_density_ratio'] = ratio(df['works_300m'], df['works_1000m'])
    df['female_ratio'] = ratio(df['female_300m'], df['female_1000m'])
    df['employed_ratio'] = ratio(df['employed_300m'], df['unemployed_300m'] + 1)
    df['children_ratio'] = ratio(df['has_children_300m'], df['no_children_300m'] + 1)

    df['mean_age_index'] = (
        17 * df['age_<17_300m'] + 21 * df['age_18-24_300m'] +
        30 * df['age_25-34_300m'] + 40 * df['age_35-44_300m'] +
        50 * df['age_45-54_300m'] + 60 * df['age_>55_300m']
    ) / (
        df[['age_<17_300m','age_18-24_300m','age_25-34_300m',
            'age_35-44_300m','age_45-54_300m','age_>55_300m']].sum(axis=1) + 1
    )

    df['income_balance_index'] = (
        df['premium_income_300m'] * 3 + df['high_income_300m'] * 2 +
        df['above_average_income_300m'] - df['below_average_income_300m']
    )
    df['education_index'] = (
        df['higher_education_300m'] * 3 +
        df['secondary_education_300m'] * 2 -
        df['no_higher_education_300m']
    )

    df['food_index'] = (
        df['restaurants_cafes_bars_300m'] + df['food_delivery_300m'] +
        df['pizza_delivery_300m'] + df['sushi_delivery_300m'] +
        df['coffee_shops_300m'] + df['grocery_stores_300m']
    )
    df['shopping_index'] = (
        df['online_shops_300m'] + df['clothing_shoes_accessories_300m'] +
        df['home_goods_300m'] + df['beauty_salons_300m'] +
        df['cosmetics_and_perfumes_300m'] + df['mens_clothing_300m'] +
        df['womens_clothing_300m']
    )
    df['health_index'] = (
        df['clinics_300m'] + df['pharmacy_300m'] +
        df['beauty_and_health_devices_300m'] + df['childrens_medicine_300m']
    )
    df['education_activity_index'] = (
        df['education_300m'] + df['courses_300m'] + df['language_courses_300m']
    )

    df['food_to_shops_ratio'] = ratio(df['food_index'], df['shopping_index'] + 1)
    df['health_to_education_ratio'] = ratio(df['health_index'], df['education_activity_index'] + 1)
    df['cafe_to_population_ratio'] = ratio(df['restaurants_cafes_bars_300m'], df['homes_300m'] + 1)
    df['shops_to_population_ratio'] = ratio(df['shopping_index'], df['homes_300m'] + 1)

    df['car_enthusiasm_index'] = (
        df['car_owners_300m'] + df['car_services_300m'] +
        df['car_parts_300m'] + df['car_market_300m'] +
        df['interest_in_buying_new_car_300m']
    )
    df['public_transport_ratio'] = ratio(df['train_ticket_order_300m'], df['car_owners_300m'] + 1)

    df['leisure_index'] = (
        df['leisure_and_entertainment_300m'] + df['sports_300m'] +
        df['cultural_leisure_events_300m'] + df['movies_and_series_300m'] +
        df['restaurants_cafes_300m'] + df['bars_300m']
    )
    df['culture_to_leisure_ratio'] = ratio(df['culture_300m'], df['leisure_index'] + 1)

    df['female_to_male_ratio'] = ratio(df['female_300m'], df['male_300m'] + 1)
    df['married_to_unmarried_ratio'] = ratio(df['married_300m'], df['not_married_300m'] + 1)
    df['employed_to_population_ratio'] = ratio(df['employed_300m'], df['homes_300m'] + 1)
    df['income_per_home'] = ratio(df['mean_income_300m'], df['homes_300m'] + 1)

    df['traffic_per_shop'] = ratio(df['traffic_300m'], df['shopping_index'] + 1)
    df['traffic_per_food'] = ratio(df['traffic_300m'], df['food_index'] + 1)
    df['traffic_per_leisure'] = ratio(df['traffic_300m'], df['leisure_index'] + 1)
    df['shops_density_1000_to_300'] = ratio(df['shopping_index'], df['online_shops_1000m'] + 1)

    df['total_commercial_index'] = (
        df['food_index'] + df['shopping_index'] +
        df['health_index'] + df['education_activity_index']
    )
    df['social_activity_index'] = (
        df['culture_300m'] + df['society_300m'] + df['politics_300m'] + df['events_300m']
    )
    df['digital_activity_index'] = (
        df['online_video_300m'] + df['computer_games_300m'] + df['manga_300m'] +
        df['anime_300m'] + df['doramas_300m'] + df['k-pop_300m']
    )

    if 'lat' in df.columns and 'lon' in df.columns:
        df['lat_bin'] = pd.cut(df['lat'], bins=50, labels=False)
        df['lon_bin'] = pd.cut(df['lon'], bins=50, labels=False)
        df['location_cluster'] = (df['lat_bin'] * 100 + df['lon_bin']).astype(int)
        df['lat_lon_sum'] = df['lat'] + df['lon']
        df['lat_lon_diff'] = df['lat'] - df['lon']

    for col in ['traffic_300m', 'homes_300m', 'works_300m', 'food_index', 'shopping_index']:
        df[f'log_{col}'] = np.log1p(df[col])

    return df

In [50]:
train = apply_custom_features(train)
test = apply_custom_features(test)

In [51]:
train_bert_features = pd.read_csv("train_bert_features.csv")
train = train.merge(train_bert_features, on="id", how="left")
test_bert_features = pd.read_csv("test_bert_features.csv")
test = test.merge(test_bert_features, on="id", how="left")
# intfloat_features = pd.read_csv("intfloat_features.csv")
# train = train.merge(intfloat_features, on="id", how="left")
# test = test.merge(intfloat_features, on="id", how="left")

In [52]:
def add_knn_features(train_df, test_df, feature_prefix="feat_", target_col="target", n_neighbors=20):
    feat_cols = [col for col in train_df.columns if col.startswith(feature_prefix)]
    
    valid_train = ~train_df[feat_cols].isna().any(axis=1)
    valid_test = ~test_df[feat_cols].isna().any(axis=1)
    
    model = NearestNeighbors(n_neighbors=n_neighbors, metric="cosine")
    model.fit(train_df.loc[valid_train, feat_cols])
    
    for stat_name in ["mean", "std", "min", "max"]:
        train_df[f"knn_target_{stat_name}"] = np.nan
        test_df[f"knn_target_{stat_name}"] = np.nan
    
    dists, idxs = model.kneighbors(train_df.loc[valid_train, feat_cols])
    neigh_targets = train_df.loc[valid_train, target_col].values[idxs]
    train_df.loc[valid_train, "knn_target_mean"] = neigh_targets.mean(axis=1)
    train_df.loc[valid_train, "knn_target_std"] = neigh_targets.std(axis=1)
    train_df.loc[valid_train, "knn_target_min"] = neigh_targets.min(axis=1)
    train_df.loc[valid_train, "knn_target_max"] = neigh_targets.max(axis=1)
    
    dists, idxs = model.kneighbors(test_df.loc[valid_test, feat_cols])
    neigh_targets = train_df.loc[valid_train, target_col].values[idxs]
    test_df.loc[valid_test, "knn_target_mean"] = neigh_targets.mean(axis=1)
    test_df.loc[valid_test, "knn_target_std"] = neigh_targets.std(axis=1)
    test_df.loc[valid_test, "knn_target_min"] = neigh_targets.min(axis=1)
    test_df.loc[valid_test, "knn_target_max"] = neigh_targets.max(axis=1)
    
    return train_df, test_df

train, test = add_knn_features(train, test)

In [53]:
tfidf_features = pd.read_csv("id_features.csv")
train = train.merge(tfidf_features, on="id", how="left")
test = test.merge(tfidf_features, on="id", how="left")

In [54]:
R_EARTH = 6_371_000.0

def _safe_div(a, b):
    b = np.asarray(b)
    return np.where(b != 0, np.asarray(a) / b, np.nan)

METRO_STATIONS = [
    ("Боровицкая",55.75034,37.60857), ("Тверская",55.7652,37.60352),
    ("Каширская",55.65412,37.64738), ("Сокол",55.80518,37.51495),
    ("Южная",55.62122,37.60752), ("Митино (стр.)",55.84589,37.35909),
    ("Новокузнецкая",55.74212,37.62901), ("Проспект Вернадского",55.67613,37.5045),
    ("ВДНХ",55.82177,37.64107), ("Динамо",55.78867,37.55936),
    ("Щелковская",55.80955,37.79884), ("Театральная",55.75857,37.6177),
    ("Савеловская",55.79421,37.58666), ("Царицино",55.62011,37.66939),
    ("Волгоградский проспект",55.7243,37.68795), ("Пушкинская",55.76565,37.60417),
    ("Китай город",55.75634,37.63002), ("Беговая",55.77378,37.54412),
    ("Рижская",55.79222,37.63557), ("Беляево",55.64371,37.52762),
    ("Крылатское",55.75879,37.40633), ("Электрозаводская",55.78177,37.70471),
    ("Полянка",55.73654,37.61856), ("Аннино (стр.)",55.581818,37.594978),
    ("Домодедовская",55.61009,37.71612), ("Измайловская",55.78768,37.78329),
    ("Тургеневская",55.7646,37.63623), ("Перово",55.75109,37.78854),
    ("Новослободская",55.77921,37.6009), ("Маяковская",55.76909,37.59635),
    ("Комсомольская",55.77717,37.655689), ("Авиамоторная",55.75208,37.71677),
    ("Улица 1905 года",55.76355,37.56375), ("Отрадное",55.86417,37.60488),
    ("Калужская",55.65566,37.53923), ("Аэропорт",55.79981,37.53412),
    ("Парк Победы (стр.)",55.736559,37.512591), ("Шаболовская",55.71886,37.60797),
    ("Октябрьское поле",55.793615,37.493496), ("Войковская",55.81811,37.49905),
    ("Киевская",55.74388,37.56673), ("Фили",55.74673,37.51384),
    ("Александровский сад",55.75219,37.60836), ("Марьина роща (стр.)",55.793602,37.615762),
    ("Парк культуры",55.73512,37.59328), ("Таганская",55.74255,37.65389),
    ("Лубянка",55.75876,37.62573), ("Октябрьская",55.729,37.61139),
    ("Печатники",55.69252,37.7295), ("Тимирязевская",55.81842,37.57571),
    ("Юго-западная",55.66464,37.48421), ("Владыкино",55.84669,37.59251),
    ("Орехово",55.61214,37.69584), ("Цветной бульвар",55.7716,37.62058),
    ("Баррикадная",55.76027,37.58111), ("Павелецкая",55.7313,37.63612),
    ("Теплый стан",55.61814,37.50814), ("Черкизовская",55.802,37.74438),
    ("Каширская",55.65412,37.64738), ("Академическая",55.68808,37.57501),
    ("Бибирево",55.88294,37.60523), ("Кантемировская",55.6343,37.65632),
    ("Волоколамская (стр.)",55.83459,37.38367), ("Новогиреево",55.75111,37.81564),
    ("Южнопортовая (нов.)",55.70622,37.68899),
    ("Рассказовка",55.63282,37.33274),
    ("Косино",55.7029,37.8513),
    ("Нижегородская",55.7325,37.7287),
    ("Беломорская",55.8647,37.4743),
    ("Мичуринский проспект",55.6878,37.4906),
    ("Ломоносовский проспект",55.6982,37.5191),
    ("Хорошевская",55.7767,37.5251),
    ("Окружная",55.8489,37.5732),
    ("Зюзино",55.6568,37.5821)
]


metro_df = pd.DataFrame(METRO_STATIONS, columns=["name","lat","lon"])

def enrich_with_metro_features(train_df, test_df, metro_df,
                               lat_col="lat", lon_col="lon",
                               ks=(1, 3, 5), radii=(300, 500, 1000),
                               prefix="metro"):

    if metro_df is None or metro_df.empty:
        print(f"[{prefix}] dataset is empty — skipped")
        return

    lat_vals = pd.to_numeric(metro_df["lat"], errors="coerce").to_numpy()
    lon_vals = pd.to_numeric(metro_df["lon"], errors="coerce").to_numpy()
    mask = np.isfinite(lat_vals) & np.isfinite(lon_vals)
    coords_rad = np.vstack((np.deg2rad(lat_vals[mask]), np.deg2rad(lon_vals[mask]))).T

    if coords_rad.size == 0:
        print(f"[{prefix}] no valid coordinates — skipped")
        return

    tree = BallTree(coords_rad, metric="haversine")

    def process_one(df):
        df_lat = pd.to_numeric(df[lat_col], errors="coerce").to_numpy()
        df_lon = pd.to_numeric(df[lon_col], errors="coerce").to_numpy()
        ok = np.isfinite(df_lat) & np.isfinite(df_lon)

        sample_coords = np.zeros((len(df), 2))
        sample_coords[:, 0] = np.deg2rad(np.where(ok, df_lat, 0.0))
        sample_coords[:, 1] = np.deg2rad(np.where(ok, df_lon, 0.0))

        for k in ks:
            k_use = min(k, coords_rad.shape[0])
            dist, _ = tree.query(sample_coords[ok], k=k_use)
            dist_m = dist * R_EARTH

            if k_use == 1:
                dmin = np.full(len(df), np.nan, dtype=np.float32)
                dmin[ok] = dist_m[:, 0]
                df[f"{prefix}_dist_min_m"] = dmin

            dmean = np.full(len(df), np.nan, dtype=np.float32)
            dmean[ok] = dist_m.mean(axis=1)
            df[f"{prefix}_mean_k{k_use}_m"] = dmean

        for r in radii:
            nearby = tree.query_radius(sample_coords[ok], r / R_EARTH)
            counts = np.array([len(v) for v in nearby], dtype=np.int32)
            out = np.full(len(df), np.nan, dtype=np.float32)
            out[ok] = counts
            df[f"{prefix}_cnt_r{r}"] = out

    for d in (train_df, test_df):
        process_one(d)

enrich_with_metro_features(train, test, metro_df, ks=(1, 3, 5), radii=(300, 500, 1000), prefix="metro")


In [55]:
KREMLIN_CENTER_LAT, KREMLIN_CENTER_LON = 55.752023, 37.617499


def add_center_angle_features(train_df, test_df, n_sectors=16):
    for df in (train_df, test_df):
        lat = pd.to_numeric(df["lat"], errors="coerce").values
        lon = pd.to_numeric(df["lon"], errors="coerce").values
        ang = np.arctan2(lat - KREMLIN_CENTER_LAT, lon - KREMLIN_CENTER_LON)
        ang[~np.isfinite(ang)] = np.nan
        df["center_angle_sin"] = np.sin(ang).astype("float32")
        df["center_angle_cos"] = np.cos(ang).astype("float32")
        sec = np.floor(((ang + np.pi) / (2*np.pi)) * n_sectors)
        df[f"center_sector_{n_sectors}"] = sec.astype("float32")

add_center_angle_features(train, test, n_sectors=16)

In [56]:
def add_category_diversity(train_df, test_df, radii=(200,300)):
    all_df = pd.concat([train_df[["lat","lon","category"]],
                        test_df[["lat","lon","category"]]], ignore_index=True)
    lat = pd.to_numeric(all_df["lat"], errors="coerce").values
    lon = pd.to_numeric(all_df["lon"], errors="coerce").values
    cats = all_df["category"].astype(str).fillna("Unknown").values
    ok = np.isfinite(lat) & np.isfinite(lon)
    X = np.c_[np.deg2rad(np.where(ok, lat, 0.0)), np.deg2rad(np.where(ok, lon, 0.0))]
    tree = BallTree(X[ok], metric="haversine")

    for r in radii:
        ent = np.full(len(all_df), np.nan, dtype=np.float32)
        hhi = np.full(len(all_df), np.nan, dtype=np.float32)
        ind = tree.query_radius(X[ok], r=r/R_EARTH, return_distance=False)
        ok_idx = np.where(ok)[0]
        for i, nb_local in enumerate(ind):
            gidx = ok_idx[i]
            nb = ok_idx[nb_local]
            nb = nb[nb != gidx]
            if nb.size == 0: 
                ent[i] = np.nan; hhi[i] = np.nan; continue
            vals, cnts = np.unique(cats[nb], return_counts=True)
            p = cnts / cnts.sum()
            ent[i] = float(-(p * np.log(p + 1e-12)).sum())       # Shannon
            hhi[i] = float((p**2).sum())                         # Herfindahl
        # раскладываем по train/test
        ntr = len(train_df)
        train_df[f"cat_entropy_r{r}"] = ent[:ntr]
        test_df[f"cat_entropy_r{r}"]  = ent[ntr:]
        train_df[f"cat_hhi_r{r}"]     = hhi[:ntr]
        test_df[f"cat_hhi_r{r}"]      = hhi[ntr:]

add_category_diversity(train, test, radii=(200,300))

In [57]:
# import pandas as pd
# import folium
# from folium.plugins import FastMarkerCluster

# def df_to_map(df, lon_col="lon", lat_col="lat", output_html="map.html"):
#     """
#     """
#     if lon_col not in df.columns or lat_col not in df.columns:
#         raise KeyError(f"DataFrame должен содержать колонки '{lon_col}' и '{lat_col}'")

#     df_clean = df.copy()
#     df_clean = df_clean.dropna(subset=[lon_col, lat_col])
#     df_clean = df_clean[
#         (df_clean[lon_col].between(-180, 180)) & 
#         (df_clean[lat_col].between(-90, 90))
#     ]

#     points = df_clean[[lat_col, lon_col]].values.tolist()

#     m = folium.Map(location=[0, 0], zoom_start=2, tiles="CartoDB positron", control_scale=True)

#     FastMarkerCluster(points).add_to(m)

#     if points:
#         lats = [p[0] for p in points]
#         lons = [p[1] for p in points]
#         m.fit_bounds([[min(lats), min(lons)], [max(lats), max(lons)]])

#     m.save(output_html)
#     return output_html


# df_to_map(train)

In [58]:
moscow_center = (55.751244, 37.618423)
def calculate_distance(lat, lon):
    return geodesic(moscow_center, (lat, lon)).meters

In [59]:
def take_h3_indx(lat, long, size):
    h3_address = h3.latlng_to_cell(lat, long,  size)      
    return h3_address

In [60]:
def haversine_km(lat1, lon1, lat2, lon2):
    """Расстояние по сфере (км) между точками 1 и 2."""
    R = 6371.0088
    dlat = np.radians(lat2 - lat1)
    dlon = np.radians(lon2 - lon1)
    a = np.sin(dlat/2)**2 + np.cos(np.radians(lat1))*np.cos(np.radians(lat2))*np.sin(dlon/2)**2
    return 2 * R * np.arcsin(np.sqrt(a))

In [61]:
def preprocess_geo_data(df: pd.DataFrame) -> pd.DataFrame:
    result_df = df.copy()

    for size, name in [
        (11, 'small'),
        (9, 'mid'),
        (8, 'norm'),
        (7, 'large'),
        (6, 'super_large')
    ]:
        col_name = f"h3_first_indx_{name}"
        result_df[col_name] = result_df[['lon', 'lat']].apply(
            lambda row: take_h3_indx(*row, size), axis=1
        )

    result_df['distance_to_moscow'] = result_df[['lat', 'lon']].apply(
        lambda row: calculate_distance(*row), axis=1
    )

    result_df['is_moscow'] = (
        result_df['address'].str.contains('Москва', na=False)
    )

    coords = list(zip(result_df["lon"], result_df["lat"]))
    with rasterio.open("rus_pd_2020_1km.tif") as dataset:
        densities = [
            (val[0] if val[0] != dataset.nodata else np.nan)
            for val in tqdm(dataset.sample(coords), total=len(coords), desc="Processing density")
        ]
    result_df["rasterio_density"] = densities
    result_df.loc[result_df["rasterio_density"] < 0, "rasterio_density"] = np.nan

    with rasterio.open("rus_ppp_2020_constrained.tif") as dataset:
        population = [
            (val[0] if val[0] != dataset.nodata else np.nan)
            for val in tqdm(dataset.sample(coords), total=len(coords), desc="Processing population")
        ]
    result_df["rasterio_population"] = population
    result_df.loc[result_df["rasterio_population"] < 0, "rasterio_population"] = np.nan

    return result_df


In [62]:
processed_train = preprocess_geo_data(train)

Processing density: 100%|██████████| 41105/41105 [00:02<00:00, 17768.87it/s]
Processing population: 100%|██████████| 41105/41105 [00:04<00:00, 9748.85it/s] 


In [63]:
processed_test = preprocess_geo_data(test)

Processing density: 100%|██████████| 9276/9276 [00:00<00:00, 17484.51it/s]
Processing population: 100%|██████████| 9276/9276 [00:02<00:00, 4363.85it/s] 


In [64]:
def add_text_features(df: pd.DataFrame, reviews_df: pd.DataFrame) -> pd.DataFrame:
    def count_occurrences(texts: pd.Series, pattern: str) -> int:
        return texts.str.count(pattern, flags=re.IGNORECASE).sum()
    agg_df = (
        reviews_df
        .groupby("id")["text"]
        .agg(
            text_count="count",
            text_mean_len=lambda x: x.str.len().mean(),
            text_max_len=lambda x: x.str.len().max(),
            count_no=lambda x: count_occurrences(x, r"\bно\b"),
            count_ne_nrav=lambda x: count_occurrences(x, r"\bне\s+нравится\b"),
            count_nrav=lambda x: count_occurrences(x, r"(?<!не\s)\bнравится\b"),
            count_horosh=lambda x: count_occurrences(x, r"(?<!не\s)\bхорош\w*\b"),
            count_ne_horosh=lambda x: count_occurrences(x, r"\bне\s+хорош\w*\b"),
            count_otlich=lambda x: count_occurrences(x, r"(?<!не\s)\bотличн\w*\b"),
            count_hotya=lambda x: count_occurrences(x, r"\bхотя\b"),
            count_ponrav=lambda x: count_occurrences(x, r"(?<!не\s)\bпонрав\w*\b"),
            count_ne_ponrav=lambda x: count_occurrences(x, r"\bне\s+понрав\w*\b"),
            count_priyatn=lambda x: count_occurrences(x, r"(?<!не\s)\bприятн\w*\b"),
            count_ne_priyatn=lambda x: count_occurrences(x, r"\bне\s+приятн\w*\b"),
        )
        .reset_index()
    )
    
    result = df.merge(agg_df, on="id", how="left")
    
    return result

In [65]:
processed_train = add_text_features(processed_train, reviews)
processed_test = add_text_features(processed_test, reviews)

In [None]:
# text_features = ['text_count', 'text_mean_len', 'text_max_len', 'count_no', 'count_ne_nrav', 'count_nrav', 'count_horosh', 'count_ne_horosh', 'count_otlich', 'count_hotya', 'count_ponrav', 'count_ne_ponrav', 'count_priyatn', 'count_ne_priyatn']

# text_train = pd.read_csv("text_features_train.csv")
# text_test = pd.read_csv("text_features_test.csv")

# processed_train = pd.concat([processed_train, text_train], axis=1)
# processed_test = pd.concat([processed_test, text_test], axis=1)


In [33]:
# text_train = processed_train[text_features]
# text_test = processed_test[text_features]
# text_train.to_csv("text_features_train.csv", index=False)
# text_test.to_csv("text_features_test.csv", index=False)


In [66]:
def add_geo_features(processed_train: pd.DataFrame, processed_test: pd.DataFrame):
    """
    Добавляет гео-фичи с tqdm.
    Для train: исключает саму точку из усреднения, чтобы избежать target leakage.
    """
    def compute_features(df_base, df_ref, radius_m_list, use_target=True, desc_prefix="", exclude_self=False):
        coords_ref = np.radians(df_ref[['lat', 'lon']].values)
        tree = BallTree(coords_ref, metric='haversine')
        coords_base = np.radians(df_base[['lat', 'lon']].values)

        features = pd.DataFrame(index=df_base.index)

        for r_m in radius_m_list:
            tqdm_desc = f"{desc_prefix}{r_m}m"
            r = r_m / 6371000

            ind_in_radius = tree.query_radius(coords_base, r)

            samecat_mean, allcat_mean = [], []
            samecat_cnt, allcat_cnt = [], []
            samecat_wmean, allcat_wmean = [], []
            samecat_sim = []

            for i, inds in tqdm(enumerate(ind_in_radius), total=len(ind_in_radius), desc=tqdm_desc, leave=False):
                if len(inds) == 0:
                    samecat_mean.append(np.nan)
                    allcat_mean.append(np.nan)
                    samecat_cnt.append(0)
                    allcat_cnt.append(0)
                    samecat_wmean.append(np.nan)
                    allcat_wmean.append(np.nan)
                    samecat_sim.append(0)
                    continue

                if exclude_self and i < len(df_ref):
                    inds = inds[inds != i]
                    if len(inds) == 0:
                        samecat_mean.append(np.nan)
                        allcat_mean.append(np.nan)
                        samecat_cnt.append(0)
                        allcat_cnt.append(0)
                        samecat_wmean.append(np.nan)
                        allcat_wmean.append(np.nan)
                        samecat_sim.append(0)
                        continue

                base_cat = df_base.loc[i, 'category']
                cats = df_ref.iloc[inds]['category'].values
                mask_same = cats == base_cat

                samecat_cnt.append(mask_same.sum())
                allcat_cnt.append(len(inds))
                samecat_sim.append(mask_same.mean())

                if not use_target:
                    samecat_mean.append(np.nan)
                    allcat_mean.append(np.nan)
                    samecat_wmean.append(np.nan)
                    allcat_wmean.append(np.nan)
                    continue

                # tgts = df_ref.iloc[inds]['target'].values
                # dists = np.maximum(
                #     np.linalg.norm(coords_base[i] - coords_ref[inds], axis=1), 1e-6
                # )

                # same_tgts = tgts[mask_same]
                # same_d = dists[mask_same]

                # samecat_mean.append(np.mean(same_tgts) if len(same_tgts) else np.nan)
                # allcat_mean.append(np.mean(tgts) if len(tgts) else np.nan)
                # samecat_wmean.append(np.average(same_tgts, weights=1/same_d) if len(same_tgts) else np.nan)
                # allcat_wmean.append(np.average(tgts, weights=1/dists) if len(tgts) else np.nan)

            # features[f'geo_samecat_mean_target_{r_m}m'] = samecat_mean
            # features[f'geo_allcat_mean_target_{r_m}m'] = allcat_mean
            features[f'geo_samecat_sim_{r_m}m'] = samecat_sim

            if r_m in [300, 1000]:
                features[f'geo_samecat_cnt_{r_m}m'] = samecat_cnt
                features[f'geo_allcat_cnt_{r_m}m'] = allcat_cnt
                # features[f'geo_samecat_wmean_target_{r_m}m'] = samecat_wmean
                # features[f'geo_allcat_wmean_target_{r_m}m'] = allcat_wmean

        return features

    radii = [300, 600, 1000]

    print("▶️ Calculating features for TRAIN (self excluded)...")
    train_features = compute_features(processed_train, processed_train, radii,
                                      use_target=True, desc_prefix="train ", exclude_self=True)

    print("▶️ Calculating features for TEST (using train targets)...")
    test_features = compute_features(processed_test, processed_train, radii,
                                     use_target=True, desc_prefix="test ", exclude_self=False)

    processed_train = pd.concat([processed_train, train_features], axis=1)
    processed_test = pd.concat([processed_test, test_features], axis=1)
    return processed_train, processed_test


In [35]:
# processed_train, processed_test = add_geo_features(processed_train, processed_test)

In [36]:
# new_cols = [c for c in processed_train_geo.columns if c not in processed_train.columns]
# train_geo_features = processed_train_geo[new_cols]
# test_geo_features = processed_test_geo[new_cols]

# train_geo_features.to_csv("geo_features_train.csv", index=False)
# test_geo_features.to_csv("geo_features_test.csv", index=False)

# print(f"✅ Saved geo_features_train.csv ({train_geo_features.shape})")
# print(f"✅ Saved geo_features_test.csv ({test_geo_features.shape})")

In [67]:
geo_train = pd.read_csv("geo_features_train.csv")
geo_test = pd.read_csv("geo_features_test.csv")

processed_train = pd.concat([processed_train, geo_train], axis=1)
processed_test = pd.concat([processed_test, geo_test], axis=1)

In [68]:
not_needed_geo_features = ['geo_samecat_mean_target_300m', 'geo_allcat_mean_target_300m', 'geo_samecat_wmean_target_300m', 'geo_allcat_wmean_target_300m', 'geo_samecat_mean_target_600m', 'geo_allcat_mean_target_600m', 'geo_samecat_mean_target_1000m', 'geo_allcat_mean_target_1000m', 'geo_samecat_wmean_target_1000m', 'geo_allcat_wmean_target_1000m']
processed_train = processed_train.drop(columns=not_needed_geo_features)
processed_test = processed_test.drop(columns=not_needed_geo_features)

In [39]:
# selected_features_loaded = pd.read_csv('selected_features.csv')['selected_features'].tolist()
# selected_features_loaded.append('target')

# processed_train = processed_train[selected_features_loaded]

# selected_features_loaded.remove('target')
# processed_test = processed_test[selected_features_loaded]

In [69]:
processed_train = processed_train[processed_train['target'] >= 1]

In [70]:
def preprocess_data(df: pd.DataFrame) -> Tuple[pd.DataFrame, List[str]]:
    result_df = df.copy()
    result_df.drop(columns=['id', 'name', 'address'], inplace=True)
    
    for col in result_df.select_dtypes(include=['category']).columns:
        result_df[col] = result_df[col].cat.add_categories(['missing']).fillna('missing')

    for col in result_df.select_dtypes(include=['object']).columns:
        result_df[col] = result_df[col].fillna("missing").astype("category")
        
    cat_features = result_df.select_dtypes(include=['category']).columns.tolist()
    return result_df, cat_features

In [71]:
processed_train, cat_features = preprocess_data(processed_train)
processed_test, _ = preprocess_data(processed_test)

In [72]:
X_train, y_train = processed_train.drop(columns=['target']), processed_train['target']

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

def objective(trial):
    params = {
        "loss_function": "MAE",
        "eval_metric": "MAE",
        "learning_rate": trial.suggest_float("learning_rate", 0.02, 0.3, step=0.01),
        "depth": trial.suggest_int("depth", 4, 10),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1.0, 10.0, step=0.5),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 64),
        "iterations": 2000,
        "task_type": "GPU",
        "random_seed": 42,
        "verbose": False,
        "early_stopping_rounds": 100,
    }

    mae_scores = []

    for train_idx, val_idx in kf.split(X_train):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

        train_pool = Pool(X_tr, y_tr, cat_features=cat_features)
        val_pool = Pool(X_val, y_val, cat_features=cat_features)

        model = CatBoostRegressor(**params)
        model.fit(train_pool, eval_set=val_pool)

        preds = model.predict(X_val)
        mae = mean_absolute_error(y_val, preds)
        mae_scores.append(mae)

    mean_mae = np.mean(mae_scores)
    return mean_mae


study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=2, n_jobs=1)


In [37]:
best_params = study.best_params

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

models = []
scores = []
mae_scores = []

for train_idx, val_idx in kf.split(X_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    train_pool = Pool(X_tr, y_tr, cat_features=cat_features)
    val_pool = Pool(X_val, y_val, cat_features=cat_features)

    model = CatBoostRegressor(
        loss_function="MAE",
        eval_metric="MAE",
        iterations=2000,
        task_type="GPU",
        random_seed=42,
        verbose=100,
        early_stopping_rounds=100,
        **best_params
        )
    model.fit(train_pool, eval_set=val_pool, verbose=False)

    preds = model.predict(X_val)

    score = model.score(X_val, y_val)
    mae = mean_absolute_error(y_val, preds)
    print(mae)
    scores.append(score)
    mae_scores.append(mae)
    models.append(model)

print("Средний R² скор:", np.mean(scores))
print("Средний MAE:", np.mean(mae_scores))


In [None]:
feature_importances = []
for model in models:
    fi = model.get_feature_importance(prettified=True)
    fi = fi.rename(columns={'Feature Id': 'feature', 'Importances': 'importance'})
    feature_importances.append(fi[['feature', 'importance']])

df_fi = pd.concat(feature_importances)
mean_fi = df_fi.groupby('feature', as_index=False)['importance'].mean().sort_values('importance', ascending=False)

In [None]:
catboost_predictions = np.mean([m.predict(processed_test) for m in models], axis=0)


In [None]:
submission_df = pd.DataFrame({'id': test['id'], 'target': catboost_predictions})

In [50]:
submission_df.to_csv('catboost_submission49.csv', index=False)

In [1]:
# import lightgbm as lgb
# from sklearn.metrics import mean_absolute_error

# params = {
#     "objective": "regression_l1",
#     "metric": "mae",
#     "learning_rate": 0.02,
#     "max_depth": 7,
#     "num_leaves": 2**7 - 1,
#     "n_estimators": 800,
#     "random_state": 42,
#     "verbosity": -1,
# }

# train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=cat_features)
# val_data = lgb.Dataset(X_val, label=y_val, categorical_feature=cat_features)

# model_lgb = lgb.train(
#     params,
#     train_data,
#     valid_sets=[val_data],
#     callbacks=[
#         lgb.early_stopping(500),
#         lgb.log_evaluation(100),
#     ],
# )

# y_pred = model_lgb.predict(X_val, num_iteration=model_lgb.best_iteration)
# mae = mean_absolute_error(y_val, y_pred)
# print(f"Validation MAE: {mae:.4f}")


In [None]:
# lgb_preds = model_lgb.predict(processed_test)

In [None]:
# submission_df = pd.DataFrame({'id': test['id'], 'target': lgb_preds})

In [None]:
# submission_df.to_csv('lgb3.csv', index=False)

In [None]:
# mixed_submission = pd.DataFrame(
#     {
#     'id': test['id'], 
#     'target': (lgb_preds * 0.2 + catboost_predictions * 0.8)
#     }
# )
# mixed_submission.to_csv('combined_submission.csv', index=False)