In [2]:
import pandas as pd
import numpy as np
from pandas.api.types import CategoricalDtype
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
from catboost import CatBoostRegressor, Pool

cols = [
    # таргет
    "units",

    # категориальные
    "store_code", "store_item_code",

    # погода (float32)
    "tmax", "tmin", "tavg", "depart", "dewpoint", "wetbulb", "heat", "cool",
    "sunrise", "sunset",
    "snowfall", "preciptotal", "stnpressure", "sealevel",
    "resultspeed", "resultdir", "avgspeed",

    # календарь и флаги (int16)
    "year", "week", "BCFG", "BLDU", "BLSN", "BR", "DU", "DZ", "FG", "FU",
    "FZDZ", "FZFG", "FZRA", "GR", "GS", "HZ", "MIFG", "PL", "PRFG", "RA",
    "SG", "SN", "SQ", "TS", "TSRA", "TSSN", "UP", "VCFG", "VCTS",
    "day_of_week", "week_of_year", "month", "is_weekend", "is_holiday",
    "rain_streak", "dry_streak",

    # look‑ahead
    "avg_temp_next_day", "rain_next_day",
]

dtypes = {
    # целевой
    "units": "int16",        # -32 768 … 32 767

    # категориальные коды
    "store_code": "category",
    "store_item_code": "category",

    # погода → float32
    **{c: "float32" for c in [
        "tmax","tmin","tavg","depart","dewpoint","wetbulb","heat","cool",
        "sunrise","sunset",
        "snowfall","preciptotal","stnpressure","sealevel",
        "resultspeed","resultdir","avgspeed",
        "avg_temp_next_day","rain_next_day",
    ]},

    # календарные/флаговые → int16
    **{c: "int16" for c in [
        "year","week","day_of_week","week_of_year","month",
        "is_weekend","is_holiday","rain_streak","dry_streak",
        "BCFG","BLDU","BLSN","BR","DU","DZ","FG","FU","FZDZ","FZFG",
        "FZRA","GR","GS","HZ","MIFG","PL","PRFG","RA","SG","SN","SQ",
        "TS","TSRA","TSSN","UP","VCFG","VCTS",
    ]},
}

full_table_df = pd.read_csv(
    "./data/big_full_table.csv",
    usecols=cols,
    dtype=dtypes,
)

### Обучение модели

In [3]:

# ---------- 1. подготовка X / y  ----------
# full_table_df уже в памяти (из предыдущего шага)
y = full_table_df["units"]
X = full_table_df.drop(columns="units")

cat_cols = ["store_code", "store_item_code"]  # CatBoost поймёт сам

# ---------- 2. train / valid / test ----------
X_train, X_tmp, y_train, y_tmp = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=None
)
X_valid, X_test, y_valid, y_test = train_test_split(
    X_tmp, y_tmp, test_size=0.5, random_state=42
)
# итог ≈ 75 / 12.5 / 12.5 %

# ---------- 3. CatBoost ----------
train_pool  = Pool(X_train, y_train, cat_features=cat_cols)
valid_pool  = Pool(X_valid, y_valid, cat_features=cat_cols)
test_pool   = Pool(X_test,  y_test,  cat_features=cat_cols)

model = CatBoostRegressor(
    loss_function="RMSE",         # будем минимизировать RMSE
    eval_metric="RMSE",           # лог‑RMSE посчитаем сами
    learning_rate=0.05,
    depth=8,
    iterations=3000,
    early_stopping_rounds=200,
    random_seed=42,
    verbose=200
)
model.fit(train_pool, eval_set=valid_pool)

# ---------- 4. оценка RMSLE ----------
import numpy as np

y_pred = model.predict(test_pool).clip(min=0)
rmsle  = np.sqrt(mean_squared_log_error(y_test, y_pred))
print(f"RMSLE on test = {rmsle:.4f}")

0:	learn: 38.3942143	test: 36.0998801	best: 36.0998801 (0)	total: 256ms	remaining: 12m 48s
200:	learn: 21.7511902	test: 18.0880524	best: 18.0880524 (200)	total: 30.8s	remaining: 7m 9s
400:	learn: 19.9465650	test: 16.5308284	best: 16.5308284 (400)	total: 1m 1s	remaining: 6m 39s
600:	learn: 18.7422247	test: 15.7323367	best: 15.7323367 (600)	total: 1m 32s	remaining: 6m 10s
800:	learn: 17.8761996	test: 15.3492421	best: 15.3490882 (799)	total: 2m 6s	remaining: 5m 46s
1000:	learn: 17.3497386	test: 15.0746884	best: 15.0746884 (1000)	total: 2m 41s	remaining: 5m 23s
1200:	learn: 16.9120805	test: 14.8432383	best: 14.8432383 (1200)	total: 3m 29s	remaining: 5m 13s
1400:	learn: 16.6078183	test: 14.6937647	best: 14.6937647 (1400)	total: 4m 3s	remaining: 4m 38s
1600:	learn: 16.2283236	test: 14.5624976	best: 14.5624976 (1600)	total: 4m 38s	remaining: 4m 3s
1800:	learn: 15.9199211	test: 14.4758379	best: 14.4756863 (1797)	total: 5m 22s	remaining: 3m 34s
2000:	learn: 15.6239330	test: 14.4153467	best: 14.

In [4]:
import pandas as pd
import matplotlib.pyplot as plt

# ── 1. числовой вывод
importances = model.get_feature_importance(train_pool, type="PredictionValuesChange")
feat_names  = X_train.columns

imp_df = (
    pd.DataFrame({"feature": feat_names, "importance": importances})
      .sort_values("importance", ascending=False)
)

print(imp_df.head(20))   # топ‑20 в консоль


              feature  importance
1     store_item_code   44.956097
0          store_code   19.786625
14        stnpressure   12.800558
19               year    2.985862
48        day_of_week    2.502447
15           sealevel    1.946131
20               week    1.387837
55  avg_temp_next_day    1.330867
11             sunset    1.322102
50              month    1.213924
5              depart    1.207614
10            sunrise    1.204074
49       week_of_year    0.851160
2                tmax    0.803249
17          resultdir    0.781204
51         is_weekend    0.674209
54         dry_streak    0.493591
7             wetbulb    0.480115
16        resultspeed    0.444003
8                heat    0.414978


TODO:
- Лог‑таргет и калибровка
y_log = np.log1p(units) → обучаем, на выводе pred = np.expm1(raw).clip(0)

- Временной split вместо random, чтобы по честному

- Лаги продаж - units_t-1, t-7, t-14, rolling_mean_4w

- Скользящие агрегаты погоды, temp_avg_7d, rain_sum_14d Реакция спроса часто не мгновенная.

- days_to_holiday сколько дней до выходного

- Кластерные признаки магазинов/товаров store_cluster, item_family

- HyperOpt / Optuna для CatBoost
искать depth, l2_leaf_reg, bagging_temperature, learning_rate
Часто даёт −3 … −10 % RMSLE.

- GrowPolicy = Lossguide / Depthwise
Lossguide быстрее и иногда точнее на высокой кардинальности.

- Stacking / Blending
CatBoost + LightGBM + Prophet → meta‑model
Смешанные модели ловят разные аспекты; rmsle ↓.
