In [1]:
import pandas as pd
import numpy as np
from pandas.api.types import CategoricalDtype
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
from catboost import CatBoostRegressor, Pool

cols = [
    # таргет
    "units",

    # прошлое
    "units_yesterday", "units_prev_week",
    # "rolling_mean_4w", - кривая фича, переделать, она не должна включать день предсказания

    # категориальные
    "store_code", "store_item_code",

    # погода (float32)
    "tmax", "tmin", "tavg", "depart", "dewpoint", "wetbulb", "heat", "cool",
    "sunrise", "sunset",
    "snowfall", "preciptotal", "stnpressure", "sealevel",
    "resultspeed", "resultdir", "avgspeed",

    # календарь и флаги (int16)
    "year", "week", "BCFG", "BLDU", "BLSN", "BR", "DU", "DZ", "FG", "FU",
    "FZDZ", "FZFG", "FZRA", "GR", "GS", "HZ", "MIFG", "PL", "PRFG", "RA",
    "SG", "SN", "SQ", "TS", "TSRA", "TSSN", "UP", "VCFG", "VCTS",
    "day_of_week", "month", "is_weekend", "is_holiday",
    "rain_streak", "dry_streak",

    # look‑ahead
    "avg_temp_next_day", "rain_next_day", "days_to_holiday"
]

dtypes = {
    # целевой
    "units": "int16",        # -32 768 … 32 767

    # прошлое → float32
    **{c: "float32" for c in [
        "units_yesterday", "units_prev_week", "rolling_mean_4w",
    ]},
    

    # категориальные коды
    "store_code": "category",
    "store_item_code": "category",

    # погода → float32
    **{c: "float32" for c in [
        "tmax","tmin","tavg","depart","dewpoint","wetbulb","heat","cool",
        "sunrise","sunset",
        "snowfall","preciptotal","stnpressure","sealevel",
        "resultspeed","resultdir","avgspeed",
        "avg_temp_next_day","rain_next_day",
    ]},

    # календарные/флаговые → int16
    **{c: "int16" for c in [
        "year","week","day_of_week","month",
        "is_weekend","is_holiday","rain_streak","dry_streak",
        "BCFG","BLDU","BLSN","BR","DU","DZ","FG","FU","FZDZ","FZFG",
        "FZRA","GR","GS","HZ","MIFG","PL","PRFG","RA","SG","SN","SQ",
        "TS","TSRA","TSSN","UP","VCFG","VCTS", "days_to_holiday",
    ]},
}

full_table_df = pd.read_csv(
    "./data/big_full_table.csv",
    usecols=cols,
    dtype=dtypes,
)

### Обучение модели

In [2]:

# ---------- 1. подготовка X / y  ----------
# full_table_df уже в памяти (из предыдущего шага)
y = full_table_df["units"]
X = full_table_df.drop(columns="units")

cat_cols = ["store_code", "store_item_code"]  # CatBoost поймёт сам

# ---------- 2. train / valid / test ----------
X_train, X_tmp, y_train, y_tmp = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=None
)
X_valid, X_test, y_valid, y_test = train_test_split(
    X_tmp, y_tmp, test_size=0.5, random_state=42
)
# итог ≈ 75 / 12.5 / 12.5 %

# ---------- 3. CatBoost ----------
train_pool  = Pool(X_train, y_train, cat_features=cat_cols)
valid_pool  = Pool(X_valid, y_valid, cat_features=cat_cols)
test_pool   = Pool(X_test,  y_test,  cat_features=cat_cols)

model = CatBoostRegressor(
    loss_function="RMSE",         # будем минимизировать RMSE
    eval_metric="RMSE",           # лог‑RMSE посчитаем сами
    learning_rate=0.05,
    depth=8,
    iterations=3000,
    early_stopping_rounds=200,
    random_seed=42,
    verbose=200
)
model.fit(train_pool, eval_set=valid_pool)

# ---------- 4. оценка RMSLE ----------
import numpy as np

y_pred = model.predict(test_pool).clip(min=0)
rmsle  = np.sqrt(mean_squared_log_error(y_test, y_pred))
print(f"RMSLE on test = {rmsle:.4f}")

0:	learn: 36.4648308	test: 36.1887455	best: 36.1887455 (0)	total: 233ms	remaining: 11m 38s
200:	learn: 14.4521310	test: 14.6438356	best: 14.6438356 (200)	total: 44.5s	remaining: 10m 20s
400:	learn: 13.1916634	test: 14.3587645	best: 14.3587645 (400)	total: 1m 29s	remaining: 9m 43s
600:	learn: 12.3205511	test: 14.1341522	best: 14.1336345 (593)	total: 2m 7s	remaining: 8m 29s
800:	learn: 11.7647193	test: 14.0031653	best: 14.0031653 (800)	total: 2m 47s	remaining: 7m 39s
1000:	learn: 11.3411381	test: 13.9341598	best: 13.9340456 (997)	total: 3m 33s	remaining: 7m 7s
1200:	learn: 10.9785644	test: 13.8666043	best: 13.8654985 (1199)	total: 4m 13s	remaining: 6m 19s
1400:	learn: 10.6934362	test: 13.8192489	best: 13.8192489 (1400)	total: 4m 50s	remaining: 5m 31s
1600:	learn: 10.4298514	test: 13.7886120	best: 13.7886120 (1600)	total: 5m 27s	remaining: 4m 46s
1800:	learn: 10.1911307	test: 13.7553127	best: 13.7550055 (1799)	total: 6m 10s	remaining: 4m 6s
2000:	learn: 9.9712103	test: 13.7324169	best: 13

In [3]:
import pandas as pd
import matplotlib.pyplot as plt

# ── 1. числовой вывод
importances = model.get_feature_importance(train_pool, type="PredictionValuesChange")
feat_names  = X_train.columns

imp_df = (
    pd.DataFrame({"feature": feat_names, "importance": importances})
      .sort_values("importance", ascending=False)
)

print(imp_df.head(20))   # топ‑20 в консоль


              feature  importance
2     units_yesterday   29.466343
3     units_prev_week   22.972018
1     store_item_code    5.524464
50        day_of_week    3.851140
16        stnpressure    3.415536
58    days_to_holiday    3.257297
17           sealevel    2.834928
19          resultdir    2.117419
21               year    2.088160
22               week    2.068444
0          store_code    2.043223
9             wetbulb    1.927825
51              month    1.840241
13             sunset    1.609161
18        resultspeed    1.431440
56  avg_temp_next_day    1.416673
4                tmax    1.279764
7              depart    1.247146
20           avgspeed    1.233116
55         dry_streak    1.059168


Мнение чата:

| Позиция          | RMSLE (≈)       | Что обычно отличает команды                                                 |
| ---------------- | --------------- | --------------------------------------------------------------------------- |
| 🥇 1‑е место     | **0.38 – 0.44** | сильная фич‑инженерия (лаги, сезонность, промо), тонкий hyper‑opt, ансамбли |
| 🥈 Топ‑3         | **0.44 – 0.48** | 1 х градиентный бустинг + набор умных признаков                             |
| 🥉 Топ‑10        | **0.48 – 0.55** | базовый бустинг, минимальный тюнинг, ограниченное число фич                 |
| Середина таблицы | 0.55 – 0.70     | «из коробки» модели, мало фич                                               |

У тебя уже 0.494 — граница входа в условный топ‑10. 
Оптимистичный прогноз: доведёшь до 0.42 – 0.45 — и шансы на победу вполне реальные.

Где ещё взять пару сотых:

- Лаги + rolling‑median по погоде (t‑7, t‑14 для tavg, precip).
- Target encoding категорий: item_nbr → средняя продажа по товару, store_nbr → сезонный коэффициент.
- Hyper‑opt (Optuna, 200 итераций) для depth, l2_leaf_reg, bagging_temperature.
- Blending CatBoost + LightGBM + Linear Reg (по лог‑таргету).

В сумме эти штрихи обычно дают −0.03 … −0.06 RMSLE.

Так что ориентир «< 0.45» держи как цель.

In [6]:
model.save_model("../ml-models/CatBoost v1.cbm")