In [1]:
import pandas as pd
import numpy as np
from pandas.api.types import CategoricalDtype
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
from catboost import CatBoostRegressor, Pool

cols = [
    # —Ç–∞—Ä–≥–µ—Ç
    "units",

    # –ø—Ä–æ—à–ª–æ–µ
    "units_yesterday", "units_prev_week",
    # "rolling_mean_4w", - –∫—Ä–∏–≤–∞—è —Ñ–∏—á–∞, –ø–µ—Ä–µ–¥–µ–ª–∞—Ç—å, –æ–Ω–∞ –Ω–µ –¥–æ–ª–∂–Ω–∞ –≤–∫–ª—é—á–∞—Ç—å –¥–µ–Ω—å –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏—è

    # –∫–∞—Ç–µ–≥–æ—Ä–∏–∞–ª—å–Ω—ã–µ
    "store_code", "store_item_code",

    # –ø–æ–≥–æ–¥–∞ (float32)
    "tmax", "tmin", "tavg", "depart", "dewpoint", "wetbulb", "heat", "cool",
    "sunrise", "sunset",
    "snowfall", "preciptotal", "stnpressure", "sealevel",
    "resultspeed", "resultdir", "avgspeed",

    # –∫–∞–ª–µ–Ω–¥–∞—Ä—å –∏ —Ñ–ª–∞–≥–∏ (int16)
    "year", "week", "BCFG", "BLDU", "BLSN", "BR", "DU", "DZ", "FG", "FU",
    "FZDZ", "FZFG", "FZRA", "GR", "GS", "HZ", "MIFG", "PL", "PRFG", "RA",
    "SG", "SN", "SQ", "TS", "TSRA", "TSSN", "UP", "VCFG", "VCTS",
    "day_of_week", "month", "is_weekend", "is_holiday",
    "rain_streak", "dry_streak",

    # look‚Äëahead
    "avg_temp_next_day", "rain_next_day", "days_to_holiday"
]

dtypes = {
    # —Ü–µ–ª–µ–≤–æ–π
    "units": "int16",        # -32‚ÄØ768‚ÄØ‚Ä¶‚ÄØ32‚ÄØ767

    # –ø—Ä–æ—à–ª–æ–µ ‚Üí float32
    **{c: "float32" for c in [
        "units_yesterday", "units_prev_week", "rolling_mean_4w",
    ]},
    

    # –∫–∞—Ç–µ–≥–æ—Ä–∏–∞–ª—å–Ω—ã–µ –∫–æ–¥—ã
    "store_code": "category",
    "store_item_code": "category",

    # –ø–æ–≥–æ–¥–∞ ‚Üí float32
    **{c: "float32" for c in [
        "tmax","tmin","tavg","depart","dewpoint","wetbulb","heat","cool",
        "sunrise","sunset",
        "snowfall","preciptotal","stnpressure","sealevel",
        "resultspeed","resultdir","avgspeed",
        "avg_temp_next_day","rain_next_day",
    ]},

    # –∫–∞–ª–µ–Ω–¥–∞—Ä–Ω—ã–µ/—Ñ–ª–∞–≥–æ–≤—ã–µ ‚Üí int16
    **{c: "int16" for c in [
        "year","week","day_of_week","month",
        "is_weekend","is_holiday","rain_streak","dry_streak",
        "BCFG","BLDU","BLSN","BR","DU","DZ","FG","FU","FZDZ","FZFG",
        "FZRA","GR","GS","HZ","MIFG","PL","PRFG","RA","SG","SN","SQ",
        "TS","TSRA","TSSN","UP","VCFG","VCTS", "days_to_holiday",
    ]},
}

full_table_df = pd.read_csv(
    "./data/big_full_table.csv",
    usecols=cols,
    dtype=dtypes,
)

### –û–±—É—á–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏

In [2]:

# ---------- 1. –ø–æ–¥–≥–æ—Ç–æ–≤–∫–∞ X / y  ----------
# full_table_df —É–∂–µ –≤ –ø–∞–º—è—Ç–∏ (–∏–∑ –ø—Ä–µ–¥—ã–¥—É—â–µ–≥–æ —à–∞–≥–∞)
y = full_table_df["units"]
X = full_table_df.drop(columns="units")

cat_cols = ["store_code", "store_item_code"]  # CatBoost –ø–æ–π–º—ë—Ç —Å–∞–º

# ---------- 2. train / valid / test ----------
X_train, X_tmp, y_train, y_tmp = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=None
)
X_valid, X_test, y_valid, y_test = train_test_split(
    X_tmp, y_tmp, test_size=0.5, random_state=42
)
# –∏—Ç–æ–≥ ‚âà 75‚ÄØ/‚ÄØ12.5‚ÄØ/‚ÄØ12.5 %

# ---------- 3. CatBoost ----------
train_pool  = Pool(X_train, y_train, cat_features=cat_cols)
valid_pool  = Pool(X_valid, y_valid, cat_features=cat_cols)
test_pool   = Pool(X_test,  y_test,  cat_features=cat_cols)

model = CatBoostRegressor(
    loss_function="RMSE",         # –±—É–¥–µ–º –º–∏–Ω–∏–º–∏–∑–∏—Ä–æ–≤–∞—Ç—å RMSE
    eval_metric="RMSE",           # –ª–æ–≥‚ÄëRMSE –ø–æ—Å—á–∏—Ç–∞–µ–º —Å–∞–º–∏
    learning_rate=0.05,
    depth=8,
    iterations=3000,
    early_stopping_rounds=200,
    random_seed=42,
    verbose=200
)
model.fit(train_pool, eval_set=valid_pool)

# ---------- 4. –æ—Ü–µ–Ω–∫–∞ RMSLE ----------
import numpy as np

y_pred = model.predict(test_pool).clip(min=0)
rmsle  = np.sqrt(mean_squared_log_error(y_test, y_pred))
print(f"RMSLE on test = {rmsle:.4f}")

0:	learn: 36.4648308	test: 36.1887455	best: 36.1887455 (0)	total: 233ms	remaining: 11m 38s
200:	learn: 14.4521310	test: 14.6438356	best: 14.6438356 (200)	total: 44.5s	remaining: 10m 20s
400:	learn: 13.1916634	test: 14.3587645	best: 14.3587645 (400)	total: 1m 29s	remaining: 9m 43s
600:	learn: 12.3205511	test: 14.1341522	best: 14.1336345 (593)	total: 2m 7s	remaining: 8m 29s
800:	learn: 11.7647193	test: 14.0031653	best: 14.0031653 (800)	total: 2m 47s	remaining: 7m 39s
1000:	learn: 11.3411381	test: 13.9341598	best: 13.9340456 (997)	total: 3m 33s	remaining: 7m 7s
1200:	learn: 10.9785644	test: 13.8666043	best: 13.8654985 (1199)	total: 4m 13s	remaining: 6m 19s
1400:	learn: 10.6934362	test: 13.8192489	best: 13.8192489 (1400)	total: 4m 50s	remaining: 5m 31s
1600:	learn: 10.4298514	test: 13.7886120	best: 13.7886120 (1600)	total: 5m 27s	remaining: 4m 46s
1800:	learn: 10.1911307	test: 13.7553127	best: 13.7550055 (1799)	total: 6m 10s	remaining: 4m 6s
2000:	learn: 9.9712103	test: 13.7324169	best: 13

In [3]:
import pandas as pd
import matplotlib.pyplot as plt

# ‚îÄ‚îÄ 1. —á–∏—Å–ª–æ–≤–æ–π –≤—ã–≤–æ–¥
importances = model.get_feature_importance(train_pool, type="PredictionValuesChange")
feat_names  = X_train.columns

imp_df = (
    pd.DataFrame({"feature": feat_names, "importance": importances})
      .sort_values("importance", ascending=False)
)

print(imp_df.head(20))   # —Ç–æ–ø‚Äë20 –≤ –∫–æ–Ω—Å–æ–ª—å


              feature  importance
2     units_yesterday   29.466343
3     units_prev_week   22.972018
1     store_item_code    5.524464
50        day_of_week    3.851140
16        stnpressure    3.415536
58    days_to_holiday    3.257297
17           sealevel    2.834928
19          resultdir    2.117419
21               year    2.088160
22               week    2.068444
0          store_code    2.043223
9             wetbulb    1.927825
51              month    1.840241
13             sunset    1.609161
18        resultspeed    1.431440
56  avg_temp_next_day    1.416673
4                tmax    1.279764
7              depart    1.247146
20           avgspeed    1.233116
55         dry_streak    1.059168


–ú–Ω–µ–Ω–∏–µ —á–∞—Ç–∞:

| –ü–æ–∑–∏—Ü–∏—è          | RMSLE (‚âà)       | –ß—Ç–æ –æ–±—ã—á–Ω–æ –æ—Ç–ª–∏—á–∞–µ—Ç –∫–æ–º–∞–Ω–¥—ã                                                 |
| ---------------- | --------------- | --------------------------------------------------------------------------- |
| ü•á 1‚Äë–µ –º–µ—Å—Ç–æ     | **0.38‚ÄØ‚Äì‚ÄØ0.44** | —Å–∏–ª—å–Ω–∞—è —Ñ–∏—á‚Äë–∏–Ω–∂–µ–Ω–µ—Ä–∏—è (–ª–∞–≥–∏, —Å–µ–∑–æ–Ω–Ω–æ—Å—Ç—å, –ø—Ä–æ–º–æ), —Ç–æ–Ω–∫–∏–π hyper‚Äëopt, –∞–Ω—Å–∞–º–±–ª–∏ |
| ü•à –¢–æ–ø‚Äë3         | **0.44‚ÄØ‚Äì‚ÄØ0.48** | 1‚ÄØ—Ö –≥—Ä–∞–¥–∏–µ–Ω—Ç–Ω—ã–π –±—É—Å—Ç–∏–Ω–≥ + –Ω–∞–±–æ—Ä —É–º–Ω—ã—Ö –ø—Ä–∏–∑–Ω–∞–∫–æ–≤                             |
| ü•â –¢–æ–ø‚Äë10        | **0.48‚ÄØ‚Äì‚ÄØ0.55** | –±–∞–∑–æ–≤—ã–π –±—É—Å—Ç–∏–Ω–≥, –º–∏–Ω–∏–º–∞–ª—å–Ω—ã–π —Ç—é–Ω–∏–Ω–≥, –æ–≥—Ä–∞–Ω–∏—á–µ–Ω–Ω–æ–µ —á–∏—Å–ª–æ —Ñ–∏—á                 |
| –°–µ—Ä–µ–¥–∏–Ω–∞ —Ç–∞–±–ª–∏—Ü—ã | 0.55‚ÄØ‚Äì‚ÄØ0.70     | ¬´–∏–∑ –∫–æ—Ä–æ–±–∫–∏¬ª –º–æ–¥–µ–ª–∏, –º–∞–ª–æ —Ñ–∏—á                                               |

–£ —Ç–µ–±—è —É–∂–µ 0.494‚ÄØ‚Äî –≥—Ä–∞–Ω–∏—Ü–∞ –≤—Ö–æ–¥–∞ –≤ —É—Å–ª–æ–≤–Ω—ã–π —Ç–æ–ø‚Äë10. 
–û–ø—Ç–∏–º–∏—Å—Ç–∏—á–Ω—ã–π –ø—Ä–æ–≥–Ω–æ–∑: –¥–æ–≤–µ–¥—ë—à—å –¥–æ 0.42‚ÄØ‚Äì‚ÄØ0.45‚ÄØ‚Äî –∏ —à–∞–Ω—Å—ã –Ω–∞ –ø–æ–±–µ–¥—É –≤–ø–æ–ª–Ω–µ —Ä–µ–∞–ª—å–Ω—ã–µ.

–ì–¥–µ –µ—â—ë –≤–∑—è—Ç—å –ø–∞—Ä—É —Å–æ—Ç—ã—Ö:

- –õ–∞–≥–∏ + rolling‚Äëmedian –ø–æ –ø–æ–≥–æ–¥–µ (t‚Äë7, t‚Äë14 –¥–ª—è tavg, precip).
- Target‚ÄØencoding –∫–∞—Ç–µ–≥–æ—Ä–∏–π: item_nbr ‚Üí —Å—Ä–µ–¥–Ω—è—è –ø—Ä–æ–¥–∞–∂–∞ –ø–æ —Ç–æ–≤–∞—Ä—É, store_nbr ‚Üí —Å–µ–∑–æ–Ω–Ω—ã–π –∫–æ—ç—Ñ—Ñ–∏—Ü–∏–µ–Ω—Ç.
- Hyper‚Äëopt (Optuna, 200 –∏—Ç–µ—Ä–∞—Ü–∏–π) –¥–ª—è depth, l2_leaf_reg, bagging_temperature.
- Blending CatBoost‚ÄØ+‚ÄØLightGBM‚ÄØ+‚ÄØLinear‚ÄØReg (–ø–æ –ª–æ–≥‚Äë—Ç–∞—Ä–≥–µ—Ç—É).

–í —Å—É–º–º–µ —ç—Ç–∏ —à—Ç—Ä–∏—Ö–∏ –æ–±—ã—á–Ω–æ –¥–∞—é—Ç ‚àí0.03‚ÄØ‚Ä¶‚ÄØ‚àí0.06 RMSLE.

–¢–∞–∫ —á—Ç–æ –æ—Ä–∏–µ–Ω—Ç–∏—Ä ¬´<‚ÄØ0.45¬ª –¥–µ—Ä–∂–∏ –∫–∞–∫ —Ü–µ–ª—å.

In [6]:
model.save_model("../ml-models/CatBoost v1.cbm")