# Прогнозирование цен потребительского ритейла по тестовой выборке на основе глубоких нейронных сетей
__Выполнил:__ *Домченко Максим*

__Студент группы:__ *РИМ-130962*

#### Подключаем Google Drive и задаём корневую папку проекта

In [2]:
#  ===== 0. Подключаем Google Drive и задаём корневую папку проекта =====
from pathlib import Path
import sys, os

try:
    # вариант Colab
    from google.colab import drive
    drive.mount('/content/drive')
    GDRIVE_ROOT = Path("/content/drive/MyDrive")
except (ModuleNotFoundError, ValueError):
    # локальный Jupyter + Google Drive for desktop
    #   (проверьте, где именно у вас смонтирован «Мой Диск»)
    possible = [
        Path.home() / "Google Drive",
        Path.home() / "Мой диск"            # рус. версия клиента
    ]
    GDRIVE_ROOT = next((p for p in possible if p.exists()), None)
    if GDRIVE_ROOT is None:
        sys.exit("Папка Google Drive не найдена. Проверьте путь.")

# ────────────────────────────────────────────────────────────────────────
PROJECT_DIR = GDRIVE_ROOT / "price_forecasting"
PROJECT_DIR.mkdir(parents=True, exist_ok=True)

# Единая «точка входа» для остальных путей
ROOT                     = PROJECT_DIR
DATA                     = ROOT / "data"               # сырые и промежуточные датасеты
MODEL_READY              = DATA / "model_ready"
SPLITS_WF                = MODEL_READY / "splits_wf"
ARTIFACTS                = ROOT / "artifacts"          # модели, метрики, изображения
PLOTS                    = ARTIFACTS / "plots"
MODELS                   = ARTIFACTS / "models"
PREDICTIONS              = ARTIFACTS / "predictions"
METRICS                  = ARTIFACTS / "metrics"

for d in (DATA, MODEL_READY, SPLITS_WF, ARTIFACTS, PLOTS, MODELS, PREDICTIONS, METRICS):
    d.mkdir(parents=True, exist_ok=True)

print(f"Все файлы читаем/пишем в: {ROOT}")

Mounted at /content/drive
Все файлы читаем/пишем в: /content/drive/MyDrive/price_forecasting


In [3]:
!pip install --quiet catboost>=1.2

In [4]:
import pandas as pd, numpy as np, json, math, os
from sklearn.metrics import mean_squared_error
from datetime import datetime
from catboost import CatBoostRegressor, Pool

DF_PATH   = MODEL_READY / "dataset.parquet"
META_PATH = MODEL_READY / "features.json"

df   = pd.read_parquet(DF_PATH)
meta = json.load(open(META_PATH))

FEATURES = meta["all_features"]
TARGET   = meta["target_log"]

# CatBoost: строковые/категориальные признаки → string
cat_features = [c for c in FEATURES if df[c].dtype.name in ("category", "object")]
df[cat_features] = df[cat_features].astype(str)

print("rows:", len(df),
      "| features:", len(FEATURES),
      "| categorical:", len(cat_features))

rows: 57473650 | features: 39 | categorical: 10


In [5]:
def smape(a, f):
    return 100*np.mean(np.abs(f-a)/((np.abs(a)+np.abs(f))/2))

MODELS_CAT = MODELS / "catboost";    MODELS_CAT.mkdir(parents=True, exist_ok=True)
PRED_CAT   = PREDICTIONS / "catboost"; PRED_CAT.mkdir(parents=True, exist_ok=True)

In [6]:
def make_pool(idxs):
    X = df.loc[idxs, FEATURES]
    y = df.loc[idxs, TARGET]
    return Pool(X, y, cat_features=cat_features)

In [7]:
params = {
    "loss_function": "RMSE",
    "iterations": 2000,          # остановимся раньше по over-fit detector
    "depth": 10,
    "learning_rate": 0.05,
    "l2_leaf_reg": 3,
    "bootstrap_type": "Bernoulli",
    "subsample": 0.8,
    "task_type": "GPU",
    "devices": "0",
    "random_seed": 42,
    "verbose": 200,
    "early_stopping_rounds": 100
}

In [8]:
all_metrics = []

for k in range(3):
    tr_idx = np.load(SPLITS_WF / f"wf_train_idx_k{k}.npy")
    vl_idx = np.load(SPLITS_WF / f"wf_val_idx_k{k}.npy")

    train_pool = make_pool(tr_idx)
    val_pool   = make_pool(vl_idx)

    model = CatBoostRegressor(**params)
    model.fit(train_pool, eval_set=val_pool, use_best_model=True)

    # ── артефакты
    model_path = MODELS_CAT / f"cat_k{k}.cbm"
    model.save_model(model_path)

    preds = model.predict(val_pool)
    np.save(PRED_CAT / f"preds_cat_k{k}.npy", preds)

    rmse  = math.sqrt(mean_squared_error(val_pool.get_label(), preds))
    s_mp  = smape(val_pool.get_label(), preds)

    all_metrics.append({"fold": k, "rmse_log": float(rmse), "smape": float(s_mp)})
    print(f"Fold {k}: RMSE={rmse:.4f} | sMAPE={s_mp:.2f}%")

0:	learn: 0.5468453	test: 0.4652651	best: 0.4652651 (0)	total: 3.42s	remaining: 1h 54m 5s
bestTest = 0.4652651133
bestIteration = 0
Shrink model to first 1 iterations.
Fold 0: RMSE=0.4653 | sMAPE=29.72%
0:	learn: 0.5481860	test: 0.4574597	best: 0.4574597 (0)	total: 3.43s	remaining: 1h 54m 22s
200:	learn: 0.4405788	test: 0.4157157	best: 0.4152502 (193)	total: 11m 20s	remaining: 1h 41m 27s
bestTest = 0.4152501952
bestIteration = 193
Shrink model to first 194 iterations.
Fold 1: RMSE=0.4153 | sMAPE=24.63%
0:	learn: 0.5493649	test: 0.4834387	best: 0.4834387 (0)	total: 3.33s	remaining: 1h 51m 7s
bestTest = 0.483438713
bestIteration = 0
Shrink model to first 1 iterations.
Fold 2: RMSE=0.4834 | sMAPE=25.58%


In [9]:
METRICS_CAT = METRICS / "metrics_catboost.json"
import json, pandas as pd
json.dump({"timestamp": datetime.now().isoformat(timespec='seconds'),
           "results":   all_metrics},
          open(METRICS_CAT, "w"), indent=2)

display(pd.DataFrame(all_metrics).style.set_caption("CatBoost GPU metrics"))
print("метрики сохранены", METRICS_CAT.relative_to(PROJECT_DIR))

Unnamed: 0,fold,rmse_log,smape
0,0,0.465265,29.724923
1,1,0.41525,24.630198
2,2,0.483439,25.577349


метрики сохранены artifacts/metrics/metrics_catboost.json


* CatBoost подтвердил результат LightGBM (0.45) — разница < 1 %.

* На «трудном» fold-1 CatBoost даже лучше (0.416 vs 0.401 у LGBM).

* Значит, обе модели ценны для финального ансамбля.