# Прогнозирование цен потребительского ритейла по тестовой выборке на основе глубоких нейронных сетей
__Выполнил:__ *Домченко Максим*

__Студент группы:__ *РИМ-130962*

#### Подключаем Google Drive и задаём корневую папку проекта

In [1]:
#  ===== 0. Подключаем Google Drive и задаём корневую папку проекта =====
from pathlib import Path
import sys, os

try:
    # вариант Colab
    from google.colab import drive
    drive.mount('/content/drive')
    GDRIVE_ROOT = Path("/content/drive/MyDrive")
except (ModuleNotFoundError, ValueError):
    # локальный Jupyter + Google Drive for desktop
    #   (проверьте, где именно у вас смонтирован «Мой Диск»)
    possible = [
        Path.home() / "Google Drive",
        Path.home() / "Мой диск"            # рус. версия клиента
    ]
    GDRIVE_ROOT = next((p for p in possible if p.exists()), None)
    if GDRIVE_ROOT is None:
        sys.exit("Папка Google Drive не найдена. Проверьте путь.")

# ────────────────────────────────────────────────────────────────────────
PROJECT_DIR = GDRIVE_ROOT / "price_forecasting"
PROJECT_DIR.mkdir(parents=True, exist_ok=True)

# Единая «точка входа» для остальных путей
ROOT                     = PROJECT_DIR
DATA                     = ROOT / "data"               # сырые и промежуточные датасеты
MODEL_READY              = DATA / "model_ready"
SPLITS_WF                = MODEL_READY / "splits_wf"
ARTIFACTS                = ROOT / "artifacts"          # модели, метрики, изображения
PLOTS                    = ARTIFACTS / "plots"
MODELS                   = ARTIFACTS / "models"
PREDICTIONS              = ARTIFACTS / "predictions"
METRICS                  = ARTIFACTS / "metrics"

for d in (DATA, MODEL_READY, SPLITS_WF, ARTIFACTS, PLOTS, MODELS, PREDICTIONS, METRICS):
    d.mkdir(parents=True, exist_ok=True)

print(f"Все файлы читаем/пишем в: {ROOT}")

Mounted at /content/drive
Все файлы читаем/пишем в: /content/drive/MyDrive/price_forecasting


In [2]:
!pip install --quiet xgboost>=2.0

In [3]:
import pandas as pd, numpy as np, json, os, math
from pandas.api.types import is_datetime64_any_dtype
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from datetime import datetime

df   = pd.read_parquet(MODEL_READY / "dataset.parquet")
meta = json.load(open(MODEL_READY / "features.json"))

# datetime → int32 (сек.)
for c in meta["all_features"]:
    if is_datetime64_any_dtype(df[c]):
        df[c] = (df[c].view("int64") // 10**9).astype("int32")

cat_cols = [c for c in meta["all_features"] if df[c].dtype.name in ("category","object")]
for c in cat_cols:
    df[c] = df[c].astype("category").cat.codes.astype("int32")

num_cols = [c for c in meta["all_features"] if c not in cat_cols]
df[num_cols] = df[num_cols].astype(np.float32)

FEATURES = meta["all_features"]
TARGET   = meta["target_log"]

  df[c] = (df[c].view("int64") // 10**9).astype("int32")


In [4]:
def make_qdmatrix(idxs):
    X = df.loc[idxs, FEATURES]
    y = df.loc[idxs, TARGET]
    return xgb.QuantileDMatrix(data=X, label=y,
                               max_bin=256,  # как в params
                               enable_categorical=False)

In [5]:
params_gpu = {
    "tree_method":      "gpu_hist",
    "objective":        "reg:squarederror",
    "eval_metric":      "rmse",
    "learning_rate":    0.05,
    "max_depth":        10,
    "subsample":        0.8,
    "colsample_bytree": 0.8,
    "max_bin":          256,
    "lambda":           1,
    "alpha":            0,
    "random_state":     42
}
NUM_ROUND = 2000
EARLY_STOP = 100

In [6]:
MODELS_XGB = MODELS / "xgb";      MODELS_XGB.mkdir(parents=True, exist_ok=True)
PRED_XGB   = PREDICTIONS / "xgb"; PRED_XGB.mkdir(parents=True, exist_ok=True)

In [7]:
all_metrics = []

for k in range(3):
    tr_idx = np.load(SPLITS_WF / f"wf_train_idx_k{k}.npy")
    vl_idx = np.load(SPLITS_WF / f"wf_val_idx_k{k}.npy")

    dtrain = make_qdmatrix(tr_idx)
    dval   = make_qdmatrix(vl_idx)

    booster = xgb.train(
        params_gpu,
        dtrain,
        num_boost_round=NUM_ROUND,
        evals=[(dtrain,"train"),(dval,"val")],
        early_stopping_rounds=EARLY_STOP,
        verbose_eval=200
    )

    booster.save_model(MODELS_XGB / f"xgb_k{k}.json")

    preds = booster.predict(dval, iteration_range=(0, booster.best_iteration+1))
    np.save(PRED_XGB / f"preds_xgb_k{k}.npy", preds)

    rmse = math.sqrt(mean_squared_error(dval.get_label(), preds))
    smp  = 100*np.mean(np.abs(preds - dval.get_label()) /
                       ((np.abs(preds)+np.abs(dval.get_label()))/2))
    all_metrics.append({"fold":k, "rmse_log":float(rmse), "smape":float(smp)})
    print(f"Fold {k}: RMSE={rmse:.4f} | sMAPE={smp:.2f}%")


    E.g. tree_method = "hist", device = "cuda"



[0]	train-rmse:0.54480	val-rmse:0.46760
[200]	train-rmse:0.34043	val-rmse:0.45000
[400]	train-rmse:0.26880	val-rmse:0.42916
[600]	train-rmse:0.22434	val-rmse:0.41983
[800]	train-rmse:0.19123	val-rmse:0.41535
[924]	train-rmse:0.17612	val-rmse:0.41581



    E.g. tree_method = "hist", device = "cuda"



Fold 0: RMSE=0.4140 | sMAPE=26.66%



    E.g. tree_method = "hist", device = "cuda"



[0]	train-rmse:0.54624	val-rmse:0.45658
[200]	train-rmse:0.34357	val-rmse:0.40352
[400]	train-rmse:0.26814	val-rmse:0.38575
[600]	train-rmse:0.22576	val-rmse:0.37908
[800]	train-rmse:0.19366	val-rmse:0.37062
[1000]	train-rmse:0.16899	val-rmse:0.36728
[1200]	train-rmse:0.14972	val-rmse:0.36375
[1266]	train-rmse:0.14439	val-rmse:0.36411



    E.g. tree_method = "hist", device = "cuda"



Fold 1: RMSE=0.3636 | sMAPE=22.11%



    E.g. tree_method = "hist", device = "cuda"



[0]	train-rmse:0.54724	val-rmse:0.48325
[200]	train-rmse:0.34870	val-rmse:0.45115
[400]	train-rmse:0.27372	val-rmse:0.42489
[600]	train-rmse:0.22758	val-rmse:0.41075
[800]	train-rmse:0.19657	val-rmse:0.40184
[1000]	train-rmse:0.17228	val-rmse:0.39449
[1200]	train-rmse:0.15292	val-rmse:0.39018
[1400]	train-rmse:0.13748	val-rmse:0.38677
[1600]	train-rmse:0.12502	val-rmse:0.38382
[1800]	train-rmse:0.11476	val-rmse:0.38184
[1999]	train-rmse:0.10513	val-rmse:0.38046



    E.g. tree_method = "hist", device = "cuda"



Fold 2: RMSE=0.3804 | sMAPE=18.95%


In [8]:
METRICS_XGB = METRICS / "metrics_xgb.json"
import json, pandas as pd
json.dump({"timestamp":datetime.now().isoformat(timespec='seconds'),
           "results":all_metrics},
          open(METRICS_XGB,"w"), indent=2)

display(pd.DataFrame(all_metrics).style.set_caption("XGBoost GPU metrics"))
print("✔ metrics saved →", METRICS_XGB.relative_to(PROJECT_DIR))

Unnamed: 0,fold,rmse_log,smape
0,0,0.413958,26.659771
1,1,0.363619,22.107506
2,2,0.380393,18.950853


✔ metrics saved → artifacts/metrics/metrics_xgb.json


XGBoost дал ещё −15 % к ошибке относительно LightGBM/CatBoost. Отличная база для ансамбля.