# Прогнозирование цен потребительского ритейла по тестовой выборке на основе глубоких нейронных сетей
__Выполнил:__ *Домченко Максим*

__Студент группы:__ *РИМ-130962*

#### Подключаем Google Drive и задаём корневую папку проекта

In [1]:
#  ===== 0. Подключаем Google Drive и задаём корневую папку проекта =====
from pathlib import Path
import sys, os

try:
    # вариант Colab
    from google.colab import drive
    drive.mount('/content/drive')
    GDRIVE_ROOT = Path("/content/drive/MyDrive")
except (ModuleNotFoundError, ValueError):
    # локальный Jupyter + Google Drive for desktop
    #   (проверьте, где именно у вас смонтирован «Мой Диск»)
    possible = [
        Path.home() / "Google Drive",
        Path.home() / "Мой диск"            # рус. версия клиента
    ]
    GDRIVE_ROOT = next((p for p in possible if p.exists()), None)
    if GDRIVE_ROOT is None:
        sys.exit("Папка Google Drive не найдена. Проверьте путь.")

# ────────────────────────────────────────────────────────────────────────
PROJECT_DIR = GDRIVE_ROOT / "price_forecasting"
PROJECT_DIR.mkdir(parents=True, exist_ok=True)

# Единая «точка входа» для остальных путей
ROOT                     = PROJECT_DIR
DATA                     = ROOT / "data"               # сырые и промежуточные датасеты
MODEL_READY              = DATA / "model_ready"
SPLITS_WF                = MODEL_READY / "splits_wf"
ARTIFACTS                = ROOT / "artifacts"          # модели, метрики, изображения
PLOTS                    = ARTIFACTS / "plots"
MODELS                   = ARTIFACTS / "models"
PREDICTIONS              = ARTIFACTS / "predictions"
METRICS                  = ARTIFACTS / "metrics"

for d in (DATA, MODEL_READY, SPLITS_WF, ARTIFACTS, PLOTS, MODELS, PREDICTIONS, METRICS):
    d.mkdir(parents=True, exist_ok=True)

print(f"Все файлы читаем/пишем в: {ROOT}")

Mounted at /content/drive
Все файлы читаем/пишем в: /content/drive/MyDrive/price_forecasting


In [2]:
!pip install --quiet lightgbm>=4.1 optuna

In [3]:
import pandas as pd, numpy as np, json, math, os, pickle
from pandas.api.types import is_datetime64_any_dtype
import lightgbm as lgb, optuna
from sklearn.metrics import mean_squared_error
from datetime import datetime

DF_PATH   = MODEL_READY / "dataset.parquet"
META_PATH = MODEL_READY / "features.json"

df   = pd.read_parquet(DF_PATH)
meta = json.load(open(META_PATH))

RAW_FEATURES = [c for c in meta["all_features"] if not is_datetime64_any_dtype(df[c])]

# --- разделяем на "узкие" и "широкие" категориальные -----------------
MAX_BIN = 255
SMALL_CAT = [c for c in RAW_FEATURES
             if df[c].dtype.name == "category" and df[c].nunique() <= MAX_BIN]
LARGE_CAT = [c for c in RAW_FEATURES
             if df[c].dtype.name == "category" and df[c].nunique() >  MAX_BIN]

# закодируем большие категориальные в int32  (cat.codes)
for col in LARGE_CAT:
    df[col] = df[col].cat.codes.astype("int32")

FEATURES = RAW_FEATURES            # тот же список, данные уже скорректированы
CAT_COLS = SMALL_CAT               # только "безопасные" категориальные
NUM_COLS = [c for c in FEATURES if c not in CAT_COLS]

TARGET   = meta["target_log"]

print(f"rows {len(df)} | feats {len(FEATURES)} "
      f"| cat_small {len(CAT_COLS)} | cat_large {len(LARGE_CAT)}")

rows 57473650 | feats 38 | cat_small 9 | cat_large 1


In [4]:
def smape(a, f):
    return 100*np.mean(np.abs(f-a)/((np.abs(a)+np.abs(f))/2))

MODELS_LGBM = MODELS / "lgbm";        MODELS_LGBM.mkdir(parents=True, exist_ok=True)
PRED_LGBM   = PREDICTIONS / "lgbm";   PRED_LGBM.mkdir(parents=True, exist_ok=True)
PLOTS_LGBM  = PLOTS / "lgbm";         PLOTS_LGBM.mkdir(parents=True, exist_ok=True)

In [5]:
rng = np.random.default_rng(42)
sample_sku  = rng.choice(df['id'].unique(),
                         size=int(0.05*df['id'].nunique()),
                         replace=False)
SAMPLE_MASK = df['id'].isin(sample_sku).values

In [6]:
def make_ds(indices, *, sample=False):
    if sample:
        indices = np.intersect1d(indices, df.index[SAMPLE_MASK])

    X = df.loc[indices, FEATURES].copy()
    # числовые фичи → float32
    X[NUM_COLS] = X[NUM_COLS].astype(np.float32)
    y = df.loc[indices, TARGET].astype(np.float32)

    return lgb.Dataset(X, y,
                       free_raw_data=False,
                       categorical_feature=CAT_COLS)

In [7]:
def objective(trial, tr_idx, vl_idx):
    params = {
        "objective":"poisson", "metric":"rmse",
        "device_type":"cpu", "num_threads": os.cpu_count(),
        "learning_rate": trial.suggest_float("lr", 0.01, 0.2, log=True),
        "num_leaves":    trial.suggest_int("leaves", 63, 511, step=32),
        "min_data_in_leaf": trial.suggest_int("min_data", 50, 400),
        "feature_fraction": trial.suggest_float("ff", 0.6, 1.0),
        "bagging_fraction":0.8, "bagging_freq":1,
        "seed":42, "verbose":-1
    }
    dtr = make_ds(tr_idx, sample=True)
    dvl = make_ds(vl_idx, sample=True)

    mdl = lgb.train(params, dtr,
                    num_boost_round=600,
                    valid_sets=[dvl],
                    callbacks=[lgb.early_stopping(50, verbose=False)])
    pred = mdl.predict(dvl.data)
    return math.sqrt(mean_squared_error(dvl.label, pred))

In [8]:
all_metrics = []

for k in range(3):
    tr_idx = np.load(SPLITS_WF/f"wf_train_idx_k{k}.npy")
    vl_idx = np.load(SPLITS_WF/f"wf_val_idx_k{k}.npy")

    # ---- Optuna search (15 trial) ----
    study = optuna.create_study(direction="minimize",
                                sampler=optuna.samplers.TPESampler(seed=42))
    study.optimize(lambda t: objective(t, tr_idx, vl_idx),
                   n_trials=15, timeout=15*60, show_progress_bar=False)

    best = study.best_trial.params | {
        "objective":"poisson", "metric":"rmse",
        "device_type":"gpu",
        "gpu_platform_id":0, "gpu_device_id":0,
        "force_row_wise":True,
        "bagging_fraction":0.8, "bagging_freq":1,
        "max_bin":MAX_BIN,
        "histogram_pool_size":12.0,      # GB, безопасно при 40 GB VRAM
        "seed":42, "verbose":-1
    }

    dtr = make_ds(tr_idx, sample=False)
    dvl = make_ds(vl_idx, sample=False)

    mdl = lgb.train(best, dtr,
                    num_boost_round=1500,
                    valid_sets=[dvl],
                    callbacks=[lgb.early_stopping(100),
                               lgb.log_evaluation(200)])

    mdl.save_model((MODELS_LGBM/f"lgbm_gpu_full_k{k}.txt").as_posix())

    y_pred = mdl.predict(dvl.data, num_iteration=mdl.best_iteration)
    np.save(PRED_LGBM/f"preds_lgbm_gpu_full_k{k}.npy", y_pred)

    rmse = math.sqrt(mean_squared_error(dvl.label, y_pred))
    s_mp = smape(dvl.label, y_pred)
    all_metrics.append({"fold":k, "rmse_log":float(rmse), "smape":float(s_mp)})
    print(f"Fold {k}: RMSE={rmse:.4f} | sMAPE={s_mp:.2f}%")

    # ---- feature importance
    import matplotlib.pyplot as plt
    gain = mdl.feature_importance(importance_type="gain")
    top  = np.argsort(gain)[-30:]
    plt.figure(figsize=(6,8))
    plt.barh(range(30), gain[top])
    plt.yticks(range(30), np.array(FEATURES)[top])
    plt.title(f"LGBM GPU importance k{k}")
    plt.tight_layout()
    plt.savefig(PLOTS_LGBM/f"fi_lgbm_k{k}.png")
    plt.close()

[I 2025-05-11 16:16:25,209] A new study created in memory with name: no-name-52b59b92-14d9-4edf-969e-5c38f7cad403
[I 2025-05-11 16:16:46,320] Trial 0 finished with value: 0.46768505602363136 and parameters: {'lr': 0.030710573677773714, 'leaves': 511, 'min_data': 306, 'ff': 0.8394633936788146}. Best is trial 0 with value: 0.46768505602363136.
[I 2025-05-11 16:17:01,778] Trial 1 finished with value: 0.46723172717810985 and parameters: {'lr': 0.015958237752949748, 'leaves': 127, 'min_data': 70, 'ff': 0.9464704583099741}. Best is trial 1 with value: 0.46723172717810985.
[I 2025-05-11 16:17:18,818] Trial 2 finished with value: 0.46733505623456667 and parameters: {'lr': 0.06054365855469246, 'leaves': 383, 'min_data': 57, 'ff': 0.9879639408647978}. Best is trial 1 with value: 0.46723172717810985.
[I 2025-05-11 16:17:36,245] Trial 3 finished with value: 0.46964148000458733 and parameters: {'lr': 0.12106896936002161, 'leaves': 159, 'min_data': 113, 'ff': 0.6733618039413735}. Best is trial 1 wit

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[3]	valid_0's rmse: 0.463115
Fold 0: RMSE=0.4631 | sMAPE=29.59%


[I 2025-05-11 16:23:46,633] A new study created in memory with name: no-name-8f14f421-2ed2-4f87-a9fb-8c42dc5a63be
[I 2025-05-11 16:25:26,619] Trial 0 finished with value: 0.33930726008907086 and parameters: {'lr': 0.030710573677773714, 'leaves': 511, 'min_data': 306, 'ff': 0.8394633936788146}. Best is trial 0 with value: 0.33930726008907086.
[I 2025-05-11 16:26:34,025] Trial 1 finished with value: 0.3890112236022584 and parameters: {'lr': 0.015958237752949748, 'leaves': 127, 'min_data': 70, 'ff': 0.9464704583099741}. Best is trial 0 with value: 0.33930726008907086.
[I 2025-05-11 16:27:51,562] Trial 2 finished with value: 0.3178479690675019 and parameters: {'lr': 0.06054365855469246, 'leaves': 383, 'min_data': 57, 'ff': 0.9879639408647978}. Best is trial 2 with value: 0.3178479690675019.
[I 2025-05-11 16:28:45,126] Trial 3 finished with value: 0.3276468115509723 and parameters: {'lr': 0.12106896936002161, 'leaves': 159, 'min_data': 113, 'ff': 0.6733618039413735}. Best is trial 2 with va

Training until validation scores don't improve for 100 rounds
[200]	valid_0's rmse: 0.402383
Early stopping, best iteration is:
[186]	valid_0's rmse: 0.400701
Fold 1: RMSE=0.4007 | sMAPE=23.88%


[I 2025-05-11 16:44:23,346] A new study created in memory with name: no-name-735157a4-3827-486b-be1b-e7993a07dff2
[I 2025-05-11 16:44:43,419] Trial 0 finished with value: 0.48157318909348434 and parameters: {'lr': 0.030710573677773714, 'leaves': 511, 'min_data': 306, 'ff': 0.8394633936788146}. Best is trial 0 with value: 0.48157318909348434.
[I 2025-05-11 16:44:59,648] Trial 1 finished with value: 0.4812557534505432 and parameters: {'lr': 0.015958237752949748, 'leaves': 127, 'min_data': 70, 'ff': 0.9464704583099741}. Best is trial 1 with value: 0.4812557534505432.
[I 2025-05-11 16:45:16,344] Trial 2 finished with value: 0.48184265472016335 and parameters: {'lr': 0.06054365855469246, 'leaves': 383, 'min_data': 57, 'ff': 0.9879639408647978}. Best is trial 1 with value: 0.4812557534505432.
[I 2025-05-11 16:45:32,648] Trial 3 finished with value: 0.4830669866675557 and parameters: {'lr': 0.12106896936002161, 'leaves': 159, 'min_data': 113, 'ff': 0.6733618039413735}. Best is trial 1 with va

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	valid_0's rmse: 0.48223
Fold 2: RMSE=0.4822 | sMAPE=25.50%


In [10]:
METRICS_LGBM = METRICS / "metrics_lgbm_gpu_full.json"
json.dump({"timestamp":datetime.now().isoformat(timespec='seconds'),
           "results":all_metrics},
          open(METRICS_LGBM,"w"), indent=2)

import pandas as pd
display(pd.DataFrame(all_metrics)
        .style.set_caption("LightGBM GPU"))
print("✔ метрики сохранены →", METRICS_LGBM.relative_to(PROJECT_DIR))

Unnamed: 0,fold,rmse_log,smape
0,0,0.463115,29.588389
1,1,0.400701,23.876497
2,2,0.48223,25.496307


✔ метрики сохранены → artifacts/metrics/metrics_lgbm_gpu_full.json


Средний RMSE ≈ 0.45 против 0.55 (Prophet) и 0.65 (Naïve) → -≈ 30 % ошибок.
sMAPE упал с 36 % до ~26 % — хороший рывок.