# Прогнозирование цен потребительского ритейла по тестовой выборке на основе глубоких нейронных сетей
__Выполнил:__ *Домченко Максим*

__Студент группы:__ *РИМ-130962*

#### Подключаем Google Drive и задаём корневую папку проекта

In [1]:
#  ===== 0. Подключаем Google Drive и задаём корневую папку проекта =====
from pathlib import Path
import sys, os

try:
    # вариант Colab
    from google.colab import drive
    drive.mount('/content/drive')
    GDRIVE_ROOT = Path("/content/drive/MyDrive")
except (ModuleNotFoundError, ValueError):
    # локальный Jupyter + Google Drive for desktop
    #   (проверьте, где именно у вас смонтирован «Мой Диск»)
    possible = [
        Path.home() / "Google Drive",
        Path.home() / "Мой диск"            # рус. версия клиента
    ]
    GDRIVE_ROOT = next((p for p in possible if p.exists()), None)
    if GDRIVE_ROOT is None:
        sys.exit("Папка Google Drive не найдена. Проверьте путь.")

# ────────────────────────────────────────────────────────────────────────
PROJECT_DIR = GDRIVE_ROOT / "price_forecasting"
PROJECT_DIR.mkdir(parents=True, exist_ok=True)

# Единая «точка входа» для остальных путей
ROOT                     = PROJECT_DIR
DATA                     = ROOT / "data"               # сырые и промежуточные датасеты
MODEL_READY              = DATA / "model_ready"
SPLITS_WF                = MODEL_READY / "splits_wf"
ARTIFACTS                = ROOT / "artifacts"          # модели, метрики, изображения
PLOTS                    = ARTIFACTS / "plots"
MODELS                   = ARTIFACTS / "models"
PREDICTIONS              = ARTIFACTS / "predictions"
METRICS                  = ARTIFACTS / "metrics"

for d in (DATA, MODEL_READY, SPLITS_WF, ARTIFACTS, PLOTS, MODELS, PREDICTIONS, METRICS):
    d.mkdir(parents=True, exist_ok=True)

print(f"Все файлы читаем/пишем в: {ROOT}")

Mounted at /content/drive
Все файлы читаем/пишем в: /content/drive/MyDrive/price_forecasting


In [2]:
!pip install --quiet scikit-learn joblib

In [3]:
import pandas as pd, numpy as np, json, os, math, joblib, warnings
from pandas.api.types import is_datetime64_any_dtype
warnings.filterwarnings("ignore")

df   = pd.read_parquet(MODEL_READY / "dataset.parquet")
meta = json.load(open(MODEL_READY / "features.json"))

for c in meta["all_features"]:
    if is_datetime64_any_dtype(df[c]):
        df[c] = (df[c].view("int64") // 10**9).astype("int32")

cat_cols = [c for c in meta["all_features"] if df[c].dtype.name in ("category","object")]
for c in cat_cols:
    df[c] = df[c].astype("category").cat.codes.astype("int32")

num_cols = [c for c in meta["all_features"] if c not in cat_cols]
df[num_cols] = df[num_cols].astype(np.float32)

FEATURES = meta["all_features"]
TARGET   = meta["target_log"]

print(f"rows {len(df)} | feats {len(FEATURES)} | cat {len(cat_cols)}")

rows 57473650 | feats 39 | cat 10


In [4]:
def sample_indices(idxs, n_rows=3_000_000, seed=42):
    if len(idxs) <= n_rows:
        return idxs
    rng = np.random.default_rng(seed)
    return rng.choice(idxs, size=n_rows, replace=False)

In [5]:
from sklearn.ensemble import ExtraTreesRegressor
MODELS_ET = MODELS / "extratrees";  MODELS_ET.mkdir(parents=True, exist_ok=True)
PRED_ET   = PREDICTIONS / "extratrees"; PRED_ET.mkdir(parents=True, exist_ok=True)

ET_BASE = dict(
    n_estimators     = 200,
    max_depth        = 18,
    max_features     = 0.5,
    min_samples_leaf = 10,
    n_jobs           = -1
)

In [6]:
from sklearn.metrics import mean_squared_error
from datetime import datetime, timezone
import json, pandas as pd

def smape(a,f): return 100*np.mean(np.abs(f-a)/((np.abs(f)+np.abs(a))/2))

metrics = []

for k in range(3):
    tr_full = np.load(SPLITS_WF/f"wf_train_idx_k{k}.npy")
    vl_idx  = np.load(SPLITS_WF/f"wf_val_idx_k{k}.npy")

    tr_idx = sample_indices(tr_full, 3_000_000, seed=42+k)

    X_tr, y_tr = df.loc[tr_idx, FEATURES], df.loc[tr_idx, TARGET]
    X_val, y_val = df.loc[vl_idx, FEATURES], df.loc[vl_idx, TARGET]

    et = ExtraTreesRegressor(**ET_BASE, random_state=42+k).fit(X_tr, y_tr)

    joblib.dump(et, MODELS_ET/f"et_k{k}.joblib")

    preds = et.predict(X_val).astype(np.float32)
    np.save(PRED_ET/f"preds_et_k{k}.npy", preds)

    rmse = math.sqrt(mean_squared_error(y_val, preds))
    s_mp = smape(y_val, preds)
    metrics.append({"fold":k, "rmse_log":float(rmse), "smape":float(s_mp)})

    print(f"Fold {k}: RMSE={rmse:.4f} | sMAPE={s_mp:.2f}%")

Fold 0: RMSE=0.5115 | sMAPE=31.60%
Fold 1: RMSE=0.4129 | sMAPE=24.33%
Fold 2: RMSE=0.5249 | sMAPE=26.75%


In [7]:
METRICS_ET = METRICS / "metrics_extratrees.json"
json.dump({"timestamp":datetime.now(timezone.utc).isoformat(timespec='seconds'),
           "results":metrics},
          open(METRICS_ET,"w"), indent=2)

pd.DataFrame(metrics).style.set_caption("ExtraTrees (fast) metrics")
print("metrics saved ->", METRICS_ET.relative_to(PROJECT_DIR))

metrics saved -> artifacts/metrics/metrics_extratrees.json
