# Прогнозирование цен потребительского ритейла по тестовой выборке на основе глубоких нейронных сетей
__Выполнил:__ *Домченко Максим*

__Студент группы:__ *РИМ-130962*

In [1]:
# Подключение Google Drive
from pathlib import Path
from google.colab import drive

drive.mount('/content/drive')
ROOT = Path('/content/drive/MyDrive')

# Настройка структуры папок проекта
PROJECT_DIR = ROOT / 'price_forecasting'
DATA = PROJECT_DIR / 'data'
RAW = DATA / 'raw'
PROCESSED = DATA / 'processed'
MODELS = PROJECT_DIR / 'models'
MODELS.mkdir(exist_ok=True)

print('Проектная папка подключена:', PROJECT_DIR)

Mounted at /content/drive
Проектная папка подключена: /content/drive/MyDrive/price_forecasting


In [2]:
# Установка xgboost с поддержкой GPU
!pip install xgboost -U

# Импорт библиотек
import numpy as np
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from sklearn.preprocessing import LabelEncoder
import pickle

Collecting xgboost
  Downloading xgboost-3.0.1-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.1-py3-none-manylinux_2_28_x86_64.whl (253.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.9/253.9 MB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xgboost
  Attempting uninstall: xgboost
    Found existing installation: xgboost 2.1.4
    Uninstalling xgboost-2.1.4:
      Successfully uninstalled xgboost-2.1.4
Successfully installed xgboost-3.0.1


In [3]:
import xgboost as xgb

print(xgb.__version__)
print(xgb.build_info()['USE_CUDA'])  # Если True, то GPU поддерживается

3.0.1
True


In [4]:
# Загрузка подготовленных данных
train_df = pd.read_parquet(PROCESSED / 'train_features_final.parquet')
val_df = pd.read_parquet(PROCESSED / 'val_features_final.parquet')
test_df = pd.read_parquet(PROCESSED / 'test_features_final.parquet')

# Проверка загруженных данных
print(f"Размер тренировочной выборки: {train_df.shape}")
print(f"Размер валидационной выборки: {val_df.shape}")
print(f"Размер тестовой выборки: {test_df.shape}")

display(train_df.head())

Размер тренировочной выборки: (35743638, 22)
Размер валидационной выборки: (5986318, 22)
Размер тестовой выборки: (6005441, 22)


Unnamed: 0,store_id,item_id,wm_yr_wk,sell_price,date,month,year,event_name_1,event_type_1,event_name_2,...,snap_TX,snap_WI,day_of_week,is_weekend,log_sell_price,event_flag,event_type_National,event_type_Cultural,event_type_Religious,event_type_Sporting
0,CA_1,FOODS_1_001,11101,2.0,2011-01-29,1,2011,NoEvent,NoEvent,NoEvent,...,0,0,5,1,1.098612,0,0,0,0,0
1,CA_1,FOODS_1_001,11101,2.0,2011-01-30,1,2011,NoEvent,NoEvent,NoEvent,...,0,0,6,1,1.098612,0,0,0,0,0
2,CA_1,FOODS_1_001,11101,2.0,2011-01-31,1,2011,NoEvent,NoEvent,NoEvent,...,0,0,0,0,1.098612,0,0,0,0,0
3,CA_1,FOODS_1_001,11101,2.0,2011-02-01,2,2011,NoEvent,NoEvent,NoEvent,...,1,0,1,0,1.098612,0,0,0,0,0
4,CA_1,FOODS_1_001,11101,2.0,2011-02-02,2,2011,NoEvent,NoEvent,NoEvent,...,0,1,2,0,1.098612,0,0,0,0,0


In [5]:
# Колонки для удаления (убираем лишние признаки и сохраняем таргеты отдельно)
cols_to_drop = ['date', 'wm_yr_wk', 'sell_price', 'log_sell_price']

# Извлечение таргета (цена и логарифм цены) отдельно
y_train = train_df[['sell_price', 'log_sell_price']]
y_val = val_df[['sell_price', 'log_sell_price']]
y_test = test_df[['sell_price', 'log_sell_price']]

# Удаление ненужных признаков из датасетов
X_train = train_df.drop(columns=cols_to_drop)
X_val = val_df.drop(columns=cols_to_drop)
X_test = test_df.drop(columns=cols_to_drop)

# Проверим итоговый состав признаков
print("Итоговый список признаков после финального обновления:")
for col in X_train.columns:
    print("-", col)

# Проверим итоговые датасеты
display(X_train.head())

Итоговый список признаков после финального обновления:
- store_id
- item_id
- month
- year
- event_name_1
- event_type_1
- event_name_2
- event_type_2
- snap_CA
- snap_TX
- snap_WI
- day_of_week
- is_weekend
- event_flag
- event_type_National
- event_type_Cultural
- event_type_Religious
- event_type_Sporting


Unnamed: 0,store_id,item_id,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,day_of_week,is_weekend,event_flag,event_type_National,event_type_Cultural,event_type_Religious,event_type_Sporting
0,CA_1,FOODS_1_001,1,2011,NoEvent,NoEvent,NoEvent,NoEvent,0,0,0,5,1,0,0,0,0,0
1,CA_1,FOODS_1_001,1,2011,NoEvent,NoEvent,NoEvent,NoEvent,0,0,0,6,1,0,0,0,0,0
2,CA_1,FOODS_1_001,1,2011,NoEvent,NoEvent,NoEvent,NoEvent,0,0,0,0,0,0,0,0,0,0
3,CA_1,FOODS_1_001,2,2011,NoEvent,NoEvent,NoEvent,NoEvent,1,1,0,1,0,0,0,0,0,0
4,CA_1,FOODS_1_001,2,2011,NoEvent,NoEvent,NoEvent,NoEvent,1,0,1,2,0,0,0,0,0,0


In [6]:
# Кодирование категориальных признаков с помощью LabelEncoder
cat_features = ['store_id', 'item_id', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']

label_encoders = {}
for col in cat_features:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col])
    X_val[col] = le.transform(X_val[col])
    X_test[col] = le.transform(X_test[col])
    label_encoders[col] = le

# Проверим итог кодирования
display(X_train.head())

Unnamed: 0,store_id,item_id,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,day_of_week,is_weekend,event_flag,event_type_National,event_type_Cultural,event_type_Religious,event_type_Sporting
0,0,0,1,2011,19,2,3,1,0,0,0,5,1,0,0,0,0,0
1,0,0,1,2011,19,2,3,1,0,0,0,6,1,0,0,0,0,0
2,0,0,1,2011,19,2,3,1,0,0,0,0,0,0,0,0,0,0
3,0,0,2,2011,19,2,3,1,1,1,0,1,0,0,0,0,0,0
4,0,0,2,2011,19,2,3,1,1,0,1,2,0,0,0,0,0,0


In [11]:
# Создание DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
dtest = xgb.DMatrix(X_test, label=y_test)

In [12]:
params = {
    'objective': 'reg:squarederror',
    'eval_metric': ['rmse', 'mae', 'mape'],
    'tree_method': 'hist',
    'device': 'cuda',
    'learning_rate': 0.01,
    'max_depth': 14,
    'subsample': 0.9,
    'colsample_bytree': 0.9,
    'gamma': 0.1,
    'min_child_weight': 3,
    'seed': 42
}

evals = [(dtrain, 'train'), (dval, 'validation')]
xgb_model = xgb.train(
    params=params,
    dtrain=dtrain,
    num_boost_round=4000,
    early_stopping_rounds=150,
    evals=evals,
    verbose_eval=50
)

[0]	train-rmse:2.78962	train-mae:1.86473	train-mape:1.00245	validation-rmse:2.91759	validation-mae:1.89639	validation-mape:0.97977
[50]	train-rmse:2.39428	train-mae:1.48528	train-mape:0.79983	validation-rmse:2.53113	validation-mae:1.52875	validation-mape:0.78885
[100]	train-rmse:2.22646	train-mae:1.30925	train-mape:0.69361	validation-rmse:2.36914	validation-mae:1.36299	validation-mape:0.69088
[150]	train-rmse:2.15495	train-mae:1.23143	train-mape:0.63939	validation-rmse:2.30109	validation-mae:1.29221	validation-mape:0.64239
[200]	train-rmse:2.12732	train-mae:1.20269	train-mape:0.61444	validation-rmse:2.27513	validation-mae:1.26809	validation-mape:0.62154
[250]	train-rmse:2.11572	train-mae:1.19327	train-mape:0.60293	validation-rmse:2.26464	validation-mae:1.26055	validation-mape:0.61221
[300]	train-rmse:2.11084	train-mae:1.19094	train-mape:0.59764	validation-rmse:2.26065	validation-mae:1.25917	validation-mape:0.60826
[350]	train-rmse:2.10874	train-mae:1.19084	train-mape:0.59511	validation

In [14]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score

# Предсказание на всех наборах данных
y_pred_train = xgb_model.predict(dtrain)
y_pred_val = xgb_model.predict(dval)
y_pred_test = xgb_model.predict(dtest)

# Расчёт расширенного списка метрик
metrics_dict = {
    # Метрики на тренировочном наборе
    'MAE_train': mean_absolute_error(y_train, y_pred_train),
    'MSE_train': mean_squared_error(y_train, y_pred_train),
    'RMSE_train': mean_squared_error(y_train, y_pred_train)**0.5,
    'MAPE_train': mean_absolute_percentage_error(y_train, y_pred_train),
    'R2_train': r2_score(y_train, y_pred_train),

    # Метрики на валидационном наборе
    'MAE_val': mean_absolute_error(y_val, y_pred_val),
    'MSE_val': mean_squared_error(y_val, y_pred_val),
    'RMSE_val': mean_squared_error(y_val, y_pred_val)**0.5,
    'MAPE_val': mean_absolute_percentage_error(y_val, y_pred_val),
    'R2_val': r2_score(y_val, y_pred_val),

    # Метрики на тестовом наборе
    'MAE_test': mean_absolute_error(y_test, y_pred_test),
    'MSE_test': mean_squared_error(y_test, y_pred_test),
    'RMSE_test': mean_squared_error(y_test, y_pred_test)**0.5,
    'MAPE_test': mean_absolute_percentage_error(y_test, y_pred_test),
    'R2_test': r2_score(y_test, y_pred_test)
}

# Вывод метрик
print("Расширенный набор метрик:")
for key, value in metrics_dict.items():
    print(f"{key}: {value:.4f}")

# Сохранение модели
model_path = MODELS / 'xgboost_model.json'
xgb_model.save_model(model_path)

# Сохранение метрик
metrics_path = MODELS / 'xgboost_model_extended_metrics.pkl'
with open(metrics_path, 'wb') as f:
    pickle.dump(metrics_dict, f)

print(f"Модель успешно сохранена в: {model_path}")
print(f"Расширенные метрики успешно сохранены в: {metrics_path}")

Расширенный набор метрик:
MAE_train: 1.0799
MSE_train: 4.0609
RMSE_train: 2.0152
MAPE_train: 0.5067
R2_train: 0.3052
MAE_val: 1.2595
MSE_val: 5.0519
RMSE_val: 2.2476
MAPE_val: 0.6030
R2_val: 0.2165
MAE_test: 1.2578
MSE_test: 5.0523
RMSE_test: 2.2477
MAPE_test: 0.5974
R2_test: 0.2154
Модель успешно сохранена в: /content/drive/MyDrive/price_forecasting/models/xgboost_model.json
Расширенные метрики успешно сохранены в: /content/drive/MyDrive/price_forecasting/models/xgboost_model_extended_metrics.pkl
