In [42]:
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
import xgboost as xgb
import plotly.graph_objects as go
import logging
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
import plotly.express as px
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, TimeSeriesSplit, cross_val_score
from sklearn.metrics import mean_absolute_error

In [43]:
warnings.filterwarnings('ignore', category=UserWarning)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")

In [44]:
sales_df = pd.read_excel('initial_data/train_sales.xlsx')
promo_df = pd.read_excel('initial_data/train_promo.xlsx')
promo_2_df = pd.read_excel('initial_data/test_promo.xlsx')
geography_df = pd.read_excel('initial_data/Атрибуты Customer.xlsx')
weights_df = pd.read_excel('initial_data/Веса DFU.xlsx')

Очистим файл будущего промо (только Customer = 1 продает со скидками)

In [45]:
promo_2_df = promo_2_df[promo_2_df['Customer'] == 1]
promo_2_df

Unnamed: 0,Promo,Promo №,Customer,DFU,Promo mechanic,Start Date on shelf,Promo Days on shelf,End Date on shelf,Shipment days to promo start,First Date of shipment,End Date of shipment,"Discount, %"
4,989914,708755,1,Рис длиннозерный 500 гр,395054,2021-09-07,7,2021-09-13,21,2021-08-17,2021-09-02,0.25
7,989925,708751,1,Рис длиннозерный 500 гр,395054,2021-07-27,7,2021-08-02,21,2021-07-06,2021-07-22,0.25
10,989962,708764,1,Рис длиннозерный 500 гр,395054,2021-11-30,7,2021-12-06,21,2021-11-09,2021-11-25,0.25
11,989963,708762,1,Рис длиннозерный 500 гр,395054,2021-10-19,7,2021-10-25,21,2021-09-28,2021-10-14,0.25
12,989964,708755,1,Рис длиннозерный 500 гр,395054,2021-09-07,7,2021-09-13,21,2021-08-17,2021-09-02,0.20
...,...,...,...,...,...,...,...,...,...,...,...,...
1497,991409,708804,1,Рис длиннозерный 486 гр,395054,2022-03-22,7,2022-03-28,21,2022-03-01,2022-03-17,0.25
1564,991954,708806,1,Рис длиннозерный 486 гр,395054,2022-04-19,7,2022-04-25,21,2022-03-29,2022-04-14,0.25
1608,990425,709776,1,Киноа 300 гр,395054,2021-10-31,30,2021-11-29,21,2021-10-10,2021-11-30,0.25
1609,990426,708756,1,Киноа 300 гр,395054,2021-09-21,7,2021-09-27,21,2021-08-31,2021-09-16,0.25


Очистка продаж

In [46]:
bpv_mean = sales_df["BPV"].mean()
bpv_std = sales_df["BPV"].std()
lower_bound = bpv_mean - 2 * bpv_std
upper_bound = bpv_mean + 2 * bpv_std

In [47]:
sales_df_cleaned = sales_df[(sales_df["BPV"] >= lower_bound) & (sales_df["BPV"] <= upper_bound)]
sales_df_cleaned = sales_df_cleaned[sales_df_cleaned["BPV"] >= 0]

Отсечение периода

In [48]:
sales_df_cleaned = sales_df_cleaned[sales_df_cleaned["Period"] >= np.datetime64("2018-04-23")]

Замена 500 гр на 486 гр (слова заказчика)

In [49]:
sales_df_cleaned = sales_df_cleaned.copy()
sales_df_cleaned['DFU'] = sales_df_cleaned['DFU'].replace('Рис длиннозерный 500 гр', 'Рис длиннозерный 486 гр')

Добавим 0 в пропусках

In [50]:
min_date = sales_df_cleaned['Period'].min()
max_date = sales_df_cleaned['Period'].max()

all_weeks = pd.date_range(start=min_date, end=max_date, freq='W-MON')

full_data = []

for (dfu, customer), group in sales_df_cleaned.groupby(['DFU', 'Customer']):
    existing_periods = set(group['Period'])

    for week in all_weeks:
        if week not in existing_periods:
            # Добавляем пропущенную строку с BPV и Total Sell-in = 0
            full_data.append({
                'DFU': dfu,
                'Customer': customer,
                'Period': week,
                'BPV': 0,
                'Total Sell-in': 0
            })

sales_df_cleaned = pd.concat([sales_df_cleaned, pd.DataFrame(full_data)], ignore_index=True)
sales_df_cleaned = sales_df_cleaned.sort_values(by=['DFU', 'Customer', 'Period']).reset_index(drop=True)

Добавим пустые записи до 2023-01-01

In [51]:
sales_df_cleaned['Period'] = pd.to_datetime(sales_df_cleaned['Period'])

# Создание пустого списка для новых строк
new_rows = []

# Группировка по Customer и DFU
grouped = sales_df_cleaned.groupby(['Customer', 'DFU'])

for (customer, dfu), group in grouped:
    max_date = group['Period'].max()
    next_monday = max_date + pd.DateOffset(weeks=1)

    # Генерация дат до 2023-01-01 (не включая)
    while next_monday < pd.Timestamp('2023-01-01'):
        new_rows.append({
            'Customer': customer,
            'DFU': dfu,
            'Period': next_monday,
            'BPV': np.nan,
            'Total Sell-in': np.nan
        })
        next_monday += pd.DateOffset(weeks=1)

# Создание датафрейма из новых строк
new_rows_df = pd.DataFrame(new_rows)

# Объединение с исходным датафреймом
sales_df_cleaned = pd.concat([sales_df_cleaned, new_rows_df], ignore_index=True)



In [52]:
def get_season(date):
    month = date.month
    if month in [12, 1, 2]:
        return "Winter"
    elif month in [3, 4, 5]:
        return "Spring"
    elif month in [6, 7, 8]:
        return "Summer"
    else:
        return "Autumn"


sales_df_cleaned["Season"] = sales_df_cleaned["Period"].apply(get_season)

In [53]:
sales_df_cleaned = pd.merge(sales_df_cleaned, geography_df, left_on="Customer", right_on="Клиент", how="left")
sales_df_cleaned.rename(columns={"Тип": "Type", "География": "Geography"}, inplace=True)

In [54]:
sales_df_cleaned.drop(columns=["Клиент"], inplace=True)
sales_df_cleaned

Unnamed: 0,DFU,Customer,Period,BPV,Total Sell-in,Season,Type,Geography
0,Булгур 300 гр,1,2018-04-23,0.0,0.0,Spring,Сеть,Москва
1,Булгур 300 гр,1,2018-04-30,0.0,0.0,Spring,Сеть,Москва
2,Булгур 300 гр,1,2018-05-07,0.0,0.0,Spring,Сеть,Москва
3,Булгур 300 гр,1,2018-05-14,0.0,0.0,Spring,Сеть,Москва
4,Булгур 300 гр,1,2018-05-21,0.0,0.0,Spring,Сеть,Москва
...,...,...,...,...,...,...,...,...
3184,Рис басмати 500 гр,34,2022-11-28,,,Autumn,Дистрибутор,СНГ
3185,Рис басмати 500 гр,34,2022-12-05,,,Winter,Дистрибутор,СНГ
3186,Рис басмати 500 гр,34,2022-12-12,,,Winter,Дистрибутор,СНГ
3187,Рис басмати 500 гр,34,2022-12-19,,,Winter,Дистрибутор,СНГ


In [55]:
sales_df_cleaned['End of Period'] = sales_df_cleaned['Period'] + pd.Timedelta(days=6)

In [56]:
sales_df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3189 entries, 0 to 3188
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   DFU            3189 non-null   object        
 1   Customer       3189 non-null   int64         
 2   Period         3189 non-null   datetime64[ns]
 3   BPV            2162 non-null   float64       
 4   Total Sell-in  2162 non-null   float64       
 5   Season         3189 non-null   object        
 6   Type           3189 non-null   object        
 7   Geography      3189 non-null   object        
 8   End of Period  3189 non-null   datetime64[ns]
dtypes: datetime64[ns](2), float64(2), int64(1), object(4)
memory usage: 224.4+ KB


Удаление выведенных из продажи товаров

In [57]:
dfu_to_remove = ["Йогурт Постный 180 гр", "Рис круглозерный 1000 гр", "Булгур 300 гр"]
sales_df_cleaned = sales_df_cleaned[~sales_df_cleaned['DFU'].isin(dfu_to_remove)]

In [58]:
sales_df_cleaned = sales_df_cleaned[~((sales_df_cleaned["Customer"] == 1) &
                                      (sales_df_cleaned["DFU"].isin(["Рис Жасмин 500 гр", "Рис басмати 500 гр"])))]

Посчитаем промо нагрузку каждого продукта

In [59]:
promo_load = (
    sales_df_cleaned.dropna(subset=["BPV", "Total Sell-in"])  # удаляем строки с пропущенными значениями
    .groupby(["Customer", "DFU"])
    .apply(lambda x: (x["BPV"] != x["Total Sell-in"]).mean()*100)
    .reset_index()
    .rename(columns={0: "Promo_Load"})
)

print(promo_load)

   Customer                      DFU  Promo_Load
0         1  Рис длиннозерный 486 гр   64.117647
1         1     Рис для плова 500 гр   69.277108
2         1  Рис круглозерный 500 гр   57.228916
3         2       Рис басмати 500 гр    0.000000
4        14       Рис басмати 500 гр    0.000000
5        18       Рис басмати 500 гр    0.000000
6        29       Рис басмати 500 гр    0.000000
7        34       Рис басмати 500 гр    0.000000


Обнуляем BPV, которое составляет менее 9% от Total Sell-in

In [60]:
#sales_df_cleaned.loc[sales_df_cleaned['BPV'] < 0.09 * sales_df_cleaned['Total Sell-in'], 'BPV'] = 0

Приравниваем BPV к Total Sell-in в случаях когда BPV / Total Sell-in >= 92%

In [61]:
#sales_df_cleaned.loc[sales_df_cleaned['BPV']/sales_df_cleaned['Total Sell-in'] >= 0.92, 'BPV'] = sales_df_cleaned['Total Sell-in']

In [62]:
sales_df_cleaned.to_excel('sales_df_cleaned.xlsx')

In [63]:
for (dfu, customer), group in sales_df_cleaned.groupby(['DFU', 'Customer']):
    fig = px.line(
        group.melt(id_vars=['Period'], value_vars=['BPV', 'Total Sell-in'], var_name='Metric', value_name='Value'),
        x='Period', y='Value', color='Metric',
        title=f'DFU: {dfu} | Customer: {customer}',
        labels={'Value': 'Sales', 'Period': 'Date'}
    )

    fig.update_traces(mode='lines+markers')  # Добавляем точки на линии
    fig.update_xaxes(title_text='Дата', tickformat='%Y-%m-%d')  # Форматируем ось X
    fig.update_yaxes(title_text='Значение')  # Подпись оси Y
    fig.show()

In [64]:
sales_df_cleaned.loc[sales_df_cleaned['BPV'] == sales_df_cleaned['Total Sell-in'], 'BPV_sale_period'] = 6
sales_df_cleaned.loc[sales_df_cleaned['BPV'] == sales_df_cleaned['Total Sell-in'], 'SoD_sale_period'] = 0
sales_df_cleaned.loc[(sales_df_cleaned['BPV'] != sales_df_cleaned['Total Sell-in']) &
                     (sales_df_cleaned['BPV'] == 0), 'SoD_sale_period'] = 6
sales_df_cleaned.loc[(sales_df_cleaned['BPV'] != sales_df_cleaned['Total Sell-in']) &
                     (sales_df_cleaned['BPV'] == 0), 'BPV_sale_period'] = 0


In [65]:
sales_df_cleaned

Unnamed: 0,DFU,Customer,Period,BPV,Total Sell-in,Season,Type,Geography,End of Period,BPV_sale_period,SoD_sale_period
664,Рис басмати 500 гр,2,2018-04-23,11.208,11.208,Spring,Дистрибутор,ЦФО,2018-04-29,6.0,0.0
665,Рис басмати 500 гр,2,2018-04-30,0.903,0.903,Spring,Дистрибутор,ЦФО,2018-05-06,6.0,0.0
666,Рис басмати 500 гр,2,2018-05-07,24.828,24.828,Spring,Дистрибутор,ЦФО,2018-05-13,6.0,0.0
667,Рис басмати 500 гр,2,2018-05-14,9.102,9.102,Spring,Дистрибутор,ЦФО,2018-05-20,6.0,0.0
668,Рис басмати 500 гр,2,2018-05-21,25.203,25.203,Spring,Дистрибутор,ЦФО,2018-05-27,6.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
3184,Рис басмати 500 гр,34,2022-11-28,,,Autumn,Дистрибутор,СНГ,2022-12-04,,
3185,Рис басмати 500 гр,34,2022-12-05,,,Winter,Дистрибутор,СНГ,2022-12-11,,
3186,Рис басмати 500 гр,34,2022-12-12,,,Winter,Дистрибутор,СНГ,2022-12-18,,
3187,Рис басмати 500 гр,34,2022-12-19,,,Winter,Дистрибутор,СНГ,2022-12-25,,


In [66]:
sales_df_cleaned.to_excel('sales_df_cleaned.xlsx')

In [67]:
sales_with_values = sales_df_cleaned.dropna(subset=["BPV", "Total Sell-in"])
sales_with_values

Unnamed: 0,DFU,Customer,Period,BPV,Total Sell-in,Season,Type,Geography,End of Period,BPV_sale_period,SoD_sale_period
664,Рис басмати 500 гр,2,2018-04-23,11.2080,11.2080,Spring,Дистрибутор,ЦФО,2018-04-29,6.0,0.0
665,Рис басмати 500 гр,2,2018-04-30,0.9030,0.9030,Spring,Дистрибутор,ЦФО,2018-05-06,6.0,0.0
666,Рис басмати 500 гр,2,2018-05-07,24.8280,24.8280,Spring,Дистрибутор,ЦФО,2018-05-13,6.0,0.0
667,Рис басмати 500 гр,2,2018-05-14,9.1020,9.1020,Spring,Дистрибутор,ЦФО,2018-05-20,6.0,0.0
668,Рис басмати 500 гр,2,2018-05-21,25.2030,25.2030,Spring,Дистрибутор,ЦФО,2018-05-27,6.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
2157,Рис круглозерный 500 гр,1,2021-05-24,23.7168,23.7168,Spring,Сеть,Москва,2021-05-30,6.0,0.0
2158,Рис круглозерный 500 гр,1,2021-05-31,20.6955,41.2695,Spring,Сеть,Москва,2021-06-06,,
2159,Рис круглозерный 500 гр,1,2021-06-07,0.0000,25.6446,Summer,Сеть,Москва,2021-06-13,0.0,6.0
2160,Рис круглозерный 500 гр,1,2021-06-14,0.0000,24.9237,Summer,Сеть,Москва,2021-06-20,0.0,6.0


In [68]:
sales_missing_values = sales_df_cleaned[
    sales_df_cleaned[["BPV", "Total Sell-in"]].isna().any(axis=1)
]
sales_missing_values

Unnamed: 0,DFU,Customer,Period,BPV,Total Sell-in,Season,Type,Geography,End of Period,BPV_sale_period,SoD_sale_period
2478,Рис длиннозерный 486 гр,1,2021-06-28,,,Summer,Сеть,Москва,2021-07-04,,
2479,Рис длиннозерный 486 гр,1,2021-07-05,,,Summer,Сеть,Москва,2021-07-11,,
2480,Рис длиннозерный 486 гр,1,2021-07-12,,,Summer,Сеть,Москва,2021-07-18,,
2481,Рис длиннозерный 486 гр,1,2021-07-19,,,Summer,Сеть,Москва,2021-07-25,,
2482,Рис длиннозерный 486 гр,1,2021-07-26,,,Summer,Сеть,Москва,2021-08-01,,
...,...,...,...,...,...,...,...,...,...,...,...
3184,Рис басмати 500 гр,34,2022-11-28,,,Autumn,Дистрибутор,СНГ,2022-12-04,,
3185,Рис басмати 500 гр,34,2022-12-05,,,Winter,Дистрибутор,СНГ,2022-12-11,,
3186,Рис басмати 500 гр,34,2022-12-12,,,Winter,Дистрибутор,СНГ,2022-12-18,,
3187,Рис басмати 500 гр,34,2022-12-19,,,Winter,Дистрибутор,СНГ,2022-12-25,,


v1

In [69]:
# Рассчитываем промо нагрузку
promo_load = (sales_with_values.groupby(["Customer", "DFU"])
              .apply(lambda x: (x["BPV"] != x["Total Sell-in"]).mean())
              .reset_index()
              .rename(columns={0: "Promo_Load"}))

# Объединяем с основными данными
sales_with_values = sales_with_values.merge(promo_load, on=["Customer", "DFU"], how="left")


def prepare_and_predict_sales(sales_with_values):
    df = sales_with_values.copy()

    df['Period'] = pd.to_datetime(df['Period'])

    # Кодируем категориальные переменные
    le_dfu = LabelEncoder()
    le_customer = LabelEncoder()
    le_season = LabelEncoder()
    le_type = LabelEncoder()
    le_geography = LabelEncoder()

    df['DFU_encoded'] = le_dfu.fit_transform(df['DFU'])
    df['Customer_encoded'] = le_customer.fit_transform(df['Customer'])
    df['Season_encoded'] = le_season.fit_transform(df['Season'])
    df['Type_encoded'] = le_type.fit_transform(df['Type'])
    df['Geography_encoded'] = le_geography.fit_transform(df['Geography'])

    # Создаем столбец для предсказанных значений
    df['Total Sell-in predicted'] = df['Total Sell-in']

    # Находим строки, где нужно предсказать значения
    condition = (df['BPV'] != df['Total Sell-in']) & (df['BPV'] != 0)
    predict_periods = df[condition]['Period'].unique()

    # Функция для создания признаков из прошлых данных
    def create_features(group, current_period):
        # Берем только прошлые периоды
        past_data = group[group['Period'] < current_period]

        if len(past_data) == 0:
            return pd.Series({
                'lag_1': np.nan,
                'lag_2': np.nan,
                'rolling_mean': np.nan
            })

        # Создаем лаги и скользящее среднее
        lag_1 = past_data['Total Sell-in'].iloc[-1] if len(past_data) >= 1 else np.nan
        lag_2 = past_data['Total Sell-in'].iloc[-2] if len(past_data) >= 2 else np.nan
        rolling_mean = past_data['Total Sell-in'].mean()

        return pd.Series({
            'lag_1': lag_1,
            'lag_2': lag_2,
            'rolling_mean': rolling_mean
        })

    for period in predict_periods:
        train_data = df[df['BPV'] == df['Total Sell-in']].copy()

        predict_data = df[(df['Period'] == period) & condition].copy()

        if len(train_data) == 0 or len(predict_data) == 0:
            continue

        train_features = train_data.groupby(['DFU', 'Customer']).apply(
            lambda x: create_features(x, period)
        ).reset_index()

        predict_features = predict_data.groupby(['DFU', 'Customer']).apply(
            lambda x: create_features(x, period)
        ).reset_index()

        categorical_features = ['Season_encoded', 'Type_encoded', 'Geography_encoded']
        train_cat = train_data.groupby(['DFU', 'Customer'])[categorical_features].first().reset_index()
        predict_cat = predict_data.groupby(['DFU', 'Customer'])[categorical_features].first().reset_index()

        train_features = train_features.merge(train_cat, on=['DFU', 'Customer'], how='left')
        predict_features = predict_features.merge(predict_cat, on=['DFU', 'Customer'], how='left')

        train_features = train_features.dropna()
        if len(train_features) == 0:
            continue

        y_train = train_data.groupby(['DFU', 'Customer'])['Total Sell-in'].first().reset_index()
        train_features = train_features.merge(y_train, on=['DFU', 'Customer'], how='left')

        features = ['lag_1', 'lag_2', 'rolling_mean'] + categorical_features
        X_train = train_features[features]
        y_train = train_features['Total Sell-in']

        X_predict = predict_features[features]

        # Обучаем модель
        model = RandomForestRegressor(n_estimators=100, random_state=42)
        model.fit(X_train, y_train)

        predictions = model.predict(X_predict)

        # Применяем ограничение 0.09 < BPV / Total Sell-in predicted < 0.92
        bpv_values = predict_data.groupby(['DFU', 'Customer'])['BPV'].first().reset_index()
        predict_features = predict_features.merge(bpv_values, on=['DFU', 'Customer'], how='left')

        # Вычисляем минимальное и максимальное значение Total Sell-in predicted
        max_pred = predict_features['BPV'] / 0.915  # Для верхней границы < 0.92
        min_pred = predict_features['BPV'] / 0.09  # Для нижней границы > 0.09

        # Корректируем предсказания, чтобы они попадали в диапазон
        predictions = np.maximum(predictions, max_pred + 0.001)  # > min_pred
        predictions = np.minimum(predictions, min_pred - 0.001)  # < max_pred

        # Обновляем предсказанные значения в основном датафрейме
        for idx, (dfu, customer) in enumerate(predict_features[['DFU', 'Customer']].values):
            mask = (df['Period'] == period) & (df['DFU'] == dfu) & (df['Customer'] == customer)
            df.loc[mask, 'Total Sell-in predicted'] = predictions[idx]

    df = df.drop(columns=['DFU_encoded', 'Customer_encoded', 'Season_encoded',
                          'Type_encoded', 'Geography_encoded'])

    return df


# Использование функции
result_df = prepare_and_predict_sales(sales_with_values)
result_df.to_excel('result_df.xlsx')

In [70]:
result_df.loc[(result_df['BPV'] != result_df['Total Sell-in']) &
              (result_df['BPV'] != 0), 'BPV_sale_period'] = round(
    result_df['BPV'] / result_df['Total Sell-in predicted'] * 6, 0)
result_df.loc[(result_df['BPV'] != result_df['Total Sell-in']) &
              (result_df['BPV'] != 0), 'SoD_sale_period'] = 6 - round(
    result_df['BPV'] / result_df['Total Sell-in predicted'] * 6, 0)
result_df.to_excel('result_df.xlsx')

In [71]:
promo_2_df = promo_2_df[["Customer", "DFU", "First Date of shipment", "End Date of shipment"]]
promo_2_df['DFU'] = promo_2_df['DFU'].replace('Рис длиннозерный 500 гр', 'Рис длиннозерный 486 гр')
promo_2_df = promo_2_df.drop_duplicates()
promo_2_df['First Date of shipment'] = pd.to_datetime(promo_2_df['First Date of shipment'])
promo_2_df['End Date of shipment'] = pd.to_datetime(promo_2_df['End Date of shipment'])

# Период начала и конца
start_date = pd.Timestamp('2021-06-28')
end_date = pd.Timestamp('2023-01-01')

# Создаём список всех недельных периодов
weeks = pd.date_range(start=start_date, end=end_date, freq='W-MON')  # Понедельники
periods = pd.DataFrame({'Period': weeks})
periods['End of Period'] = periods['Period'] + pd.Timedelta(days=6)

# Картезианское произведение Customer/DFU и недель
promo_2_expanded = promo_2_df.merge(periods, how='cross')

# Функция для расчёта SoD_sale_period и BPV_sale_period
def compute_periods(row):
    start = row['Period']
    end = row['End of Period']
    first = row['First Date of shipment']
    last = row['End Date of shipment']

    # Инициализируем
    sod_days = 0
    bpv_days = 6

    if first <= end and last >= start:
        # Есть пересечение
        if first >= start and first <= end:
            bpv_days = (first - start).days
            sod_days = (end - first).days
        elif last >= start and last <= end:
            sod_days = (last - start).days
            bpv_days = (end - last).days
        elif first <= start and last >= end:
            sod_days = 6
            bpv_days = 0
    else:
        # Нет пересечения
        sod_days = 0
        bpv_days = 6

    return pd.Series({'BPV_sale_period': bpv_days, 'SoD_sale_period': sod_days})

# Применяем функцию к строкам
promo_2_expanded[['BPV_sale_period', 'SoD_sale_period']] = promo_2_expanded.apply(compute_periods, axis=1)

# Функция определения "приоритетности" строки
def priority_flag(row):
    return not ((row['BPV_sale_period'] == 6 and row['SoD_sale_period'] == 0) or
                (row['BPV_sale_period'] == 0 and row['SoD_sale_period'] == 6))

# Добавим флаг приоритетности
promo_2_expanded['priority'] = promo_2_expanded.apply(priority_flag, axis=1)

# Сортируем, чтобы приоритетные строки шли первыми
promo_2_expanded_sorted = promo_2_expanded.sort_values(by='priority', ascending=False)

# Удаляем дубликаты по Customer, DFU и Period, оставляя приоритетную запись
promo_2_deduplicated = promo_2_expanded_sorted.drop_duplicates(subset=['Customer', 'DFU', 'Period'])

promo_2_deduplicated = promo_2_deduplicated.drop(columns=['priority','First Date of shipment' , 'End Date of shipment'])

In [72]:
promo_2_deduplicated.to_excel('promo_2_deduplicated.xlsx')

In [73]:
sales_with_promo = pd.merge(
    sales_missing_values,
    promo_2_deduplicated,
    on=['Customer', 'DFU', 'Period', 'End of Period'],
    how='left'
)
sales_with_promo = sales_with_promo.drop(columns=['BPV_sale_period_x', 'SoD_sale_period_x'])
sales_with_promo = sales_with_promo.rename(columns={'BPV_sale_period_y': 'BPV_sale_period', 'SoD_sale_period_y': 'SoD_sale_period'})
sales_with_promo.loc[sales_with_promo['Customer'] != 1, ['SoD_sale_period', 'BPV_sale_period']] = [0, 6]
sales_with_promo.to_excel('sales_with_promo.xlsx')

In [74]:
final_df = pd.concat([sales_with_promo, result_df], ignore_index=True)
final_df.to_excel('final_df.xlsx')

Модели

V1

V2

In [75]:
# Функция подготовки признаков
def prepare_features(df):
    df = df.copy()
    df['Period'] = pd.to_datetime(df['Period'])
    df = df.sort_values(['Period'])

    # Добавление лагов и скользящих средних
    for lag in [1, 2, 4, 8]:
        df[f'BPV_lag_{lag}'] = df['BPV'].shift(lag)
    df['BPV_rolling_mean_4'] = df['BPV'].rolling(4, min_periods=1).mean()

    # Разбиение даты на компоненты
    df['Year'] = df['Period'].dt.year
    df['Month'] = df['Period'].dt.month
    df['Week'] = df['Period'].dt.isocalendar().week

    # One-hot кодирование категориальных признаков
    df = pd.get_dummies(df, columns=['Season', 'Type', 'Geography'], drop_first=True)

    df = df.dropna()
    return df


# Функция обучения модели и предсказания
def train_and_predict(group):
    group = prepare_features(group)

    features = [col for col in group.columns if col not in ['BPV', 'Period', 'Customer', 'DFU', 'End of Period']]
    X = group[features]
    y = group['BPV']

    if len(X) < 10:
        print(f"Недостаточно данных для Customer: {group['Customer'].iloc[0]}, DFU: {group['DFU'].iloc[0]}")
        return pd.DataFrame()

    # Логарифмическое преобразование целевой переменной
    y_log = np.log1p(y)

    X_train, X_test, y_train, y_test = train_test_split(X, y_log, test_size=0.2, shuffle=False)

    model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=500, learning_rate=0.05, max_depth=6)
    tscv = TimeSeriesSplit(n_splits=5)
    scores = cross_val_score(model, X_train, y_train, cv=tscv, scoring='neg_mean_absolute_error')

    model.fit(X_train, y_train)
    y_pred_log = model.predict(X_test)

    # Обратное преобразование предсказаний
    y_pred = np.expm1(y_pred_log)
    y_test_actual = np.expm1(y_test)

    wape = (np.sum(np.abs(y_test_actual - y_pred)) / np.sum(y_test_actual)) * 100

    print(
        f'Customer: {group["Customer"].iloc[0]}, DFU: {group["DFU"].iloc[0]}, Cross-validation MAE: {-np.mean(scores):.4f}, WAPE on test set: {wape:.2f}%')

    group.loc[X_test.index, 'Predicted_BPV'] = y_pred
    return group


# Группировка по уникальным комбинациям Customer и DFU и последовательная обработка
results = []
grouped = sales_df_cleaned.groupby(['Customer', 'DFU'])
for name, group in grouped:
    result = train_and_predict(group)
    if not result.empty:
        results.append(result)

# Объединение результатов
final_df = pd.concat(results)

Customer: 1, DFU: Рис длиннозерный 486 гр, Cross-validation MAE: 0.2573, WAPE on test set: 9.74%
Customer: 1, DFU: Рис для плова 500 гр, Cross-validation MAE: 0.2020, WAPE on test set: 7.19%
Customer: 1, DFU: Рис круглозерный 500 гр, Cross-validation MAE: 0.4030, WAPE on test set: 5.83%
Customer: 2, DFU: Рис басмати 500 гр, Cross-validation MAE: 0.0750, WAPE on test set: 4.15%
Customer: 14, DFU: Рис басмати 500 гр, Cross-validation MAE: 0.0833, WAPE on test set: 1.83%
Customer: 18, DFU: Рис басмати 500 гр, Cross-validation MAE: 0.0403, WAPE on test set: 0.06%
Customer: 29, DFU: Рис басмати 500 гр, Cross-validation MAE: 0.0718, WAPE on test set: 4.53%
Customer: 34, DFU: Рис басмати 500 гр, Cross-validation MAE: 0.0766, WAPE on test set: 1.63%


In [76]:
final_df.to_excel("result_3.xlsx")

In [77]:
sales_df_cleaned.head(10)

Unnamed: 0,DFU,Customer,Period,BPV,Total Sell-in,Season,Type,Geography,End of Period,BPV_sale_period,SoD_sale_period
664,Рис басмати 500 гр,2,2018-04-23,11.208,11.208,Spring,Дистрибутор,ЦФО,2018-04-29,6.0,0.0
665,Рис басмати 500 гр,2,2018-04-30,0.903,0.903,Spring,Дистрибутор,ЦФО,2018-05-06,6.0,0.0
666,Рис басмати 500 гр,2,2018-05-07,24.828,24.828,Spring,Дистрибутор,ЦФО,2018-05-13,6.0,0.0
667,Рис басмати 500 гр,2,2018-05-14,9.102,9.102,Spring,Дистрибутор,ЦФО,2018-05-20,6.0,0.0
668,Рис басмати 500 гр,2,2018-05-21,25.203,25.203,Spring,Дистрибутор,ЦФО,2018-05-27,6.0,0.0
669,Рис басмати 500 гр,2,2018-05-28,27.516,27.516,Spring,Дистрибутор,ЦФО,2018-06-03,6.0,0.0
670,Рис басмати 500 гр,2,2018-06-04,9.855,9.855,Summer,Дистрибутор,ЦФО,2018-06-10,6.0,0.0
671,Рис басмати 500 гр,2,2018-06-11,0.0,0.0,Summer,Дистрибутор,ЦФО,2018-06-17,6.0,0.0
672,Рис басмати 500 гр,2,2018-06-18,13.767,13.767,Summer,Дистрибутор,ЦФО,2018-06-24,6.0,0.0
673,Рис басмати 500 гр,2,2018-06-25,27.009,27.009,Summer,Дистрибутор,ЦФО,2018-07-01,6.0,0.0


In [78]:
sales_df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1964 entries, 664 to 3188
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   DFU              1964 non-null   object        
 1   Customer         1964 non-null   int64         
 2   Period           1964 non-null   datetime64[ns]
 3   BPV              1332 non-null   float64       
 4   Total Sell-in    1332 non-null   float64       
 5   Season           1964 non-null   object        
 6   Type             1964 non-null   object        
 7   Geography        1964 non-null   object        
 8   End of Period    1964 non-null   datetime64[ns]
 9   BPV_sale_period  1138 non-null   float64       
 10  SoD_sale_period  1138 non-null   float64       
dtypes: datetime64[ns](2), float64(4), int64(1), object(4)
memory usage: 184.1+ KB


In [79]:
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 12

In [80]:
sales_df_cleaned

Unnamed: 0,DFU,Customer,Period,BPV,Total Sell-in,Season,Type,Geography,End of Period,BPV_sale_period,SoD_sale_period
664,Рис басмати 500 гр,2,2018-04-23,11.208,11.208,Spring,Дистрибутор,ЦФО,2018-04-29,6.0,0.0
665,Рис басмати 500 гр,2,2018-04-30,0.903,0.903,Spring,Дистрибутор,ЦФО,2018-05-06,6.0,0.0
666,Рис басмати 500 гр,2,2018-05-07,24.828,24.828,Spring,Дистрибутор,ЦФО,2018-05-13,6.0,0.0
667,Рис басмати 500 гр,2,2018-05-14,9.102,9.102,Spring,Дистрибутор,ЦФО,2018-05-20,6.0,0.0
668,Рис басмати 500 гр,2,2018-05-21,25.203,25.203,Spring,Дистрибутор,ЦФО,2018-05-27,6.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
3184,Рис басмати 500 гр,34,2022-11-28,,,Autumn,Дистрибутор,СНГ,2022-12-04,,
3185,Рис басмати 500 гр,34,2022-12-05,,,Winter,Дистрибутор,СНГ,2022-12-11,,
3186,Рис басмати 500 гр,34,2022-12-12,,,Winter,Дистрибутор,СНГ,2022-12-18,,
3187,Рис басмати 500 гр,34,2022-12-19,,,Winter,Дистрибутор,СНГ,2022-12-25,,


In [81]:
sales_df_cleaned.groupby(['Customer', 'DFU']).value_counts()

Customer  DFU                      Period      BPV      Total Sell-in  Season  Type         Geography  End of Period  BPV_sale_period  SoD_sale_period
1         Рис длиннозерный 486 гр  2018-04-23  58.9530  58.9530        Spring  Сеть         Москва     2018-04-29     6.0              0.0                1
                                   2018-04-30  0.0000   0.0000         Spring  Сеть         Москва     2018-05-06     6.0              0.0                1
                                   2018-05-07  65.7930  65.7930        Spring  Сеть         Москва     2018-05-13     6.0              0.0                1
                                   2018-05-14  25.5030  25.5030        Spring  Сеть         Москва     2018-05-20     6.0              0.0                1
                                   2018-06-11  26.7540  26.7540        Summer  Сеть         Москва     2018-06-17     6.0              0.0                1
                                                                     

In [89]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

df_cus_1_rice_486g = sales_df_cleaned[
    (sales_df_cleaned['Customer'] == 1) & (sales_df_cleaned['DFU'] == 'Рис длиннозерный 486 гр')]


def create_features(df):
    df['Month'] = df['Period'].dt.month
    df['Quarter'] = df['Period'].dt.quarter
    df['Year'] = df['Period'].dt.year
    df['Weekday'] = df['Period'].dt.weekday

    df = pd.get_dummies(df.drop(columns=['DFU']), drop_first=True)

    df['BPV_lag1'] = df['BPV'].shift(1)
    df['BPV_lag2'] = df['BPV'].shift(2)
    df['BPV_lag3'] = df['BPV'].shift(3)
    df['BPV_lag4'] = df['BPV'].shift(4)

    df['Rolling_Mean_BPV'] = df['BPV'].rolling(window=4).mean()

    df = df.dropna(subset=['BPV_lag1', 'BPV_lag2', 'BPV_lag3', 'BPV_lag4'])

    return df


df_cus_1_rice_486g = create_features(df_cus_1_rice_486g)

features = [col for col in df_cus_1_rice_486g.columns if
            col not in ['DFU', 'Customer', 'Period', 'BPV', 'Total Sell-in', 'End of Period']]
X = df_cus_1_rice_486g[features]
y = df_cus_1_rice_486g['BPV']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Проверяем на наличие NaN или Infinity в y_train
if pd.isna(y_train).any() or np.isinf(y_train).any():
    y_train = np.nan_to_num(y_train, nan=np.mean(y_train), posinf=np.max(y_train), neginf=np.min(y_train))


model_cus_1_rice_486g = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
model_cus_1_rice_486g.fit(X_train_scaled, y_train)

y_pred = model_cus_1_rice_486g.predict(X_test_scaled)
mae = mean_absolute_error(y_test, y_pred)
logger.info(f"MAE для Customer=1, DFU=Рис длиннозерный 486 гр: {mae:.4f}")


def calculate_wape(actual, predicted):
    return np.sum(np.abs(actual - predicted)) / np.sum(actual) * 100


wape = calculate_wape(y_test, y_pred)
logger.info(f"WAPE для Customer=1, DFU=Рис длиннозерный 486 гр: {wape:.2f}%")


def predict_bpv_for_group(model, scaler, last_data, start_date, weeks=78):
    forecast = []
    dates = pd.date_range(start=start_date, periods=weeks, freq='W')
    current_input = last_data.values.reshape(1, -1)
    current_input_scaled = scaler.transform(current_input)

    for _ in tqdm(range(weeks), desc="Прогнозирование"):
        next_bpv = model.predict(current_input_scaled)[0]
        forecast.append(next_bpv)

        current_input = np.roll(current_input, -1, axis=1)
        current_input[0, -1] = next_bpv
        current_input_scaled = scaler.transform(current_input)

    return forecast, dates


start_date = '2021-06-28'

forecast_cus_1_rice_486g, forecast_dates = predict_bpv_for_group(model_cus_1_rice_486g, scaler,
                                                                 df_cus_1_rice_486g.iloc[-1:][features], start_date)

fig = go.Figure()

fig.add_trace(go.Scatter(
    x=df_cus_1_rice_486g['Period'],
    y=df_cus_1_rice_486g['BPV'],
    mode='lines',
    name='Исторические данные',
    line=dict(color='green')
))

fig.add_trace(go.Scatter(
    x=forecast_dates,
    y=forecast_cus_1_rice_486g,
    mode='lines',
    name='Прогноз на 78 недель',
    line=dict(color='blue', dash='dash')
))

fig.update_layout(
    title='Прогноз на 78 недель для Customer=1, DFU=Рис длиннозерный 486 гр',
    xaxis_title='Дата',
    yaxis_title='BPV',
    legend=dict(x=0.7, y=0.1),
    template='plotly_dark',
    xaxis=dict(tickformat='%d-%m-%Y'),
    showlegend=True
)

fig.show()

forecast_df = pd.DataFrame({
    'Date': forecast_dates,
    'Forecast': forecast_cus_1_rice_486g
})
forecast_df.to_csv('forecast_bpv_cus_1_rice_486g.csv', index=False)
logger.info("Прогноз сохранен в 'forecast_bpv_cus_1_rice_486g.csv'")

2025-05-06 18:31:47,370 - INFO - MAE для Customer=1, DFU=Рис длиннозерный 486 гр: 11.2582
2025-05-06 18:31:47,370 - INFO - WAPE для Customer=1, DFU=Рис длиннозерный 486 гр: 37.14%
Прогнозирование: 100%|██████████| 78/78 [00:00<00:00, 3901.40it/s]


2025-05-06 18:31:47,431 - INFO - Прогноз сохранен в 'forecast_bpv_cus_1_rice_486g.csv'


In [34]:
!pip install -U kaleido



In [115]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Функции для обработки данных
def create_features(df):
    df['Month'] = df['Period'].dt.month
    df['Quarter'] = df['Period'].dt.quarter
    df['Year'] = df['Period'].dt.year
    df['Weekday'] = df['Period'].dt.weekday

    df = pd.get_dummies(df.drop(columns=['DFU']), drop_first=True)

    df['BPV_lag1'] = df['BPV'].shift(1)
    df['BPV_lag2'] = df['BPV'].shift(2)
    df['BPV_lag3'] = df['BPV'].shift(3)
    df['BPV_lag4'] = df['BPV'].shift(4)

    df['Rolling_Mean_BPV'] = df['BPV'].rolling(window=4).mean()

    df = df.dropna(subset=['BPV_lag1', 'BPV_lag2', 'BPV_lag3', 'BPV_lag4'])

    return df

def calculate_wape(actual, predicted):
    return np.sum(np.abs(actual - predicted)) / np.sum(actual) * 100

def predict_bpv_for_group(model, scaler, last_data, start_date, weeks=78):
    forecast = []
    dates = pd.date_range(start=start_date, periods=weeks, freq='W')
    current_input = last_data.values.reshape(1, -1)
    current_input_scaled = scaler.transform(current_input)

    for _ in tqdm(range(weeks), desc="Прогнозирование"):
        next_bpv = model.predict(current_input_scaled)[0]
        forecast.append(next_bpv)

        current_input = np.roll(current_input, -1, axis=1)
        current_input[0, -1] = next_bpv
        current_input_scaled = scaler.transform(current_input)

    return forecast, dates


# Список групп для обработки
groups = [
    (1, 'Рис длиннозерный 486 гр'),
    (1, 'Рис для плова 500 гр'),
    (1, 'Рис круглозерный 500 гр'),
    (2, 'Рис басмати 500 гр'),
    (14, 'Рис басмати 500 гр'),
    (18, 'Рис басмати 500 гр'),
    (29, 'Рис басмати 500 гр'),
    (34, 'Рис басмати 500 гр')
]

for customer, dfu in groups:
    logger.info(f"Обработка данных для Customer={customer}, DFU={dfu}")
    
    # Фильтрация данных
    df_group = sales_df_cleaned[
        (sales_df_cleaned['Customer'] == customer) & (sales_df_cleaned['DFU'] == dfu)
    ]
    
    # Применяем создание признаков
    df_group = create_features(df_group)

    # Определяем признаки и целевую переменную
    features = [col for col in df_group.columns if
                col not in ['DFU', 'Customer', 'Period', 'BPV', 'Total Sell-in', 'End of Period']]
    X = df_group[features]
    y = df_group['BPV']

    # Разделение данных на обучающую и тестовую выборки
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    if pd.isna(y_train).any() or np.isinf(y_train).any():
        y_train = np.nan_to_num(y_train, nan=np.mean(y_train), posinf=np.max(y_train), neginf=np.min(y_train))

    # Масштабирование признаков
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Обучение модели XGBoost
    model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
    model.fit(X_train_scaled, y_train)

    # Прогнозирование и вычисление MAE и WAPE
    y_pred = model.predict(X_test_scaled)
    mae = mean_absolute_error(y_test, y_pred)
    wape = calculate_wape(y_test, y_pred)
    
    logger.info(f"MAE для Customer={customer}, DFU={dfu}: {mae:.4f}")
    logger.info(f"WAPE для Customer={customer}, DFU={dfu}: {wape:.2f}%")
    
    # Прогнозирование на 78 недель
    start_date = '2021-06-28'
    forecast, forecast_dates = predict_bpv_for_group(model, scaler, df_group.iloc[-1:][features], start_date)

    # Построение графика
    fig = go.Figure()

    # Исторические данные
    fig.add_trace(go.Scatter(
        x=df_group['Period'],
        y=df_group['BPV'],
        mode='lines',
        name=f'Исторические данные {customer}-{dfu}',
        line=dict(color='green')
    ))

    # Прогноз на 78 недель
    fig.add_trace(go.Scatter(
        x=forecast_dates,
        y=forecast,
        mode='lines',
        name=f'Прогноз на 78 недель {customer}-{dfu}',
        line=dict(color='blue', dash='dash')
    ))

    # Настройка внешнего вида графика
    fig.update_layout(
        title=f'Прогноз на 78 недель для Customer={customer}, DFU={dfu}',
        xaxis_title='Дата',
        yaxis_title='BPV',
        legend=dict(x=0.01, y=0.99, traceorder='normal', orientation='h', font=dict(size=10), bgcolor='rgba(255, 255, 255, 0.7)', bordercolor='black', borderwidth=1),
        template='plotly_white',  # Белый фон
        xaxis=dict(tickformat='%d-%m-%Y'),
        showlegend=True,
        plot_bgcolor='white',  # Белый фон для графика
        paper_bgcolor='white',  # Белый фон всей области
        xaxis_showgrid=True,  # Включаем сетку
        xaxis_gridcolor='lightgray',  # Легкая серая сетка
        yaxis_showgrid=True,
        yaxis_gridcolor='lightgray',
    )

    fig.show()
    fig.write_image(f"forecast_bpv_customer_{customer}_dfu_{dfu}.png")
    logger.info(f"График для Customer={customer}, DFU={dfu} сохранен в 'forecast_bpv_customer_{customer}_dfu_{dfu}.png'")

    forecast_df = pd.DataFrame({
        'Date': forecast_dates,
        'Forecast': forecast
    })
    forecast_df.to_csv(f'forecast_bpv_customer_{customer}_dfu_{dfu}.csv', index=False)
    logger.info(f"Прогноз для Customer={customer}, DFU={dfu} сохранен в 'forecast_bpv_customer_{customer}_dfu_{dfu}.csv'")

2025-05-06 19:04:33,000 - INFO - Обработка данных для Customer=1, DFU=Рис длиннозерный 486 гр
2025-05-06 19:04:33,101 - INFO - MAE для Customer=1, DFU=Рис длиннозерный 486 гр: 11.2582
2025-05-06 19:04:33,102 - INFO - WAPE для Customer=1, DFU=Рис длиннозерный 486 гр: 37.14%
Прогнозирование: 100%|██████████| 78/78 [00:00<00:00, 5318.65it/s]


2025-05-06 19:04:33,187 - INFO - График для Customer=1, DFU=Рис длиннозерный 486 гр сохранен в 'forecast_bpv_customer_1_dfu_Рис длиннозерный 486 гр.png'
2025-05-06 19:04:33,188 - INFO - Прогноз для Customer=1, DFU=Рис длиннозерный 486 гр сохранен в 'forecast_bpv_customer_1_dfu_Рис длиннозерный 486 гр.csv'
2025-05-06 19:04:33,189 - INFO - Обработка данных для Customer=1, DFU=Рис для плова 500 гр
2025-05-06 19:04:33,267 - INFO - MAE для Customer=1, DFU=Рис для плова 500 гр: 2.9995
2025-05-06 19:04:33,267 - INFO - WAPE для Customer=1, DFU=Рис для плова 500 гр: 41.01%
Прогнозирование: 100%|██████████| 78/78 [00:00<00:00, 3992.19it/s]


2025-05-06 19:04:33,346 - INFO - График для Customer=1, DFU=Рис для плова 500 гр сохранен в 'forecast_bpv_customer_1_dfu_Рис для плова 500 гр.png'
2025-05-06 19:04:33,348 - INFO - Прогноз для Customer=1, DFU=Рис для плова 500 гр сохранен в 'forecast_bpv_customer_1_dfu_Рис для плова 500 гр.csv'
2025-05-06 19:04:33,348 - INFO - Обработка данных для Customer=1, DFU=Рис круглозерный 500 гр
2025-05-06 19:04:33,444 - INFO - MAE для Customer=1, DFU=Рис круглозерный 500 гр: 6.7050
2025-05-06 19:04:33,444 - INFO - WAPE для Customer=1, DFU=Рис круглозерный 500 гр: 38.23%
Прогнозирование: 100%|██████████| 78/78 [00:00<00:00, 4277.44it/s]


2025-05-06 19:04:33,521 - INFO - График для Customer=1, DFU=Рис круглозерный 500 гр сохранен в 'forecast_bpv_customer_1_dfu_Рис круглозерный 500 гр.png'
2025-05-06 19:04:33,522 - INFO - Прогноз для Customer=1, DFU=Рис круглозерный 500 гр сохранен в 'forecast_bpv_customer_1_dfu_Рис круглозерный 500 гр.csv'
2025-05-06 19:04:33,523 - INFO - Обработка данных для Customer=2, DFU=Рис басмати 500 гр
2025-05-06 19:04:33,651 - INFO - MAE для Customer=2, DFU=Рис басмати 500 гр: 4.4431
2025-05-06 19:04:33,655 - INFO - WAPE для Customer=2, DFU=Рис басмати 500 гр: 60.09%
Прогнозирование: 100%|██████████| 78/78 [00:00<00:00, 4087.40it/s]


2025-05-06 19:04:33,735 - INFO - График для Customer=2, DFU=Рис басмати 500 гр сохранен в 'forecast_bpv_customer_2_dfu_Рис басмати 500 гр.png'
2025-05-06 19:04:33,737 - INFO - Прогноз для Customer=2, DFU=Рис басмати 500 гр сохранен в 'forecast_bpv_customer_2_dfu_Рис басмати 500 гр.csv'
2025-05-06 19:04:33,737 - INFO - Обработка данных для Customer=14, DFU=Рис басмати 500 гр
2025-05-06 19:04:33,871 - INFO - MAE для Customer=14, DFU=Рис басмати 500 гр: 4.0683
2025-05-06 19:04:33,872 - INFO - WAPE для Customer=14, DFU=Рис басмати 500 гр: 52.45%
Прогнозирование: 100%|██████████| 78/78 [00:00<00:00, 5205.59it/s]


2025-05-06 19:04:33,938 - INFO - График для Customer=14, DFU=Рис басмати 500 гр сохранен в 'forecast_bpv_customer_14_dfu_Рис басмати 500 гр.png'
2025-05-06 19:04:33,940 - INFO - Прогноз для Customer=14, DFU=Рис басмати 500 гр сохранен в 'forecast_bpv_customer_14_dfu_Рис басмати 500 гр.csv'
2025-05-06 19:04:33,941 - INFO - Обработка данных для Customer=18, DFU=Рис басмати 500 гр
2025-05-06 19:04:34,078 - INFO - MAE для Customer=18, DFU=Рис басмати 500 гр: 0.2570
2025-05-06 19:04:34,079 - INFO - WAPE для Customer=18, DFU=Рис басмати 500 гр: 187.88%
Прогнозирование: 100%|██████████| 78/78 [00:00<00:00, 3401.07it/s]


2025-05-06 19:04:34,178 - INFO - График для Customer=18, DFU=Рис басмати 500 гр сохранен в 'forecast_bpv_customer_18_dfu_Рис басмати 500 гр.png'
2025-05-06 19:04:34,196 - INFO - Прогноз для Customer=18, DFU=Рис басмати 500 гр сохранен в 'forecast_bpv_customer_18_dfu_Рис басмати 500 гр.csv'
2025-05-06 19:04:34,197 - INFO - Обработка данных для Customer=29, DFU=Рис басмати 500 гр
2025-05-06 19:04:34,387 - INFO - MAE для Customer=29, DFU=Рис басмати 500 гр: 5.8362
2025-05-06 19:04:34,387 - INFO - WAPE для Customer=29, DFU=Рис басмати 500 гр: 58.24%
Прогнозирование: 100%|██████████| 78/78 [00:00<00:00, 2581.50it/s]


2025-05-06 19:04:34,476 - INFO - График для Customer=29, DFU=Рис басмати 500 гр сохранен в 'forecast_bpv_customer_29_dfu_Рис басмати 500 гр.png'
2025-05-06 19:04:34,478 - INFO - Прогноз для Customer=29, DFU=Рис басмати 500 гр сохранен в 'forecast_bpv_customer_29_dfu_Рис басмати 500 гр.csv'
2025-05-06 19:04:34,479 - INFO - Обработка данных для Customer=34, DFU=Рис басмати 500 гр
2025-05-06 19:04:34,616 - INFO - MAE для Customer=34, DFU=Рис басмати 500 гр: 4.8421
2025-05-06 19:04:34,616 - INFO - WAPE для Customer=34, DFU=Рис басмати 500 гр: 71.66%
Прогнозирование: 100%|██████████| 78/78 [00:00<00:00, 4025.39it/s]


2025-05-06 19:04:34,683 - INFO - График для Customer=34, DFU=Рис басмати 500 гр сохранен в 'forecast_bpv_customer_34_dfu_Рис басмати 500 гр.png'
2025-05-06 19:04:34,685 - INFO - Прогноз для Customer=34, DFU=Рис басмати 500 гр сохранен в 'forecast_bpv_customer_34_dfu_Рис басмати 500 гр.csv'


## 1 группа:


In [142]:
def predict_bpv_for_group_(model, scaler, last_data, start_date, weeks=78):
    forecast = []
    dates = pd.date_range(start=start_date, periods=weeks, freq='W')
    current_input = last_data.values.reshape(1, -1)
    current_input_scaled = scaler.transform(current_input)

    for _ in tqdm(range(weeks), desc="Прогнозирование"):
        # Прогнозирование следующего BPV
        next_bpv = model.predict(current_input_scaled)[0]

        # Ограничение: если предсказание меньше 0, заменяем на 0
        if next_bpv < 0:
            next_bpv = 0

        forecast.append(next_bpv)

        # Обновляем входные данные для следующего прогноза
        current_input = np.roll(current_input, -1, axis=1)
        current_input[0, -1] = next_bpv
        current_input_scaled = scaler.transform(current_input)

    return forecast, dates

In [143]:
from sklearn.model_selection import GridSearchCV, ParameterGrid
from sklearn.ensemble import VotingRegressor, RandomForestRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler, RobustScaler
import numpy as np
import pandas as pd
import logging

# Инициализация логгера
logger = logging.getLogger()
logging.basicConfig(level=logging.INFO)

# Параметры
customer = 1
dfu = 'Рис длиннозерный 486 гр'

# Фильтрация данных
df_group = sales_df_cleaned[
    (sales_df_cleaned['Customer'] == customer) & (sales_df_cleaned['DFU'] == dfu)
].copy()

logger.info(f"Выбрана группа: Customer={customer}, DFU='{dfu}', количество записей: {len(df_group)}")

# Создание признаков
df_group = create_features(df_group)

# Формирование признаков и целевой переменной
features = [col for col in df_group.columns if
            col not in ['DFU', 'Customer', 'Period', 'BPV', 'Total Sell-in', 'End of Period']]

X = df_group[features]
y = df_group['BPV']

# Разделение на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Обработка пропусков и выбросов в целевой переменной
if pd.isna(y_train).any() or np.isinf(y_train).any():
    y_train = np.nan_to_num(y_train, nan=np.mean(y_train), posinf=np.max(y_train), neginf=np.min(y_train))

# Масштабирование признаков с использованием RobustScaler
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.05, 0.1],
    'n_estimators': [200, 500],
    'subsample': [0.8],
    'colsample_bytree': [0.8],
    'gamma': [0, 0.1],
}

# Генерация всех комбинаций
param_combinations = list(ParameterGrid(param_grid))
logger.info(f"Всего комбинаций для XGBoost: {len(param_combinations)}")

best_score = float('inf')
best_params = None

# Обход всех параметров с прогресс-баром и кросс-валидацией
for params in tqdm(param_combinations, desc="Grid Search XGBoost"):
    model = XGBRegressor(**params, random_state=42, verbosity=0)
    scores = cross_val_score(model, X_train_scaled, y_train, cv=3, scoring='neg_mean_absolute_error', n_jobs=-1)
    mean_score = -np.mean(scores)

    logger.info(f"Параметры: {params}, MAE: {mean_score:.4f}")

    if mean_score < best_score:
        best_score = mean_score
        best_params = params

logger.info(f"Лучшие параметры: {best_params}")
logger.info(f"Лучшая MAE: {best_score:.4f}")

# Обучение модели XGBoost с лучшими параметрами
model_xgb = XGBRegressor(**best_params, random_state=42)
model_xgb.fit(X_train_scaled, y_train)

# Модели для ансамбля
model_rf = RandomForestRegressor(n_estimators=100, random_state=42)
model_catboost = CatBoostRegressor(n_estimators=100, learning_rate=0.1, random_state=42, verbose=0)

# Ансамбль моделей
ensemble_model = VotingRegressor(estimators=[
    ('xgb', model_xgb),
    ('rf', model_rf),
    ('catboost', model_catboost)
])

ensemble_model.fit(X_train_scaled, y_train)

# Оценка ансамбля
y_pred_ensemble = ensemble_model.predict(X_test_scaled)
mae_ensemble = mean_absolute_error(y_test, y_pred_ensemble)
wape_ensemble = calculate_wape(y_test, y_pred_ensemble)

logger.info(f"[{dfu}] Ensemble MAE: {mae_ensemble:.4f}")
logger.info(f"[{dfu}] Ensemble WAPE: {wape_ensemble:.2f}%")

2025-05-06 19:14:18,521 - INFO - Выбрана группа: Customer=1, DFU='Рис длиннозерный 486 гр', количество записей: 249
2025-05-06 19:14:18,533 - INFO - Всего комбинаций для XGBoost: 24
Grid Search XGBoost:   0%|          | 0/24 [00:00<?, ?it/s]2025-05-06 19:14:19,406 - INFO - Параметры: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 200, 'subsample': 0.8}, MAE: 10.2475
Grid Search XGBoost:   4%|▍         | 1/24 [00:00<00:20,  1.15it/s]2025-05-06 19:14:19,827 - INFO - Параметры: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 500, 'subsample': 0.8}, MAE: 9.9290
Grid Search XGBoost:   8%|▊         | 2/24 [00:01<00:13,  1.65it/s]2025-05-06 19:14:19,869 - INFO - Параметры: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 200, 'subsample': 0.8}, MAE: 10.6774
2025-05-06 19:14:20,285 - INFO - Параметры: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.05,

In [144]:
from xgboost import XGBRegressor
import numpy as np

# Лучшие параметры
best_params = {
    'colsample_bytree': 1.0,
    'gamma': 0.1,
    'learning_rate': 0.05,
    'max_depth': 3,
    'n_estimators': 1000,
    'subsample': 0.7
}

# Обучение модели
model_xgb = XGBRegressor(**best_params, random_state=42)
model_xgb.fit(X_train_scaled, y_train)

# Предсказание
y_pred_xgb = model_xgb.predict(X_test_scaled)

# Обработка отрицательных предсказаний
y_pred_xgb = np.maximum(y_pred_xgb, 0)

# Оценка качества
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
wape_xgb = calculate_wape(y_test, y_pred_xgb)

logger.info(f"[{dfu}] XGBoost MAE (best params): {mae_xgb:.4f}")
logger.info(f"[{dfu}] XGBoost WAPE (best params): {wape_xgb:.2f}%")

2025-05-06 19:14:25,902 - INFO - [Рис длиннозерный 486 гр] XGBoost MAE (best params): 7.2488
2025-05-06 19:14:25,903 - INFO - [Рис длиннозерный 486 гр] XGBoost WAPE (best params): 23.92%


In [145]:
start_date = '2021-06-28'
last_input = df_group.iloc[-1:][features]
forecast, forecast_dates = predict_bpv_for_group_(model_xgb, scaler, last_input, start_date)

Прогнозирование: 100%|██████████| 78/78 [00:00<00:00, 3076.07it/s]


In [146]:
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=df_group['Period'],
    y=df_group['BPV'],
    mode='lines',
    name='Исторические данные',
    line=dict(color='green')
))

fig.add_trace(go.Scatter(
    x=forecast_dates,
    y=forecast,
    mode='lines',
    name='Прогноз на 78 недель',
    line=dict(color='blue', dash='dash')
))

fig.update_layout(
    title=f'Прогноз для: {dfu}',
    xaxis_title='Дата',
    yaxis_title='BPV',
    template='plotly_white'
)
fig.show()

In [147]:
forecast, forecast_dates = predict_bpv_for_group_(model_xgb, scaler, last_input, start_date)

forecast_df = pd.DataFrame({
    'Date': forecast_dates,
    'Predicted BPV': forecast
})

forecast_df.to_csv('1 Рис длиннозерный 486 гр 23.92%.csv', index=False)

print(forecast_df.head())

Прогнозирование: 100%|██████████| 78/78 [00:00<00:00, 3069.06it/s]

        Date  Predicted BPV
0 2021-07-04      23.082033
1 2021-07-11      38.059097
2 2021-07-18      40.445766
3 2021-07-25      24.138475
4 2021-08-01       0.000000





## 2 группа

In [174]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
import numpy as np
import pandas as pd
import logging

logger = logging.getLogger()
logging.basicConfig(level=logging.INFO)

customer = 1
dfu = 'Рис для плова 500 гр'

df_group = sales_df_cleaned[
    (sales_df_cleaned['Customer'] == customer) & (sales_df_cleaned['DFU'] == dfu)
].copy()

logger.info(f"Выбрана группа: Customer={customer}, DFU='{dfu}', количество записей: {len(df_group)}")

2025-05-06 20:07:26,464 - INFO - Выбрана группа: Customer=1, DFU='Рис для плова 500 гр', количество записей: 245


In [175]:
df_group.info()

<class 'pandas.core.frame.DataFrame'>
Index: 245 entries, 1664 to 2635
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   DFU              245 non-null    object        
 1   Customer         245 non-null    int64         
 2   Period           245 non-null    datetime64[ns]
 3   BPV              166 non-null    float64       
 4   Total Sell-in    166 non-null    float64       
 5   Season           245 non-null    object        
 6   Type             245 non-null    object        
 7   Geography        245 non-null    object        
 8   End of Period    245 non-null    datetime64[ns]
 9   BPV_sale_period  97 non-null     float64       
 10  SoD_sale_period  97 non-null     float64       
dtypes: datetime64[ns](2), float64(4), int64(1), object(4)
memory usage: 23.0+ KB


In [177]:
print(df_group.describe())

       Customer               Period         BPV  Total Sell-in  \
count     245.0                  245  166.000000     166.000000   
mean        1.0  2020-08-24 00:00:00    7.748250      28.113493   
min         1.0  2018-04-23 00:00:00    0.000000       0.000000   
25%         1.0  2019-06-24 00:00:00    0.000000      11.011497   
50%         1.0  2020-08-24 00:00:00    6.510000      17.306634   
75%         1.0  2021-10-25 00:00:00   12.581223      32.490600   
max         1.0  2022-12-26 00:00:00   74.349000     134.315400   
std         0.0                  NaN    8.986214      27.035945   

             End of Period  BPV_sale_period  SoD_sale_period  
count                  245        97.000000        97.000000  
mean   2020-08-30 00:00:00         3.154639         2.845361  
min    2018-04-29 00:00:00         0.000000         0.000000  
25%    2019-06-30 00:00:00         0.000000         0.000000  
50%    2020-08-30 00:00:00         6.000000         0.000000  
75%    2021-10-31 

In [182]:
import pandas as pd
import numpy as np
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error

df = df_group.copy()

# Предположим, что df уже загружен в переменную df
# Для начала создадим лаги для предсказания BPV
for lag in range(1, 8):  # лаги с 1 по 7 недель назад
    df[f'BPV_lag_{lag}'] = df.groupby('DFU')['BPV'].shift(lag)

# Добавим временные признаки
df['weekofyear'] = df['Period'].dt.isocalendar().week
df['month'] = df['Period'].dt.month
df['year'] = df['Period'].dt.year
df['quarter'] = df['Period'].dt.quarter

# Заполняем пропуски
df.fillna(method='ffill', inplace=True)

# Разделим данные на train и test
train = df[df['Period'] < '2021-06-28']
test = df[df['Period'] >= '2021-06-28']

# Убираем лишние столбцы из данных
features = [col for col in df.columns if col not in ['BPV', 'Period', 'End of Period', 'DFU']]

# Обучаем модель LightGBM
model = LGBMRegressor(n_estimators=500, learning_rate=0.05)
model.fit(train[features], train['BPV'])

# Предсказания на тестовой выборке
y_pred = model.predict(test[features])

# Расчитаем WAPE
wape = np.sum(np.abs(test['BPV'] - y_pred)) / np.sum(test['BPV']) * 100
print(f'WAPE: {wape:.2f}%')

# Печать MAE
mae = mean_absolute_error(test['BPV'], y_pred)
print(f'MAE: {mae:.2f}')

ValueError: pandas dtypes must be int, float or bool.
Fields with bad pandas dtypes: Season: object, Type: object, Geography: object