# Workplace for: Customer: 1; DFU: Рис для плова 500 гр

In [31]:
import plotly.express as px
from sklearn.linear_model import LinearRegression
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from lightgbm import LGBMRegressor

In [32]:
df = pd.read_csv("data_grouped/train_group_1_Рис для плова 500 гр.csv")
sales_2_df_t = pd.read_csv("data_test_grouped/test_group_1_Рис для плова 500 гр.csv")
df, sales_2_df_t

(                      DFU  Customer      Period  BPV  Total Sell-in  Season  \
 0    Рис для плова 500 гр         1  2022-01-24  NaN            NaN  Winter   
 1    Рис для плова 500 гр         1  2022-01-31  NaN            NaN  Winter   
 2    Рис для плова 500 гр         1  2022-02-07  NaN            NaN  Winter   
 3    Рис для плова 500 гр         1  2022-02-14  NaN            NaN  Winter   
 4    Рис для плова 500 гр         1  2022-02-21  NaN            NaN  Winter   
 ..                    ...       ...         ...  ...            ...     ...   
 240  Рис для плова 500 гр         1  2021-12-20  NaN            NaN  Winter   
 241  Рис для плова 500 гр         1  2021-12-27  NaN            NaN  Winter   
 242  Рис для плова 500 гр         1  2022-01-03  NaN            NaN  Winter   
 243  Рис для плова 500 гр         1  2022-01-10  NaN            NaN  Winter   
 244  Рис для плова 500 гр         1  2022-01-17  NaN            NaN  Winter   
 
      Type Geography End of Period  BP

In [33]:
mu = sales_2_df_t["BPV"].mean()
sigma = sales_2_df_t["BPV"].std()

mask = (sales_2_df_t["BPV"] - mu).abs() > 2 * sigma

sales_2_df_t.loc[mask, "BPV"] = mu

print(mu)
print(f"Заменено выбросов: {mask.sum()} из {len(sales_2_df_t)} строк")

6.275024999999999
Заменено выбросов: 1 из 30 строк


## Visualization

In [34]:
unique_stat_colors = {
    'BPV Mean': '#1f77b4',
    'BPV Median': '#ff7f0e',
    'BPV CI Low (95%)': '#2ca02c',
    'BPV CI Up (95%)': '#d62728',
    'Total Sell-in Mean': '#9467bd',
    'Total Sell-in Median': '#8c564b',
    'Total Sell-in CI Low (95%)': '#e377c2',
    'Total Sell-in CI Up (95%)': '#7f7f7f',
}

for (dfu, customer), group in df.groupby(['DFU', 'Customer']):
    melted_group = group.melt(id_vars=['Period'], value_vars=['BPV', 'Total Sell-in'],
                              var_name='Metric', value_name='Value')

    stat_lines = []

    for metric in ['BPV', 'Total Sell-in']:
        data = group[metric].dropna()
        if not data.empty:
            mean_val = data.mean()
            median_val = data.median()
            std_val = data.std()

            stat_lines.extend([
                (metric, f'{metric} Mean: {mean_val:.2f}', mean_val, unique_stat_colors[f'{metric} Mean']),
                (metric, f'{metric} Median: {median_val:.2f}', median_val, unique_stat_colors[f'{metric} Median'])
            ])

    fig = px.line(
        melted_group,
        x='Period', y='Value', color='Metric',
        title=f'DFU: {dfu} | Customer: {customer}',
        labels={'Value': 'Sales', 'Period': 'Date'}
    )

    for metric, label, value, color in stat_lines:
        fig.add_trace(go.Scatter(
            x=[group['Period'].min(), group['Period'].max()],
            y=[value, value],
            mode='lines',
            name=label,
            line=dict(dash='dot', color=color, width=1.5),
            showlegend=True
        ))

    for metric in ['BPV', 'Total Sell-in']:
        metric_df = group[['Period', metric]].dropna()
        if not metric_df.empty:
            metric_df = metric_df.copy()
            metric_df['Period_ordinal'] = pd.to_datetime(metric_df['Period']).map(pd.Timestamp.toordinal)
            X = metric_df[['Period_ordinal']]
            y = metric_df[metric]
            model = LinearRegression().fit(X, y)
            y_pred = model.predict(X)

            fig.add_trace(go.Scatter(
                x=metric_df['Period'],
                y=y_pred,
                mode='lines',
                name=f'Trend - {metric}',
                line=dict(dash='dash', color='orange')
            ))

    if group[['BPV', 'Total Sell-in']].dropna().shape[0] > 1:
        corr = group['BPV'].corr(group['Total Sell-in'])
        fig.add_annotation(
            text=f"Корреляция BPV и Total Sell-in: {corr:.2f}",
            xref="paper", yref="paper",
            x=0.99, y=1.05, showarrow=False,
            font=dict(size=12, color="white"),
            align="right",
            bordercolor="white", borderwidth=1
        )

    filtered_sales_2 = sales_2_df_t[(sales_2_df_t['DFU'] == dfu) & (sales_2_df_t['Customer'] == customer)]
    if not filtered_sales_2.empty:
        melted_sales_2 = filtered_sales_2.melt(id_vars=['Period'], value_vars=['BPV', 'Total Sell-in'],
                                               var_name='Metric', value_name='Value')
        for metric in melted_sales_2['Metric'].unique():
            metric_data = melted_sales_2[melted_sales_2['Metric'] == metric]
            color = 'black' if metric == 'BPV' else 'gray'
            fig.add_trace(go.Scatter(
                x=metric_data['Period'],
                y=metric_data['Value'],
                mode='lines+markers',
                name=f'test_sales - {metric}',
                line=dict(color=color, width=2, dash='dot')
            ))

    fig.update_xaxes(title_text='Дата', tickformat='%Y-%m-%d')
    fig.update_yaxes(title_text='Значение')
    fig.show()


## Machine Learning

In [35]:
train_path = "data_grouped/train_group_1_Рис для плова 500 гр.csv"
test_path = "data_test_grouped/test_group_1_Рис для плова 500 гр.csv"

train_full = pd.read_csv(train_path)
train_full = train_full[
    (train_full["Period"] >= "2020-01-03") &
    (train_full["Period"] <= "2022-01-17")
    ].copy()

test = pd.read_csv(test_path)
test = test.rename(columns={"BPV": "BPV_true"})

train_full["Period"] = pd.to_datetime(train_full["Period"])
test["Period"] = pd.to_datetime(test["Period"])
train_full["Period_ord"] = train_full["Period"].map(pd.Timestamp.toordinal)
test["Period_ord"] = test["Period"].map(pd.Timestamp.toordinal)

train_full["Period_last_year"] = train_full["Period"] - pd.DateOffset(weeks=52)
test["Period_last_year"] = test["Period"] - pd.DateOffset(weeks=52)

sellin_map = train_full[["Period", "Total Sell-in"]].copy()
sellin_map = sellin_map.rename(columns={
    "Period": "Period_last_year",
    "Total Sell-in": "sellin_last_year"
})

train_full = train_full.merge(
    sellin_map, on="Period_last_year", how="left"
)
train_full["sellin_last_year"] = train_full["sellin_last_year"].fillna(0)

test = test.merge(
    sellin_map, on="Period_last_year", how="left"
)
test["sellin_last_year"] = test["sellin_last_year"].fillna(0)

train_full["bpv_vs_sod_flag"] = (
        train_full["BPV_sale_period"] > train_full["SoD_sale_period"]
).astype(int)
train_full["bpv_minus_sod"] = (
        train_full["BPV_sale_period"] - train_full["SoD_sale_period"]
)
train_full["bpv_div_sod"] = (
        train_full["BPV_sale_period"] / (train_full["SoD_sale_period"] + 1e-9)
)
train_full["weekofyear"] = train_full["Period"].dt.isocalendar().week
train_full["month"] = train_full["Period"].dt.month

sale_period_map = train_full.set_index("Period")["BPV_sale_period"]
pred_dates = train_full.loc[train_full["BPV"].isna(), "Period"]

drop_cols = ["Customer", "DFU", "Period", "Period_last_year"]
train_full = train_full.drop(columns=[c for c in drop_cols if c in train_full.columns])

feature_cols = [
    "Period_ord",
    "BPV_sale_period",
    "SoD_sale_period",
    "sellin_last_year",
    "bpv_vs_sod_flag",
    "bpv_minus_sod",
    "bpv_div_sod",
    "weekofyear",
    "month"
]

train_df = train_full[train_full["BPV"].notna()].reset_index(drop=True)
X_train = train_df[feature_cols]
y_train = train_df["BPV"].values

pred_df = train_full[train_full["BPV"].isna()].reset_index(drop=True)
X_pred = pred_df[feature_cols]
pred_bpv_sale = pred_df["BPV_sale_period"]

model = LGBMRegressor(random_state=42)
model.fit(X_train, y_train)

y_pred_full = model.predict(X_pred)

y_pred_full = np.where(pred_bpv_sale == 0, 0, y_pred_full)
y_pred_full = np.maximum(y_pred_full, 0)

pred_dates_idx = pred_dates.reset_index(drop=True)

mask_holiday = (
        ((pred_dates_idx.dt.month == 12) & (pred_dates_idx.dt.day >= 15)) |
        ((pred_dates_idx.dt.month == 1) & (pred_dates_idx.dt.day <= 5))
)

y_pred_full[mask_holiday.values] = y_pred_full[mask_holiday.values] * 1.439

pred_dates_idx = pred_dates.reset_index(drop=True)

mask_two_weeks = []
for dt in pred_dates_idx:
    prev1 = dt - pd.Timedelta(weeks=1)
    prev2 = dt - pd.Timedelta(weeks=2)
    if (prev1 in sale_period_map.index and sale_period_map[prev1] == 0) and \
            (prev2 in sale_period_map.index and sale_period_map[prev2] == 0):
        mask_two_weeks.append(True)
    else:
        mask_two_weeks.append(False)

mask_two_weeks = np.array(mask_two_weeks, dtype=bool)
y_pred_full[mask_two_weeks] = y_pred_full[mask_two_weeks] * 1.995

merged = pd.DataFrame({
    "Period": pred_dates_idx,
    "BPV_pred": y_pred_full
})

merged = merged.merge(
    test[["Period", "BPV_true"]],
    on="Period",
    how="left"
)

merged["BPV_true"] = merged["BPV_true"].fillna(0)

numerator = np.abs(merged["BPV_true"] - merged["BPV_pred"]).sum()
denominator = merged["BPV_true"].sum()
wape = numerator / denominator * 100 if denominator != 0 else np.nan

print(f"WAPE (LightGBM с sellin_last_year и праздничной корректировкой) = {wape:.2f}%")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000276 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99
[LightGBM] [Info] Number of data points in the train set: 77, number of used features: 9
[LightGBM] [Info] Start training from score 6.200595
WAPE (LightGBM с sellin_last_year и праздничной корректировкой) = 31.05%


In [36]:
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=merged["Period"], y=merged["BPV_true"],
    mode="markers+lines",
    name="BPV_true",
    line=dict(color="blue"),
    marker=dict(size=6)
))
fig.add_trace(go.Scatter(
    x=merged["Period"], y=merged["BPV_pred"],
    mode="markers+lines",
    name="BPV_pred",
    line=dict(color="red", dash="dot"),
    marker=dict(size=6)
))
fig.update_layout(
    title="Сравнение BPV_true и BPV_pred (с обновленным набором признаков)",
    xaxis_title="Period",
    yaxis_title="BPV",
    legend_title="Легенда"
)
fig.show()

## Revision

1. Оформил ноутбук группы (Макух Д. В.)
2. 2021-06-28 2021-08-23 2021-11-22 - Отрицательные значения в данных (поменял на 0 и перерасчитал train SoD_sale_period и BPV_sale_Period) (Макух Д. В.)

            Результаты:
                WAPE: 54.68 -> 37.48 после изменения данных
3. В периоды январских закупок (по статистике были определены с 15.12 по 5.01) увеличиваем прогноз в 1.439 раз. В периоды, когда у нас 2 недели подряд было промо увеличиваем прогноз в 1.995 раз, т. к. остается эффект промо нагрузки. (Макух Д. В.)