In [2]:
import pandas as pd
import numpy as np
from datetime import datetime
import tqdm
import re
from sklearn.multioutput import MultiOutputRegressor
import seaborn as sns
import matplotlib.pyplot as plt

from warnings import simplefilter

import matplotlib.pyplot as plt
import lightgbm as lgb

from statsmodels.tsa.deterministic import DeterministicProcess, CalendarFourier
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import os

In [3]:
# Set Matplotlib defaults
plt.rc("figure", autolayout=True, figsize=(11, 4))
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=16,
    titlepad=10,
)
plot_params = dict(
    color="0.75",
    style=".-",
    markeredgecolor="0.25",
    markerfacecolor="0.25",
)
%config InlineBackend.figure_format = 'retina'


def plot_multistep(y, every=1, ax=None, palette_kwargs=None):
    palette_kwargs_ = dict(palette='husl', n_colors=16, desat=None)
    if palette_kwargs is not None:
        palette_kwargs_.update(palette_kwargs)
    palette = sns.color_palette(**palette_kwargs_)
    if ax is None:
        fig, ax = plt.subplots()
    ax.set_prop_cycle(plt.cycler('color', palette))
    for date, preds in y[::every].iterrows():
        preds.index = pd.period_range(start=date, periods=len(preds))
        preds.plot(ax=ax)
    return ax

In [4]:
train_data = pd.read_csv('../data/sales_train_evaluation.csv')
calendar = pd.read_csv('../data/calendar.csv')

In [5]:
train_data

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,4,0,0,0,0,3,3,0,1
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,1,2,1,1,0,0,0,0,0
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,0,2,0,0,0,2,3,0,1
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,1,0,4,0,1,3,0,2,6
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,2,1,0,0,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,FOODS_3_823_WI_3_evaluation,FOODS_3_823,FOODS_3,FOODS,WI_3,WI,0,0,2,2,...,1,0,3,0,1,1,0,0,1,1
30486,FOODS_3_824_WI_3_evaluation,FOODS_3_824,FOODS_3,FOODS,WI_3,WI,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
30487,FOODS_3_825_WI_3_evaluation,FOODS_3_825,FOODS_3,FOODS,WI_3,WI,0,6,0,2,...,0,0,1,2,0,1,0,1,0,2
30488,FOODS_3_826_WI_3_evaluation,FOODS_3_826,FOODS_3,FOODS,WI_3,WI,0,0,0,0,...,1,1,1,4,6,0,1,1,1,0


In [6]:
melt_df = pd.melt(train_data, id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], 
        value_vars=[f'd_{i}' for i in range(1,1942)], value_name='sales')
melt_df

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,variable,sales
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0
...,...,...,...,...,...,...,...,...
59181085,FOODS_3_823_WI_3_evaluation,FOODS_3_823,FOODS_3,FOODS,WI_3,WI,d_1941,1
59181086,FOODS_3_824_WI_3_evaluation,FOODS_3_824,FOODS_3,FOODS,WI_3,WI,d_1941,0
59181087,FOODS_3_825_WI_3_evaluation,FOODS_3_825,FOODS_3,FOODS,WI_3,WI,d_1941,2
59181088,FOODS_3_826_WI_3_evaluation,FOODS_3_826,FOODS_3,FOODS,WI_3,WI,d_1941,0


In [7]:
# target encoding
melt_df['item_mean'] = melt_df.groupby('id')['sales'].rolling(14).mean().reset_index(level=0, drop=True).astype(np.float16)
melt_df['item_std'] = melt_df.groupby('id')['sales'].rolling(14).std().reset_index(level=0, drop=True).astype(np.float16)
melt_df

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,variable,sales,item_mean,item_std
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,,
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,,
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,,
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,,
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,,
...,...,...,...,...,...,...,...,...,...,...
59181085,FOODS_3_823_WI_3_evaluation,FOODS_3_823,FOODS_3,FOODS,WI_3,WI,d_1941,1,0.643066,0.841797
59181086,FOODS_3_824_WI_3_evaluation,FOODS_3_824,FOODS_3,FOODS,WI_3,WI,d_1941,0,0.142822,0.363037
59181087,FOODS_3_825_WI_3_evaluation,FOODS_3_825,FOODS_3,FOODS,WI_3,WI,d_1941,2,1.000000,1.109375
59181088,FOODS_3_826_WI_3_evaluation,FOODS_3_826,FOODS_3,FOODS,WI_3,WI,d_1941,0,1.428711,1.650391


In [8]:
day_columns = [column for column in train_data.columns if 'd_' in column]
#selected_items = melt_df[(melt_df['dept_id']=='FOODS_3') & (melt_df['store_id']=='TX_2')]
selected_items = melt_df

In [9]:
selected_items

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,variable,sales,item_mean,item_std
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,,
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,,
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,,
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,,
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,,
...,...,...,...,...,...,...,...,...,...,...
59181085,FOODS_3_823_WI_3_evaluation,FOODS_3_823,FOODS_3,FOODS,WI_3,WI,d_1941,1,0.643066,0.841797
59181086,FOODS_3_824_WI_3_evaluation,FOODS_3_824,FOODS_3,FOODS,WI_3,WI,d_1941,0,0.142822,0.363037
59181087,FOODS_3_825_WI_3_evaluation,FOODS_3_825,FOODS_3,FOODS,WI_3,WI,d_1941,2,1.000000,1.109375
59181088,FOODS_3_826_WI_3_evaluation,FOODS_3_826,FOODS_3,FOODS,WI_3,WI,d_1941,0,1.428711,1.650391


In [10]:
merged_item_sales = selected_items.merge(calendar, how='left', left_on='variable', right_on='d')[['date', 'sales', 'item_mean', 'item_std', 'variable', 'id']]

merged_item_sales['date'] = pd.to_datetime(merged_item_sales['date'])
merged_item_sales.set_index('date', inplace=True)
merged_item_sales.index = merged_item_sales.index.to_period('D')
merged_item_sales

Unnamed: 0_level_0,sales,item_mean,item_std,variable,id
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2011-01-29,0,,,d_1,HOBBIES_1_001_CA_1_evaluation
2011-01-29,0,,,d_1,HOBBIES_1_002_CA_1_evaluation
2011-01-29,0,,,d_1,HOBBIES_1_003_CA_1_evaluation
2011-01-29,0,,,d_1,HOBBIES_1_004_CA_1_evaluation
2011-01-29,0,,,d_1,HOBBIES_1_005_CA_1_evaluation
...,...,...,...,...,...
2016-05-22,1,0.643066,0.841797,d_1941,FOODS_3_823_WI_3_evaluation
2016-05-22,0,0.142822,0.363037,d_1941,FOODS_3_824_WI_3_evaluation
2016-05-22,2,1.000000,1.109375,d_1941,FOODS_3_825_WI_3_evaluation
2016-05-22,0,1.428711,1.650391,d_1941,FOODS_3_826_WI_3_evaluation


In [11]:
len(merged_item_sales['id'].unique())

30490

In [12]:
fourier = CalendarFourier(freq='M', order=4)

dp = DeterministicProcess(
    index=merged_item_sales.index,
    order=1,
    constant=True,
    seasonal=True,
    drop=True,   
    additional_terms=[fourier]
    )

time_features = dp.in_sample()

In [13]:
def make_lags(df, lags):
    return pd.DataFrame(
        {
            f'y_lag_{i}': df.groupby('id').sales.shift(i) for i in range(1, lags+1)
        }
    )

lag_features = make_lags(merged_item_sales, lags=14)
lag_features = lag_features.fillna(0.0)

In [14]:
def make_rolling_mean(df, lags, window):
        return pd.DataFrame(
        {
            f'y_rolling_mean_{i}': df.groupby('id').sales.shift(i).rolling(window).mean() for i in range(1, lags+1)
        }
    )

def make_rolling_std(df, lags, window):
        return pd.DataFrame(
        {
            f'y_rolling_std_{i}': df.groupby('id').sales.shift(i).rolling(window).std() for i in range(1, lags+1)
        }
    )

    
rolling_mean_features = make_rolling_mean(merged_item_sales, lags=14, window=7)
rolling_mean_features = rolling_mean_features.fillna(0.0)

rolling_std_features = make_rolling_std(merged_item_sales, lags=14, window=7)
rolling_std_features = rolling_std_features.fillna(0.0)

In [None]:
all_data = pd.concat([time_features, lag_features, rolling_mean_features, rolling_std_features], axis=1)
all_data['sales'] = merged_item_sales['sales']
all_data['item_mean'] = merged_item_sales['item_mean']
all_data['item_std'] = merged_item_sales['item_std']
all_data['id'] = merged_item_sales['id']
all_data['trend'] = merged_item_sales.variable.apply(lambda x: x.replace('d_', '')).astype(np.float16)
all_data = all_data.fillna(0)
all_data.rename({name: re.sub(r'[^a-zA-Z0-9_]', '_', name) for name in all_data.columns}, axis=1, inplace=True)
all_data

In [None]:
def make_multistep_target(df, steps):
    return pd.concat(
        {f'F{i + 1}': df.groupby('id').sales.shift(-i)
         for i in range(steps)},
        axis=1)

y = make_multistep_target(all_data, steps=28).dropna()
y

In [None]:
all_data.to_csv('../data/features.csv', index=False)
y.to_csv('../data/target.csv', index=False)

In [None]:
X_columns = all_data.columns.drop(['sales','id'])
y_column = 'sales'

n_splits = 5  # 指定分割次数
tscv = TimeSeriesSplit(n_splits=n_splits, test_size=90*len(merged_item_sales['id'].unique()))

y_fore = pd.Series()

lgb_params = {
    #"boosting_type": "goss",
    "n_estimators":1000,
    "boosting_type": "gbdt",
    "objective": "tweedie",
    "tweedie_variance_power": 1.1,
    "metric": "rmse",
    "learning_rate": 0.03,
    #"num_leaves": 2 ** 11 - 1,
    #"min_data_in_leaf": 2 ** 12 - 1,
    "feature_fraction": 0.5,
    #"max_bin": 100,
    "boost_from_average": False,
    #"num_boost_round": 1400,
    "verbose": -1,
    "num_threads": os.cpu_count(),
    "force_row_wise": True,
    "seed": 42
}

for fold, (train_index, test_index) in enumerate(tscv.split(all_data)):
    print(f"Fold {fold + 1}")
    #print('Train index: %s' % train_index)
    #print('Test index: %s' % test_index)
    X_train, X_test = all_data[X_columns].iloc[train_index], all_data[X_columns].iloc[test_index]
    y_train, y_test = all_data[y_column].iloc[train_index], all_data[y_column].iloc[test_index]
    
    train_ds = lgb.Dataset(X_train, label=y_train)
    valid_ds = lgb.Dataset(X_test, label=y_test)

    
    # 在这里训练模型并评估表现
    #model = lgb.LGBMRegressor(lgb_params)
    model = lgb.train(lgb_params, train_ds, valid_sets=[valid_ds], callbacks=[lgb.log_evaluation(period=100,show_stdv=False)])  # 定义模型
    #model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    #y_pred = pd.Series(model.predict(X_train), index=y_train.index)
    y_fore = pd.concat([y_fore, pd.Series(model.predict(X_test), index=y_test.index)])

In [None]:
y_fore

In [None]:
#y_fore = y_fore.rename({0: 'sales_fore'}, axis=1)
test_data = all_data[-len(y_fore):].copy()
test_data['fore'] = y_fore

In [None]:
item_id = 'FOODS_3_586_TX_2_evaluation'

In [None]:
ax = test_data[test_data['id']==item_id]['sales'][-28:].plot(color='0.25', style='.', title="Sales - Seasonal Forecast")
ax = test_data[test_data['id']==item_id]['fore'][-28:].plot(ax=ax, label="Seasonal Forecast", color='C3')
_ = ax.legend()

In [None]:
y_residual = y_fore - all_data['sales'][-len(y_fore):]
X = all_data[X_columns][-len(y_fore):]
y_residual

In [None]:
all_data['sales'][-len(y_fore):]

In [None]:
ax = all_data['sales'][-len(y_fore):].plot(color='0.25', style='.', title="Sales - Seasonal Forecast")
ax = y_residual.plot(ax=ax, label="Seasonal Forecast Residual", color='C3')
_ = ax.legend()

In [None]:
X_train, X_test = X.iloc[:-180], X.iloc[-180:]
y_train, y_test = y_residual.iloc[:-180], y_residual.iloc[-180:]
    
# 在这里训练模型并评估表现
model = XGBRegressor()  # 定义模型
model.fit(X_train, y_train)
y_residual_fore = pd.Series(model.predict(X_test), index=y_test.index)

In [None]:
print('source residuals : %s' % np.mean(y_test**2))

In [None]:
print('after xgboost residuals : %s' % mean_squared_error(y_fore[-180:] + y_residual_fore, all_data['sales'][-180:]))

In [None]:
mean_squared_error(y_fore[-180:] + y_residual_fore, all_data['sales'][-180:])/np.mean(y_test**2)

* 使用xgboost预测error，并没有减少residual

### MultiOutput Regressor

In [None]:
def make_multistep_target(df, steps):
    return pd.concat(
        {f'F{i + 1}': df.groupby('id').sales.shift(-i)
         for i in range(steps)},
        axis=1)

In [None]:
y = make_multistep_target(all_data, steps=28).dropna()
y

In [None]:
#y, X = y.align(all_data[X_columns], join='inner', axis=0)
X = all_data[all_data.trend<=1914][X_columns]
X

In [None]:
all_data.to_csv('../data/features.csv', index=False)
y.to_csv('../data/target.csv', index=False)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False)

lgb_params = {
    #"boosting_type": "goss",
    "n_estimators": 1000,
    "boosting_type": "gbdt",
    "objective": "tweedie",
    "tweedie_variance_power": 1.1,
    "metric": "rmse",
    "learning_rate": 0.01,
    "num_leaves": 2 ** 5 - 1,
    #"min_data_in_leaf": 2 ** 12 - 1,
    "feature_fraction": 0.5,
    #"max_bin": 100,
    "boost_from_average": False,
    #"num_boost_round": 1400,
    "verbose": -1,
    #"num_threads": os.cpu_count(),
    "force_row_wise": True,
    "seed": 42
}

model = MultiOutputRegressor(lgb.LGBMRegressor(**lgb_params))
model.fit(X_train, y_train)

y_fit = pd.DataFrame(model.predict(X_train), index=X_train.index, columns=y.columns)
y_pred = pd.DataFrame(model.predict(X_test), index=X_test.index, columns=y.columns)

In [None]:
train_rmse = mean_squared_error(y_train, y_fit, squared=False)
test_rmse = mean_squared_error(y_test, y_pred, squared=False)
print((f"Train RMSE: {train_rmse:.2f}\n" f"Test RMSE: {test_rmse:.2f}"))

'''
palette = dict(palette='husl', n_colors=64)
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(11, 6))
ax1 = all_data.sales[y_fit.index].plot(**plot_params, ax=ax1)
ax1 = plot_multistep(y_fit, ax=ax1, palette_kwargs=palette)
_ = ax1.legend(['Sales (train)', 'Forecast'])
ax2 = all_data.sales[y_pred.index].plot(**plot_params, ax=ax2)
ax2 = plot_multistep(y_pred, ax=ax2, palette_kwargs=palette)
_ = ax2.legend(['Sales (test)', 'Forecast'])
'''

### Predict

In [None]:
#valid_data = all_data[all_data.trend>1913].sales
#valid_data

valid_data = all_data[(all_data.trend>1913) & (all_data.id == 'FOODS_3_827_WI_3_evaluation')].sales
valid_data

In [None]:
fore_data = y_pred[-1:].T
fore_data.index = valid_data.index
fore_data

In [None]:
ax = valid_data.plot(color='0.25', style='.', title="Sales - Seasonal True")
ax = fore_data.plot(ax=ax, label="Seasonal Forecast", color='C3')
_ = ax.legend()

In [None]:
fore_data.T

### Predict All Items

In [None]:
y_fore = pd.DataFrame(model.predict(all_data[all_data.trend==1941][X_columns]), columns=y.columns)
y_fore['id'] = all_data[all_data.trend==1941].reset_index().id
y_fore

In [None]:
y_fore.to_csv('../result/lgb_all_evaluation_submission.csv', index=False)

In [None]:
%%time
day_columns = [column for column in train_data.columns if 'd_' in column]
y_valids = pd.DataFrame(columns=['id']+[f'F{i}' for i in range(1,29)])
for item_id in train_data['id'].unique():
    selected_item = train_data[train_data['id']==item_id]
    item_sales = selected_item[day_columns].sum()
    item_sales.reset_index()
    item_sales = pd.DataFrame(item_sales, columns=['sales'])
    item_sales = item_sales.reset_index().rename({'index':'d'},axis=1)

    merged_item_sales = item_sales.merge(calendar, how='left', on='d')[['date', 'sales']]

    merged_item_sales['date'] = pd.to_datetime(merged_item_sales['date'])
    merged_item_sales.set_index('date', inplace=True)
    merged_item_sales.index = merged_item_sales.index.to_period('D')


    y = make_multistep_target(merged_item_sales['sales'], steps=28).dropna()
    y_valid = y[-1:].copy()
    y_valid['id'] = item_id.replace('_evaluation', '_validation')

    y_valids = pd.concat([y_valids, y_valid], axis=0)

In [None]:
y_valids.to_csv('../result/validation_submission.csv', index=False)

In [None]:
all_sub = pd.concat([y_valids, y_fore], axis=0)

In [None]:
all_sub.to_csv('../result/lgb_all_submission.csv', index=False)