In [2]:
import os
import gc
import datetime
import pandas as pd
import numpy as np
import warnings
import pickle
from sklearn.model_selection import train_test_split,cross_val_score
import chinese_calendar
from sklearn.preprocessing import OrdinalEncoder
import xgboost as xgb
import lightgbm as lgb
import catboost
from sklearn import metrics

warnings.filterwarnings('ignore')

  from pandas import MultiIndex, Int64Index
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)


In [3]:
def get_data_path(filename:str) -> str:
    return str(os.path.dirname(os.getcwd()))+'\\data\\'+filename+'.csv'


def get_time_list(starttime:datetime.datetime,endtime:datetime.datetime):
    time_list = []
    while starttime<=endtime:
        time_list.append(starttime)
        starttime += datetime.timedelta(days=1)
    return time_list


def str_to_datetime(x:str) -> datetime.datetime:
    return datetime.datetime.strptime(x, "%Y-%m-%d")


def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() 
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype
        if col == 'date':
            continue   
        # elif col=='store_id' or col=='sku_id':
        #     df[col] = df[col].astype(int)
        # elif col=='online_y' or col=='offline_y':
        #     df[col] = df[col].astype(float)
        elif col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    # df[col] = df[col].astype(np.float16)
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            # df[col] = df[col].astype('category')
            pass
    end_mem = df.memory_usage().sum() 
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

In [4]:
# 根据历史订单销售数据判定sku类型
sku_sales_df = pd.read_csv(get_data_path('sku_sales'), sep=',', encoding='utf8')
sku_sales_df['order_time'] = sku_sales_df.apply(func=lambda x: x['order_time'][:10], axis=1)
sku_sales_df = sku_sales_df.rename(columns={'order_time':'date'})
sku_sales_df['date'] = sku_sales_df['date'].apply(str_to_datetime)

# 零售品
not_complete_list = sku_sales_df.loc[(sku_sales_df.quantity%1!=0),'sku_id'].unique().tolist()
print("零售品列表如下：",not_complete_list)
print(f"零售品数量为{len(not_complete_list)}")

# 预测期每个店铺的停售品
not_sale_store_list = []
for i in range(1,13):
    print(f"第{i}号门店的停售品为：")
    all_sku_set = set([i for i in range(1,1001)])
    condition1 = (sku_sales_df.date>=datetime.datetime(2022,8,15))
    condition2 = (sku_sales_df.date<=datetime.datetime(2022,9,30)) 
    condition3 = (sku_sales_df.date>=datetime.datetime(2023,8,10))
    condition4 = (sku_sales_df.store_id==i)
    all_sku_set = set([_ for _ in range(1,1001)])
    sale_set = set(sku_sales_df.loc[((condition1&condition2) | condition3) & (condition4),'sku_id'].unique().tolist())
    not_sale_list = list(all_sku_set-sale_set)
    print(not_sale_list)
    print(f"停售种类数量为{len(not_sale_list)}")
    not_sale_store_list.append(not_sale_list)

# 新品（历史前一个月无销售记录）
new_sale_list = []
history_sale_set = set(sku_sales_df.loc[(sku_sales_df.date<datetime.datetime(2023,8,1)),'sku_id'].unique().tolist())
new_set = set(sku_sales_df.loc[(sku_sales_df.date>=datetime.datetime(2023,8,1)),'sku_id'].unique().tolist())
new_sale_list = [_ for  _ in new_set if _ not in history_sale_set]
print("新品列表如下：",new_sale_list)
print(f"新品数量为{len(new_sale_list)}")


# sku销量规模0-2,3-7,8-15,16-inf
# 只看历史同期数据
condition1 = (sku_sales_df.date>=datetime.datetime(2022,8,25))
condition2 = (sku_sales_df.date<=datetime.datetime(2022,9,20)) 
condition3 = (sku_sales_df.date>=datetime.datetime(2023,8,15))
sku_sales_df = sku_sales_df[(condition1 & condition2) | condition3]
online_sku_first_sale_list = []
online_sku_second_sale_list = []
online_sku_third_sale_list = []
online_sku_forth_sale_list = []
online_sku_sales_df = sku_sales_df[sku_sales_df.channel==2]
online_sku_sales_df = online_sku_sales_df.groupby(['store_id','sku_id','date'])[['order_id','quantity']].agg({'order_id':'count','quantity':'sum'}).reset_index()
online_sku_sales_df = online_sku_sales_df.groupby('sku_id')['quantity'].median().reset_index()
online_sku_sales_df = online_sku_sales_df.sort_values(by='sku_id')
tmp_list = online_sku_sales_df.sku_id.tolist()
for i in tmp_list:
    if online_sku_sales_df.loc[(online_sku_sales_df['sku_id']==i),'quantity'].values<=2:
        online_sku_first_sale_list.append(i)
    elif online_sku_sales_df.loc[(online_sku_sales_df['sku_id']==i),'quantity'].values<=7:
        online_sku_second_sale_list.append(i)
    elif online_sku_sales_df.loc[(online_sku_sales_df['sku_id']==i),'quantity'].values<=15:
        online_sku_third_sale_list.append(i)
    else:
        online_sku_forth_sale_list.append(i)
print(f"线上订单销量中位数小于2的sku种类数量为{len(online_sku_first_sale_list)}")
print(f"线上订单销量中位数小于7的sku种类数量为{len(online_sku_second_sale_list)}")
print(f"线上订单销量中位数小于15的sku种类数量为{len(online_sku_third_sale_list)}")
print(f"线上订单销量中位数大于15的sku种类数量为{len(online_sku_forth_sale_list)}")
offline_sku_first_sale_list = []
offline_sku_second_sale_list = []
offline_sku_third_sale_list = []
offline_sku_forth_sale_list = []
offline_sku_sales_df = sku_sales_df[sku_sales_df.channel==1]
offline_sku_sales_df = offline_sku_sales_df.groupby(['store_id','sku_id','date'])[['order_id','quantity']].agg({'order_id':'count','quantity':'sum'}).reset_index()
offline_sku_sales_df = offline_sku_sales_df.groupby('sku_id')['quantity'].median().reset_index()
offline_sku_sales_df = offline_sku_sales_df.sort_values(by='sku_id')
tmp_list = offline_sku_sales_df.sku_id.tolist()
for i in tmp_list:
        if offline_sku_sales_df.loc[(offline_sku_sales_df['sku_id']==i),'quantity'].values<=2:
            offline_sku_first_sale_list.append(i)
        elif offline_sku_sales_df.loc[(offline_sku_sales_df.sku_id==i),'quantity'].values<=7:
            offline_sku_second_sale_list.append(i)
        elif offline_sku_sales_df.loc[(offline_sku_sales_df.sku_id==i),'quantity'].values<=15:
            offline_sku_third_sale_list.append(i)
        else:
            offline_sku_forth_sale_list.append(i)
print(f"线下订单销量中位数小于2的sku种类数量为{len(offline_sku_first_sale_list)}")
print(f"线下订单销量中位数小于7的sku种类数量为{len(offline_sku_second_sale_list)}")
print(f"线下订单销量中位数小于15的sku种类数量为{len(offline_sku_third_sale_list)}")
print(f"线下订单销量中位数大于15的sku种类数量为{len(offline_sku_forth_sale_list)}")

零售品列表如下： [14, 259, 44, 442, 150, 449, 448, 443]
零售品数量为8
第1号门店的停售品为：
[867, 580, 713, 905, 906, 652, 127, 653, 814, 784, 721, 907, 909, 922, 572, 350, 543]
停售种类数量为17
第2号门店的停售品为：
[865, 866, 451, 580, 8, 713, 586, 936, 652, 653, 693, 601, 350, 543]
停售种类数量为14
第3号门店的停售品为：
[8, 905, 906, 907, 653, 909, 786, 791, 543, 931, 936, 446, 451, 580, 713, 720, 721, 350, 865, 867]
停售种类数量为20
第4号门店的停售品为：
[865, 451, 580, 936, 713, 652, 721, 601, 921, 956, 350, 543]
停售种类数量为12
第5号门店的停售品为：
[865, 866, 995, 580, 8, 713, 586, 905, 906, 907, 909, 956, 569, 127, 350, 543]
停售种类数量为16
第6号门店的停售品为：
[451, 580, 867, 713, 652, 350, 543]
停售种类数量为7
第7号门店的停售品为：
[865, 451, 580, 867, 868, 713, 652, 366, 785, 601, 350, 543]
停售种类数量为12
第8号门店的停售品为：
[865, 451, 580, 867, 931, 713, 586, 652, 653, 785, 958, 309, 601, 795, 956, 350, 543]
停售种类数量为17
第9号门店的停售品为：
[865, 451, 580, 934, 713, 652, 366, 721, 786, 309, 956, 350]
停售种类数量为12
第10号门店的停售品为：
[896, 8, 905, 906, 907, 652, 909, 910, 785, 923, 543, 927, 931, 184, 956, 451, 580, 713, 586, 72

In [5]:
predict_start_date = datetime.datetime(2023,9,1) # 预测开始时间
predict_end_date = datetime.datetime(2023,9,14)  # 预测结束时间
predict_periods = 14 # 预测步长
pred_date_list = [] # 预测日期列表
tmp = predict_start_date
while tmp<=predict_end_date:
    pred_date_list.append(tmp)
    tmp += datetime.timedelta(days=1)
pred_date_list

[datetime.datetime(2023, 9, 1, 0, 0),
 datetime.datetime(2023, 9, 2, 0, 0),
 datetime.datetime(2023, 9, 3, 0, 0),
 datetime.datetime(2023, 9, 4, 0, 0),
 datetime.datetime(2023, 9, 5, 0, 0),
 datetime.datetime(2023, 9, 6, 0, 0),
 datetime.datetime(2023, 9, 7, 0, 0),
 datetime.datetime(2023, 9, 8, 0, 0),
 datetime.datetime(2023, 9, 9, 0, 0),
 datetime.datetime(2023, 9, 10, 0, 0),
 datetime.datetime(2023, 9, 11, 0, 0),
 datetime.datetime(2023, 9, 12, 0, 0),
 datetime.datetime(2023, 9, 13, 0, 0),
 datetime.datetime(2023, 9, 14, 0, 0)]

In [42]:
# 读取数据
# 训练集数据
train_df = pd.read_csv(get_data_path('train_df_V3'))
train_df['date'] = train_df['date'].apply(str_to_datetime)
# 考虑到可能历史3天里面存在填充数据失真的情况，去掉失真数据部分
train_df = train_df[train_df.date>=datetime.datetime(2022,8,21)]
train_df = train_df[(train_df.date>=datetime.datetime(2023,7,21)) | (train_df.date<=datetime.datetime(2022,10,15))]

# 销售数据(用于测试集，下同)
sku_sales_df = pd.read_csv(get_data_path('sku_sales_precess_df_V3'))
sku_sales_df['date'] = sku_sales_df['date'].apply(str_to_datetime)

# 价格和状态信息
sku_price_and_status_df = pd.read_csv(get_data_path('sku_price_and_status'), sep=',', encoding='utf8')
# sku_price_and_status_df = sku_price_and_status_df[(sku_price_and_status_df.salable_status==1) & (sku_price_and_status_df.stock_status==1)]
sku_price_and_status_df['date'] = sku_price_and_status_df['date'].apply(str_to_datetime)
# 成本计算在之前模型训练的时候没加入（是否考虑优化)
sku_price_and_status_df = sku_price_and_status_df.sort_values(by=['store_id','sku_id','date'])
for i in range(0,7):
    sku_price_and_status_df[f'pred_{i}d_price'] = sku_price_and_status_df.groupby(['store_id','sku_id'])['original_price'].shift(i)
sku_price_and_status_df = sku_price_and_status_df.fillna(method='backfill')
sku_price_and_status_df['unit_cost'] = 0
for i in range(7):
    sku_price_and_status_df['unit_cost'] += sku_price_and_status_df[f'pred_{i}d_price']
    del sku_price_and_status_df[f'pred_{i}d_price']
    gc.collect()
sku_price_and_status_df['unit_cost'] = round(sku_price_and_status_df['unit_cost']/14,2)
sku_price_and_status_df['price_cost_diff'] = round(sku_price_and_status_df['original_price']-sku_price_and_status_df['unit_cost'],2)
sku_price_and_status_df['price_cost_cv'] = round(sku_price_and_status_df['price_cost_diff']/sku_price_and_status_df['original_price'],2)

# 在V3版本保存好的训练集少了价格和状态信息的三个特征，修正下
train_df = pd.merge(left=train_df,
                    right=sku_price_and_status_df[['store_id','sku_id','date','unit_cost','price_cost_diff','price_cost_cv']],
                    on=['store_id','sku_id','date'],
                    how='left')


# 加入商品基础信息
sku_info_df = pd.read_csv(get_data_path('sku_info'), sep=',', encoding='utf8')

# 加入门店天气信息
store_weather_df = pd.read_csv(get_data_path('store_weather'), sep=',', encoding='utf8')
store_weather_df['date'] = store_weather_df['date'].apply(str_to_datetime)

# 获取线上和线下数据
online_sku_prom_df = pd.read_csv(get_data_path('online_sku_prom_df'))
online_sku_prom_df['date'] = online_sku_prom_df['date'].apply(str_to_datetime)
offline_sku_prom_df = pd.read_csv(get_data_path('offline_sku_prom_df'))
offline_sku_prom_df['date'] = offline_sku_prom_df['date'].apply(str_to_datetime)

In [43]:
## 初始化相关映射字典以及信息
online_store_sku_all_y_mean_dict = {}
online_store_sku_all_y_std_dict = {}
offline_store_sku_all_y_mean_dict = {}
offline_store_sku_all_y_std_dict = {}
online_pre_1d_y_dict = {}
online_pre_2d_y_dict = {}
online_pre_3d_y_dict = {}
online_pre_4d_y_dict = {}
online_pre_5d_y_dict = {}
online_pre_6d_y_dict = {}
online_pre_7d_y_dict = {}
offline_pre_1d_y_dict = {}
offline_pre_2d_y_dict = {}
offline_pre_3d_y_dict = {}
offline_pre_4d_y_dict = {}
offline_pre_5d_y_dict = {}
offline_pre_6d_y_dict = {}
offline_pre_7d_y_dict = {}


for i in range(1,13):
    for j in range(1,1001):
        part = sku_sales_df[(sku_sales_df.store_id==i) & (sku_sales_df.sku_id==j)]
        part = part.sort_values(by=['date'])
        hash_key = i*10000+j
        if len(part):
            online_store_sku_all_y_mean_dict[hash_key] = part['online_store_sku_all_y_mean'].values[0]
            online_store_sku_all_y_std_dict[hash_key] = part['online_store_sku_all_y_std'].values[0]
            offline_store_sku_all_y_mean_dict[hash_key] = part['offline_store_sku_all_y_mean'].values[0]
            offline_store_sku_all_y_std_dict[hash_key] = part['offline_store_sku_all_y_std'].values[0]
            online_pre_1d_y_dict[hash_key] = part['online_pre_1d_y'].values[-1]
            online_pre_2d_y_dict[hash_key] = part['online_pre_2d_y'].values[-1]
            online_pre_3d_y_dict[hash_key] = part['online_pre_3d_y'].values[-1]
            online_pre_4d_y_dict[hash_key] = part['online_pre_4d_y'].values[-1]
            online_pre_5d_y_dict[hash_key] = part['online_pre_5d_y'].values[-1]
            online_pre_6d_y_dict[hash_key] = part['online_pre_6d_y'].values[-1]
            online_pre_7d_y_dict[hash_key] = part['online_pre_7d_y'].values[-1]
            offline_pre_1d_y_dict[hash_key] = part['offline_pre_1d_y'].values[-1]
            offline_pre_2d_y_dict[hash_key] = part['offline_pre_2d_y'].values[-1]
            offline_pre_3d_y_dict[hash_key] = part['offline_pre_3d_y'].values[-1]
            offline_pre_4d_y_dict[hash_key] = part['offline_pre_4d_y'].values[-1]
            offline_pre_5d_y_dict[hash_key] = part['offline_pre_5d_y'].values[-1]
            offline_pre_6d_y_dict[hash_key] = part['offline_pre_6d_y'].values[-1]
            offline_pre_7d_y_dict[hash_key] = part['offline_pre_7d_y'].values[-1]
        else:
            for d in [online_store_sku_all_y_mean_dict,online_store_sku_all_y_std_dict,
                offline_store_sku_all_y_mean_dict,offline_store_sku_all_y_std_dict,
                online_pre_1d_y_dict,online_pre_2d_y_dict,online_pre_3d_y_dict,
                online_pre_4d_y_dict,online_pre_5d_y_dict,online_pre_6d_y_dict,online_pre_7d_y_dict,
                offline_pre_1d_y_dict,offline_pre_2d_y_dict,offline_pre_3d_y_dict,
                offline_pre_4d_y_dict,offline_pre_5d_y_dict,offline_pre_6d_y_dict,offline_pre_7d_y_dict,]:
                d[hash_key] = 0


mapping_dict = [online_store_sku_all_y_mean_dict,online_store_sku_all_y_std_dict,
                offline_store_sku_all_y_mean_dict,offline_store_sku_all_y_std_dict]
            
mapping_col = ['online_store_sku_all_y_mean','online_store_sku_all_y_std',
                'offline_store_sku_all_y_mean','offline_store_sku_all_y_std']

In [44]:
na_0_list = ['online_store_sku_all_y_std','offline_store_sku_all_y_std',
             'online_curr_day','online_threshold','online_prom_cur_total_rate',
             'offline_curr_day','offline_threshold','offline_prom_cur_total_rate',
             'sku_weather_online_y_std','sku_weather_offline_y_std',
             'online_store_sku_all_y_cv','offline_store_sku_all_y_cv',
             'sku_prom_offline_y_std','sku_prom_online_y_std',
             'sku_workday_online_y_std','sku_workday_offline_y_std',
             'sku_month_online_y_std','sku_month_offline_y_std',

             'unit_cost','price_cost_diff','price_cost_cv'
             ]
na_M_list = ['online_total_days','offline_total_days',]
category_col = ['store_id','sku_id','item_first_cate_cd', 'item_second_cate_cd', 'item_third_cate_cd','brand_code','weather_type',
                'online_promotion_type','offline_promotion_type',
                'if_new_sale_sku','online_sku_sale_scale','offline_sku_sale_scale',
                'month', 'dayofweek','if_workday',]

In [45]:
online_lightgbm_category_col = ['store_id','sku_id','item_first_cate_cd', 'item_second_cate_cd', 'item_third_cate_cd','brand_code','weather_type',
                                'online_promotion_type','month', 'dayofweek','if_workday','if_new_sale_sku','online_sku_sale_scale',]
online_lightgbm_numeric_col = ['online_pre_1d_y', 'online_pre_2d_y', 'online_pre_3d_y','online_pre_3d_y_mean', 'online_pre_3d_y_std',
                      'online_store_sku_all_y_mean', 'online_store_sku_all_y_std','original_price','min_temperature', 'max_temperature',
                      'online_curr_day', 'online_total_days', 'online_threshold', 'online_discount_off',
                      'online_prom_cur_total_rate','online_discount_threshold_rate','dayofmonth','dayofyear', 
                      'sku_online_y_mean', 'sku_online_y_std','sku_online_y_mid','sku_online_y_min', 'sku_online_y_max',
                      'original_price_diff', 'original_price_cv','sku_online_y_cv','online_store_sku_all_y_cv',
                      'sku_month_online_y_mean','sku_month_online_y_std','sku_month_online_y_mid','sku_month_online_y_max','sku_month_online_y_min',
                      'sku_weather_online_y_mean','sku_weather_online_y_std','sku_weather_online_y_mid','sku_weather_online_y_max','sku_weather_online_y_min',
                      'sku_prom_online_y_mean','sku_prom_online_y_std','sku_prom_online_y_mid','sku_prom_online_y_max','sku_prom_online_y_min',
                      'sku_workday_online_y_mean','sku_workday_online_y_std','sku_workday_online_y_mid','sku_workday_online_y_max','sku_workday_online_y_min',
                      'unit_cost','price_cost_diff','price_cost_cv']
online_lightgbm_target = ['online_y']
online_lightgbm_f = open(str(os.path.dirname(os.getcwd()))+'\\model\\'+'online_model_lightgbm_3.model','rb').read()  #注意此处model是rb
online_lightgbm_model = pickle.loads(online_lightgbm_f)
offline_lightgbm_category_col = ['store_id','sku_id','item_first_cate_cd', 'item_second_cate_cd', 'item_third_cate_cd','brand_code','weather_type',
                                'offline_promotion_type','month', 'dayofweek','if_workday','if_new_sale_sku','offline_sku_sale_scale',]
offline_lightgbm_numeric_col = ['offline_pre_1d_y', 'offline_pre_2d_y', 'offline_pre_3d_y','offline_pre_3d_y_mean', 'offline_pre_3d_y_std',
                       'offline_store_sku_all_y_mean', 'offline_store_sku_all_y_std','original_price','min_temperature', 'max_temperature',
                       'offline_curr_day','offline_total_days', 'offline_threshold','offline_discount_off',
                       'offline_prom_cur_total_rate','offline_discount_threshold_rate', 'dayofmonth','dayofyear', 
                       'sku_offline_y_mean','sku_offline_y_std', 'sku_offline_y_mid','sku_offline_y_min', 'sku_offline_y_max',
                       'original_price_diff', 'original_price_cv','sku_offline_y_cv','offline_store_sku_all_y_cv',
                       'sku_month_offline_y_mean','sku_month_offline_y_std','sku_month_offline_y_mid','sku_month_offline_y_max','sku_month_offline_y_min',
                       'sku_weather_offline_y_mean','sku_weather_offline_y_std','sku_weather_offline_y_mid','sku_weather_offline_y_max','sku_weather_offline_y_min',
                       'sku_prom_offline_y_mean','sku_prom_offline_y_std','sku_prom_offline_y_mid','sku_prom_offline_y_max','sku_prom_offline_y_min',
                       'sku_workday_offline_y_mean','sku_workday_offline_y_std','sku_workday_offline_y_mid','sku_workday_offline_y_max','sku_workday_offline_y_min',
                       'unit_cost','price_cost_diff','price_cost_cv']
offline_lightgbm_target = ['offline_y']
offline_lightgbm_f = open(str(os.path.dirname(os.getcwd()))+'\\model\\'+'offline_model_lightgbm_3.model','rb').read()  #注意此处model是rb
offline_lightgbm_model = pickle.loads(offline_lightgbm_f)

In [46]:
online_xgboost_category_col = ['store_id','sku_id','item_first_cate_cd', 'item_second_cate_cd', 'item_third_cate_cd','brand_code','weather_type',
                                'online_promotion_type','month', 'dayofweek','if_workday','if_new_sale_sku','online_sku_sale_scale',]
online_xgboost_numeric_col = ['online_pre_1d_y', 'online_pre_2d_y', 'online_pre_3d_y','online_pre_3d_y_mean', 'online_pre_3d_y_std',
                      'online_store_sku_all_y_mean', 'online_store_sku_all_y_std','original_price','min_temperature', 'max_temperature',
                      'online_curr_day', 'online_total_days', 'online_threshold', 'online_discount_off',
                      'online_prom_cur_total_rate','online_discount_threshold_rate','dayofmonth','dayofyear', 
                      'sku_online_y_mean', 'sku_online_y_std','sku_online_y_mid','sku_online_y_min', 'sku_online_y_max',
                      'original_price_diff', 'original_price_cv','sku_online_y_cv','online_store_sku_all_y_cv',
                      'sku_month_online_y_mean','sku_month_online_y_std','sku_month_online_y_mid','sku_month_online_y_max','sku_month_online_y_min',
                      'sku_weather_online_y_mean','sku_weather_online_y_std','sku_weather_online_y_mid','sku_weather_online_y_max','sku_weather_online_y_min',
                      'sku_prom_online_y_mean','sku_prom_online_y_std','sku_prom_online_y_mid','sku_prom_online_y_max','sku_prom_online_y_min',
                      'sku_workday_online_y_mean','sku_workday_online_y_std','sku_workday_online_y_mid','sku_workday_online_y_max','sku_workday_online_y_min',
                      'unit_cost','price_cost_diff','price_cost_cv']
online_xgboost_target = ['online_y_log']
online_xgboost_enc = None #后续处理
online_xgboost_f = open(str(os.path.dirname(os.getcwd()))+'\\model\\'+'online_model_xgboost_3.model','rb').read()  #注意此处model是rb
online_xgboost_model = pickle.loads(online_xgboost_f)

offline_xgboost_category_col = ['store_id','sku_id','item_first_cate_cd', 'item_second_cate_cd', 'item_third_cate_cd','brand_code','weather_type',
                                'offline_promotion_type','month', 'dayofweek','if_workday','if_new_sale_sku','offline_sku_sale_scale',]
offline_xgboost_numeric_col = ['offline_pre_1d_y', 'offline_pre_2d_y', 'offline_pre_3d_y','offline_pre_3d_y_mean', 'offline_pre_3d_y_std',
                       'offline_store_sku_all_y_mean', 'offline_store_sku_all_y_std','original_price','min_temperature', 'max_temperature',
                       'offline_curr_day','offline_total_days', 'offline_threshold','offline_discount_off',
                       'offline_prom_cur_total_rate','offline_discount_threshold_rate', 'dayofmonth','dayofyear', 
                       'sku_offline_y_mean','sku_offline_y_std', 'sku_offline_y_mid','sku_offline_y_min', 'sku_offline_y_max',
                       'original_price_diff', 'original_price_cv','sku_offline_y_cv','offline_store_sku_all_y_cv',
                       'sku_month_offline_y_mean','sku_month_offline_y_std','sku_month_offline_y_mid','sku_month_offline_y_max','sku_month_offline_y_min',
                       'sku_weather_offline_y_mean','sku_weather_offline_y_std','sku_weather_offline_y_mid','sku_weather_offline_y_max','sku_weather_offline_y_min',
                       'sku_prom_offline_y_mean','sku_prom_offline_y_std','sku_prom_offline_y_mid','sku_prom_offline_y_max','sku_prom_offline_y_min',
                       'sku_workday_offline_y_mean','sku_workday_offline_y_std','sku_workday_offline_y_mid','sku_workday_offline_y_max','sku_workday_offline_y_min',
                       'unit_cost','price_cost_diff','price_cost_cv']
offline_xgboost_target = ['offline_y_log']
offline_xgboost_enc = None #后续处理
offline_xgboost_f = open(str(os.path.dirname(os.getcwd()))+'\\model\\'+'offline_model_xgboost_3.model','rb').read()  #注意此处model是rb
offline_xgboost_model = pickle.loads(offline_xgboost_f)  


In [47]:
online_catboost_category_col = ['store_id','sku_id','weather_type',
                                'online_promotion_type','month', 'dayofweek','if_workday','if_new_sale_sku','online_sku_sale_scale',]
online_catboost_numeric_col = ['online_pre_1d_y', 'online_pre_2d_y', 'online_pre_3d_y','online_pre_3d_y_mean', 'online_pre_3d_y_std',
                      'original_price','min_temperature', 'max_temperature',
                      'online_curr_day', 'online_total_days', 'online_threshold', 'online_discount_off',
                      'online_prom_cur_total_rate','online_discount_threshold_rate','dayofmonth','dayofyear', 
                      'sku_online_y_mid',
                      'original_price_diff', 'original_price_cv','sku_online_y_cv','online_store_sku_all_y_cv',
                      'sku_month_online_y_mid',
                      'sku_weather_online_y_mid',
                      'sku_prom_online_y_mid',
                      'sku_workday_online_y_mid',
                      'unit_cost','price_cost_diff','price_cost_cv']
online_catboost_target = ['online_y_log']
online_catboost_f = open(str(os.path.dirname(os.getcwd()))+'\\model\\'+'online_model_catboost_3.model','rb').read()  #注意此处model是rb
online_catboost_model = pickle.loads(online_catboost_f)

offline_catboost_category_col = ['store_id','sku_id', 'weather_type',
                                'offline_promotion_type','month', 'dayofweek','if_workday','if_new_sale_sku','offline_sku_sale_scale',]
offline_catboost_numeric_col = ['offline_pre_1d_y', 'offline_pre_2d_y', 'offline_pre_3d_y','offline_pre_3d_y_mean', 'offline_pre_3d_y_std',
                       'original_price','min_temperature', 'max_temperature',
                       'offline_curr_day','offline_total_days', 'offline_threshold','offline_discount_off',
                       'offline_prom_cur_total_rate','offline_discount_threshold_rate', 'dayofmonth','dayofyear', 
                       'sku_offline_y_mid',
                       'original_price_diff', 'original_price_cv','sku_offline_y_cv','offline_store_sku_all_y_cv',
                       'sku_month_offline_y_mid',
                       'sku_weather_offline_y_mid',
                       'sku_prom_offline_y_mid',
                       'sku_workday_offline_y_mid',
                       'unit_cost','price_cost_diff','price_cost_cv']
offline_catboost_target = ['offline_y_log']
offline_catboost_f = open(str(os.path.dirname(os.getcwd()))+'\\model\\'+'offline_model_catboost_3.model','rb').read()  #注意此处model是rb
offline_catboost_model = pickle.loads(offline_catboost_f)

In [48]:
online_stacking_category_col = ['item_third_cate_cd','weather_type',
                                'dayofweek','online_sku_sale_scale',]
online_stacking_numeric_col = ['online_pre_1d_y', 'online_pre_2d_y', 'online_pre_3d_y', 'online_pre_4d_y', 'online_pre_5d_y',
                      'original_price','min_temperature', 'max_temperature',
                      'online_curr_day', 'online_discount_off',
                      'online_prom_cur_total_rate',
                      'sku_online_y_mean','sku_online_y_mid', 'sku_online_y_max',
                      'original_price_diff', 'original_price_cv','sku_online_y_cv',
                      'sku_month_online_y_mid','sku_month_online_y_max',
                      'sku_weather_online_y_mid','sku_weather_online_y_max',
                      'sku_prom_online_y_mid','sku_prom_online_y_max','sku_prom_online_y_min',
                      'sku_workday_online_y_mid',
                      'online_y_catboost_log','online_y_lightgbm_log','online_y_xgboost_log',
                      'online_y_min_log','online_y_diff_log',
                      'online_y_min','online_y_diff',
                      'online_y_log_pred','online_y_pred'
                      ]
online_stacking_target = ['online_y_label']
online_clf_f = open(str(os.path.dirname(os.getcwd()))+'\\model\\'+'online_clf_model_3.model','rb').read()  #注意此处model是rb
online_clf_model = pickle.loads(online_clf_f)

offline_stacking_category_col = ['item_third_cate_cd','weather_type',
                                'dayofweek','offline_sku_sale_scale',]
offline_stacking_numeric_col = ['offline_pre_1d_y', 'offline_pre_2d_y','offline_pre_3d_y','offline_pre_4d_y','offline_pre_5d_y',
                       'original_price','min_temperature', 'max_temperature',
                       'offline_curr_day','offline_discount_off',
                       'offline_prom_cur_total_rate', 
                       'sku_offline_y_mean', 'sku_offline_y_mid','sku_offline_y_max',
                       'original_price_diff', 'original_price_cv','sku_offline_y_cv',
                       'sku_month_offline_y_mid','sku_month_offline_y_max',
                       'sku_weather_offline_y_mid','sku_weather_offline_y_max',
                       'sku_prom_offline_y_mid','sku_prom_offline_y_max','sku_prom_offline_y_min',
                       'sku_workday_offline_y_mid',
                       'offline_y_catboost_log','offline_y_lightgbm_log','offline_y_xgboost_log',
                       'offline_y_min_log','offline_y_diff_log',
                       'offline_y_min','offline_y_diff',
                       'offline_y_log_pred','offline_y_pred'
                       ]
offline_stacking_target = ['offline_y_label']
offline_clf_f = open(str(os.path.dirname(os.getcwd()))+'\\model\\'+'offline_clf_model_3.model','rb').read()  #注意此处model是rb
offline_clf_model = pickle.loads(offline_clf_f)

In [49]:
test_df_list = []
for dt,d in enumerate(pred_date_list):
    print(f"目前正在预测{pred_date_list[dt]}的数据.....")
    # 滚动更新前7天的销量数据
    online_pre_7d_y_dict = online_pre_6d_y_dict
    online_pre_6d_y_dict = online_pre_5d_y_dict
    online_pre_5d_y_dict = online_pre_4d_y_dict
    online_pre_4d_y_dict = online_pre_3d_y_dict
    online_pre_3d_y_dict = online_pre_2d_y_dict
    online_pre_2d_y_dict = online_pre_1d_y_dict
    online_pre_1d_y_dict = {}
    offline_pre_7d_y_dict = offline_pre_6d_y_dict
    offline_pre_6d_y_dict = offline_pre_5d_y_dict
    offline_pre_5d_y_dict = offline_pre_4d_y_dict
    offline_pre_4d_y_dict = offline_pre_3d_y_dict
    offline_pre_3d_y_dict = offline_pre_2d_y_dict
    offline_pre_2d_y_dict = offline_pre_1d_y_dict
    offline_pre_1d_y_dict = {}
    pre_1d_sku_sales_df = sku_sales_df[sku_sales_df.date==pred_date_list[dt]+datetime.timedelta(-1)]
    for i in range(1,13):
        for j in range(1,1001):
            part = pre_1d_sku_sales_df[(pre_1d_sku_sales_df.store_id==i) & (pre_1d_sku_sales_df.sku_id==j)]
            part = part.sort_values(by=['date'])
            hash_key = i*10000+j
            if len(part)>0:
                online_pre_1d_y_dict[hash_key] = part['online_y'].values[-1]
                offline_pre_1d_y_dict[hash_key] = part['offline_y'].values[-1]
            else:
                online_pre_1d_y_dict[hash_key] = 0
                offline_pre_1d_y_dict[hash_key] = 0
    update_dict_list = [online_pre_1d_y_dict,online_pre_2d_y_dict,online_pre_3d_y_dict,
                        online_pre_4d_y_dict,online_pre_5d_y_dict,online_pre_6d_y_dict,online_pre_7d_y_dict,
                        offline_pre_1d_y_dict,offline_pre_2d_y_dict,offline_pre_3d_y_dict,
                        offline_pre_4d_y_dict,offline_pre_5d_y_dict,offline_pre_6d_y_dict,offline_pre_7d_y_dict,]
    update_dict_col_list = ['online_pre_1d_y','online_pre_2d_y','online_pre_3d_y',
                            'online_pre_4d_y','online_pre_5d_y','online_pre_6d_y','online_pre_7d_y',
                            'offline_pre_1d_y','offline_pre_2d_y','offline_pre_3d_y',
                            'offline_pre_4d_y','offline_pre_5d_y','offline_pre_6d_y','offline_pre_7d_y',]

    # 初始化当天的测试数据
    dt_sales_df = pd.DataFrame(data=None,columns = sku_sales_df.columns)
    dt_sales_df['store_id'] = [i for i in range(1,13) for _ in range(1,1001)]
    dt_sales_df['sku_id'] = [i for _ in range(1,13) for i in range(1,1001)]
    dt_sales_df['date'] = pred_date_list[dt]
    for idx,col in enumerate(mapping_col):
        dt_sales_df[col] = dt_sales_df['store_id']*10000 + dt_sales_df['sku_id']
        dt_sales_df[col] = dt_sales_df[col].map(mapping_dict[idx])
        dt_sales_df[col] = dt_sales_df[col].astype(float)
    for idx,col in enumerate(update_dict_col_list):
        dt_sales_df[col] = 10000*dt_sales_df['store_id']+dt_sales_df['sku_id']
        dt_sales_df[col] = dt_sales_df[col].map(update_dict_list[idx])
        dt_sales_df[col] = dt_sales_df[col].astype(float) # 出现一个奇怪的问题是online_pre_1d_y会变成object类型，因此这里做类型转换

    dt_sales_df['online_pre_3d_y_mean'] = round(dt_sales_df[[f'online_pre_{k}d_y' for k in range(1,4)]].mean(axis=1),2)
    dt_sales_df['online_pre_3d_y_std'] = round(dt_sales_df[[f'online_pre_{k}d_y' for k in range(1,4)]].std(axis=1),2)
    dt_sales_df['offline_pre_3d_y_mean'] = round(dt_sales_df[[f'offline_pre_{k}d_y' for k in range(1,4)]].mean(axis=1),2)
    dt_sales_df['offline_pre_3d_y_std'] = round(dt_sales_df[[f'offline_pre_{k}d_y' for k in range(1,4)]].std(axis=1),2)
    dt_sales_df['online_pre_7d_y_mean'] = round(dt_sales_df[[f'online_pre_{k}d_y' for k in range(1,8)]].mean(axis=1),2)
    dt_sales_df['online_pre_7d_y_std'] = round(dt_sales_df[[f'online_pre_{k}d_y' for k in range(1,8)]].std(axis=1),2)
    dt_sales_df['offline_pre_7d_y_mean'] = round(dt_sales_df[[f'offline_pre_{k}d_y' for k in range(1,8)]].mean(axis=1),2)
    dt_sales_df['offline_pre_7d_y_std'] = round(dt_sales_df[[f'offline_pre_{k}d_y' for k in range(1,8)]].std(axis=1),2)

    # 合并其他维度特征数据
    t1 = sku_price_and_status_df[sku_price_and_status_df.date==pred_date_list[dt]]
    t2 = pd.merge(left=dt_sales_df,right=t1,on=['date','store_id','sku_id'],how='left')
    t2 = t2[~t2.original_price.isna()] # 过滤无价格字段,最后填充该行数据销量为0
    t3 = pd.merge(left=t2,right=sku_info_df,on=['sku_id'],how='left')
    t4 = pd.merge(left=t3,right=store_weather_df,on=['store_id','date'],how='left')
    t5 = pd.merge(left=t4, right=online_sku_prom_df,on=['store_id','sku_id','date'],how='left')
    t5 = pd.merge(left=t5,right=offline_sku_prom_df,on=['store_id','sku_id','date'],how='left')
    

    # 有意义的缺失值填充
    for col in ['offline_discount_off','online_discount_off','offline_discount_threshold_rate','online_discount_threshold_rate']:
        t5[col] = t5[col].fillna(0)
    for col in ['offline_promotion_type','online_promotion_type']:
        t5[col] = t5[col].fillna(-1)


    # sku业务特征
    t5['if_new_sale_sku'] = 0
    t5.loc[t5['sku_id'].isin(new_sale_list),'if_new_sale_sku'] = 1
    t5['online_sku_sale_scale'] = 1
    t5.loc[t5['sku_id'].isin(online_sku_second_sale_list),'online_sku_sale_scale'] = 2
    t5.loc[t5['sku_id'].isin(online_sku_third_sale_list),'online_sku_sale_scale'] = 3
    t5.loc[t5['sku_id'].isin(online_sku_forth_sale_list),'online_sku_sale_scale'] = 4
    t5['offline_sku_sale_scale'] = 1
    t5.loc[t5['sku_id'].isin(offline_sku_second_sale_list),'offline_sku_sale_scale'] = 2
    t5.loc[t5['sku_id'].isin(offline_sku_third_sale_list),'offline_sku_sale_scale'] = 3
    t5.loc[t5['sku_id'].isin(offline_sku_forth_sale_list),'offline_sku_sale_scale'] = 4

    # 时间特征
    t5['month'] = t5['date'].dt.month
    t5['dayofweek'] = t5['date'].dt.day_of_week
    t5['dayofmonth'] = t5['date'].dt.day
    t5['dayofyear'] = t5['date'].dt.day_of_year
    t5['if_workday'] = t5['date'].apply(func=lambda x:1 if chinese_calendar.is_workday(x) else 0)

    # onehot编码
    if online_xgboost_enc is None or offline_xgboost_enc is None:
        train_df['month'] = train_df['date'].dt.month
        train_df['dayofweek'] = train_df['date'].dt.day_of_week
        train_df['if_workday'] = train_df['date'].apply(func=lambda x:1 if chinese_calendar.is_workday(x) else 0)
        train_df['if_new_sale_sku'] = 0
        train_df.loc[train_df['sku_id'].isin(new_sale_list),'if_new_sale_sku'] = 1
        train_df['online_sku_sale_scale'] = 1
        train_df.loc[train_df['sku_id'].isin(online_sku_second_sale_list),'online_sku_sale_scale'] = 2
        train_df.loc[train_df['sku_id'].isin(online_sku_third_sale_list),'online_sku_sale_scale'] = 3
        train_df.loc[train_df['sku_id'].isin(online_sku_forth_sale_list),'online_sku_sale_scale'] = 4
        train_df['offline_sku_sale_scale'] = 1
        train_df.loc[train_df['sku_id'].isin(offline_sku_second_sale_list),'offline_sku_sale_scale'] = 2
        train_df.loc[train_df['sku_id'].isin(offline_sku_third_sale_list),'offline_sku_sale_scale'] = 3
        train_df.loc[train_df['sku_id'].isin(offline_sku_forth_sale_list),'offline_sku_sale_scale'] = 4
        train_df['offline_y_log'] = np.log(train_df['offline_y']+1)
        train_df['online_y_log'] = np.log(train_df['online_y']+1)
        online_xgboost_enc = OrdinalEncoder()
        online_xgboost_enc.fit_transform(X=train_df.loc[:,online_xgboost_category_col], 
                                         y=train_df.loc[:,online_xgboost_target])
        offline_xgboost_enc = OrdinalEncoder()
        offline_xgboost_enc.fit_transform(X=train_df.loc[:,offline_xgboost_category_col], 
                                         y=train_df.loc[:,offline_xgboost_target])
    
    # 空间特征
    tmp = train_df.groupby('sku_id')['online_y'].agg(['mean','std','median','min','max']).reset_index().rename(columns={'mean':'sku_online_y_mean',
                                                                                                            'std':'sku_online_y_std',
                                                                                                            'median':'sku_online_y_mid',
                                                                                                            'min':'sku_online_y_min',
                                                                                                            'max':'sku_online_y_max',}).round(2)
    tmp1 = train_df.groupby('sku_id')['offline_y'].agg(['mean','std','median','min','max']).reset_index().rename(columns={'mean':'sku_offline_y_mean',
                                                                                                            'std':'sku_offline_y_std',
                                                                                                            'median':'sku_offline_y_mid',
                                                                                                            'min':'sku_offline_y_min',
                                                                                                            'max':'sku_offline_y_max',}).round(2)
    tmp2 = train_df.groupby('sku_id')['original_price'].agg(['mean','std','median','min','max']).reset_index().rename(columns={'mean':'sku_original_price_mean',
                                                                                                            'std':'sku_original_price_std',
                                                                                                            'median':'sku_original_price_mid',
                                                                                                            'min':'sku_original_price_min',
                                                                                                            'max':'sku_original_price_max',}).round(2)
    train_df['month'] = train_df['date'].dt.month
    tmp3 = train_df.groupby(['sku_id','month'])['online_y'].agg(['mean','std','median','min','max']).reset_index().rename(columns={'mean':'sku_month_online_y_mean',
                                                                                                            'std':'sku_month_online_y_std',
                                                                                                                'median':'sku_month_online_y_mid',
                                                                                                            'min':'sku_month_online_y_min',
                                                                                                            'max':'sku_month_online_y_max',}).round(2)
    tmp4 = train_df.groupby(['sku_id','month'])['offline_y'].agg(['mean','std','median','min','max']).reset_index().rename(columns={'mean':'sku_month_offline_y_mean',
                                                                                                            'std':'sku_month_offline_y_std',
                                                                                                            'median':'sku_month_offline_y_mid',
                                                                                                            'min':'sku_month_offline_y_min',
                                                                                                            'max':'sku_month_offline_y_max',}).round(2)
    del train_df['month']
    tmp5 = train_df.groupby(['sku_id','weather_type'])['online_y'].agg(['mean','std','median','min','max']).reset_index().rename(columns={'mean':'sku_weather_online_y_mean',
                                                                                                            'std':'sku_weather_online_y_std',
                                                                                                            'median':'sku_weather_online_y_mid',
                                                                                                            'min':'sku_weather_online_y_min',
                                                                                                            'max':'sku_weather_online_y_max',}).round(2)
    tmp6 = train_df.groupby(['sku_id','weather_type'])['offline_y'].agg(['mean','std','median','min','max']).reset_index().rename(columns={'mean':'sku_weather_offline_y_mean',
                                                                                                            'std':'sku_weather_offline_y_std',
                                                                                                            'median':'sku_weather_offline_y_mid',
                                                                                                            'min':'sku_weather_offline_y_min',
                                                                                                            'max':'sku_weather_offline_y_max',}).round(2)
    tmp7 = train_df.groupby(['sku_id','offline_promotion_type'])['offline_y'].agg(['mean','std','median','min','max']).reset_index().rename(columns={'mean':'sku_prom_offline_y_mean',
                                                                                                            'std':'sku_prom_offline_y_std',
                                                                                                            'median':'sku_prom_offline_y_mid',
                                                                                                            'min':'sku_prom_offline_y_min',
                                                                                                            'max':'sku_prom_offline_y_max',}).round(2)
    tmp8 = train_df.groupby(['sku_id','online_promotion_type'])['online_y'].agg(['mean','std','median','min','max']).reset_index().rename(columns={'mean':'sku_prom_online_y_mean',
                                                                                                            'std':'sku_prom_online_y_std',
                                                                                                            'median':'sku_prom_online_y_mid',
                                                                                                            'min':'sku_prom_online_y_min',
                                                                                                            'max':'sku_prom_online_y_max',}).round(2)
    train_df['if_workday'] = train_df['date'].apply(func=lambda x:1 if chinese_calendar.is_workday(x) else 0)
    tmp9 = train_df.groupby(['sku_id','if_workday'])['offline_y'].agg(['mean','std','median','min','max']).reset_index().rename(columns={'mean':'sku_workday_offline_y_mean',
                                                                                                            'std':'sku_workday_offline_y_std',
                                                                                                            'median':'sku_workday_offline_y_mid',
                                                                                                            'min':'sku_workday_offline_y_min',
                                                                                                            'max':'sku_workday_offline_y_max',}).round(2)
    tmp10 = train_df.groupby(['sku_id','if_workday'])['online_y'].agg(['mean','std','median','min','max']).reset_index().rename(columns={'mean':'sku_workday_online_y_mean',
                                                                                                            'std':'sku_workday_online_y_std',
                                                                                                            'median':'sku_workday_online_y_mid',
                                                                                                            'min':'sku_workday_online_y_min',
                                                                                                            'max':'sku_workday_online_y_max',}).round(2)
    del train_df['if_workday']

    all_tmp = pd.merge(left=tmp,right=tmp1,how='inner',on=['sku_id'])
    all_tmp = pd.merge(left=all_tmp,right=tmp2,how='left',on=['sku_id'])
    test_df = pd.merge(left=t5,right=all_tmp,how='left',on=['sku_id'])
    test_df = pd.merge(left=test_df,right=tmp3,how='left',on=['sku_id','month'])
    test_df = pd.merge(left=test_df,right=tmp4,how='left',on=['sku_id','month'])
    test_df = pd.merge(left=test_df,right=tmp5,how='left',on=['sku_id','weather_type'])
    test_df = pd.merge(left=test_df,right=tmp6,how='left',on=['sku_id','weather_type'])
    test_df = pd.merge(left=test_df,right=tmp7,how='left',on=['sku_id','offline_promotion_type'])
    test_df = pd.merge(left=test_df,right=tmp8,how='left',on=['sku_id','online_promotion_type'])
    test_df = pd.merge(left=test_df,right=tmp9,how='left',on=['sku_id','if_workday'])
    test_df = pd.merge(left=test_df,right=tmp10,how='left',on=['sku_id','if_workday'])

    # 业务特征
    test_df['original_price_diff'] = test_df['original_price'] - test_df['sku_original_price_mid']
    test_df['original_price_cv'] = round(test_df['original_price_diff']/test_df['sku_original_price_mid'],2)
    test_df['sku_online_y_cv'] = round(test_df['sku_online_y_std']/test_df['sku_online_y_mid'],2)
    test_df['sku_offline_y_cv'] = round(test_df['sku_offline_y_std']/test_df['sku_offline_y_mid'],2)
    test_df['online_store_sku_all_y_cv'] = round(test_df['online_store_sku_all_y_std']/test_df['online_store_sku_all_y_mean'],2)
    test_df['offline_store_sku_all_y_cv'] = round(test_df['offline_store_sku_all_y_std']/test_df['offline_store_sku_all_y_mean'],2)

    # 缺失值填充
    for col in na_0_list:
        test_df[col] = test_df[col].fillna(0)
    for col in na_M_list:
        test_df[col] = test_df[col].fillna(999)
    
    # 数据内存压缩
    test_df = reduce_mem_usage(test_df)

    # 类别特征转换
    for col in category_col:
        test_df[col] = test_df[col].astype(int) #可能在填充缺失值时出现了0.0,1.0的情况
        test_df[col] = test_df[col].astype('category')

    # 模型预测
    # 线上订单部分
    online_catboost_x = test_df[online_catboost_category_col+online_catboost_numeric_col] 
    online_catboost_y = np.exp(online_catboost_model.predict(online_catboost_x))-1
    online_lightgbm_x = test_df[online_lightgbm_category_col+online_lightgbm_numeric_col] 
    online_lightgbm_y = online_lightgbm_model.predict(online_lightgbm_x,num_iteration=online_lightgbm_model.best_iteration_)
    online_xgboost_x = test_df[online_xgboost_category_col+online_xgboost_numeric_col]
    online_xgboost_x.loc[:,online_xgboost_category_col] = online_xgboost_enc._transform(online_xgboost_x.loc[:,online_xgboost_category_col],
                                                                                        handle_unknown='ignore')[0]
    # online_xgboost_x.replace([np.inf, -np.inf], 999, inplace=True) 
    online_xgboost_y = np.exp(online_xgboost_model.predict(xgb.DMatrix(online_xgboost_x,enable_categorical=True),
                                                           ntree_limit=online_xgboost_model.best_ntree_limit))-1 

    test_df['online_y_3d'] = round(0.5*test_df['online_pre_1d_y']+0.3*test_df['online_pre_2d_y']+0.2*test_df['online_pre_3d_y'],3)
    test_df['online_y_7d'] = round((0.2*test_df['online_pre_1d_y']+0.15*test_df['online_pre_2d_y']+0.15*test_df['online_pre_3d_y']+
                                  0.1*test_df['online_pre_4d_y']+0.1*test_df['online_pre_5d_y']+0.1*test_df['online_pre_6d_y']+
                                  0.2*test_df['online_pre_7d_y']),3)
    test_df['online_y_catboost'] = online_catboost_y.round(3)
    test_df['online_y_lightgbm'] = online_lightgbm_y.round(3)
    test_df['online_y_xgboost'] = online_xgboost_y.round(3)
    # # 线上订单模型融合方式1（所有模型预测值取最大）
    # test_df['online_y'] = test_df.apply(func=lambda x:max([x['online_y_3d'],x['online_y_7d'],
    #                                                        x['online_y_catboost'],
    #                                                        x['online_y_lightgbm'],
    #                                                        x['online_y_xgboost']]),axis=1)
    # # 线上订单模型融合方式2
    # test_df['online_y'] = test_df['online_y_lightgbm'].round(1)
    # 线上订单模型融合方式3
    # 基于预测值构建新特征和最终预测值,用于预测分类
    test_df['online_y_catboost_log'] = np.log(test_df['online_y_catboost']+1)
    test_df['online_y_catboost_log'] = test_df['online_y_catboost_log'].fillna(0)
    test_df['online_y_lightgbm_log'] = np.log(test_df['online_y_lightgbm']+1)
    test_df['online_y_lightgbm_log'] = test_df['online_y_lightgbm_log'].fillna(0)
    test_df['online_y_xgboost_log'] = np.log(test_df['online_y_xgboost']+1)
    test_df['online_y_xgboost_log'] = test_df['online_y_xgboost_log'].fillna(0)
    test_df['online_y_min_log'] = test_df[['online_y_catboost_log','online_y_lightgbm_log','online_y_xgboost_log']].min(axis=1)
    test_df['online_y_max_log'] = test_df[['online_y_catboost_log','online_y_lightgbm_log','online_y_xgboost_log']].max(axis=1)
    test_df['online_y_diff_log'] = test_df['online_y_max_log']-test_df['online_y_min_log']
    test_df['online_y_min'] = test_df[['online_y_3d','online_y_7d','online_y_catboost','online_y_lightgbm','online_y_xgboost']].min(axis=1)
    test_df['online_y_max'] = test_df[['online_y_3d','online_y_7d','online_y_catboost','online_y_lightgbm','online_y_xgboost']].max(axis=1)
    test_df['online_y_diff'] = test_df['online_y_max']-test_df['online_y_min']
    test_df['online_y_log_pred'] = test_df[['online_y_catboost_log','online_y_lightgbm_log','online_y_xgboost_log']].mean(axis=1)
    test_df['online_y_pred'] = np.exp(test_df['online_y_log_pred'].values)-1
    # # 取mean的值效果不理想，直接取max值
    # test_df['online_y_log_pred'] = test_df['online_y_max_log']
    # test_df['online_y_pred'] = test_df['online_y_max']

    online_clf_x = test_df[online_stacking_category_col+online_stacking_numeric_col] 
    online_y_label_pred = np.where(online_clf_model.predict(online_clf_x,num_iteration=online_clf_model.best_iteration)>=0.5,1,0)
    test_df['online_y_label_pred'] = online_y_label_pred

    # 根据分类标签结果进行修正（暂不考虑根据模型修正）
    test_df['online_y_pred_fix'] = test_df['online_y_pred']
    test_df.loc[(test_df.online_y_label_pred==1) & (test_df.online_sku_sale_scale==1),'online_y_pred_fix'] += 0.3
    test_df.loc[(test_df.online_y_label_pred==1) & (test_df.online_sku_sale_scale==2),'online_y_pred_fix'] += 0.5
    test_df.loc[(test_df.online_y_label_pred==1) & (test_df.online_sku_sale_scale==3),'online_y_pred_fix'] += 1
    test_df.loc[(test_df.online_y_label_pred==1) & (test_df.online_sku_sale_scale==4),'online_y_pred_fix'] *= 1.05
    # 根据日期进行修正
    # 第一个周五/周六
    if dt in (0,1):
        test_df['online_y_pred_fix'] = test_df[['online_y_pred_fix','online_pre_1d_y','online_pre_7d_y']].max(axis=1)
    # 第一个周日,周中
    if dt in (2,3,4,5,6):
        test_df['online_y_pred_fix'] = test_df[['online_y_pred_fix','online_pre_7d_y']].max(axis=1)
    # 第二个周五/周六
    if dt in (7,8):
        test_df['online_y_pred_fix_backup'] = 0.95*test_df['online_pre_7d_y']
        test_df['online_y_pred_fix'] = test_df[['online_y_pred_fix','online_pre_1d_y','online_y_pred_fix_backup']].max(axis=1)
        del test_df['online_y_pred_fix_backup']
    # 第二个周日,周中
    if dt in (9,10,11,12,13):
        test_df['online_y_pred_fix_backup'] = 0.95*test_df['online_pre_7d_y']
        test_df['online_y_pred_fix'] = test_df[['online_y_pred_fix','online_y_pred_fix_backup']].max(axis=1)
        del test_df['online_y_pred_fix_backup']

    test_df['online_y'] = test_df['online_y_pred_fix'].round(1)
    test_df['online_y'] = test_df.apply(func=lambda x:x['online_y'] if x['online_y']>0 else 0,axis=1)

    # 零售品业务处理
    test_df['online_y'] = test_df.apply(func=lambda x:x['online_y'] if x['sku_id'] in not_complete_list else round(x['online_y']), axis=1)
    
   

    # 线下订单部分
    offline_catboost_x = test_df[offline_catboost_category_col+offline_catboost_numeric_col] 
    offline_catboost_y = np.exp(offline_catboost_model.predict(offline_catboost_x))-1
    offline_lightgbm_x = test_df[offline_lightgbm_category_col+offline_lightgbm_numeric_col] 
    offline_lightgbm_y = offline_lightgbm_model.predict(offline_lightgbm_x,num_iteration=offline_lightgbm_model.best_iteration_)
    offline_xgboost_x = test_df[offline_xgboost_category_col+offline_xgboost_numeric_col] 
    offline_xgboost_x.loc[:,offline_xgboost_category_col] = offline_xgboost_enc._transform(offline_xgboost_x.loc[:,offline_xgboost_category_col],
                                                                                        handle_unknown='ignore')[0]
    # offline_xgboost_x.replace([np.inf, -np.inf], 999, inplace=True)
    offline_xgboost_y = np.exp(offline_xgboost_model.predict(xgb.DMatrix(offline_xgboost_x,enable_categorical=True),
                                                             ntree_limit=offline_xgboost_model.best_ntree_limit))-1 
    test_df['offline_y_3d'] = round(0.5*test_df['offline_pre_1d_y']+0.3*test_df['offline_pre_2d_y']+0.2*test_df['offline_pre_3d_y'],3)
    test_df['offline_y_7d'] = round((0.2*test_df['offline_pre_1d_y']+0.15*test_df['offline_pre_2d_y']+0.15*test_df['offline_pre_3d_y']+
                                  0.1*test_df['offline_pre_4d_y']+0.1*test_df['offline_pre_5d_y']+0.1*test_df['offline_pre_6d_y']+
                                  0.2*test_df['offline_pre_7d_y']),3)
    test_df['offline_y_catboost'] = offline_catboost_y.round(3)
    test_df['offline_y_lightgbm'] = offline_lightgbm_y.round(3)
    test_df['offline_y_xgboost'] = offline_xgboost_y.round(3)
    # # 线下订单模型融合方式1（所有模型预测值取最大）
    # # test_df['offline_y'] = test_df.apply(func=lambda x:max([x['offline_y_3d'],x['offline_y_7d'],
    # #                                                        x['offline_y_catboost'],
    # #                                                        x['offline_y_lightgbm'],
    # #                                                        x['offline_y_xgboost']]),axis=1)
    # # 线下订单模型融合方式2
    # test_df['offline_y'] = test_df['offline_y_lightgbm']
    # 线下订单模型融合方式3
    # 用于构建分类模型的线下特征
    test_df['offline_y_catboost_log'] = np.log(test_df['offline_y_catboost']+1)
    test_df['offline_y_catboost_log'] = test_df['offline_y_catboost_log'].fillna(0)
    test_df['offline_y_lightgbm_log'] = np.log(test_df['offline_y_lightgbm']+1)
    test_df['offline_y_lightgbm_log'] = test_df['offline_y_lightgbm_log'].fillna(0)
    test_df['offline_y_xgboost_log'] = np.log(test_df['offline_y_xgboost']+1)
    test_df['offline_y_xgboost_log'] = test_df['offline_y_xgboost_log'].fillna(0)
    test_df['offline_y_min_log'] = test_df[['offline_y_catboost_log','offline_y_lightgbm_log','offline_y_xgboost_log']].min(axis=1)
    test_df['offline_y_max_log'] = test_df[['offline_y_catboost_log','offline_y_lightgbm_log','offline_y_xgboost_log']].max(axis=1)
    test_df['offline_y_diff_log'] = test_df['offline_y_max_log']-test_df['offline_y_min_log']
    test_df['offline_y_min'] = test_df[['offline_y_3d','offline_y_7d','offline_y_catboost','offline_y_lightgbm','offline_y_xgboost']].min(axis=1)
    test_df['offline_y_max'] = test_df[['offline_y_3d','offline_y_7d','offline_y_catboost','offline_y_lightgbm','offline_y_xgboost']].max(axis=1)
    test_df['offline_y_diff'] = test_df['offline_y_max']-test_df['offline_y_min']
    test_df['offline_y_log_pred'] = test_df[['offline_y_catboost_log','offline_y_lightgbm_log','offline_y_xgboost_log']].mean(axis=1)
    test_df['offline_y_pred'] = np.exp(test_df['offline_y_log_pred'].values)-1
    # # 取mean的值效果不理想，直接取max值
    # test_df['offline_y_log_pred'] = test_df['offline_y_max_log']
    # test_df['offline_y_pred'] = test_df['offline_y_max']

    offline_clf_x = test_df[offline_stacking_category_col+offline_stacking_numeric_col] 
    offline_y_label_pred = np.where(offline_clf_model.predict(offline_clf_x,num_iteration=offline_clf_model.best_iteration)>=0.5,1,0)
    test_df['offline_y_label_pred'] = offline_y_label_pred

    # 根据结果进行修正（暂不考虑根据模型修正）
    test_df['offline_y_pred_fix'] = test_df['offline_y_pred']
    test_df.loc[(test_df.offline_y_label_pred==1) & (test_df.offline_sku_sale_scale==1),'offline_y_pred_fix'] += 0.3
    test_df.loc[(test_df.offline_y_label_pred==1) & (test_df.offline_sku_sale_scale==2),'offline_y_pred_fix'] += 0.5
    test_df.loc[(test_df.offline_y_label_pred==1) & (test_df.offline_sku_sale_scale==3),'offline_y_pred_fix'] += 1
    test_df.loc[(test_df.offline_y_label_pred==1) & (test_df.offline_sku_sale_scale==4),'offline_y_pred_fix'] *= 1.05
    # 根据日期进行修正
    # 第一个周五/周六
    if dt in (0,1):
        test_df['offline_y_pred_fix'] = test_df[['offline_y_pred_fix','offline_pre_1d_y','offline_pre_7d_y']].max(axis=1)
    # 第一个周日,周中
    if dt in (2,3,4,5,6):
        test_df['offline_y_pred_fix'] = test_df[['offline_y_pred_fix','offline_pre_7d_y']].max(axis=1)
    # 第二个周五/周六
    if dt in (7,8):
        test_df['offline_y_pred_fix_backup'] = 0.95*test_df['offline_pre_7d_y']
        test_df['offline_y_pred_fix'] = test_df[['offline_y_pred_fix','offline_pre_1d_y','offline_y_pred_fix_backup']].max(axis=1)
        del test_df['offline_y_pred_fix_backup']
    # 第二个周日,周中
    if dt in (9,10,11,12,13):
        test_df['offline_y_pred_fix_backup'] = 0.95*test_df['offline_pre_7d_y']
        test_df['offline_y_pred_fix'] = test_df[['offline_y_pred_fix','offline_y_pred_fix_backup']].max(axis=1)
        del test_df['offline_y_pred_fix_backup']

    test_df['offline_y'] = test_df['offline_y_pred_fix'].round(1)
    test_df['offline_y'] = test_df.apply(func=lambda x:x['offline_y'] if x['offline_y']>0 else 0,axis=1)

    # 零售品业务处理
    test_df['offline_y'] = test_df.apply(func=lambda x:x['offline_y'] if x['sku_id'] in not_complete_list else round(x['offline_y']), axis=1)

    # 返回结果
    test_df['all_y'] = test_df['online_y']+test_df['offline_y']
    test_df_list.append(test_df)

    del dt_sales_df['online_y'],dt_sales_df['offline_y'],dt_sales_df['all_y']
    gc.collect()
    final_dt_sales_df = pd.merge(left=dt_sales_df,
                                right=test_df[['store_id','sku_id','date','online_y','offline_y','all_y']],
                                on=['store_id','sku_id','date'],
                                how='left')
    # 停售品业务处理
    for i in range(len(not_sale_store_list)):
        for j in not_sale_store_list[i]:
            for col in ['online_y','offline_y','all_y']:
                final_dt_sales_df.loc[((final_dt_sales_df.store_id==i+1) & (final_dt_sales_df.sku_id==j)),col] = 0
    # 非售品业务处理
    final_dt_sales_df['online_y'] = final_dt_sales_df['online_y'].fillna(0).astype(float)
    final_dt_sales_df['offline_y'] = final_dt_sales_df['offline_y'].fillna(0).astype(float)
    final_dt_sales_df['all_y'] = final_dt_sales_df['all_y'].fillna(0).astype(float)
    final_dt_sales_df = final_dt_sales_df[list(final_dt_sales_df.columns[:3])+list(final_dt_sales_df.columns[-3:])+list(final_dt_sales_df.columns[3:-3])]

    # 将预测的结果合并到sku_sale_df,周期滚动训练
    sku_sales_df = pd.concat([sku_sales_df,final_dt_sales_df],ignore_index=True)

目前正在预测2023-09-01 00:00:00的数据.....
Memory usage of dataframe is 12378840.00 MB
Memory usage after optimization is: 6105455.00 MB
Decreased by 50.7%
目前正在预测2023-09-02 00:00:00的数据.....
Memory usage of dataframe is 12378840.00 MB
Memory usage after optimization is: 6105455.00 MB
Decreased by 50.7%
目前正在预测2023-09-03 00:00:00的数据.....
Memory usage of dataframe is 12378840.00 MB
Memory usage after optimization is: 6105455.00 MB
Decreased by 50.7%
目前正在预测2023-09-04 00:00:00的数据.....
Memory usage of dataframe is 12378840.00 MB
Memory usage after optimization is: 6105455.00 MB
Decreased by 50.7%
目前正在预测2023-09-05 00:00:00的数据.....
Memory usage of dataframe is 12378840.00 MB
Memory usage after optimization is: 6105455.00 MB
Decreased by 50.7%
目前正在预测2023-09-06 00:00:00的数据.....
Memory usage of dataframe is 12378840.00 MB
Memory usage after optimization is: 6105455.00 MB
Decreased by 50.7%
目前正在预测2023-09-07 00:00:00的数据.....
Memory usage of dataframe is 12378840.00 MB
Memory usage after optimization is: 6105

In [50]:
predict_df = sku_sales_df[sku_sales_df.date>=datetime.datetime(2023,9,1)]
predict_df = predict_df[['date','online_y','offline_y','all_y','store_id','sku_id']].rename(columns={'online_y':'y_online',
                                                                                                     'offline_y':'y_offline',
                                                                                                     'all_y':'y_all'})
predict_df['y_online'] = predict_df.apply(func=lambda x:x['y_online'] if x['y_online']>0 else 0,axis=1)
predict_df['y_offline'] = predict_df.apply(func=lambda x:x['y_offline'] if x['y_offline']>0 else 0,axis=1)
predict_df['y_all'] = predict_df['y_online']+predict_df['y_offline']
predict_df.to_csv(get_data_path('Base_stacking3_predict_df_clf_4'),index=False)

In [51]:
predict_df

Unnamed: 0,date,y_online,y_offline,y_all,store_id,sku_id
582306,2023-09-01,0.0,2.0,2.0,1,1
582307,2023-09-01,1.0,3.0,4.0,1,2
582308,2023-09-01,3.0,4.0,7.0,1,3
582309,2023-09-01,2.0,0.0,2.0,1,4
582310,2023-09-01,7.0,30.0,37.0,1,5
...,...,...,...,...,...,...
750301,2023-09-14,2.0,3.0,5.0,12,996
750302,2023-09-14,3.0,2.0,5.0,12,997
750303,2023-09-14,2.0,1.0,3.0,12,998
750304,2023-09-14,2.0,2.0,4.0,12,999


In [54]:
print(predict_df.y_offline.max(),predict_df.y_online.max(),predict_df.y_offline.min(),predict_df.y_online.min())

412.0 482.6 0.0 0.0


In [41]:
# 上一个版本
predict_df

Unnamed: 0,date,y_online,y_offline,y_all,store_id,sku_id
582306,2023-09-01,0.0,1.0,1.0,1,1
582307,2023-09-01,1.0,3.0,4.0,1,2
582308,2023-09-01,3.0,4.0,7.0,1,3
582309,2023-09-01,2.0,0.0,2.0,1,4
582310,2023-09-01,7.0,30.0,37.0,1,5
...,...,...,...,...,...,...
750301,2023-09-14,1.0,1.0,2.0,12,996
750302,2023-09-14,1.0,1.0,2.0,12,997
750303,2023-09-14,2.0,1.0,3.0,12,998
750304,2023-09-14,2.0,2.0,4.0,12,999
