In [110]:
import os
import gc
import datetime
import pandas as pd
import numpy as np
import math
import matplotlib as mpl
import matplotlib.pyplot as plt
from mlxtend.frequent_patterns import apriori
import warnings
from typing import Callable
from joblib import Parallel, delayed
import pickle

In [3]:
class LocalParallel(object):
    """
    本地并行算法库
    """

    class ParallelUnit(object):
        """
        并行计算单元
        """

        def __init__(self, seq: int, call_func: Callable, **kwargs):
            self.seq = seq  # 输入参数序号，求解顺序
            self.call_func = call_func
            self.call_func_input: dict = {}
            for k, v in kwargs.items():
                self.call_func_input.update({k: v})
            self.call_func_output = None

    def __init__(self):
        self.parallel_unit_list = []  # 并行计算单元

    def parallel_run(self, n_jobs: int = 1) -> list:
        '''
        并行调度
        Args:
            parallel_unit_list: 并行集合
            n_jobs: 并行单元

        Returns: 结果列表

        '''
        # 求解
        ret_list = Parallel(n_jobs=n_jobs, backend='multiprocessing')(
            delayed(LocalParallel.parallel_search)(self.parallel_unit_list, i) for i in
            range(len(self.parallel_unit_list)))

        # 合并
        merge_ret_list = []
        ret_list.sort(key=lambda x: x.seq)  # 保持顺序一致
        for parallel_unit in ret_list:
            merge_ret_list.extend(parallel_unit.call_func_output)

        # 返回
        return merge_ret_list

    def series_run(self) -> list:
        '''
        串行调度
        Args:
            parallel_unit_list: 并行集合
            n_jobs: 并行单元

        Returns: 结果列表

        '''
        # 求解
        ret_list = []
        for parallel_unit in self.parallel_unit_list:
            call_func = parallel_unit.call_func
            parallel_unit.call_func_output = call_func(**parallel_unit.call_func_input)
            ret_list.append(parallel_unit)

        # 合并
        merge_ret_list = []
        ret_list.sort(key=lambda x: x.seq)  # 保持顺序一致
        for parallel_unit in ret_list:
            merge_ret_list.extend(parallel_unit.call_func_output)

        # 返回
        return merge_ret_list

    @staticmethod
    def parallel_search(parallel_unit_list, i):
        """
        并行搜索单元
        Args:
            parallel_unit_list: 并行列表
            i: 并行单元序号
        Returns: 并行单元

        """
        parallel_unit = parallel_unit_list[i]
        call_func = parallel_unit.call_func
        parallel_unit.call_func_output = call_func(**parallel_unit.call_func_input)
        return parallel_unit

def get_data_path(filename:str) -> str:
    return str(os.path.dirname(os.getcwd()))+'\\data\\'+filename+'.csv'


def get_time_list(starttime:datetime.datetime,endtime:datetime.datetime):
    time_list = []
    while starttime<=endtime:
        time_list.append(starttime)
        starttime += datetime.timedelta(days=1)
    return time_list

def str_to_datetime(x:str) -> datetime.datetime:
    return datetime.datetime.strptime(x, "%Y-%m-%d")

In [4]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() 
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype
        if col == 'date':
            continue   
        elif col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    # df[col] = df[col].astype(np.float16)
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            # df[col] = df[col].astype('category')
            pass
    end_mem = df.memory_usage().sum() 
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

# 获取训练集数据

In [4]:
# 获取销量数据
sku_sales_df = pd.read_csv(get_data_path('sku_sales'), sep=',', encoding='utf8')
sku_sales_df['order_time'] = sku_sales_df.apply(func=lambda x: x['order_time'][:10], axis=1)
sku_sales_df = sku_sales_df.rename(columns={'order_time':'date'})
sku_sales_df['date'] = sku_sales_df['date'].apply(str_to_datetime)


'''
注意这里对时间取舍了,V2版本取的全部数据
'''
condition1 = (sku_sales_df.date>=datetime.datetime(2023,7,15))
condition2 = (sku_sales_df.date>=datetime.datetime(2022,8,15))
condition3 = (sku_sales_df.date<=datetime.datetime(2022,10,15))
sku_sales_df = sku_sales_df[(condition1) | (condition2 & condition3)]




sku_sales_df = sku_sales_df.groupby(['store_id','sku_id','date','channel'])['quantity'].agg(['sum']).reset_index()
# 按说明书的线上线下定义
online_sku_sales_df = sku_sales_df[sku_sales_df.channel==2]
del online_sku_sales_df['channel']
gc.collect()
online_sku_sales_df = online_sku_sales_df.rename(columns={'sum':'online_y'})
online_sku_sales_df = online_sku_sales_df.reset_index(drop=True)

offline_sku_sales_df = sku_sales_df[sku_sales_df.channel==1]
del offline_sku_sales_df['channel']
gc.collect()
offline_sku_sales_df = offline_sku_sales_df.rename(columns={'sum':'offline_y'})
offline_sku_sales_df = offline_sku_sales_df.reset_index(drop=True)

sku_sales_df = pd.merge(left=online_sku_sales_df,right=offline_sku_sales_df,on=['date','store_id','sku_id'],how='outer')
sku_sales_df = sku_sales_df.fillna(0)
sku_sales_df['all_y'] = sku_sales_df['online_y']+sku_sales_df['offline_y']
sku_sales_df

Unnamed: 0,store_id,sku_id,date,online_y,offline_y,all_y
0,1,1,2022-08-15,1.0,0.0,1.0
1,1,1,2022-08-25,1.0,0.0,1.0
2,1,1,2022-09-03,1.0,1.0,2.0
3,1,1,2022-09-07,3.0,0.0,3.0
4,1,1,2022-09-08,1.0,0.0,1.0
...,...,...,...,...,...,...
582301,12,999,2023-08-26,0.0,1.0,1.0
582302,12,999,2023-08-28,0.0,2.0,2.0
582303,12,999,2023-08-29,0.0,2.0,2.0
582304,12,1000,2023-08-25,0.0,1.0,1.0


In [5]:
sku_sales_df['online_pre_1d_y'] = 0
sku_sales_df['online_pre_2d_y'] = 0
sku_sales_df['online_pre_3d_y'] = 0
sku_sales_df['online_pre_4d_y'] = 0
sku_sales_df['online_pre_5d_y'] = 0
sku_sales_df['online_pre_6d_y'] = 0
sku_sales_df['online_pre_7d_y'] = 0
sku_sales_df['online_pre_3d_y_mean'] = 0
sku_sales_df['online_pre_3d_y_std'] = 0
sku_sales_df['online_pre_7d_y_mean'] = 0
sku_sales_df['online_pre_7d_y_std'] = 0
sku_sales_df['online_store_sku_all_y_mean'] = 0
sku_sales_df['online_store_sku_all_y_std'] = 0

sku_sales_df['offline_pre_1d_y'] = 0
sku_sales_df['offline_pre_2d_y'] = 0
sku_sales_df['offline_pre_3d_y'] = 0
sku_sales_df['offline_pre_4d_y'] = 0
sku_sales_df['offline_pre_5d_y'] = 0
sku_sales_df['offline_pre_6d_y'] = 0
sku_sales_df['offline_pre_7d_y'] = 0
sku_sales_df['offline_pre_3d_y_mean'] = 0
sku_sales_df['offline_pre_3d_y_std'] = 0
sku_sales_df['online_pre_7d_y_mean'] = 0
sku_sales_df['online_pre_7d_y_std'] = 0
sku_sales_df['offline_store_sku_all_y_mean'] = 0
sku_sales_df['offline_store_sku_all_y_std'] = 0

store_sku_sku_sales_df_list = []
for i in range(1,13):
    for j in range(1,1001):
        print(f"正在获取{i}商铺{j}商品的数据")
        tmp_df = sku_sales_df[(sku_sales_df.store_id==i) & (sku_sales_df.sku_id==j)]
        if len(tmp_df)>0:
            tmp_df = tmp_df.reset_index(drop=True)
            tmp_df['online_store_sku_all_y_mean'] = round(tmp_df['online_y'].median(),2)
            tmp_df['online_store_sku_all_y_std'] = round(tmp_df['online_y'].std(),2)
            tmp_df['offline_store_sku_all_y_mean'] = round(tmp_df['offline_y'].median(),2)
            tmp_df['offline_store_sku_all_y_std'] = round(tmp_df['offline_y'].std(),2)
            store_sku_sku_sales_df_list.append(tmp_df)

正在获取1商铺1商品的数据
正在获取1商铺2商品的数据
正在获取1商铺3商品的数据
正在获取1商铺4商品的数据
正在获取1商铺5商品的数据
正在获取1商铺6商品的数据
正在获取1商铺7商品的数据
正在获取1商铺8商品的数据
正在获取1商铺9商品的数据
正在获取1商铺10商品的数据
正在获取1商铺11商品的数据
正在获取1商铺12商品的数据
正在获取1商铺13商品的数据
正在获取1商铺14商品的数据
正在获取1商铺15商品的数据
正在获取1商铺16商品的数据
正在获取1商铺17商品的数据
正在获取1商铺18商品的数据
正在获取1商铺19商品的数据
正在获取1商铺20商品的数据
正在获取1商铺21商品的数据
正在获取1商铺22商品的数据
正在获取1商铺23商品的数据
正在获取1商铺24商品的数据
正在获取1商铺25商品的数据
正在获取1商铺26商品的数据
正在获取1商铺27商品的数据
正在获取1商铺28商品的数据
正在获取1商铺29商品的数据
正在获取1商铺30商品的数据
正在获取1商铺31商品的数据
正在获取1商铺32商品的数据
正在获取1商铺33商品的数据
正在获取1商铺34商品的数据
正在获取1商铺35商品的数据
正在获取1商铺36商品的数据
正在获取1商铺37商品的数据
正在获取1商铺38商品的数据
正在获取1商铺39商品的数据
正在获取1商铺40商品的数据
正在获取1商铺41商品的数据
正在获取1商铺42商品的数据
正在获取1商铺43商品的数据
正在获取1商铺44商品的数据
正在获取1商铺45商品的数据
正在获取1商铺46商品的数据
正在获取1商铺47商品的数据
正在获取1商铺48商品的数据
正在获取1商铺49商品的数据
正在获取1商铺50商品的数据
正在获取1商铺51商品的数据
正在获取1商铺52商品的数据
正在获取1商铺53商品的数据
正在获取1商铺54商品的数据
正在获取1商铺55商品的数据
正在获取1商铺56商品的数据
正在获取1商铺57商品的数据
正在获取1商铺58商品的数据
正在获取1商铺59商品的数据
正在获取1商铺60商品的数据
正在获取1商铺61商品的数据
正在获取1商铺62商品的数据
正在获取1商铺63商品的数据
正在获取1商铺64商品的数据
正在获取1商铺65商品的数据
正在获取1商铺66商品的数据
正在获取1商铺67商品的数据
正在获取

In [6]:
store_sku_kpi_list_list = []
for i,store_sku_sku_sales_df in enumerate(store_sku_sku_sales_df_list):
    print(f"进度为{round(100*i/len(store_sku_sku_sales_df_list),2)}%")
    for delta in range(1,8):
        store_sku_sku_sales_df[f'online_pre_{delta}d_y'] = store_sku_sku_sales_df['online_y'].shift(delta)
        store_sku_sku_sales_df[f'online_pre_{delta}d_y_date'] = store_sku_sku_sales_df['date'].shift(delta)
        store_sku_sku_sales_df.loc[store_sku_sku_sales_df[f'online_pre_{delta}d_y_date']!=store_sku_sku_sales_df['date']+datetime.timedelta(-delta),
                                f'online_pre_{delta}d_y'] = 0
        del store_sku_sku_sales_df[f'online_pre_{delta}d_y_date']
        gc.collect()
        store_sku_sku_sales_df[f'offline_pre_{delta}d_y'] = store_sku_sku_sales_df['offline_y'].shift(delta)
        store_sku_sku_sales_df[f'offline_pre_{delta}d_y_date'] = store_sku_sku_sales_df['date'].shift(delta)
        store_sku_sku_sales_df.loc[store_sku_sku_sales_df[f'offline_pre_{delta}d_y_date']!=store_sku_sku_sales_df['date']+datetime.timedelta(-delta),
                                f'offline_pre_{delta}d_y'] = 0
        del store_sku_sku_sales_df[f'offline_pre_{delta}d_y_date']
        gc.collect()
    store_sku_sku_sales_df['online_pre_3d_y_mean'] = round(store_sku_sku_sales_df[[f'online_pre_{k}d_y' for k in range(1,4)]].mean(axis=1),2)
    store_sku_sku_sales_df['online_pre_3d_y_std'] = round(store_sku_sku_sales_df[[f'online_pre_{k}d_y' for k in range(1,4)]].std(axis=1),2)
    store_sku_sku_sales_df['online_pre_7d_y_mean'] = round(store_sku_sku_sales_df[[f'online_pre_{k}d_y' for k in range(1,8)]].mean(axis=1),2)
    store_sku_sku_sales_df['online_pre_7d_y_std'] = round(store_sku_sku_sales_df[[f'online_pre_{k}d_y' for k in range(1,8)]].std(axis=1),2)
    store_sku_sku_sales_df['offline_pre_3d_y_mean'] = round(store_sku_sku_sales_df[[f'offline_pre_{k}d_y' for k in range(1,4)]].mean(axis=1),2)
    store_sku_sku_sales_df['offline_pre_3d_y_std'] = round(store_sku_sku_sales_df[[f'offline_pre_{k}d_y' for k in range(1,4)]].std(axis=1),2)
    store_sku_sku_sales_df['offline_pre_7d_y_mean'] = round(store_sku_sku_sales_df[[f'offline_pre_{k}d_y' for k in range(1,8)]].mean(axis=1),2)
    store_sku_sku_sales_df['offline_pre_7d_y_std'] = round(store_sku_sku_sales_df[[f'offline_pre_{k}d_y' for k in range(1,8)]].std(axis=1),2)
    store_sku_kpi_list_list.append(store_sku_sku_sales_df)
kpi_df = pd.concat(store_sku_kpi_list_list,ignore_index=True)
kpi_df

进度为0.0%
进度为0.01%
进度为0.02%
进度为0.03%
进度为0.03%
进度为0.04%
进度为0.05%
进度为0.06%
进度为0.07%
进度为0.08%
进度为0.08%
进度为0.09%
进度为0.1%
进度为0.11%
进度为0.12%
进度为0.13%
进度为0.13%
进度为0.14%
进度为0.15%
进度为0.16%
进度为0.17%
进度为0.18%
进度为0.19%
进度为0.19%
进度为0.2%
进度为0.21%
进度为0.22%
进度为0.23%
进度为0.24%
进度为0.24%
进度为0.25%
进度为0.26%
进度为0.27%
进度为0.28%
进度为0.29%
进度为0.29%
进度为0.3%
进度为0.31%
进度为0.32%
进度为0.33%
进度为0.34%
进度为0.34%
进度为0.35%
进度为0.36%
进度为0.37%
进度为0.38%
进度为0.39%
进度为0.4%
进度为0.4%
进度为0.41%
进度为0.42%
进度为0.43%
进度为0.44%
进度为0.45%
进度为0.45%
进度为0.46%
进度为0.47%
进度为0.48%
进度为0.49%
进度为0.5%
进度为0.5%
进度为0.51%
进度为0.52%
进度为0.53%
进度为0.54%
进度为0.55%
进度为0.56%
进度为0.56%
进度为0.57%
进度为0.58%
进度为0.59%
进度为0.6%
进度为0.61%
进度为0.61%
进度为0.62%
进度为0.63%
进度为0.64%
进度为0.65%
进度为0.66%
进度为0.66%
进度为0.67%
进度为0.68%
进度为0.69%
进度为0.7%
进度为0.71%
进度为0.71%
进度为0.72%
进度为0.73%
进度为0.74%
进度为0.75%
进度为0.76%
进度为0.77%
进度为0.77%
进度为0.78%
进度为0.79%
进度为0.8%
进度为0.81%
进度为0.82%
进度为0.82%
进度为0.83%
进度为0.84%
进度为0.85%
进度为0.86%
进度为0.87%
进度为0.87%
进度为0.88%
进度为0.89%
进度为0.9%
进度为0.91%
进度为0.92%
进度为0.93%
进度为0.93%
进度为0

Unnamed: 0,store_id,sku_id,date,online_y,offline_y,all_y,online_pre_1d_y,online_pre_2d_y,online_pre_3d_y,online_pre_4d_y,...,offline_pre_4d_y,offline_pre_5d_y,offline_pre_6d_y,offline_pre_7d_y,offline_pre_3d_y_mean,offline_pre_3d_y_std,offline_store_sku_all_y_mean,offline_store_sku_all_y_std,offline_pre_7d_y_mean,offline_pre_7d_y_std
0,1,1,2022-08-15,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00,0.00,1.0,1.41,0.00,0.00
1,1,1,2022-08-25,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00,0.00,1.0,1.41,0.00,0.00
2,1,1,2022-09-03,1.0,1.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00,0.00,1.0,1.41,0.00,0.00
3,1,1,2022-09-07,3.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00,0.00,1.0,1.41,0.00,0.00
4,1,1,2022-09-08,1.0,0.0,1.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00,0.00,1.0,1.41,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
582301,12,1000,2023-08-28,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.33,0.58,1.0,0.52,0.14,0.38
582302,12,1000,2023-08-29,2.0,0.0,2.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.33,0.58,1.0,0.52,0.14,0.38
582303,12,1000,2023-08-31,2.0,1.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00,0.00,1.0,0.52,0.00,0.00
582304,12,1000,2023-08-25,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00,0.00,1.0,0.52,0.00,0.00


In [7]:
kpi_df.to_csv(get_data_path('sku_sales_precess_df_V3'),index=False)

In [None]:
kpi_df = pd.read_csv(get_data_path('sku_sales_precess_df_V3'))
kpi_df['date'] = kpi_df['date'].apply(str_to_datetime)

In [8]:
# 在训练集中考虑过滤掉销量可能不准确的数据
sku_price_and_status_df = pd.read_csv(get_data_path('sku_price_and_status'), sep=',', encoding='utf8')
sku_price_and_status_df['date'] = sku_price_and_status_df['date'].apply(str_to_datetime)

# 加入商品基础信息
sku_info_df = pd.read_csv(get_data_path('sku_info'), sep=',', encoding='utf8')

# 加入门店天气信息
store_weather_df = pd.read_csv(get_data_path('store_weather'), sep=',', encoding='utf8')
store_weather_df['date'] = store_weather_df['date'].apply(str_to_datetime)

# 获取线上和线下数据
online_sku_prom_df = pd.read_csv(get_data_path('online_sku_prom_df'))
online_sku_prom_df['date'] = online_sku_prom_df['date'].apply(str_to_datetime)
offline_sku_prom_df = pd.read_csv(get_data_path('offline_sku_prom_df'))
offline_sku_prom_df['date'] = offline_sku_prom_df['date'].apply(str_to_datetime)

In [9]:
train_df_list = []
for i in range(1,13):
    for j in range(1,1001):
        print(f"正在处理{i}号店铺{j}商品的数据......")
        sku_sales_df_i_j = kpi_df[(kpi_df.store_id==i) & (kpi_df.sku_id==j)]

        sku_price_and_status_df_i_j = sku_price_and_status_df[(sku_price_and_status_df.store_id==i) & (sku_price_and_status_df.sku_id==j)]

        sku_info_df_j = sku_info_df[sku_info_df.sku_id==j]

        store_weather_df_i = store_weather_df[store_weather_df.store_id==i]

        online_sku_prom_df_i_j = online_sku_prom_df[(online_sku_prom_df.store_id==i) & (online_sku_prom_df.sku_id==j)]
        offline_sku_prom_df_i_j = offline_sku_prom_df[(offline_sku_prom_df.store_id==i) & (offline_sku_prom_df.sku_id==j)]
        
        store_sku_df_i_j = pd.merge(left=sku_sales_df_i_j,right=sku_price_and_status_df_i_j,on=['date','store_id','sku_id'],how='inner')
        store_sku_df_i_j = store_sku_df_i_j[(store_sku_df_i_j.salable_status==1) & (store_sku_df_i_j.stock_status==1)]
        store_sku_df_i_j = pd.merge(left=store_sku_df_i_j,right=sku_info_df_j,on=['sku_id'],how='left')
        store_sku_df_i_j = pd.merge(left=store_sku_df_i_j,right=store_weather_df_i,on=['store_id','date'],how='left')
        store_sku_df_i_j = pd.merge(left=store_sku_df_i_j,right=online_sku_prom_df_i_j,on=['store_id','sku_id','date'],how='left')
        store_sku_df_i_j = pd.merge(left=store_sku_df_i_j,right=offline_sku_prom_df_i_j,on=['store_id','sku_id','date'],how='left')

        # 有意义的缺失值填充
        for col in ['offline_discount_off','online_discount_off','offline_discount_threshold_rate','online_discount_threshold_rate']:
            store_sku_df_i_j[col] = store_sku_df_i_j[col].fillna(0)
        for col in ['offline_promotion_type','online_promotion_type']:
            store_sku_df_i_j[col] = store_sku_df_i_j[col].fillna(-1)
        train_df_list.append(store_sku_df_i_j)
train_df = pd.concat(train_df_list,ignore_index=True)

正在处理1号店铺1商品的数据......
正在处理1号店铺2商品的数据......
正在处理1号店铺3商品的数据......
正在处理1号店铺4商品的数据......
正在处理1号店铺5商品的数据......
正在处理1号店铺6商品的数据......
正在处理1号店铺7商品的数据......
正在处理1号店铺8商品的数据......
正在处理1号店铺9商品的数据......
正在处理1号店铺10商品的数据......
正在处理1号店铺11商品的数据......
正在处理1号店铺12商品的数据......
正在处理1号店铺13商品的数据......
正在处理1号店铺14商品的数据......
正在处理1号店铺15商品的数据......
正在处理1号店铺16商品的数据......
正在处理1号店铺17商品的数据......
正在处理1号店铺18商品的数据......
正在处理1号店铺19商品的数据......
正在处理1号店铺20商品的数据......
正在处理1号店铺21商品的数据......
正在处理1号店铺22商品的数据......
正在处理1号店铺23商品的数据......
正在处理1号店铺24商品的数据......
正在处理1号店铺25商品的数据......
正在处理1号店铺26商品的数据......
正在处理1号店铺27商品的数据......
正在处理1号店铺28商品的数据......
正在处理1号店铺29商品的数据......
正在处理1号店铺30商品的数据......
正在处理1号店铺31商品的数据......
正在处理1号店铺32商品的数据......
正在处理1号店铺33商品的数据......
正在处理1号店铺34商品的数据......
正在处理1号店铺35商品的数据......
正在处理1号店铺36商品的数据......
正在处理1号店铺37商品的数据......
正在处理1号店铺38商品的数据......
正在处理1号店铺39商品的数据......
正在处理1号店铺40商品的数据......
正在处理1号店铺41商品的数据......
正在处理1号店铺42商品的数据......
正在处理1号店铺43商品的数据......
正在处理1号店铺44商品的数据......
正在处理1号店铺45商品的数据......
正在处理1号店铺46商品的数据....

In [10]:
train_df.to_csv(get_data_path('train_df_V3'),index=False)

### train_df再补上成本和差价特征，在预测时加入这2个特征，主键为['store_id','sku_id','date']

In [4]:
# 读取数据
# 训练集数据
train_df = pd.read_csv(get_data_path('train_df_V3'))
train_df['date'] = train_df['date'].apply(str_to_datetime)
# 考虑到可能历史3天里面存在填充数据失真的情况，去掉失真数据部分
train_df = train_df[train_df.date>=datetime.datetime(2022,8,21)]
train_df = train_df[(train_df.date>=datetime.datetime(2023,7,21)) | (train_df.date<=datetime.datetime(2022,10,15))]

In [6]:
train_df.columns

Index(['store_id', 'sku_id', 'date', 'online_y', 'offline_y', 'all_y',
       'online_pre_1d_y', 'online_pre_2d_y', 'online_pre_3d_y',
       'online_pre_4d_y', 'online_pre_5d_y', 'online_pre_6d_y',
       'online_pre_7d_y', 'online_pre_3d_y_mean', 'online_pre_3d_y_std',
       'online_pre_7d_y_mean', 'online_pre_7d_y_std',
       'online_store_sku_all_y_mean', 'online_store_sku_all_y_std',
       'offline_pre_1d_y', 'offline_pre_2d_y', 'offline_pre_3d_y',
       'offline_pre_4d_y', 'offline_pre_5d_y', 'offline_pre_6d_y',
       'offline_pre_7d_y', 'offline_pre_3d_y_mean', 'offline_pre_3d_y_std',
       'offline_store_sku_all_y_mean', 'offline_store_sku_all_y_std',
       'offline_pre_7d_y_mean', 'offline_pre_7d_y_std', 'salable_status',
       'stock_status', 'original_price', 'item_first_cate_cd',
       'item_second_cate_cd', 'item_third_cate_cd', 'brand_code',
       'weather_type', 'min_temperature', 'max_temperature', 'online_curr_day',
       'online_total_days', 'online_promoti

In [12]:
train_df.isna().any()[40:60]

min_temperature                    False
max_temperature                    False
online_curr_day                     True
online_total_days                   True
online_promotion_type              False
online_threshold                    True
online_discount_off                False
online_prom_cur_total_rate          True
online_discount_threshold_rate     False
offline_curr_day                    True
offline_total_days                  True
offline_promotion_type             False
offline_threshold                   True
offline_discount_off               False
offline_prom_cur_total_rate         True
offline_discount_threshold_rate    False
dtype: bool

In [13]:
# 价格和状态信息
sku_price_and_status_df = pd.read_csv(get_data_path('sku_price_and_status'), sep=',', encoding='utf8')
sku_price_and_status_df['date'] = sku_price_and_status_df['date'].apply(str_to_datetime)

In [15]:
# 成本计算在之前模型训练的时候没加入（优化)
sku_price_and_status_df = sku_price_and_status_df.sort_values(by=['store_id','sku_id','date'])
for i in range(0,7):
    sku_price_and_status_df[f'pred_{i}d_price'] = sku_price_and_status_df.groupby(['store_id','sku_id'])['original_price'].shift(i)
sku_price_and_status_df = sku_price_and_status_df.fillna(method='backfill')
sku_price_and_status_df['unit_cost'] = 0
for i in range(7):
    sku_price_and_status_df['unit_cost'] += sku_price_and_status_df[f'pred_{i}d_price']
    del sku_price_and_status_df[f'pred_{i}d_price']
    gc.collect()
sku_price_and_status_df['unit_cost'] = round(sku_price_and_status_df['unit_cost']/14,2)
sku_price_and_status_df['price_cost_diff'] = round(sku_price_and_status_df['original_price']-sku_price_and_status_df['unit_cost'],2)
sku_price_and_status_df['price_cost_cv'] = round(sku_price_and_status_df['price_cost_diff']/sku_price_and_status_df['original_price'],2)
sku_price_and_status_df

Unnamed: 0,store_id,sku_id,date,salable_status,stock_status,original_price,unit_cost,price_cost_diff,price_cost_cv
5576,1,1,2021-08-31,0,1,25.48,12.74,12.74,0.5
5564,1,1,2021-09-01,0,1,25.48,12.74,12.74,0.5
5603,1,1,2021-09-02,0,1,25.48,12.74,12.74,0.5
5565,1,1,2021-09-03,0,1,25.48,12.74,12.74,0.5
5367,1,1,2021-09-04,0,1,25.48,12.74,12.74,0.5
...,...,...,...,...,...,...,...,...,...
4941665,12,1000,2023-09-10,-1,-1,27.38,13.69,13.69,0.5
4941687,12,1000,2023-09-11,-1,-1,27.38,13.69,13.69,0.5
4941671,12,1000,2023-09-12,-1,-1,27.38,13.69,13.69,0.5
4941666,12,1000,2023-09-13,-1,-1,27.38,13.69,13.69,0.5


In [20]:
sku_price_and_status_df.price_cost_diff.min()

-8.97

# 获取促销活动数据

In [11]:
# 线上和线下活动基本保持一致

# 获取apridf关联数据(V3版本)

In [5]:
# 订单数据
sku_sales_df = pd.read_csv(get_data_path('sku_sales'), sep=',', encoding='utf8')
sku_sales_df['order_time'] = sku_sales_df.apply(func=lambda x: x['order_time'][:10], axis=1)
sku_sales_df = sku_sales_df.rename(columns={'order_time':'date'})
sku_sales_df['date'] = sku_sales_df['date'].apply(str_to_datetime)

# 取最近15天的数据
sku_sales_df = sku_sales_df[(sku_sales_df.date>=datetime.datetime(2023,8,15))]

In [30]:
high_support_sku_list = []
high_social_sku_list = []

for i in range(1,13):
    print(f"正在获取{i}号店铺的关联信息......")
    sku_sales_df_i = sku_sales_df[(sku_sales_df.store_id==i)]
    
    order_id_list = sku_sales_df_i.order_id.unique().tolist()
    basket = np.zeros(shape=(len(order_id_list),1001))
    for idx,id in enumerate(order_id_list):
        sku_list = sku_sales_df_i.loc[sku_sales_df_i.order_id==id,'sku_id'].tolist()
        basket[idx,sku_list] = 1
    basket = basket[:,1:]
    basket_df = pd.DataFrame(data=basket,columns=[i for i in range(1,1001)])
    basket_df = reduce_mem_usage(basket_df)
    # 使用Apriori算法找到频繁项集
    frequent_itemsets = apriori(basket_df, min_support=0.002, use_colnames=True)

    high_support_sku = np.zeros(1001)
    high_social_sku = np.zeros(1001)

    itemsets_list = frequent_itemsets.itemsets.tolist()
    for itemsets in itemsets_list:
        tmp_idx_list = list(itemsets)
        high_support_sku[tmp_idx_list] = 1
        if len(tmp_idx_list)>1:
            high_social_sku[tmp_idx_list] += 1

    high_support_sku = high_support_sku[1:]
    high_support_sku_list.extend(list(high_support_sku))
    print(f"高支持度的sku数量为{high_support_sku.sum()}")
    high_social_sku = high_social_sku[1:]
    high_social_sku_list.extend(list(high_social_sku))
    print(f"高关联度的总数量为{high_social_sku.sum()}")

# 初始化关联数据
apriori_df1 = pd.DataFrame(data=None,columns = ['store_id','sku_id','if_selected_possible','if_high_value'])
apriori_df1['store_id'] = [i for i in range(1,13) for _ in range(1,1001)]
apriori_df1['sku_id'] = [i for _ in range(1,13) for i in range(1,1001)]
apriori_df1['if_selected_possible'] = high_support_sku_list
apriori_df1['if_high_value'] = high_support_sku_list

正在获取1号店铺的关联信息......
Memory usage of dataframe is 141160128.00 MB
Memory usage after optimization is: 70580128.00 MB
Decreased by 50.0%
高支持度的sku数量为300.0
高关联度的总数量为120.0
正在获取2号店铺的关联信息......
Memory usage of dataframe is 133808128.00 MB
Memory usage after optimization is: 66904128.00 MB
Decreased by 50.0%
高支持度的sku数量为351.0
高关联度的总数量为333.0
正在获取3号店铺的关联信息......
Memory usage of dataframe is 129592128.00 MB
Memory usage after optimization is: 64796128.00 MB
Decreased by 50.0%
高支持度的sku数量为311.0
高关联度的总数量为166.0
正在获取4号店铺的关联信息......
Memory usage of dataframe is 190728128.00 MB
Memory usage after optimization is: 95364128.00 MB
Decreased by 50.0%
高支持度的sku数量为341.0
高关联度的总数量为270.0
正在获取5号店铺的关联信息......
Memory usage of dataframe is 88072128.00 MB
Memory usage after optimization is: 44036128.00 MB
Decreased by 50.0%
高支持度的sku数量为320.0
高关联度的总数量为142.0
正在获取6号店铺的关联信息......
Memory usage of dataframe is 321816128.00 MB
Memory usage after optimization is: 160908128.00 MB
Decreased by 50.0%
高支持度的sku数量为308.0
高关联度的总数量为206.

In [31]:
# 获取每天每笔订单sku销量中位数值
tmp = sku_sales_df.groupby(['store_id','sku_id','order_id','date'])['quantity'].agg(sum).reset_index()
tmp1 = tmp.groupby(['store_id','sku_id'])['quantity'].median().reset_index()

apriori_df1 = pd.merge(left=apriori_df1,right=tmp1,how='left',on=['store_id','sku_id'])
apriori_df1

Unnamed: 0,store_id,sku_id,if_selected_possible,if_high_value,quantity
0,1,1,0.0,0.0,1.0
1,1,2,1.0,1.0,1.0
2,1,3,1.0,1.0,1.0
3,1,4,0.0,0.0,1.0
4,1,5,1.0,1.0,1.0
...,...,...,...,...,...
11995,12,996,0.0,0.0,1.0
11996,12,997,0.0,0.0,1.0
11997,12,998,0.0,0.0,1.0
11998,12,999,0.0,0.0,1.0


In [32]:
apriori_df1 = apriori_df1.rename(columns={'quantity':'avg_quantity'})
apriori_df1

Unnamed: 0,store_id,sku_id,if_selected_possible,if_high_value,avg_quantity
0,1,1,0.0,0.0,1.0
1,1,2,1.0,1.0,1.0
2,1,3,1.0,1.0,1.0
3,1,4,0.0,0.0,1.0
4,1,5,1.0,1.0,1.0
...,...,...,...,...,...
11995,12,996,0.0,0.0,1.0
11996,12,997,0.0,0.0,1.0
11997,12,998,0.0,0.0,1.0
11998,12,999,0.0,0.0,1.0


In [37]:
apriori_df1['avg_quantity'] = apriori_df1['avg_quantity'].fillna(0)
apriori_df1.avg_quantity.describe()

count    12000.000000
mean         1.076346
std          0.378904
min          0.000000
25%          1.000000
50%          1.000000
75%          1.000000
max          5.300000
Name: avg_quantity, dtype: float64

In [38]:
apriori_df1.to_csv(get_data_path('apriori_df_3'),index=False)

# 看看线上订单量和线下订单量的比例分配

In [29]:
# 读取关联数据
apriori_df = pd.read_csv(get_data_path('apriori_df_3'))
apriori_df

Unnamed: 0,store_id,sku_id,if_selected_possible,if_high_value,avg_quantity
0,1,1,0.0,0.0,1.0
1,1,2,1.0,1.0,1.0
2,1,3,1.0,1.0,1.0
3,1,4,0.0,0.0,1.0
4,1,5,1.0,1.0,1.0
...,...,...,...,...,...
11995,12,996,0.0,0.0,1.0
11996,12,997,0.0,0.0,1.0
11997,12,998,0.0,0.0,1.0
11998,12,999,0.0,0.0,1.0


In [31]:
tmp2 = sku_sales_df.groupby(['store_id','sku_id','date','channel'])['order_id'].agg('count').reset_index().rename(columns={'order_id':'order_id_num'})
online_tmp2 = tmp2[tmp2.channel==2]
online_tmp2['order_id_num_backup'] = online_tmp2['order_id_num']
online_tmp2 = online_tmp2.groupby(['store_id','sku_id'])['date','order_id_num','order_id_num_backup'].agg({'date':'count',
                                                                                             'order_id_num':'sum',
                                                                                             'order_id_num_backup':'median'}).reset_index().rename(columns={'date':'online_sale_days',
                                                                                                                            'order_id_num':'online_order_num',
                                                                                                                            'order_id_num_backup':'online_order_mid_quantity'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  online_tmp2['order_id_num_backup'] = online_tmp2['order_id_num']
  online_tmp2 = online_tmp2.groupby(['store_id','sku_id'])['date','order_id_num','order_id_num_backup'].agg({'date':'count',


In [33]:
tmp2 = sku_sales_df.groupby(['store_id','sku_id','date','channel'])['order_id'].agg('count').reset_index().rename(columns={'order_id':'order_id_num'})
offline_tmp2 = tmp2[tmp2.channel==1]
offline_tmp2['order_id_num_backup'] = offline_tmp2['order_id_num']
offline_tmp2 = offline_tmp2.groupby(['store_id','sku_id'])['date','order_id_num','order_id_num_backup'].agg({'date':'count',
                                                                                             'order_id_num':'sum',
                                                                                             'order_id_num_backup':'median'}).reset_index().rename(columns={'date':'offline_sale_days',
                                                                                                                            'order_id_num':'offline_order_num',
                                                                                                                            'order_id_num_backup':'offline_order_mid_quantity'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  offline_tmp2['order_id_num_backup'] = offline_tmp2['order_id_num']
  offline_tmp2 = offline_tmp2.groupby(['store_id','sku_id'])['date','order_id_num','order_id_num_backup'].agg({'date':'count',


In [35]:
apriori_df = pd.merge(left=apriori_df,right=online_tmp2,how='left',on=['store_id','sku_id'])
apriori_df = pd.merge(left=apriori_df,right=offline_tmp2,how='left',on=['store_id','sku_id'])
apriori_df = apriori_df.fillna(0)
apriori_df

Unnamed: 0,store_id,sku_id,if_selected_possible,if_high_value,avg_quantity,online_sale_days,online_order_num,online_order_mid_quantity,offline_sale_days,offline_order_num,offline_order_mid_quantity
0,1,1,0.0,0.0,1.0,5.0,5.0,1.0,10.0,24.0,2.0
1,1,2,1.0,1.0,1.0,11.0,14.0,1.0,15.0,34.0,2.0
2,1,3,1.0,1.0,1.0,16.0,36.0,2.0,15.0,46.0,3.0
3,1,4,0.0,0.0,1.0,6.0,8.0,1.0,7.0,11.0,2.0
4,1,5,1.0,1.0,1.0,9.0,15.0,2.0,17.0,213.0,8.0
...,...,...,...,...,...,...,...,...,...,...,...
11995,12,996,0.0,0.0,1.0,2.0,3.0,1.5,4.0,6.0,1.5
11996,12,997,0.0,0.0,1.0,6.0,8.0,1.0,8.0,8.0,1.0
11997,12,998,0.0,0.0,1.0,5.0,8.0,1.0,5.0,9.0,2.0
11998,12,999,0.0,0.0,1.0,7.0,13.0,2.0,12.0,25.0,2.0


第一点，后场备货选择范围问题，可以发现大多数线下订单的单笔订单中位数销量大于线上订单，而后场备货量主要应该看线上单笔订单的支持度和关联度，因此原先获取是否选择和是否关联的数据应只选择线上订单，避免线下订单的影响。<br>
第二点，安全库存问题，应该区别看待，同时线上和线下区分。在每个单位的安全库存量对应每笔订单的中位数，注意根据零散品取整。

# 获取关联数据（V4版本）

In [46]:
# 订单数据
sku_sales_df = pd.read_csv(get_data_path('sku_sales'), sep=',', encoding='utf8')
sku_sales_df['order_time'] = sku_sales_df.apply(func=lambda x: x['order_time'][:10], axis=1)
sku_sales_df = sku_sales_df.rename(columns={'order_time':'date'})
sku_sales_df['date'] = sku_sales_df['date'].apply(str_to_datetime)

# 取最近17天的数据
sku_sales_df = sku_sales_df[(sku_sales_df.date>=datetime.datetime(2023,8,15))]

In [101]:
# 取最近7天的数据
sku_sales_df_latest = sku_sales_df[(sku_sales_df.date>=datetime.datetime(2023,8,25))]

In [53]:
high_support_sku_list = []
high_social_sku_list = []

for i in range(1,13):
    print(f"正在获取{i}号店铺的线上关联信息......")
    sku_sales_df_i = sku_sales_df[(sku_sales_df.store_id==i) & (sku_sales_df.channel==2)]
    
    order_id_list = sku_sales_df_i.order_id.unique().tolist()
    basket = np.zeros(shape=(len(order_id_list),1001))
    for idx,id in enumerate(order_id_list):
        sku_list = sku_sales_df_i.loc[sku_sales_df_i.order_id==id,'sku_id'].tolist()
        basket[idx,sku_list] = 1
    basket = basket[:,1:]
    basket_df = pd.DataFrame(data=basket,columns=[i for i in range(1,1001)])
    basket_df = reduce_mem_usage(basket_df)
    # 使用Apriori算法找到频繁项集
    frequent_itemsets = apriori(basket_df, min_support=0.002, use_colnames=True)

    high_support_sku = np.zeros(1001)
    high_social_sku = np.zeros(1001)

    itemsets_list = frequent_itemsets.itemsets.tolist()
    for itemsets in itemsets_list:
        tmp_idx_list = list(itemsets)
        high_support_sku[tmp_idx_list] = 1
        if len(tmp_idx_list)>1:
            high_social_sku[tmp_idx_list] = 1

    high_support_sku = high_support_sku[1:]
    high_support_sku_list.extend(list(high_support_sku))
    print(f"高支持度的sku数量为{high_support_sku.sum()}")
    high_social_sku = high_social_sku[1:]
    high_social_sku_list.extend(list(high_social_sku))
    print(f"高关联度的sku数量为{high_social_sku.sum()}")

# 初始化关联数据
apriori_df = pd.DataFrame(data=None,columns = ['store_id','sku_id','if_selected_possible','if_high_value'])
apriori_df['store_id'] = [i for i in range(1,13) for _ in range(1,1001)]
apriori_df['sku_id'] = [i for _ in range(1,13) for i in range(1,1001)]
apriori_df['if_selected_possible'] = high_support_sku_list
apriori_df['if_high_value'] = high_support_sku_list

正在获取1号店铺的线上关联信息......
Memory usage of dataframe is 55008128.00 MB
Memory usage after optimization is: 27504128.00 MB
Decreased by 50.0%
高支持度的sku数量为295.0
高关联度的sku数量为58.0
正在获取2号店铺的线上关联信息......
Memory usage of dataframe is 54168128.00 MB
Memory usage after optimization is: 27084128.00 MB
Decreased by 50.0%
高支持度的sku数量为344.0
高关联度的sku数量为79.0
正在获取3号店铺的线上关联信息......
Memory usage of dataframe is 64560128.00 MB
Memory usage after optimization is: 32280128.00 MB
Decreased by 50.0%
高支持度的sku数量为298.0
高关联度的sku数量为60.0
正在获取4号店铺的线上关联信息......
Memory usage of dataframe is 98736128.00 MB
Memory usage after optimization is: 49368128.00 MB
Decreased by 50.0%
高支持度的sku数量为297.0
高关联度的sku数量为71.0
正在获取5号店铺的线上关联信息......
Memory usage of dataframe is 37808128.00 MB
Memory usage after optimization is: 18904128.00 MB
Decreased by 50.0%
高支持度的sku数量为273.0
高关联度的sku数量为60.0
正在获取6号店铺的线上关联信息......
Memory usage of dataframe is 156696128.00 MB
Memory usage after optimization is: 78348128.00 MB
Decreased by 50.0%
高支持度的sku数量为298.0
高

In [133]:
apriori_df

Unnamed: 0,store_id,sku_id,if_selected_possible,if_high_value
0,1,1,0.0,0.0
1,1,2,1.0,1.0
2,1,3,1.0,1.0
3,1,4,0.0,0.0
4,1,5,1.0,1.0
...,...,...,...,...
11995,12,996,0.0,0.0
11996,12,997,0.0,0.0
11997,12,998,0.0,0.0
11998,12,999,0.0,0.0


In [180]:
# 14d销售数据
tmp2 = sku_sales_df.groupby(['store_id','sku_id','channel'])['date','order_id','quantity'].agg({'date':'nunique',
                                                                                         'order_id':'nunique',
                                                                                         'quantity':'median'}).reset_index().rename(columns={'date':'17d_sale_days',
                                                                                                                                             'order_id':'order_num',
                                                                                                                                             'quantity':'order_mid_quantity'})
# 非零售品中位数处理
not_complete_list = [14, 259, 44, 442, 150, 449, 448, 443]
tmp2['order_mid_quantity'] = tmp2['order_mid_quantity'].round(1)
tmp2['order_mid_quantity'] = tmp2.apply(func=lambda x:x['order_mid_quantity'] if x['sku_id'] in not_complete_list else round(x['order_mid_quantity']), axis=1)


  tmp2 = sku_sales_df.groupby(['store_id','sku_id','channel'])['date','order_id','quantity'].agg({'date':'nunique',


In [181]:
# 7d销售数据
tmp3 = sku_sales_df_latest.groupby(['store_id','sku_id','channel'])['date'].agg('nunique').reset_index().rename(columns={'date':'7d_sale_days'})

In [182]:
# 17d线上销售数据
online_tmp2 = tmp2[tmp2.channel==2]
del online_tmp2['channel']
online_tmp2 = online_tmp2.rename(columns={'17d_sale_days':'online_17d_sale_days',
                                          'order_num':'online_order_num',
                                          'order_mid_quantity':'online_order_mid_quantity'})
online_tmp2['online_17d_avg_order_num'] = round(online_tmp2['online_order_num']/online_tmp2['online_17d_sale_days'],2)
apriori_df1 = pd.merge(left=apriori_df,right=online_tmp2,how='left',on=['store_id','sku_id'])
apriori_df1 = apriori_df1.fillna(0)

# 7d线上销售数据
online_tmp3 = tmp3[tmp3.channel==2]
del online_tmp3['channel']
online_tmp3 = online_tmp3.rename(columns={'7d_sale_days':'online_7d_sale_days'})
apriori_df2 = pd.merge(left=apriori_df1,right=online_tmp3,how='left',on=['store_id','sku_id'])
apriori_df2 = apriori_df2.fillna(0)



In [183]:
apriori_df2

Unnamed: 0,store_id,sku_id,if_selected_possible,if_high_value,online_17d_sale_days,online_order_num,online_order_mid_quantity,online_17d_avg_order_num,online_7d_sale_days
0,1,1,0.0,0.0,5.0,5.0,1.0,1.00,3.0
1,1,2,1.0,1.0,11.0,14.0,2.0,1.27,4.0
2,1,3,1.0,1.0,16.0,36.0,1.0,2.25,6.0
3,1,4,0.0,0.0,6.0,8.0,2.0,1.33,1.0
4,1,5,1.0,1.0,9.0,15.0,2.0,1.67,6.0
...,...,...,...,...,...,...,...,...,...
11995,12,996,0.0,0.0,2.0,3.0,1.0,1.50,1.0
11996,12,997,0.0,0.0,6.0,8.0,1.0,1.33,3.0
11997,12,998,0.0,0.0,5.0,8.0,1.0,1.60,3.0
11998,12,999,0.0,0.0,7.0,13.0,1.0,1.86,3.0


In [184]:
apriori_df2['online_17d_avg_order_num'].describe(percentiles=[0.6,0.7,0.8,0.9,0.95,0.99,0.995,0.999])

count    12000.000000
mean         2.817700
std          5.098336
min          0.000000
50%          1.380000
60%          1.620000
70%          2.000000
80%          2.920000
90%          5.473000
95%         10.240000
99%         28.240000
99.5%       35.653500
99.9%       53.000880
max         86.590000
Name: online_17d_avg_order_num, dtype: float64

In [185]:
# 设置线上安全库存登记和安全库存补充单位
apriori_df2['online_inventory_level'] = 0
apriori_df2.loc[((apriori_df2.online_17d_avg_order_num>=10) & (apriori_df2.online_7d_sale_days>=3)),'online_inventory_level'] = 1
apriori_df2.loc[((apriori_df2.online_17d_avg_order_num>=5) & (apriori_df2.online_7d_sale_days>=4)),'online_inventory_level'] = 1
apriori_df2.loc[(apriori_df2.online_7d_sale_days>=5),'online_inventory_level'] = 1
apriori_df2['online_inventory_plenty_unit'] = 0.2*apriori_df2['online_order_num']/apriori_df2['online_17d_sale_days']
apriori_df2['online_inventory_plenty_unit'] = apriori_df2['online_inventory_plenty_unit'].fillna(0)
apriori_df2['online_inventory_plenty_unit_ceil'] = np.ceil(apriori_df2['online_inventory_plenty_unit'])
apriori_df2['online_inventory_plenty_unit_floor'] = np.floor(apriori_df2['online_inventory_plenty_unit'])
del apriori_df2['online_order_num'],apriori_df2['online_17d_sale_days'],apriori_df2['online_inventory_plenty_unit']

In [186]:
# 17d线下销售数据
offline_tmp2 = tmp2[tmp2.channel==1]
del offline_tmp2['channel']
offline_tmp2 = offline_tmp2.rename(columns={'17d_sale_days':'offline_17d_sale_days',
                                          'order_num':'offline_order_num',
                                          'order_mid_quantity':'offline_order_mid_quantity'})
offline_tmp2['offline_17d_avg_order_num'] = round(offline_tmp2['offline_order_num']/offline_tmp2['offline_17d_sale_days'],2)
apriori_df3 = pd.merge(left=apriori_df2,right=offline_tmp2,how='left',on=['store_id','sku_id'])
apriori_df3 = apriori_df3.fillna(0)

# 7d线下销售数据
offline_tmp3 = tmp3[tmp3.channel==1]
del offline_tmp3['channel']
offline_tmp3 = offline_tmp3.rename(columns={'7d_sale_days':'offline_7d_sale_days'})
apriori_df4 = pd.merge(left=apriori_df3,right=offline_tmp3,how='left',on=['store_id','sku_id'])
apriori_df4 = apriori_df4.fillna(0)

# 设置线下安全库存登记和安全库存补充单位
apriori_df4['offline_inventory_level'] = 0
apriori_df4.loc[((apriori_df4.offline_17d_avg_order_num>=10) & (apriori_df4.offline_7d_sale_days>=3)),'offline_inventory_level'] = 1
apriori_df4.loc[((apriori_df4.offline_17d_avg_order_num>=5) & (apriori_df4.offline_7d_sale_days>=4)),'offline_inventory_level'] = 1
apriori_df4.loc[(apriori_df4.offline_7d_sale_days>=5),'offline_inventory_level'] = 1
apriori_df4['offline_inventory_plenty_unit'] = round(0.2*apriori_df4['offline_order_num']/apriori_df4['offline_17d_sale_days'],2)
apriori_df4['offline_inventory_plenty_unit'] = apriori_df4['offline_inventory_plenty_unit'].fillna(0)
apriori_df4['offline_inventory_plenty_unit_ceil'] = np.ceil(apriori_df4['offline_inventory_plenty_unit'])
apriori_df4['offline_inventory_plenty_unit_floor'] = np.floor(apriori_df4['offline_inventory_plenty_unit'])
del apriori_df4['offline_order_num'],apriori_df4['offline_17d_sale_days'],apriori_df4['offline_inventory_plenty_unit'] 

In [187]:
apriori_df4

Unnamed: 0,store_id,sku_id,if_selected_possible,if_high_value,online_order_mid_quantity,online_17d_avg_order_num,online_7d_sale_days,online_inventory_level,online_inventory_plenty_unit_ceil,online_inventory_plenty_unit_floor,offline_order_mid_quantity,offline_17d_avg_order_num,offline_7d_sale_days,offline_inventory_level,offline_inventory_plenty_unit_ceil,offline_inventory_plenty_unit_floor
0,1,1,0.0,0.0,1.0,1.00,3.0,0,1.0,0.0,1.0,2.40,4.0,0,1.0,0.0
1,1,2,1.0,1.0,2.0,1.27,4.0,0,1.0,0.0,1.0,2.27,6.0,1,1.0,0.0
2,1,3,1.0,1.0,1.0,2.25,6.0,1,1.0,0.0,1.0,3.07,6.0,1,1.0,0.0
3,1,4,0.0,0.0,2.0,1.33,1.0,0,1.0,0.0,1.0,1.57,3.0,0,1.0,0.0
4,1,5,1.0,1.0,2.0,1.67,6.0,1,1.0,0.0,1.0,12.53,7.0,1,3.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11995,12,996,0.0,0.0,1.0,1.50,1.0,0,1.0,0.0,1.0,1.50,2.0,0,1.0,0.0
11996,12,997,0.0,0.0,1.0,1.33,3.0,0,1.0,0.0,1.0,1.00,6.0,1,1.0,0.0
11997,12,998,0.0,0.0,1.0,1.60,3.0,0,1.0,0.0,1.0,1.80,2.0,0,1.0,0.0
11998,12,999,0.0,0.0,1.0,1.86,3.0,0,1.0,0.0,1.0,2.08,7.0,1,1.0,0.0


In [188]:
apriori_df4.isna().sum()

store_id                               0
sku_id                                 0
if_selected_possible                   0
if_high_value                          0
online_order_mid_quantity              0
online_17d_avg_order_num               0
online_7d_sale_days                    0
online_inventory_level                 0
online_inventory_plenty_unit_ceil      0
online_inventory_plenty_unit_floor     0
offline_order_mid_quantity             0
offline_17d_avg_order_num              0
offline_7d_sale_days                   0
offline_inventory_level                0
offline_inventory_plenty_unit_ceil     0
offline_inventory_plenty_unit_floor    0
dtype: int64

In [189]:
apriori_df4.to_csv(get_data_path('apriori_df_4'),index=False)