In [1]:
%matplotlib inline
from datetime import date, timedelta, datetime
import gc
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
import seaborn as sns
import xgboost as xgb

In [2]:
import os
rootpath = r"/home/bingo/桌面/sales_forecasting/steel_storage_throughput_prediction"
os.chdir(rootpath)

### selecting data
- 加载表格，库存数据集、储户费用数据集、出入库训练集
- 预测任务
    - 1.按照两大类货品类型（冷卷、热卷），分别预测未来4个周钢铁的周入库量和周出库量（重量）；
    - 2.按照两大类货品类型（冷卷、热卷），分别预测未来i天的日入库量和日出库量（重量）。
- 只有产品名称是热卷和圆钢为热卷类型，其余均为冷卷类型
- 整理目标：仅筛选出包含冷卷和热卷两种货品的数据集，添加新列表示冷卷或热卷

### loading data
- 加载表格：加载提取后的出入库数据集
- 整理目标
    - 转换为储户id-产品id-日期格式三层索引，做为训练集格式
    - 无记录时间记出入库重量为0
    - 补齐中断的缺失时间，时间范围为(2014-02-24,2018-01-28)

### data extraction
- 特征提取
- 滑窗选取数据集
- 有大量0数值，稀疏数据
- 以4周数据，预测下一周i天数据
- 时间点
    - 测试集：2018-01-28
    - 验证集：2018-01-21
    - 训练集：2018-01-14

In [3]:
# 合并含有大量0值的行
def merge_zeros_cols(df, merge_end=500, merge_step=50):
    freq = df.columns.freqstr
    # 添加一列统计每行0值，用于合并
    notzeros = np.count_nonzero(df, axis=1)
    merge_list = np.arange(0,merge_end+1,merge_step)
    df = df.reset_index()
    # 不用合并的行
    df_ = df[notzeros>=merge_end].set_index("type_id")
    for i, value in enumerate(merge_list[:-1]):
        # 待合并行的布尔索引
        ind = (notzeros>=value)&(notzeros<merge_list[i+1])
        tmp = df.iloc[ind,:]
        tmp = tmp.groupby("type_id").sum(axis=0)
        tmp["store_id"] = merge_list[i+1]
        df_ = pd.concat([df_,tmp],axis=0)
#     df_ = df_.drop("notzeros",axis=0)
    df_ = df_.reset_index().set_index(["store_id","type_id"])
    df_.columns = pd.DatetimeIndex(df_.columns,freq=freq)
    return df_

In [4]:
def cnt_zeros(df):
    notzeros = np.count_nonzero(df,axis=1)
    zeros = df.shape[1]-notzeros
    zeros_ratio = zeros/df.shape[1]
    pd.Series(zeros_ratio).hist()

#### 原始数据

In [5]:
# 导入数据集，保持稀疏性
# converters={"unit_sales":lambda x:np.log1p(float(x)) if float(x)>0 else 0}
span=30
df_train_in = pd.read_csv("data/train_in_all.csv").set_index(["store_id","type_id"])/1000
df_train_in.columns = pd.to_datetime(df_train_in.columns)
df_train_in.columns.name = "date"
# 窗口平滑
df_train_in = df_train_in.ewm(span=span, axis=1).mean()

df_train_ext = pd.read_csv("data/train_ext_all.csv").set_index(["store_id","type_id"])/1000
df_train_ext.columns = pd.to_datetime(df_train_ext.columns)
df_train_ext.columns.name = "date"
# 窗口平滑
df_train_ext = df_train_ext.ewm(span=span, axis=1).mean()

print("df_train_in shape: ", df_train_in.shape)
print("df_train_ext_shape: ", df_train_ext.shape)

df_train_in shape:  (1219, 1463)
df_train_ext_shape:  (1213, 1463)


In [6]:
# 取对数后重采样, 若先重采样再取对数则会预测值过大exp溢出
# 150-50,(128,155); 200-100,(87,122);500-100,(37,57)
# 有差分：合并->取对数->重采样->差分, 独立做一遍
span = 20
tmp_in = merge_zeros_cols(df_train_in, 500, 100).transpose()
# tmp_in = tmp_in.ewm(span=span,axis=1).mean()
df_train_in_transpose = np.log1p(tmp_in)

tmp_ext = merge_zeros_cols(df_train_ext, 500, 100).transpose()
# tmp_ext = tmp_ext.ewm(span=span, axis=1).mean()
df_train_ext_transpose = np.log1p(tmp_ext)

# # 对于周数据，无差分：合并->取对数->重采样取均值，置于无差分天数据操作之后
# df_train_in_transpose = df_train_in.transpose()
# df_train_ext_transpose = df_train_ext.transpose()

# 按周重采样
df_train_in_W = df_train_in_transpose.resample("W",closed="right",label="right").mean().transpose()
df_train_in_2W = df_train_in_transpose.resample("2W",closed="right",label="right").mean().transpose()
df_train_in_3W = df_train_in_transpose.resample("3W",closed="right",label="right").mean().transpose()
df_train_in_4W = df_train_in_transpose.resample("4W",closed="right",label="right").mean().transpose()

df_train_ext_W = df_train_ext_transpose.resample("W",closed="right",label="right").mean().transpose()
df_train_ext_2W = df_train_ext_transpose.resample("2W",closed="right",label="right").mean().transpose()
df_train_ext_3W = df_train_ext_transpose.resample("3W",closed="right",label="right").mean().transpose()
df_train_ext_4W = df_train_ext_transpose.resample("4W",closed="right",label="right").mean().transpose()

# # 差分，去除趋势性，保留初始值
# init_in_byWeek = [df_train_in_W.iloc[:,0], df_train_in_2W.iloc[:,0],\
#                   df_train_in_3W.iloc[:,0], df_train_in_4W.iloc[:,0]]
# init_ext_byWeek = [df_train_ext_W.iloc[:,0], df_train_ext_2W.iloc[:,0],\
#                   df_train_ext_3W.iloc[:,0], df_train_ext_4W.iloc[:,0]]

# df_train_in_W = df_train_in_W.diff(axis=1).dropna(axis=1)
# df_train_ext_W = df_train_ext_W.diff(axis=1).dropna(axis=1)
# df_train_in_2W = df_train_in_2W.diff(axis=1).dropna(axis=1)
# df_train_ext_2W = df_train_ext_2W.diff(axis=1).dropna(axis=1)
# df_train_in_3W = df_train_in_3W.diff(axis=1).dropna(axis=1)
# df_train_ext_3W = df_train_ext_3W.diff(axis=1).dropna(axis=1)
# df_train_in_4W = df_train_in_4W.diff(axis=1).dropna(axis=1)
# df_train_ext_4W = df_train_ext_4W.diff(axis=1).dropna(axis=1)
print("df_train_in_W shape: ",df_train_in_W.shape)
print("df_train_ext_W shape: ",df_train_ext_W.shape)

df_train_in_W shape:  (645, 209)
df_train_ext_W shape:  (631, 209)


In [7]:
# 对于天数据，合并->取对数->差分
# 先合并数据行
span = 180
df_train_in = merge_zeros_cols(df_train_in, merge_end=500, merge_step=50*2)#/1000 #100
df_train_ext = merge_zeros_cols(df_train_ext, merge_end=500, merge_step=50*2)#/1000 #500,100
# df_train_in = df_train_in.ewm(span=span, axis=1).mean()
# df_train_ext = df_train_ext.ewm(span=span, axis=1).mean()

# 再直接取对数
df_train_in = np.log1p(df_train_in)
df_train_ext = np.log1p(df_train_ext)

# # 最后差分，去除趋势性，保留初始值
# init_byDay = [df_train_in.iloc[:,0], df_train_ext.iloc[:,0]]
# df_train_in = df_train_in.diff(axis=1).dropna(axis=1)
# df_train_ext = df_train_ext.diff(axis=1).dropna(axis=1)
print("df_train_in shape: ", df_train_in.shape)
print("df_train_ext_shape: ", df_train_ext.shape)

df_train_in shape:  (645, 1463)
df_train_ext_shape:  (631, 1463)


#### 数据子集抽取_byDay

In [8]:
def get_span(df, dt, minus, periods, freq="D"):
    return df[pd.date_range(dt-timedelta(days=int(minus)), periods=periods, freq=freq)]

In [9]:
def prepare_dataset_byDay(df, dt, addType=False, is_train=True):
    x = {}
    n = 11
    #1.计算dt时间点一段时间内的出入库情况，总量+均量, 40个特征
    for i in np.arange(2,21,2): #np.arange(2,21,2)
        tmp = get_span(df, dt, i*7-1, i*7)
        x["sales_%s_sum"%(i*7)] = tmp.sum(axis=1).values
        x["sales_%s_mean"%(i*7)] = tmp.mean(axis=1).values
#         x["sum_mean_%s_ratio"%(i*7)] = x["sales_%s_sum"%(i*7)]/x["sales_%s_mean"%(i*7)]
        x["sales_%s_sum_decay"%(i*7)] = \
        (tmp*np.power(0.9,np.arange(i*7)[::-1])).sum(axis=1).values
        x["sales_%s_mean_decay"%(i*7)] = \
        (tmp*np.power(0.9,np.arange(i*7)[::-1])).mean(axis=1).values
#         x["sum_mean_%s_ratio_decay"%(i*7)] = x["sales_%s_sum_decay"%(i*7)]/x["sales_%s_mean_decay"%(i*7)]
    
    #2.计算dt时间点一段时间内的出入库次数，有货物天数+无货物天数，20个特征
    for i in np.arange(1,n):#np.arange(1,11)
        tmp = get_span(df, dt, i*7-1, i*7)
        x["sales_%s_count"%(i*7)] = np.count_nonzero(tmp.values, axis=1) 
        x["no_sales_%s_count"%(i*7)] = i*7 - x["sales_%s_count"%(i*7)]
        
    #3.计算dt时间点一段时间内产品出入库情况，差值均值、中值、最大值、最小值、标准差，50个特征
    for i in np.arange(1,n):#np.arange(1,11)
        tmp = get_span(df, dt, i*7-1, i*7)
        x["diff_%s_mean"%(i*7)] = tmp.diff(axis=1).mean(axis=1).values
#         x["diff_%s_mean_decay"%(i*7)] = \
#         (tmp.diff(axis=1) * np.power(0.9, np.arange(i*7)[::-1])).mean(axis=1).values #新添的
        x["median_%s"%(i*7)] = tmp.median(axis=1).values
#         x["max_%s"%(i*7)] = tmp.max(axis=1).values
#         x["min_%s"%(i*7)] = tmp.min(axis=1).values
#         x["std_%s"%(i*7)] = tmp.std(axis=1).values
        
    #4.dt时间点2周前,差值均值、中值、最大值、最小值、标准差、均值，70个特征
    for i in np.arange(1,n):#np.arange(1,11)
        tmp = get_span(df, dt+timedelta(days=-14), i*7-1, i*7)
        x['diff_%s_mean_2' %(i*7)] = tmp.diff(axis=1).mean(axis=1).values
        x['mean_%s_decay_2' %(i*7)] = \
        (tmp * np.power(0.9, np.arange(i*7)[::-1])).mean(axis=1).values #sum
        x['mean_%s_2' %(i*7)] = tmp.mean(axis=1).values
#         x['median_%s_2' %(i*7)] = tmp.median(axis=1).values
#         x['min_%s_2' %(i*7)] = tmp.min(axis=1).values
#         x['max_%s_2' %(i*7)] = tmp.max(axis=1).values
#         x['std_%s_2' %(i*7)] = tmp.std(axis=1).values
    
    #5.前一段时间内，第一次出入库距现在天数，最后一次距现在天书， 20个特征
    for i in np.arange(1,n):#np.arange(1,11)
        tmp = get_span(df, dt, i*7-1, i*7)
        x['has_sales_days_in_last_%s'%(i*7)] = (tmp > 0).sum(axis=1).values
        x["last_sales_days_in_last_%s"%(i*7)] =\
        i*7 - ((tmp>0)*np.arange(i*7)).max(axis=1).values
        x["first_sales_days_in_last_%s"%(i*7)] =\
        ((tmp>0)*np.arange(i*7,0,-1)).max(axis=1).values
        
    #6.过去n周内同一天的均值、总量, 35个特征
    for i in np.arange(7):
        x["mean_4_dow%s"%(i)] = get_span(df, dt, 28-i, 4, freq="7D").mean(axis=1).values
        x["mean_8_dow%s"%(i)] = get_span(df, dt, 56-i, 8, freq="7D").mean(axis=1).values
        x["mean_12_dow%s"%(i)] = get_span(df, dt, 84-i, 12, freq="7D").mean(axis=1).values
        x["mean_16_dow%s"%(i)] = get_span(df, dt, 16*7-i, 16, freq="7D").mean(axis=1).values
        x["mean_20_dow%s"%(i)] = get_span(df, dt, 140-i, 20, freq="7D").mean(axis=1).values
#         x["mean_24_dow%s"%(i)] = get_span(df, dt, 24*7-i, 24, freq="7D").mean(axis=1).values
        
    #7.提取前30天出入库情况，31个特征
    for i in np.arange(0,31):#np.arange(0,31)(n-1)*7
        x["day_%s"%(i)] = get_span(df, dt, i, 1).values.ravel()
    
    if addType:
        x = pd.DataFrame(x)
        x["type"] = df.reset_index()["type_id"]
    else:
        x = pd.DataFrame(x)
    
    if is_train:
        y = df[pd.date_range(dt+timedelta(1),periods=7)].values
        return x,y
    return x

In [10]:
print("Preparing dataset...")
t2018 = date(2018,1,14)
num_days = 20*5
x_in, y_in = [], []
x_ext, y_ext = [], []
addType = True

for i in range(num_days):
    delta = timedelta(days=i)
    
    x_tmp_in, y_tmp_in = prepare_dataset_byDay(df_train_in, t2018-delta, addType=addType)
    x_tmp_ext, y_tmp_ext = prepare_dataset_byDay(df_train_ext, t2018-delta, addType=addType)
    
    x_in.append(x_tmp_in)
    y_in.append(y_tmp_in)
    x_ext.append(x_tmp_ext)
    y_ext.append(y_tmp_ext)

x_train_in = pd.concat(x_in, axis=0)
y_train_in = np.concatenate(y_in, axis=0)
x_train_ext = pd.concat(x_ext, axis=0)
y_train_ext = np.concatenate(y_ext, axis=0)

print("x_train_in : ",x_train_in.shape)
print("y_train_in : ", y_train_in.shape)
print("x_train_ext : ", x_train_ext.shape)
print("y_train_ext : ", y_train_ext.shape)

Preparing dataset...
x_train_in :  (64500, 207)
y_train_in :  (64500, 7)
x_train_ext :  (63100, 207)
y_train_ext :  (63100, 7)


In [11]:
# 验证集 2018-1-21
x_val_in, y_val_in = prepare_dataset_byDay(df_train_in, date(2018,1,21),addType=addType)
x_val_ext, y_val_ext = prepare_dataset_byDay(df_train_ext, date(2018,1,21), addType=addType)

# 测试集 2018-1-28
x_test_in = prepare_dataset_byDay(df_train_in, date(2018,1,28), is_train=False, addType=addType)
x_test_ext = prepare_dataset_byDay(df_train_ext, date(2018,1,28), is_train=False, addType=addType)

#### 数据子集抽取_byWeek

In [12]:
def get_span(df, dt, minus, periods, freq="D"):
    return df[pd.date_range(dt-timedelta(days=int(minus)), periods=periods, freq=freq)]

In [13]:
def prepare_dataset_byWeek(df, dt, step=7, addType=False, is_train=True):
    x = {}
    n = 11
    freq = df.columns.freqstr
    #1.计算dt时间点一段时间内的出入库情况，总量+均量, 40个特征
    for i in np.arange(2,21,2):#np.arange(2,21,2)
        tmp = get_span(df, dt, i*step, i, freq=freq)
        x["sales_%s_sum"%(i)] = tmp.sum(axis=1).values
        x["sales_%s_mean"%(i)] = tmp.mean(axis=1).values
#         x["sum_mean_%s_ratio"%(i)] = x["sales_%s_sum"%(i)]/x["sales_%s_mean"%(i)]
        x["sales_%s_sum_decay"%(i)] = \
        (tmp*np.power(0.9,np.arange(i)[::-1])).sum(axis=1).values
        x["sales_%s_mean_decay"%(i)] = \
        (tmp*np.power(0.9,np.arange(i)[::-1])).mean(axis=1).values
#         x["sum_mean_%s_ratio_decay"%(i)] = x["sales_%s_sum_decay"%(i)]/x["sales_%s_mean_decay"%(i)]
    
    #2.计算dt时间点一段时间内的出入库次数，有货物天数+无货物天数，20个特征
    for i in np.arange(1,11): #np.arange(1,11)
        tmp = get_span(df, dt, i*step, i, freq=freq)
        x["sales_%s_count"%(i)] = np.count_nonzero(tmp.values, axis=1) 
        x["no_sales_%s_count"%(i)] = i - x["sales_%s_count"%(i)]
        
    #3.计算dt时间点一段时间内产品出入库情况，差值均值、中值、最大值、最小值、标准差，50个特征
    for i in np.arange(1,11):#np.arange(1,11)
        tmp = get_span(df, dt, i*step, i, freq=freq)
        x["diff_%s_mean"%(i)] = tmp.diff(axis=1).mean(axis=1).values
#         x["diff_%s_mean_decay"%(i)] = \
#         (tmp.diff(axis=1) * np.power(0.9, np.arange(i)[::-1])).mean(axis=1).values #新添的
        x["median_%s"%(i)] = tmp.median(axis=1).values
#         x["max_%s"%(i)] = tmp.max(axis=1).values
#         x["min_%s"%(i)] = tmp.min(axis=1).values
#         x["std_%s"%(i)] = tmp.std(axis=1).values
        
    #4.dt时间点2周前,差值均值、中值、最大值、最小值、标准差、均值，i0个特征
    for i in np.arange(1,11):#np.arange(1,11)
        n = step if step%14 or 14%step else 14
        tmp = get_span(df, dt+timedelta(days=-n), i*step, i, freq=freq)
        x['diff_%s_mean_2' %(i)] = tmp.diff(axis=1).mean(axis=1).values
        x['mean_%s_decay_2' %(i)] = \
        (tmp * np.power(0.9, np.arange(i)[::-1])).mean(axis=1).values #sum
        x['mean_%s_2' %(i)] = tmp.mean(axis=1).values
#         x['median_%s_2' %(i)] = tmp.median(axis=1).values
#         x['min_%s_2' %(i)] = tmp.min(axis=1).values
#         x['max_%s_2' %(i)] = tmp.max(axis=1).values
#         x['std_%s_2' %(i)] = tmp.std(axis=1).values
    
    #5.前一段时间内，第一次出入库距现在天数，最后一次距现在天书， 20个特征
    for i in np.arange(1,11): #np.arange(1,11)
        tmp = get_span(df, dt, i*step, i, freq=freq)
        x['has_sales_days_in_last_%s' % i] = (tmp > 0).sum(axis=1).values
        x["last_sales_days_in_last_%s"%(i)] =\
        i - ((tmp>0)*np.arange(i)).max(axis=1).values
        x["first_sales_days_in_last_%s"%(i)] =\
        ((tmp>0)*np.arange(i,0,-1)).max(axis=1).values
        
    #7.提取前30天出入库情况，31个特征。这各地方不能包括自身
    for i in np.arange(1,11):#np.arange(1,11)
        x["day_%s"%(i)] = get_span(df, dt, i*step, 1, freq=freq).values.ravel()
    
    if addType:
        x = pd.DataFrame(x)
        x["type"] = df.reset_index()["type_id"]
    else:
        x = pd.DataFrame(x)
    
    if is_train:
        y = df[pd.date_range(dt,periods=1,freq=freq)].values
        return x,y
    return x

In [14]:
def getDatas(df_in, df_ext, dt="2018-01-21", step=7, num_days=int(20), is_train=True, addType=True):
    print("Preparing dataset...step",step)
    dt = datetime.strptime(dt,"%Y-%m-%d")
    num_days = num_days
    x_in, y_in = [], []
    x_ext, y_ext = [], []
    for i in range(num_days):
        delta = timedelta(days=i*step)
        if is_train:
            x_tmp_in, y_tmp_in = prepare_dataset_byWeek(df_in, dt-delta, step=step, addType=addType, is_train=is_train)
            x_tmp_ext, y_tmp_ext = prepare_dataset_byWeek(df_ext, dt-delta, step=step, addType=addType,is_train=is_train)
        else:
            x_tmp_in = prepare_dataset_byWeek(df_in, dt-delta, step=step, addType=addType, is_train=is_train)
            x_tmp_ext = prepare_dataset_byWeek(df_ext, dt-delta, step=step, addType=addType,is_train=is_train)
            y_tmp_in = []
            y_tmp_ext = []

        x_in.append(x_tmp_in)
        y_in.append(y_tmp_in)
        x_ext.append(x_tmp_ext)
        y_ext.append(y_tmp_ext)

    x_train_in = pd.concat(x_in, axis=0)
    y_train_in = np.concatenate(y_in, axis=0)
    x_train_ext = pd.concat(x_ext, axis=0)
    y_train_ext = np.concatenate(y_ext, axis=0)
    return x_train_in, y_train_in, x_train_ext, y_train_ext

In [None]:
# train_date = ['2018-01-21','2018-01-14', '2018-01-07', '2017-12-31']
# steps = [7,14,21,28]
# in_list = [df_train_in_W, df_train_in_2W, df_train_in_3W, df_train_in_4W]
# ext_list = [df_train_ext_W, df_train_ext_2W, df_train_ext_3W, df_train_ext_4W]

# 训练集
x_train_in_W, y_train_in_W, x_train_ext_W, y_train_ext_W = getDatas(
    df_train_in_W, df_train_ext_W, "2018-01-21", 7)
x_train_in_2W, y_train_in_2W, x_train_ext_2W, y_train_ext_2W = getDatas(
    df_train_in_2W, df_train_ext_2W, "2018-01-14", 14)
x_train_in_3W, y_train_in_3W, x_train_ext_3W, y_train_ext_3W = getDatas(
    df_train_in_3W, df_train_ext_3W, "2018-01-07", 21)
x_train_in_4W, y_train_in_4W, x_train_ext_4W, y_train_ext_4W = getDatas(
    df_train_in_4W, df_train_ext_4W, "2017-12-31", 28)
print("x_train_in_W shape: ", x_train_in_W.shape)
print("x_train_in_2W shape: ", x_train_in_2W.shape)

In [None]:
# 验证集
# steps = [7,14,21,28]
# in_list = [df_train_in_W, df_train_in_2W, df_train_in_3W, df_train_in_4W]
# ext_list = [df_train_ext_W, df_train_ext_2W, df_train_ext_3W, df_train_ext_4W]
# # val_date = ["2018-01-07","2017-12-17","2017-11-26","2017-11-05"]
# val_date = ['2018-01-28', '2018-01-28', '2018-01-28', '2018-01-28']

x_val_in_W, y_val_in_W, x_val_ext_W, y_val_ext_W = getDatas(
    df_train_in_W, df_train_ext_W, "2018-01-28", 7, 1)
x_val_in_2W, y_val_in_2W, x_val_ext_2W, y_val_ext_2W = getDatas(
    df_train_in_2W, df_train_ext_2W, "2018-01-28", 14, 1)
x_val_in_3W, y_val_in_3W, x_val_ext_3W, y_val_ext_3W = getDatas(
    df_train_in_3W, df_train_ext_3W, "2018-01-28", 21, 1)
x_val_in_4W, y_val_in_4W, x_val_ext_4W, y_val_ext_4W = getDatas(
    df_train_in_4W, df_train_ext_4W, "2018-01-28", 28, 1)
print("x_val_in_W shape: ", x_val_in_W.shape)
print("x_val_ext_2W shape: ", y_val_ext_2W.shape)

In [None]:
# 测试集
# steps = [7,14,21,28]
# in_list = [df_train_in_W, df_train_in_2W, df_train_in_3W, df_train_in_4W]
# ext_list = [df_train_ext_W, df_train_ext_2W, df_train_ext_3W, df_train_ext_4W]
# test_date = ["2018-01-29"]*4

x_test_in_W, y_test_in_W, x_test_ext_W, y_test_ext_W = getDatas(
    df_train_in_W, df_train_ext_W, "2018-02-03", 7, 1, False)
x_test_in_2W, y_test_in_2W, x_test_ext_2W, y_test_ext_2W = getDatas(
    df_train_in_2W, df_train_ext_2W, "2018-02-10", 14, 1, False)
x_test_in_3W, y_test_in_3W, x_test_ext_3W, y_test_ext_3W = getDatas(
    df_train_in_3W, df_train_ext_3W, "2018-02-17", 21, 1, False)
x_test_in_4W, y_test_in_4W, x_test_ext_4W, y_test_ext_4W = getDatas(
    df_train_in_4W, df_train_ext_4W, "2018-02-24", 28, 1, False)
print("x_test_in_W shape: ", x_test_in_W.shape)
print("x_test_in_2W shape: ", x_test_ext_2W.shape)

In [None]:
x_train_in_Wall = [x_train_in_W, x_train_in_2W, x_train_in_3W, x_train_in_4W]
y_train_in_Wall = np.concatenate(
    [y_train_in_W, y_train_in_2W, y_train_in_3W, y_train_in_4W], axis=1)
# y_train_in_Wall = [y_train_in_W, y_train_in_2W, y_train_in_3W, y_train_in_4W]
x_train_ext_Wall = [x_train_ext_W, x_train_ext_2W,
                    x_train_ext_3W, x_train_ext_4W]
y_train_ext_Wall = np.concatenate(
    [y_train_ext_W, y_train_ext_2W, y_train_ext_3W, y_train_ext_4W], axis=1)
# y_train_ext_Wall = [y_train_ext_W, y_train_ext_2W, y_train_ext_3W, y_train_ext_4W]

x_val_in_Wall = [x_val_in_W, x_val_in_2W, x_val_in_3W, x_val_in_4W]
y_val_in_Wall = np.concatenate(
    [y_val_in_W, y_val_in_2W, y_val_in_3W, y_val_in_4W], axis=1)
# y_val_in_Wall = [y_val_in_W, y_val_in_2W, y_val_in_3W, y_val_in_4W]
x_val_ext_Wall = [x_val_ext_W, x_val_ext_2W, x_val_ext_3W, x_val_ext_4W]
y_val_ext_Wall = np.concatenate(
    [y_val_ext_W, y_val_ext_2W, y_val_ext_3W, y_val_ext_4W], axis=1)
# y_val_ext_Wall = [y_val_ext_W, y_val_ext_2W, y_val_ext_3W, y_val_ext_4W]

x_test_in_Wall = [x_test_in_W, x_test_in_2W, x_test_in_3W, x_test_in_4W]
x_test_ext_Wall = [x_test_ext_W, x_test_ext_2W, x_test_ext_3W, x_test_ext_4W]

### 算法建模_byDay
- 细粒度预测(按天预测)，不选择使用传统算法
    - xgboost
    - cnn+dnn
    - lstm
- 加大数据规模，重新训练模型 2018-12-06, num_days=200

#### 细粒度-1天1个模型

In [76]:
params1 = {
    "booster":"gblinear",
    "eta":0.02,
    "alpha":0,
    "lambda":1,
    "max_depth":10, #10
    "colsample_bytree":0.8,
    "subsample":0.8,
    "max_leaves":15, #10
    "gamma":0.1, # 0.2
    "min_child_weight":10,
    "nthread":4,
    "objective":"reg:linear",
    "eval_metric":"rmse",
    "silent":1,
}

In [77]:
# 一天一个模型
boosting_round = [500*2,1000,0,0]#[500*2,1000*2,2000,4000]
MAX_ROUNDS = np.sum(boosting_round)
etas = [0.01, 0.005, 0.001, 0.0001]
eta_ = np.ones(MAX_ROUNDS)
eta_list = [np.tile(eta, rounds) for rounds, eta in zip(boosting_round, etas)]
eta_list = np.concatenate(eta_list, axis=0)

dtest = xgb.DMatrix(x_test_ext.values)
val_pred = []
test_pred = []
if not os.path.exists(r"./model/model1"):
    os.mkdir(r"./model/model1")
        
for i in range(7):
    print("="*50)
    print("step %d"%(i+1))
    print(datetime.now())
    print("="*50)
    dtrain = xgb.DMatrix(
        data=np.concatenate([x_train_in.values,x_train_ext.values],axis=0),
        label=np.concatenate([y_train_in[:,i], y_train_ext[:,i]], axis=0)
    )
    
    dval = xgb.DMatrix(
        data=np.concatenate([x_val_in.values,x_val_ext.values], axis=0),
        label=np.concatenate([y_val_in[:,i], y_val_ext[:,i]], axis=0)
    )
    
    bst = xgb.train(
        params1,
        dtrain,
        num_boost_round=MAX_ROUNDS,
        evals=[(dtrain,"dtrain"),(dval,"dval")],
        early_stopping_rounds=300,
        verbose_eval=500,
        callbacks=[xgb.callback.reset_learning_rate(list(eta_list))]
    )

    bst.save_model(r"./model/model1/bst_byday"+str(i+1)+".model")
    
    val_pred.append(bst.predict(dval))
    test_pred.append(bst.predict(dtest))

step 1
2018-12-15 15:16:21.315090
[0]	dtrain-rmse:0.34807	dval-rmse:0.347507
Multiple eval metrics have been passed: 'dval-rmse' will be used for early stopping.

Will train until dval-rmse hasn't improved in 300 rounds.
Stopping. Best iteration:
[26]	dtrain-rmse:0.024772	dval-rmse:0.019027

step 2
2018-12-15 15:16:52.379954
[0]	dtrain-rmse:0.348057	dval-rmse:0.347317
Multiple eval metrics have been passed: 'dval-rmse' will be used for early stopping.

Will train until dval-rmse hasn't improved in 300 rounds.
Stopping. Best iteration:
[26]	dtrain-rmse:0.025091	dval-rmse:0.018866

step 3
2018-12-15 15:17:20.685680
[0]	dtrain-rmse:0.348005	dval-rmse:0.34717
Multiple eval metrics have been passed: 'dval-rmse' will be used for early stopping.

Will train until dval-rmse hasn't improved in 300 rounds.
Stopping. Best iteration:
[27]	dtrain-rmse:0.025303	dval-rmse:0.018552

step 4
2018-12-15 15:17:49.332013
[0]	dtrain-rmse:0.347996	dval-rmse:0.347291
Multiple eval metrics have been passed: 'd

### 算法建模_byWeek
- 粗粒度预测，以周为单位
- 4周4个模型，预测第一周可收敛，预测第二、三、四周完全不收敛。2018-12-04
- 换种思路：一个模型直接预测2周，3周，4周，做差分得到每周的数目。2018-12-05
    - 收敛，但是预测结果，特别大，10^20量级都有。醉了。
    - 可能是后期包含多周得样本占多数，所以预测结果偏大
- 换种思路2.0：4个模型分别预测前1周，前2周，前3周，前4周。2018-12-05
    - 累积预测后，计算差分为每周预测值
    - 数据总体先求对数，再以每周均值为回归目标
    - 若以总量为回归目标，不收敛
- 思路2.0可以，现增加数据量重新训练，num_days=30 2018-12-06

#### 粗粒度-1周1个模型

#### 粗粒度-4个累积模型

In [78]:
print("Training and Predicting models...")
params_w4 = {
    "booster":"gblinear",
    "eta":0.02,
    "alpha":0,
    "lambda":1,
    "max_depth":8, #5
    "colsample_bytree":0.8,
    "subsample":0.8,
    "max_leaves":15, #10
    "gamma":0.1, #0.2
    "min_child_weight":10,
    "nthread":4,
    "objective":"reg:linear",
    "eval_metric":"rmse",
    "silent":1
}

Training and Predicting models...


In [79]:
boosting_round = [500*2,1000,0,0] #[500*2,1000*2,2000*2,4000*2]
MAX_ROUNDS = np.sum(boosting_round)
etas = [0.01, 0.005, 0.001, 0.0001] #[0.02, 0.01, 0.005, 0.001]
eta_ = np.ones(MAX_ROUNDS)
eta_list = [np.tile(eta, rounds) for rounds, eta in zip(boosting_round, etas)]
eta_list = np.concatenate(eta_list, axis=0)

# MAX_ROUNDS = 2000
val_pred_w4 = []
test_pred_w4 = []

if not os.path.exists(r"./model/model1"):
        os.mkdir(r"./model/model1")
        
for i in range(4):
    print("="*50)
    print("step %d"%(i+1))
    print(datetime.now())
    print("="*50)
    dtrain_w4 = xgb.DMatrix(
        data=np.concatenate([x_train_in_Wall[i].values, x_train_ext_Wall[i].values],axis=0),
        label=np.concatenate([y_train_in_Wall[:,i], y_train_ext_Wall[:,i]], axis=0)
    )
    
    dval_w4 = xgb.DMatrix(
        data=np.concatenate([x_val_in_Wall[i].values,x_val_ext_Wall[i].values], axis=0),
        label=np.concatenate([y_val_in_Wall[:,i], y_val_ext_Wall[:,i]], axis=0)
    )
    
    dtest_w4 = xgb.DMatrix(x_test_ext_Wall[i].values)
    
    bst_w4 = xgb.train(
        params_w4,
        dtrain_w4,
        num_boost_round=MAX_ROUNDS,
        evals=[(dtrain_w4,"dtrain"),(dval_w4,"dval")],
        early_stopping_rounds=300,
        verbose_eval=1000,
        callbacks=[xgb.callback.reset_learning_rate(list(eta_list))]
    )
    
    bst_w4.save_model(r"./model/model1/bst_byweek"+str(i+1)+".model")
    
    
    val_pred_w4.append(bst_w4.predict(dval_w4))
#     test_pred_w4.append(bst_w4.predict(dtest_w4))

step 1
2018-12-15 15:23:51.303434
[0]	dtrain-rmse:0.352924	dval-rmse:0.352285
Multiple eval metrics have been passed: 'dval-rmse' will be used for early stopping.

Will train until dval-rmse hasn't improved in 300 rounds.
Stopping. Best iteration:
[119]	dtrain-rmse:0.027026	dval-rmse:0.023671

step 2
2018-12-15 15:23:55.688128
[0]	dtrain-rmse:0.353118	dval-rmse:0.352598
Multiple eval metrics have been passed: 'dval-rmse' will be used for early stopping.

Will train until dval-rmse hasn't improved in 300 rounds.
Stopping. Best iteration:
[115]	dtrain-rmse:0.028519	dval-rmse:0.03132

step 3
2018-12-15 15:23:59.379998
[0]	dtrain-rmse:0.353732	dval-rmse:0.352607
Multiple eval metrics have been passed: 'dval-rmse' will be used for early stopping.

Will train until dval-rmse hasn't improved in 300 rounds.
Stopping. Best iteration:
[148]	dtrain-rmse:0.029582	dval-rmse:0.035809

step 4
2018-12-15 15:24:02.984326
[0]	dtrain-rmse:0.357327	dval-rmse:0.352951
Multiple eval metrics have been passed

### 算法评估标准

In [80]:
def getPrediction(preds, inds, steps=[4,4,7,7]):
    # preds=[week_ext, week_in, day_ext, day_in]
    stack = []
    for ind,pred,step in zip(inds, preds, steps):
        pred = np.array(pred).transpose()
        tmp = pd.DataFrame(np.expm1(pred), index=ind)
        tmp = tmp.reset_index().drop("store_id",axis=1)
        tmp = tmp.groupby("type_id").sum().values[::-1]
        tmp = tmp.reshape(2*step)
        stack.append(tmp)
    stack_pred = np.concatenate(stack, axis=0)
    return stack_pred

In [81]:
def getSMAPE(label, pred, inds):
    M = 800
    t = getPrediction(label, inds)*10000
    p = getPrediction(pred, inds)*10000
    print((t-p)/t)
    smape = np.mean(np.abs((t - p)/(np.maximum((t+p)/2,M))))
    mape = np.mean(np.abs((t-p)/t))
#     mse = np.mean(np.power(t-p,2))
#     mae = np.mean(np.abs(t-p))
    score = (2-smape)*50
    print("get MAPE %.4f"%(mape))
#     print("get MSE %.4f"%(mse))
#     print("get MAE %.4f"%(mae))
    print("get SMAPW %.4f"%(smape))
    print("get score %.4f"%(score))
    return smape,score

In [82]:
def getLastW(data):
    d1 = np.array(data).squeeze().transpose()
    # 先差分
    d2 = np.expm1(d1)*np.array([7,14,21,28]) #d2 = np.expm1(d1)*np.array([7,14,21,28])
    d3 = np.diff(d2, axis=1)
#     d3[d3<0] = 0
    d2[:,1:]= d3
    d2 = np.clip(d2,0,None)
    res = np.log1p(d2).transpose()
    return res

In [83]:
# 无标准化
loc1 = df_train_in.shape[0]
loc2 = df_train_in_W.shape[0]
pred_val_in, pred_val_ext = np.split(np.array(val_pred),np.array([loc1]),axis=1) #1196
pred_val_in_W, pred_val_ext_W = np.split(np.array(val_pred_w4),np.array([loc2]),axis=1) # 1196

pred_val_in_W = getLastW(pred_val_in_W)
pred_val_ext_W = getLastW(pred_val_ext_W)

pred = [pred_val_ext_W, pred_val_in_W, pred_val_ext, pred_val_in]
label = [y_val_ext_Wall.transpose(), y_val_in_Wall.transpose(), y_val_ext.transpose(), y_val_in.transpose()]

In [84]:
inds = [df_train_ext_W.index, df_train_in_W.index, df_train_ext.index, df_train_in.index]
getSMAPE(label, pred, inds)

[-6.70972362 -6.37014042 -4.32293423 -5.52153552 -5.96404952 -5.74667175
 -4.30010267 -5.57621819 -6.95309891 -6.84031797 -4.86114917 -5.92072765
 -6.1356374  -5.78095804 -4.29968574 -5.45108868 -0.0830624  -0.08981472
 -0.07431601 -0.11210022 -0.11046372 -0.17022486 -0.24004695 -0.08074063
 -0.0514349  -0.03244491 -0.0306132   0.01456307  0.01112593 -0.03980805
 -0.07274648 -0.09329705 -0.11296971 -0.15452906 -0.17610106 -0.22529993
 -0.30463069 -0.13220416 -0.11144526 -0.09522382 -0.08854851 -0.01803336
 -0.00964606 -0.06971257]
get MAPE 2.1263
get SMAPW 0.5947
get score 70.2673


(0.59465436279175787, 70.267281860412112)

### 预测提交

In [85]:
def prediction(test, path="./model/model1",mode="wd"):
    # test=[test_ext_w, test_in_w, test_ext_day, test_in_day]
    # test=[W, W2, W3, W4]
    if mode=="wd":
        week_ext, week_in, day_ext, day_in = [], [], [], []
        results = [week_ext, week_in, day_ext, day_in]
        bases = ["bst_byweek%s.model","bst_byweek%s.model","bst_byday%s.model","bst_byday%s.model"]
        steps = [0,0,7,7]
        for step, res, base, data in zip(steps, results, bases, test):
            dtest = xgb.DMatrix(data.values)
            for i in range(step):
                model = os.path.join(path, base%(i+1))
                print(model)
                bst = xgb.Booster()
                bst.load_model(model)
                res.append(bst.predict(dtest))
    else:
        W, W2, W3, W4 = [], [], [], []
        results = [W, W2, W3, W4]
        bases = ["bst_byweek%s.model"]*4
        steps = range(4)
        for step, res, base, data in zip(steps, results, bases, test):
            dtest = xgb.DMatrix(data.values)
            model = os.path.join(path, base%(step+1))
            print(model)
            bst = xgb.Booster()
            bst.load_model(model)
            res.append(bst.predict(dtest))
    return results

In [86]:
def makeSubmission(preds, inds, steps=[4,4,7,7], filename="./submission/result.csv"):
    # preds=[week_ext, week_in, day_ext, day_in]
    stack = []
    for ind,pred,step in zip(inds, preds, steps):
        pred = np.array(pred).transpose()
        tmp = pd.DataFrame(np.expm1(pred), index=ind)
        tmp = tmp.reset_index().drop("store_id",axis=1)
        tmp = tmp.groupby("type_id").sum().values[::-1]
        tmp = tmp.reshape(2*step)
        stack.append(tmp)
    stack_pred = np.concatenate(stack, axis=0)
    stack_pred = stack_pred*1000
    result = pd.read_csv("./submission/result_org.csv")
    result["VALUE"] = stack_pred
    result.to_csv(filename,float_format="%.4f",index=False)
    print("done!")
    return stack_pred

In [87]:
def invDiff(initValues, diffValues, predValues):
    n = len(initValues)
    res = []
    for i in range(n):
        initv = initValues[i].values.reshape(-1,1)
        diffv = diffValues[i].values
        predv = np.array(predValues[i]).transpose()
        length = predv.shape[1]
        tmp = np.concatenate([initv, diffv, predv],axis=1)
        # 先求和再exp
        step1 = np.cumsum(tmp, axis=1)
        step2 = step1[:,-length:]
        res.append(step2.transpose())
    return res

#### 1周/天1个模型

#### 4周累积

In [88]:
x_test_in_Wall = [x_test_in_W, x_test_in_2W, x_test_in_3W, x_test_in_4W]
x_test_ext_Wall = [x_test_ext_W, x_test_ext_2W, x_test_ext_3W, x_test_ext_4W]
test = [x_test_ext_W, x_test_in_W, x_test_ext, x_test_in]

mpath = './model/model1'
res_ext_W = prediction(x_test_ext_Wall,path=mpath,mode="w")
res_in_W = prediction(x_test_in_Wall,path=mpath, mode="w")
res = prediction(test,path=mpath)

# # -----------------------有差分时使用-----------------------------
# # 周数据还原，去差分->求各周->取exp
# # init_in_byWeek, init_ext_byWeek
# diff_in_byWeek = [df_train_in_W, df_train_in_2W, df_train_in_3W, df_train_in_4W]
# diff_ext_byWeek = [df_train_ext_W, df_train_ext_2W, df_train_ext_3W, df_train_ext_4W]
# diff_byDay = [df_train_in, df_train_ext]
# res_ext_W = invDiff(init_ext_byWeek, diff_ext_byWeek, res_ext_W)
# res_in_W = invDiff(init_in_byWeek, diff_in_byWeek, res_in_W)
# res = invDiff(init_byDay, diff_byDay, [res[3], res[2]])

# # 求各周
# res1 = getLastW(res_ext_W)
# res2 = getLastW(res_in_W)
# results = [res1,res2,res[1],res[0]]

# inds = [df_train_ext_W.index, df_train_in_W.index, df_train_ext.index, df_train_in.index]

# # 取exp
# pred1 = makeSubmission(results, inds)

# # 标准化缩放
# res[3], res[2] = getInvScaler(scaler_in, scaler_ext, res[3], res[2])
# s_in = [scaler_in_W, scaler_in_2W, scaler_in_3W, scaler_in_4W]
# s_ext = [scaler_ext_W, scaler_ext_2W, scaler_ext_3W, scaler_ext_4W]
# for i in range(4):
#     res_in_W[i], res_ext_W[i] = getInvScaler(s_in[i], s_ext[i], res_in_W[i], res_ext_W[i])

# -----------------------无差分时使用-------------------------------
# 求各周
res1 = getLastW(res_ext_W)
res2 = getLastW(res_in_W)
results = [res1,res2,res[2],res[3]]

inds = [df_train_ext_W.index, df_train_in_W.index, df_train_ext.index, df_train_in.index]

# 取exp
pred1 = makeSubmission(results, inds)

./model/model1/bst_byweek1.model
./model/model1/bst_byweek2.model
./model/model1/bst_byweek3.model
./model/model1/bst_byweek4.model
./model/model1/bst_byweek1.model
./model/model1/bst_byweek2.model
./model/model1/bst_byweek3.model
./model/model1/bst_byweek4.model
./model/model1/bst_byday1.model
./model/model1/bst_byday2.model
./model/model1/bst_byday3.model
./model/model1/bst_byday4.model
./model/model1/bst_byday5.model
./model/model1/bst_byday6.model
./model/model1/bst_byday7.model
./model/model1/bst_byday1.model
./model/model1/bst_byday2.model
./model/model1/bst_byday3.model
./model/model1/bst_byday4.model
./model/model1/bst_byday5.model
./model/model1/bst_byday6.model
./model/model1/bst_byday7.model
done!


In [89]:
resbest = pd.read_csv(r"./submission/result7781.csv",index_col=[0])

In [90]:
resbest.values.reshape(-1)-pred1

array([-4973.98741806, -2206.91515395,  2013.0555274 ,  2413.57478776,
       -3762.43909355, -2740.60012825,  4449.80139139,  1729.50725537,
       -4868.71065402, -3016.48252795,  2634.27910608,   563.0655478 ,
       -3480.23898354, -2416.70749963,  5138.73314029,   134.97734347,
       -1161.65157805,  -789.49597207,  -897.57275254,  -928.04450859,
       -1017.14130179, -1676.76851503, -1809.7297691 ,  -990.78509912,
        -592.99847942,  -703.48027135,  -639.78293851,  -525.38199845,
       -1258.21499231, -1751.93502377, -1182.10677028,  -998.54803184,
       -1080.77361972, -1055.94542234, -1092.96080389, -1443.81547574,
       -1489.74848915, -1104.634283  ,  -815.1598202 ,  -918.13759143,
        -886.91042432,  -752.23018569, -1244.91312875, -1678.33854747])

In [91]:
print(np.sum(pred1[-28:-21]))
print(np.sum(pred1[-21:-14]))
print(np.sum(pred1[-14:-7]))
print(np.sum(pred1[-7:]))

32163.5231972
60373.7030029
30105.588913
60987.6403809


In [92]:
pred1

array([ 31938.57211806,  31174.74065395,  23335.5192726 ,  29030.76351224,
        58341.77179355,  54254.44402825,  43760.27340861,  54205.57694463,
        30071.97965402,  31068.20432795,  23694.97329392,  29188.2863522 ,
        58407.86408354,  54379.64529963,  44377.29135971,  54946.19315653,
         4583.28437805,   4593.09387207,   4599.13635254,   4602.96630859,
         4590.02590179,   4590.67821503,   4604.3381691 ,   8633.20159912,
         8647.78327942,   8653.63407135,   8657.10353851,   8567.41809845,
         8563.95149231,   8650.61092377,   4284.31797028,   4294.00873184,
         4300.35161972,   4304.56352234,   4306.54430389,   4307.93857574,
         4307.86418915,   8716.837883  ,   8731.6160202 ,   8736.21559143,
         8739.23492432,   8669.09408569,   8664.03102875,   8730.61084747])

In [137]:
pred1

array([ 31515.88092395,  31491.97439733,  23545.43853016,  28696.15215692,
        56512.09549839,  55691.98399363,  43935.04051561,  53045.28383655,
        29800.7306885 ,  31581.57944586,  23904.49425578,  28897.48649206,
        56331.8435424 ,  55727.65622614,  44621.87614432,  53795.45261944,
         4662.03403473,   4668.65444183,   4673.36845398,   4676.78642273,
         4679.8210144 ,   4681.73456192,   4682.18040466,   8585.28137207,
         8597.00584412,   8605.26847839,   8610.99529266,   8616.36447906,
         8619.5640564 ,   8620.23258209,   4357.69128799,   4364.48669434,
         4369.53306198,   4373.45981598,   4376.99556351,   4379.50801849,
         4380.57947159,   8644.66667175,   8656.30531311,   8664.42012787,
         8669.92473602,   8675.10318756,   8678.05480957,   8678.4734726 ])

### 提交结果记录
- 对于原始数据，无特殊说明都为取对数操作
- 对于周预测模型，没有特殊指明的都为累计模型
- 对于预测时得训练数据，周预测为20,天预测为100,最佳
- 对于合并稀疏样本，无特殊说明，合并阈值为500，步长50
- 对于特征数目，无特殊说明，则都为(2,21,2) (1,11) (1,31)
- 73分记录，天预测特征266。，周预测特征210, sum
- 77.19记录，新增类别特征，天预测特征267。，周预测特征211, mean，数值缩小1000

#### 2018-12-06

- socre-52, 12/255, 周预测num_days=20, 天预测num_days=100, 回归模型，无正则化项
- socre-50, 12/255, 周预测num_days=30, 天预测num_days=200, 回归模型，无正则化项
- socre-41, 15/255, 周预测num_days=30, 天预测num_days=200, 回归模型，有正则化项alpha=5,lambda=10

#### 2018-12-07

- score-50, 12/256, 周预测num_days=15, 天预测num_days=50, 回归模型，无正则化项
- score-39, 15/256, 周预测使用1周1个预测模型，num_days=20, 天预测num_days=100, 回归模型，无正则化项
- score-53, 13/258, 周预测num_days=20, 天预测num_days=100, 回归模型，无正则化项
- score-42, 17/259, 数据标准化处理，周预测num_days=20, 天预测num_days=100, 回归模型，无正则化项
- score-40, 17/259, 数据标准化处理，周预测num_days=30, 天预测num_days=200, 回归模型，无正则化项
- score-67, 12/261, 合并稀疏数据样本，周预测num_days=30, 天预测num_days=100, 回归模型，无正则化项

#### 2018-12-08
- score-66, 12/262, 合并稀疏数据样本，周预测num_days=30, 天预测num_days=100, 树模型，无正则化项
- score-63, 12/262, 合并稀疏数据样本，周预测num_days=30, 天预测num_days=200, 回归模型，无正则化项
- score-64, 12/262, 合并稀疏数据样本<=300样本、步长30，周预测num_days=20, 天预测num_days=100, 回归模型，无正则化项
- score-68, 12/262, 合并稀疏数据样本，周预测num_days=20, 天预测num_days=100, 回归模型，无正则化项
- score-62, 13/262, 合并稀疏数据样本<=500、步长25，周预测num_days=20, 天预测num_days=100, 回归模型，无正则化项
- score-73, 12/264, 合并稀疏数据样本<=500、步长100，周预测num_days=20, 天预测num_days=100, 回归模型，无正则化项
- score-71, 13/264, 合并稀疏数据样本<=500、步长100,周预测num_days=20, 天预测num_days=100,树模型max_depth=7,5/gamma=0.1,0.1
- score-72, 13/264, 合并稀疏数据样本<=500、步长100，特征数目减少8/6, 周预测num_days=20, 天预测num_days=100, 回归模型，无正则化项

#### 2018-12-09

- score-67, 13/264, 天预测合并稀疏数据样本<=500、步长100，天预测num_days=100;周预测合并稀疏数据样本<=150、步长50，周预测num_days=20 回归模型，无正则化项
- score-73, 12/264, 天预测合并稀疏数据样本<=500、步长100，天预测num_days=100;周预测合并稀疏数据样本<=150、步长50，周预测num_days=20,周预测结果*1.8 回归模型，无正则化项
- score-71, 13/264, 合并稀疏数据样本<=500、步长100,周预测num_days=20, 天预测num_days=100,树模型max_depth=10,8/gamma=0.1,0.1,周预测*1.3
- score-70, 14/264, 天预测合并稀疏数据样本<=500、步长100，天预测num_days=100;周预测合并稀疏数据样本<=200、步长100，周预测num_days=20, 回归模型，无正则化项.周预测*1.3， 73
- score-69, 14/264, 天预测合并稀疏数据样本<=500、步长50，天预测num_days=100;周预测合并稀疏数据样本<=400、步长200，周预测num_days=20, 回归模型，无正则化项.样本数减少，预测值会提高。
- score-69, 14/264, 天预测合并稀疏数据样本<=500、步长50，天预测num_days=100;周预测合并稀疏数据样本<=400、步长200，周预测num_days=20, 回归模型，无正则化项.样本数减少，预测值会提高。 去掉无用特征，例如求和特征，提高衰减比例0.8/0.85
- score-72, 12/264, 天预测合并稀疏数据样本，天预测num_days=100，周预测num_days=20, 回归模型，无正则化项。样本数减少，预测值会提高。 去掉无用特征，例如求和特征，提高衰减比例0.8/0.85
- score-72, 12/264, 天预测合并稀疏数据样本，天预测num_days=100，周预测num_days=20, 回归模型，无正则化项。样本数减少，预测值会提高。 去掉无用特征，例如求和特征，减小衰减比例0.95
- score-69, 12/264, 天预测合并稀疏数据样本<=500、步长50，天预测num_days=100;周预测合并稀疏数据样本<=500、步长100，周预测num_days=20。 去掉无用特征，例如求和特征 回归模型，无正则化项
- score-72.54, 14/264, 合并稀疏数据样本，天预测num_days=100,周预测num_days=20。 去掉无用特征，例如求和特征 回归模型，无正则化项
- score-72.16, 14/264, 合并稀疏数据样本，天预测num_days=100,周预测num_days=20。 回归模型，无正则化项.特征保留，部分sum替换未mean.使用全部数据训练

#### 2018-12-10

- score-70.65, 15/266, 合并稀疏样本数据，取差分，预测结果周>天，天预测值略有减小。周数据先求和再取对数
- score-71.88, 14/265, 增加特征
- score-73.32, 12/265, 增加产品类型一个特征。
- score-68.886, 16/265, 增加产品类型一个特征+差分预测，效果下降
- score-77.19, 11/265, 增加产品类型一个特征，所有数值/1000预测，预测完成后×1000,关键在于数值浮动不能太大
- score-77.14, 11/265, 无增加特征，所有数值/1000预测，预测完成后×1000,关键在于数值浮动不能太大
- score-77.71, 10/266, 无增加特征，所有数值/1000预测，预测完成后×1000,合并步长800-100
- score-77.81, 10/266, 去掉一堆无效特征,周预测剩余140，天预测剩余196, median/std/max/min,所有数值/1000预测，预测完成后×1000,合并步长800-100
- score-76.46, 13/266, 周预测剩余特征60,天预测剩余特征116

#### 2018-12-11
- ‘first_sales_days_in_last’和‘sale_count’两个特征很重要

- score-77.79, 10/266, 增加产品类型一个特征,去掉无效特征70个,周预测剩余140，天预测剩余196, median/std/max/min,所有数值/1000预测，预测完成后×1000,合并步长800-200
- score-66.76, 10/266, 增加产品类型一个特征,新增特征20个，去掉无效特征50个,周预测剩余111，天预测剩余167, median/std/max/min,所有数值/1000预测，预测完成后×1000,合并步长800-200
- score-77.84, 10/266, 去掉无效特征70个,周预测剩余140，天预测剩余196, median/std/max/min,所有数值/1000预测，预测完成后×1000,合并步长500-100
- score-77.81, 10/266, 去掉无效特征70个,周预测剩余140，天预测剩余196, median/std/max/min,所有数值/1000预测，预测完成后×1000,合并步长1000-100

#### 2018-12-13

- score-77.25, 13/279, 对原始数据做滑动平滑原span=3,此时数据样本未630条不是37条; 去掉无效特征70个,周预测剩余140+1，天预测剩余196+1, median/std/max/min,所有数值/1000预测，预测完成后×1000,合并步长500-100
- score-76.56, 16/279, 对合并后数据做滑动平滑span=2,此时数据样本37条; 去掉无效特征70个,周预测剩余140+1，天预测剩余196+1, median/std/max/min,所有数值/1000预测，预测完成后×1000,合并步长500-100
- score-76.56, 16/279, 对合并后数据做滑动平滑天数据span=4; 去掉无效特征70个,周预测剩余140+1，天预测剩余196+1, median/std/max/min,所有数值/1000预测，预测完成后×1000,合并步长500-100
- score-76.44, 16/279, 对合并后数据做对数再取滑动平滑天数据span=4; 去掉无效特征70个,周预测剩余140+1，天预测剩余196+1, median/std/max/min,所有数值/1000预测，预测完成后×1000,合并步长500-100
- score-77.48, 16/279, 对合并后数据做取滑动平滑天数据span=10; 去掉无效特征70个,周预测剩余140+1，天预测剩余196+1, median/std/max/min,所有数值/1000预测，预测完成后×1000,合并步长500-100
- score-77.58, 16/279, 对合并后数据做取滑动平滑天数据span=30; 去掉无效特征70个,周预测剩余140+1，天预测剩余196+1, median/std/max/min,所有数值/1000预测，预测完成后×1000,合并步长500-100
- score-76.98, 16/279, 对合并后数据做取滑动平滑天数据span=180; 去掉无效特征70个,周预测剩余140+1，天预测剩余196+1, median/std/max/min,所有数值/1000预测，预测完成后×1000,合并步长500-100

- score-78.14, 11/279, 对原始数据操作顺序：ewm(span=60)->merge(500,100)->/1000->log1p.去掉无效特征70个,周预测剩余140+1，天预测剩余196+1
- score-74.44, 14/279, 对原始数据操作顺序：ewm(span=7)->merge(1000,100)->/1000->log1p.去掉无效特征70个,周预测剩余140+1，天预测剩余196+1

#### 2018-12-14

- score-72.47, 19/280, 对原始数据操作顺序：ewm(span=7)->merge(500,250)->/1000->log1p,特征数目(614,608).
- score-77.43, 13/281, 对原始数据操作顺序：ewm(span=120)->merge(500,250)->/1000->log1p,特征数目(614,608).
- score-77.36, 13/281, 对原始数据操作顺序：ewm(span=120)->merge(500,100)->/1000->log1p,特征数目(614,608).
- score-78.06, 10/281, 对原始数据操作顺序：ewm(span=60)->merge(500,100)->/1000->log1p,特征数目(620,614).
- score-78.20, 10/281, 对原始数据操作顺序：ewm(span=30)->merge(500,100)->/1000->log1p,特征数目(620,614).
- score-78.06, 10/281, 对原始数据操作顺序：ewm(span=15)->merge(500,100)->/1000->log1p,特征数目(620,614).
span = 37是个好的尝试
- score-77.87, 10/285, 尝试选取最好得三次结果取平均

#### 2018-12-15

- score-78.107, 10/291. 对原始数据操作顺序：ewm(span=37)->merge(500,100)->/1000->log1p,特征数目(620,614)
- score-78.113, 10/291. 对原始数据操作顺序：ewm(span=30)->merge(500,100)->/1000->log1p,特征数目(620,614)

**观察储量趋势线，发现储量存在上升趋势，xgboost没有很好捕捉上升趋势，下一步采用神经网络模型。**