## 导入所需的包

In [1]:
!git log

[33mcommit f814661261860e419b5af1b5a6c8ecadb19ea70e[m[33m ([m[1;36mHEAD -> [m[1;32mmaster[m[33m, [m[1;31morigin/master[m[33m, [m[1;31morigin/HEAD[m[33m)[m
Author: poteman <946691288@qq.com>
Date:   Mon Nov 15 10:35:34 2021 +0800

    debug: 时序xgb模型

[33mcommit 240b35118c573b6810db4b05558c0034d524c294[m
Author: poteman <946691288@qq.com>
Date:   Sun Nov 14 19:59:02 2021 +0800

    时序模型: 时间单位(间隔)为week.

[33mcommit 9a272d89994102b64db956a6777d1f8a7a2f8c61[m
Author: poteman <946691288@qq.com>
Date:   Sun Nov 14 19:41:22 2021 +0800

    时序lag和ewm特征优化.

[33mcommit 73578947ba07a64408dd0e13e9b43b9b49ba44bd[m
Author: poteman <946691288@qq.com>
Date:   Sun Nov 14 19:26:57 2021 +0800

    时序数据: 周期为week的数据集特征优化.

[33mcommit 85c96e3f5053aaa959bfce339779d8c15c27f372[m
Author: poteman <946691288@qq.com>
Date:   Fri Nov 12 17:46:09 2021 +0800

    debug: 拼表.

[33mcommit 7a30d642b74eb164a5189826bef3011a414b96c2[m
Author: poteman <946691288@qq.c

In [2]:
from autox import AutoX
import pandas as pd
import numpy as np
import os
from tqdm import tqdm

## 配置数据信息

In [3]:
# 选择数据集
data_name = 'walmart_recruiting'
path = f'../data/{data_name}'

In [4]:
# 数据表对应的数据列特征类型
feature_type = {
  "stores.csv": {
        "Store": "cat",
        "Type": "cat",
        "Size": "num"
    },
    "train.csv": {
        "Store": "cat",
        "Dept": "cat",
        "Date": "datetime",
        "Weekly_Sales": "num",
        "IsHoliday": "num"
    },
    "features.csv": {
        "Store": "cat",
        "Date": "datetime",
        "Temperature": "num",
        "Fuel_Price": "num",
        "MarkDown1": "num",
        "MarkDown2": "num",
        "MarkDown3": "num",
        "MarkDown4": "num",
        "MarkDown5": "num",
        "CPI": "num",
        "Unemployment": "num",
        "IsHoliday": "num"
    },
    "test.csv": {
        "Store": "cat",
        "Dept": "cat",
        "Date": "datetime",
        "IsHoliday": "num"
    }
}   

relations = [
    {
            "related_to_main_table": "true", # 是否为和主表的关系
            "left_entity": "train.csv",  # 左表名字
            "left_on": ["Store"],  # 左表拼表键
            "right_entity": "stores.csv",  # 右表名字
            "right_on": ["Store"], # 右表拼表键
            "type": "1-1" # 左表与右表的连接关系
        },
    {
            "related_to_main_table": "true", # 是否为和主表的关系
            "left_entity": "test.csv",  # 左表名字
            "left_on": ["Store"],  # 左表拼表键
            "right_entity": "stores.csv",  # 右表名字
            "right_on": ["Store"], # 右表拼表键
            "type": "1-1" # 左表与右表的连接关系
        },
    {
            "related_to_main_table": "true", # 是否为和主表的关系
            "left_entity": "train.csv",  # 左表名字
            "left_on": ["Store", "Date"],  # 左表拼表键
            "right_entity": "features.csv",  # 右表名字
            "right_on": ["Store", "Date"], # 右表拼表键
            "type": "1-1" # 左表与右表的连接关系
        },
    {
            "related_to_main_table": "true", # 是否为和主表的关系
            "left_entity": "test.csv",  # 左表名字
            "left_on": ["Store", "Date"],  # 左表拼表键
            "right_entity": "features.csv",  # 右表名字
            "right_on": ["Store", "Date"], # 右表拼表键
            "type": "1-1" # 左表与右表的连接关系
        }
]


In [5]:
autox = AutoX(target = 'Weekly_Sales', train_name = 'train.csv', test_name = 'test.csv', 
               id = ['Store', 'Dept'], path = path, time_series=True, ts_unit='W',time_col = 'Date',
              feature_type = feature_type, relations = relations
              ) 

   INFO ->  [+] read stores.csv
   INFO ->  Memory usage of dataframe is 0.00 MB
   INFO ->  Memory usage after optimization is: 0.00 MB
   INFO ->  Decreased by 56.1%
   INFO ->  table = stores.csv, shape = (45, 3)
   INFO ->  [+] read train.csv
   INFO ->  Memory usage of dataframe is 13.27 MB
   INFO ->  Memory usage after optimization is: 4.03 MB
   INFO ->  Decreased by 69.7%
   INFO ->  table = train.csv, shape = (421570, 5)
   INFO ->  [+] read features.csv
   INFO ->  Memory usage of dataframe is 0.70 MB
   INFO ->  Memory usage after optimization is: 0.26 MB
   INFO ->  Decreased by 62.1%
   INFO ->  table = features.csv, shape = (8190, 12)
   INFO ->  [+] read test.csv
   INFO ->  Memory usage of dataframe is 2.74 MB
   INFO ->  Memory usage after optimization is: 0.55 MB
   INFO ->  Decreased by 79.9%
   INFO ->  table = test.csv, shape = (115064, 4)
   INFO ->  [+] read sampleSubmission.csv
   INFO ->  Memory usage of dataframe is 1.76 MB
   INFO ->  Memory usage after opti

In [6]:
sub = autox.get_submit_ts()

   INFO ->  start feature engineer
   INFO ->  feature engineer: one2M
   INFO ->  featureOne2M ops: {}
   INFO ->  ignore featureOne2M
   INFO ->  feature engineer: time
   INFO ->  featureTime ops: ['Date']
100%|██████████| 1/1 [00:00<00:00,  1.79it/s]
   INFO ->  feature engineer: ShiftTS
   INFO ->  featureShiftTS ops: ['Weekly_Sales', 'IsHoliday', 'stores.csv__Size', 'features.csv__Temperature', 'features.csv__Fuel_Price', 'features.csv__MarkDown1', 'features.csv__MarkDown2', 'features.csv__MarkDown3', 'features.csv__MarkDown4', 'features.csv__MarkDown5', 'features.csv__CPI', 'features.csv__Unemployment', 'features.csv__IsHoliday']
   INFO ->  featureShiftTS lags: [39, 40, 41, 42]
13it [00:52,  4.02s/it]
   INFO ->  feature engineer: RollingStatTS
   INFO ->  featureRollingStatTS ops: ['Weekly_Sales', 'IsHoliday', 'stores.csv__Size', 'features.csv__Temperature', 'features.csv__Fuel_Price', 'features.csv__MarkDown1', 'features.csv__MarkDown2', 'features.csv__MarkDown3', 'features.c

Training with validation
Training until validation scores don't improve for 150 rounds
[100]	training's rmse: 11486.8	valid_1's rmse: 9337.9
[200]	training's rmse: 8439.96	valid_1's rmse: 5748.24
[300]	training's rmse: 7537.04	valid_1's rmse: 5006.07
[400]	training's rmse: 7170.1	valid_1's rmse: 4824.32
[500]	training's rmse: 6979.62	valid_1's rmse: 4745.19
[600]	training's rmse: 6845.89	valid_1's rmse: 4696.79
[700]	training's rmse: 6760.18	valid_1's rmse: 4723.45
Early stopping, best iteration is:
[628]	training's rmse: 6815.99	valid_1's rmse: 4688.85
MSE: 21985298.147657212
Finished in 0:00:20.895184
ReTraining on all data
[100]	training's rmse: 11070.6
[200]	training's rmse: 7926.8
[300]	training's rmse: 7034.74
[400]	training's rmse: 6660.45
[500]	training's rmse: 6474.24
[600]	training's rmse: 6345.12
[700]	training's rmse: 6262.96
Finished in 0:00:24.454681


   INFO ->  (421570, 322)


Training with validation
[0]	validation_0-rmse:26988.55273
Will train until validation_0-rmse hasn't improved in 100 rounds.
[100]	validation_0-rmse:11106.44629
[200]	validation_0-rmse:6042.85547
[300]	validation_0-rmse:4652.65088
[400]	validation_0-rmse:4268.49365
[500]	validation_0-rmse:4145.81543
[600]	validation_0-rmse:4099.48242
[700]	validation_0-rmse:4076.93237
[800]	validation_0-rmse:4068.92334
[900]	validation_0-rmse:4064.46729
[1000]	validation_0-rmse:4060.95679
[1100]	validation_0-rmse:4057.40210
[1200]	validation_0-rmse:4056.21289
[1300]	validation_0-rmse:4054.61450
[1400]	validation_0-rmse:4053.05664
[1500]	validation_0-rmse:4051.39356
[1600]	validation_0-rmse:4050.07178
[1700]	validation_0-rmse:4048.80835
[1800]	validation_0-rmse:4047.33569
[1900]	validation_0-rmse:4046.69019
[2000]	validation_0-rmse:4045.33667
[2100]	validation_0-rmse:4044.17334
[2200]	validation_0-rmse:4043.46802
[2300]	validation_0-rmse:4041.12256
[2400]	validation_0-rmse:4040.35596
[2500]	validation_0

   INFO ->  feature importance
   INFO ->                                               feature  feature_importance
0                                          IsHoliday                 181
1                                   stores.csv__Type                  92
2                                   stores.csv__Size                 403
3                          features.csv__Temperature                 196
4                           features.csv__Fuel_Price                  47
..                                               ...                 ...
317  Store__Dept__features.csv__Unemployment__ewm_42                   8
318     Store__Dept__features.csv__IsHoliday__ewm_39                   3
319     Store__Dept__features.csv__IsHoliday__ewm_40                  13
320     Store__Dept__features.csv__IsHoliday__ewm_41                  63
321     Store__Dept__features.csv__IsHoliday__ewm_42                  10

[322 rows x 2 columns]


Finished in 0:51:00.593180


In [11]:
submit = pd.read_csv('../data/walmart_recruiting/sampleSubmission.csv')
sub['Id'] = submit['Id']
sub = sub[['Id', 'Weekly_Sales']]

In [12]:
sub

Unnamed: 0,Id,Weekly_Sales
0,1_1_2012-11-02,25213.679511
1,1_1_2012-11-09,25506.688117
2,1_1_2012-11-16,22432.246294
3,1_1_2012-11-23,39339.997853
4,1_1_2012-11-30,33034.963365
...,...,...
115059,45_98_2013-06-28,681.154183
115060,45_98_2013-07-05,818.599082
115061,45_98_2013-07-12,736.459453
115062,45_98_2013-07-19,768.392045


In [13]:
sub.to_csv("./sub/autox_1114_walmart_recruiting_oneclick.csv", index = False)