In [None]:
import numpy as np
import pandas as pd
import os, sys, gc, time, warnings, pickle, psutil, random

import time

warnings.filterwarnings('ignore')

In [None]:
## Simple "Memory profilers" to see memory usage
def get_memory_usage():
    return np.round(psutil.Process(os.getpid()).memory_info()[0]/2.**30, 2) 
        
def sizeof_fmt(num, suffix='B'):
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f%s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f%s%s" % (num, 'Yi', suffix)

In [None]:
########################### Vars
#################################################################################
TARGET = 'sales'         # Our main target
END_TRAIN = 1941         # Last day in train set
MAIN_INDEX = ['id','d']  # We can identify item by these columns

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
########################### Load Data/ЛАГИ!! ТОЛЬКО СА
#################################################################################
print('Load Main Data')

# We will need only train dataset
# to show lags concept
train_df = pd.read_csv('/content/gdrive/My Drive/m5/sales_train_evaluation.csv')

# To make all calculations faster
# we will limit dataset by 'CA' state
train_df = train_df[train_df['state_id']=='CA']

Load Main Data


In [None]:
########################### Data Representation
#################################################################################

# Let's check our shape
print('Shape', train_df.shape)

Shape (12196, 1919)


In [None]:
## Horizontal representation

# If we feed directly this data to model
# our label will be values in column 'd_1913'
# all other columns will be our "features"

# In lag terminology all d_1->d_1912 columns
# are our lag features 
# (target values in previous time period)

# Good thing that we have a lot of features here
# Bad thing is that we have just 12196 "train rows"
# Note: here and after all numbers are limited to 'CA' state
train_df.iloc[:10]

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,d_10,d_11,d_12,d_13,d_14,d_15,d_16,d_17,d_18,d_19,d_20,d_21,d_22,d_23,d_24,d_25,d_26,d_27,d_28,d_29,d_30,d_31,d_32,d_33,d_34,...,d_1874,d_1875,d_1876,d_1877,d_1878,d_1879,d_1880,d_1881,d_1882,d_1883,d_1884,d_1885,d_1886,d_1887,d_1888,d_1889,d_1890,d_1891,d_1892,d_1893,d_1894,d_1895,d_1896,d_1897,d_1898,d_1899,d_1900,d_1901,d_1902,d_1903,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,1,3,1,3,1,2,2,0,1,1,1,1,0,0,0,0,0,1,0,4,2,3,0,1,2,0,0,0,1,1,3,0,1,1,1,3,0,1,1
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,2,2,1,2,1,1,1,0,1,1,1
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,3,4,2,1,4,1,3,5,0,6,6,0,0,0,0,3,1,2,1,3,1,0,2,5,4,2,0,3,0,1,0,5,4,1,0,1,3,7,2
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,0,3,2,2,2,3,1,0,0,0,0,1,0,4,4,0,1,4,0,1,0,1,0,1,1,2,0,1,1,2,1,1,0,1,1,2,2,2,4
5,HOBBIES_1_006_CA_1_validation,HOBBIES_1_006,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,5,0,1,0,0,2,0,3,1,1,2,0,1,3,0,0,0,0,0,0,3,0,1,0,0,1,4,0,0,0,1,0,1,0,0,0,2,0,0
6,HOBBIES_1_007_CA_1_validation,HOBBIES_1_007,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,0,2,0,0,1,0,0,1,0,0,0,1,2,0,0,1,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,1,0,1,0,0,1,1
7,HOBBIES_1_008_CA_1_validation,HOBBIES_1_008,HOBBIES_1,HOBBIES,CA_1,CA,12,15,0,0,0,4,6,5,7,0,13,2,2,20,9,3,0,1,0,0,0,10,4,4,2,6,2,0,0,0,0,0,0,0,...,2,0,17,11,0,2,13,10,2,0,0,0,0,11,5,2,12,38,22,2,0,10,26,8,12,4,5,2,8,4,0,0,1,37,3,4,6,3,2,1
8,HOBBIES_1_009_CA_1_validation,HOBBIES_1_009,HOBBIES_1,HOBBIES,CA_1,CA,2,0,7,3,0,2,3,9,0,0,4,3,1,0,4,4,1,2,0,1,1,5,2,2,2,3,0,0,8,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,1,2,0,3,0,0,0,0,0,1,1,0,7,0,0,0,0,1,1,6,0,0,0,0,0
9,HOBBIES_1_010_CA_1_validation,HOBBIES_1_010,HOBBIES_1,HOBBIES,CA_1,CA,0,0,1,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,2,3,0,0,0,0,0,0,0,0,1,1,1,1,0,3,2,2,1,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,2,0,2


In [None]:
## Vertical representation

# In other hand we can think of d_ columns
# as additional labels and can significantly 
# scale up our training set to 23330948 rows

# Good thing that our model will have 
# greater input for training
# Bad thing that we are losing lags that we had
# in horizontal representation and
# also new data set consumes much more memory

index_columns = ['id','item_id','dept_id','cat_id','store_id','state_id']
train_df = pd.melt(train_df, 
                  id_vars = index_columns, 
                  var_name = 'd', 
                  value_name = TARGET)

train_df[train_df['id']=='HOBBIES_1_001_CA_1_validation'].iloc[:10]


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0
12196,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_2,0
24392,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_3,0
36588,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_4,0
48784,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_5,0
60980,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_6,0
73176,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_7,0
85372,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_8,0
97568,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_9,0
109764,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_10,0


In [None]:
train_df[train_df['id']=='HOBBIES_1_001_CA_1_validation']

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0
12196,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_2,0
24392,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_3,0
36588,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_4,0
48784,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_5,0
...,...,...,...,...,...,...,...,...
23269968,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1909,1
23282164,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1910,3
23294360,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1911,0
23306556,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1912,1


In [None]:
train_df[['id','d',TARGET]]

Unnamed: 0,id,d,sales
0,HOBBIES_1_001_CA_1_validation,d_1,0
1,HOBBIES_1_002_CA_1_validation,d_1,0
2,HOBBIES_1_003_CA_1_validation,d_1,0
3,HOBBIES_1_004_CA_1_validation,d_1,0
4,HOBBIES_1_005_CA_1_validation,d_1,0
...,...,...,...
23330943,FOODS_3_823_CA_4_validation,d_1913,0
23330944,FOODS_3_824_CA_4_validation,d_1913,1
23330945,FOODS_3_825_CA_4_validation,d_1913,0
23330946,FOODS_3_826_CA_4_validation,d_1913,4


In [None]:
########################### Lags creation/СДВИГАЕМ НА 7 ДНЕЙ КАЖДЫЙ ID
#################################################################################

# We have several "code" solutions here
# As our dataset is allready sorted by d values
# we can simply shift() values
# also we have to keep in mind that 
# we need to aggregate values on 'id' level

# group and shift in loop
temp_df = train_df[['id','d',TARGET]]

start_time = time.time()
for i in range(1,8):
    print('Shifting:', i)
    temp_df['lag_'+str(i)] = temp_df.groupby(['id'])[TARGET].transform(lambda x: x.shift(i)) #сдвиг на строку вниз (день)
    
print('%0.2f min: Time for loops' % ((time.time() - start_time) / 60))


# Or same in "compact" manner
LAG_DAYS = [col for col in range(1,8)]
temp_df = train_df[['id','d',TARGET]]

start_time = time.time()
temp_df = temp_df.assign(**{
        '{}_lag_{}'.format(col, l): temp_df.groupby(['id'])[col].transform(lambda x: x.shift(l))
        for l in LAG_DAYS
        for col in [TARGET]
    })

print('%0.2f min: Time for bulk shift' % ((time.time() - start_time) / 60))

KeyError: ignored

In [None]:
# The result
temp_df[temp_df['id']=='HOBBIES_1_001_CA_1_validation'].iloc[:10]

# You can notice many NaNs values - it's normal
# because there is no data for day 0,-1,-2
# (out of dataset time periods)

# Same works for test set
# be careful to make lag features:
# for day 1920 there is no data about day 1919 (until 1913)
# So if you want to predict day 1915 your 
# lag features have to start from 2 
# (1915(predicting day) - 1913(last day with label in dataset))
# and so on.

# There are few options to work 
# with NaNs in train set
## 1. drop it train_df[train_df['d']>MAX_LAG_DAY] 
## 1.1 in our case we already dropped some lines by release date
##     so you have find d.min() for each id
##     and drop train_df[train_df['d']>(train_df['d_min']+MAX_LAG_DAY)] 
## 2. If you want to keep it you can 
##    fill with '-1' to be able to convert to int
## 3. Leave as it is
## 4. Fill with mean -> not recommended

Unnamed: 0,id,d,sales,sales_lag_1,sales_lag_2,sales_lag_3,sales_lag_4,sales_lag_5,sales_lag_6,sales_lag_7
0,HOBBIES_1_001_CA_1_validation,d_1,0,,,,,,,
12196,HOBBIES_1_001_CA_1_validation,d_2,0,0.0,,,,,,
24392,HOBBIES_1_001_CA_1_validation,d_3,0,0.0,0.0,,,,,
36588,HOBBIES_1_001_CA_1_validation,d_4,0,0.0,0.0,0.0,,,,
48784,HOBBIES_1_001_CA_1_validation,d_5,0,0.0,0.0,0.0,0.0,,,
60980,HOBBIES_1_001_CA_1_validation,d_6,0,0.0,0.0,0.0,0.0,0.0,,
73176,HOBBIES_1_001_CA_1_validation,d_7,0,0.0,0.0,0.0,0.0,0.0,0.0,
85372,HOBBIES_1_001_CA_1_validation,d_8,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
97568,HOBBIES_1_001_CA_1_validation,d_9,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
109764,HOBBIES_1_001_CA_1_validation,d_10,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
########################### Rolling lags
#################################################################################

# We restored some day sales values from horizontal representation
# as lag features but just few of them (last 7 days or less)
# because of memory limits we can't have many lag features
# How we can get additional information from other days?

## Rolling aggragations

temp_df = train_df[['id','d','sales']]

start_time = time.time()

for i in [14,30,60]:
    print('Rolling period:', i)
    temp_df['rolling_mean_'+str(i)] = temp_df.groupby(['id'])[TARGET].transform(lambda x: x.shift(1).rolling(i).mean())
    temp_df['rolling_std_'+str(i)]  = temp_df.groupby(['id'])[TARGET].transform(lambda x: x.shift(1).rolling(i).std())

# lambda x: x.shift(1)
# 1 day shift will serve only to predict day 1914
# for other days you have to shift PREDICT_DAY-1913

# Such aggregations will help us to restore
# at least part of the information for our model
# and out of 14+30+60->104 columns we can have just 6
# with valuable information (hope it is sufficient)
# you can also aggregate by max/skew/median etc 
# also you can try other rolling periods 180,365 etc
print('%0.2f min: Time for loop' % ((time.time() - start_time) / 60))

Rolling period: 14
Rolling period: 30
Rolling period: 60
2.30 min: Time for loop


In [None]:
# The result
temp_df[temp_df['id']=='HOBBIES_1_002_CA_1_validation'].iloc[:20]

# Same for NaNs values - it's normal
# because there is no data for 
# 0*(rolling_period),-1*(rolling_period),-2*(rolling_period)

Unnamed: 0,id,d,sales,rolling_mean_14,rolling_std_14,rolling_mean_30,rolling_std_30,rolling_mean_60,rolling_std_60
1,HOBBIES_1_002_CA_1_validation,d_1,0,,,,,,
12197,HOBBIES_1_002_CA_1_validation,d_2,0,,,,,,
24393,HOBBIES_1_002_CA_1_validation,d_3,0,,,,,,
36589,HOBBIES_1_002_CA_1_validation,d_4,0,,,,,,
48785,HOBBIES_1_002_CA_1_validation,d_5,0,,,,,,
60981,HOBBIES_1_002_CA_1_validation,d_6,0,,,,,,
73177,HOBBIES_1_002_CA_1_validation,d_7,0,,,,,,
85373,HOBBIES_1_002_CA_1_validation,d_8,0,,,,,,
97569,HOBBIES_1_002_CA_1_validation,d_9,0,,,,,,
109765,HOBBIES_1_002_CA_1_validation,d_10,0,,,,,,


In [None]:
########################### Memory ussage
#################################################################################
# Let's check our memory usage
print("{:>20}: {:>8}".format('Original rolling df',sizeof_fmt(temp_df.memory_usage(index=True).sum())))

# can we minify it?
# 1. if our dataset are aligned by index 
#    you don't need 'id' 'd' 'sales' columns
temp_df = temp_df.iloc[:,3:]
print("{:>20}: {:>8}".format('Values rolling df',sizeof_fmt(temp_df.memory_usage(index=True).sum())))

# can we make it even smaller?
# carefully change dtype and/or
# use sparce matrix to minify 0s
# Also note that lgbm accepts matrixes as input
# that is good for memory reducion 
from scipy import sparse 
temp_matrix = sparse.csr_matrix(temp_df)

# restore to df
temp_matrix_restored = pd.DataFrame(temp_matrix.todense())
restored_cols = ['roll_' + str(i) for i in list(temp_matrix_restored)]
temp_matrix_restored.columns = restored_cols

 Original rolling df:   1.6GiB
   Values rolling df:   1.0GiB


In [None]:
########################### Remove old objects
#################################################################################
del temp_df, train_df, temp_matrix, temp_matrix_restored

In [None]:
########################### Apply on grid_df
#################################################################################
# lets read grid from 
# https://www.kaggle.com/kyakovlev/m5-simple-fe
# to be sure that our grids are aligned by index
grid_df = pd.read_pickle('/content/gdrive/My Drive/m5_new/grid_part_1.pkl')

# We need only 'id','d','sales'
# to make lags and rollings
grid_df = grid_df[['id','d','sales']]
SHIFT_DAY = 28

# Lags
# with 28 day shift
start_time = time.time()
print('Create lags')

LAG_DAYS = [col for col in range(SHIFT_DAY,SHIFT_DAY+15)]
grid_df = grid_df.assign(**{
        '{}_lag_{}'.format(col, l): grid_df.groupby(['id'])[col].transform(lambda x: x.shift(l))
        for l in LAG_DAYS
        for col in [TARGET]
    })

# Minify lag columns
for col in list(grid_df):
    if 'lag' in col:
        grid_df[col] = grid_df[col].astype(np.float16)

print('%0.2f min: Lags' % ((time.time() - start_time) / 60))

# Rollings
# with 28 day shift
start_time = time.time()
print('Create rolling aggs')

for i in [7,14,30,60,180]:
    print('Rolling period:', i)
    grid_df['rolling_mean_'+str(i)] = grid_df.groupby(['id'])[TARGET].transform(lambda x: x.shift(SHIFT_DAY).rolling(i).mean()).astype(np.float16)
    grid_df['rolling_std_'+str(i)]  = grid_df.groupby(['id'])[TARGET].transform(lambda x: x.shift(SHIFT_DAY).rolling(i).std()).astype(np.float16)

# Rollings
# with sliding shift
for d_shift in [1,7,14]: 
    print('Shifting period:', d_shift)
    for d_window in [7,14,30,60]:
        col_name = 'rolling_mean_tmp_'+str(d_shift)+'_'+str(d_window)
        grid_df[col_name] = grid_df.groupby(['id'])[TARGET].transform(lambda x: x.shift(d_shift).rolling(d_window).mean()).astype(np.float16)
    
    
print('%0.2f min: Lags' % ((time.time() - start_time) / 60))

Create lags
8.34 min: Lags
Create rolling aggs
Rolling period: 7
Rolling period: 14
Rolling period: 30
Rolling period: 60
Rolling period: 180
Shifting period: 1
Shifting period: 7
Shifting period: 14
18.33 min: Lags


In [None]:
grid_df

Unnamed: 0,id,d,sales,sales_lag_28,sales_lag_29,sales_lag_30,sales_lag_31,sales_lag_32,sales_lag_33,sales_lag_34,sales_lag_35,sales_lag_36,sales_lag_37,sales_lag_38,sales_lag_39,sales_lag_40,sales_lag_41,sales_lag_42,rolling_mean_7,rolling_std_7,rolling_mean_14,rolling_std_14,rolling_mean_30,rolling_std_30,rolling_mean_60,rolling_std_60,rolling_mean_180,rolling_std_180,rolling_mean_tmp_1_7,rolling_mean_tmp_1_14,rolling_mean_tmp_1_30,rolling_mean_tmp_1_60,rolling_mean_tmp_7_7,rolling_mean_tmp_7_14,rolling_mean_tmp_7_30,rolling_mean_tmp_7_60,rolling_mean_tmp_14_7,rolling_mean_tmp_14_14,rolling_mean_tmp_14_30,rolling_mean_tmp_14_60
0,HOBBIES_1_008_CA_1_validation,1,12.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,HOBBIES_1_009_CA_1_validation,1,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,HOBBIES_1_010_CA_1_validation,1,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,HOBBIES_1_012_CA_1_validation,1,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,HOBBIES_1_015_CA_1_validation,1,4.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46881672,FOODS_3_823_WI_3_validation,1941,,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,2.0,0.0,0.285645,0.488037,0.428467,7.558594e-01,0.199951,0.550781,0.250000,0.571289,0.616699,0.987305,,,,,,,,,,,,
46881673,FOODS_3_824_WI_3_validation,1941,,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.142822,0.377930,0.214233,4.257812e-01,0.300049,0.535156,0.150024,0.404541,0.049988,0.242798,,,,,,,,,,,,
46881674,FOODS_3_825_WI_3_validation,1941,,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,1.0,2.0,0.0,0.0,0.0,4.0,2.0,0.571289,0.786621,0.785645,1.188477e+00,0.866699,1.136719,1.033203,1.056641,0.761230,0.941895,,,,,,,,,,,,
46881675,FOODS_3_826_WI_3_validation,1941,,3.0,1.0,3.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,3.0,0.0,1.0,0.0,2.0,1.142578,1.344727,0.928711,1.207031e+00,1.066406,1.172852,1.016602,1.065430,1.338867,1.550781,,,,,,,,,,,,


In [None]:
########################### Export
#################################################################################
print('Save lags and rollings')
grid_df.to_pickle('lags_df_28.pkl')
!cp 'lags_df_28.pkl' /content/gdrive/'My Drive'/m5_new

Save lags and rollings
