In [1]:
import pandas as pd
import numpy as np
import time

**Turn dataset into vertical representation**

In [3]:
########################### Vars
#################################################################################
TARGET = 'sales'         # Our main target
END_TRAIN = 1913         # Last day in train set
MAIN_INDEX = ['id','d']  # We can identify item by these columns

In [4]:
df = pd.read_csv('sales_train_validation.csv')

In [5]:
index_columns = ['id','item_id','dept_id','cat_id','store_id','state_id']
train_df = pd.melt(df,
                  id_vars = index_columns, 
                  var_name = 'd', 
                  value_name = TARGET)

train_df[train_df['id']=='HOBBIES_1_001_CA_1_validation'].iloc[:10]

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0
30490,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_2,0
60980,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_3,0
91470,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_4,0
121960,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_5,0
152450,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_6,0
182940,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_7,0
213430,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_8,0
243920,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_9,0
274410,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_10,0


In [6]:
# group and shift in loop
temp_df = train_df[['id','d',TARGET]]

start_time = time.time()
for i in range(1,8):
    print('Shifting:', i)
    temp_df['lag_'+str(i)] = temp_df.groupby(['id'])[TARGET].transform(lambda x: x.shift(i))
    
print('%0.2f min: Time for loops' % ((time.time() - start_time) / 60))

Shifting: 1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


Shifting: 2
Shifting: 3
Shifting: 4
Shifting: 5
Shifting: 6
Shifting: 7
7.10 min: Time for loops


In [None]:
# # Or same in "compact" manner
# LAG_DAYS = [col for col in range(1,8)]
# temp_df = train_df[['id','d',TARGET]]

# start_time = time.time()
# temp_df = temp_df.assign(**{
#         '{}_lag_{}'.format(col, l): temp_df.groupby(['id'])[col].transform(lambda x: x.shift(l))
#         for l in LAG_DAYS
#         for col in [TARGET]
#     })

# print('%0.2f min: Time for bulk shift' % ((time.time() - start_time) / 60))

In [17]:
temp_df[temp_df['id']=='HOBBIES_1_001_CA_1_validation'].iloc[:10]

Unnamed: 0,id,d,sales,lag_1,lag_2,lag_3,lag_4,lag_5,lag_6,lag_7
0,HOBBIES_1_001_CA_1_validation,d_1,0,,,,,,,
30490,HOBBIES_1_001_CA_1_validation,d_2,0,0.0,,,,,,
60980,HOBBIES_1_001_CA_1_validation,d_3,0,0.0,0.0,,,,,
91470,HOBBIES_1_001_CA_1_validation,d_4,0,0.0,0.0,0.0,,,,
121960,HOBBIES_1_001_CA_1_validation,d_5,0,0.0,0.0,0.0,0.0,,,
152450,HOBBIES_1_001_CA_1_validation,d_6,0,0.0,0.0,0.0,0.0,0.0,,
182940,HOBBIES_1_001_CA_1_validation,d_7,0,0.0,0.0,0.0,0.0,0.0,0.0,
213430,HOBBIES_1_001_CA_1_validation,d_8,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
243920,HOBBIES_1_001_CA_1_validation,d_9,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
274410,HOBBIES_1_001_CA_1_validation,d_10,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


***Here we can clearly see a pattern so that you could easily understand the purpose of creating lag features: it's to get relevant informations of the last 7 days***

### Rolling lags 

*This is to find information more than the most recent 7 days because the memory limit won't allow us to create many lag features*

In [7]:
start_time = time.time()

for i in [14,30,60]:
    print('Rolling period:', i)
    temp_df['rolling_mean_'+str(i)] = temp_df.groupby(['id'])[TARGET].transform(lambda x: x.shift(1).rolling(i).mean())
    temp_df['rolling_std_'+str(i)]  = temp_df.groupby(['id'])[TARGET].transform(lambda x: x.shift(1).rolling(i).std())

# lambda x: x.shift(1)
# 1 day shift will serve only to predict day 1914
# for other days you have to shift PREDICT_DAY-1913

# Such aggregations will help us to restore
# at least part of the information for our model
# and out of 14+30+60->104 columns we can have just 6
# with valuable information (hope it is sufficient)
# you can also aggregate by max/skew/median etc 
# also you can try other rolling periods 180,365 etc
print('%0.2f min: Time for loop' % ((time.time() - start_time) / 60))

Rolling period: 14


MemoryError: Unable to allocate 3.48 GiB for an array with shape (8, 58327370) and data type float64

In [1]:
temp_df[temp_df['id']=='HOBBIES_1_001_CA_1_validation'].iloc[:-40]

NameError: name 'temp_df' is not defined

In [2]:
lags_roll = pd.read_pickle('lags_df_28.pkl')

In [5]:
lags_roll

Unnamed: 0,id,d,sales,sales_lag_28,sales_lag_29,sales_lag_30,sales_lag_31,sales_lag_32,sales_lag_33,sales_lag_34,...,rolling_mean_tmp_1_30,rolling_mean_tmp_1_60,rolling_mean_tmp_7_7,rolling_mean_tmp_7_14,rolling_mean_tmp_7_30,rolling_mean_tmp_7_60,rolling_mean_tmp_14_7,rolling_mean_tmp_14_14,rolling_mean_tmp_14_30,rolling_mean_tmp_14_60
0,HOBBIES_1_008_CA_1_validation,1,12.0,,,,,,,,...,,,,,,,,,,
1,HOBBIES_1_009_CA_1_validation,1,2.0,,,,,,,,...,,,,,,,,,,
2,HOBBIES_1_010_CA_1_validation,1,0.0,,,,,,,,...,,,,,,,,,,
3,HOBBIES_1_012_CA_1_validation,1,0.0,,,,,,,,...,,,,,,,,,,
4,HOBBIES_1_015_CA_1_validation,1,4.0,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46881672,FOODS_3_823_WI_3_validation,1941,,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,,,,,,,,,,
46881673,FOODS_3_824_WI_3_validation,1941,,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
46881674,FOODS_3_825_WI_3_validation,1941,,0.0,1.0,0.0,0.0,1.0,0.0,2.0,...,,,,,,,,,,
46881675,FOODS_3_826_WI_3_validation,1941,,3.0,1.0,3.0,0.0,1.0,0.0,0.0,...,,,,,,,,,,
