In [1]:
import pandas as pd
from sklearn.metrics import mean_squared_log_error

In [2]:
train_df = pd.read_csv('../data/store-sales-time-series/train.csv', index_col='id')
train_df = train_df[['date', 'store_nbr', 'family', 'onpromotion', 'sales']]
train_df['date'] = pd.to_datetime(train_df['date'])
train_df

Unnamed: 0_level_0,date,store_nbr,family,onpromotion,sales
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,2013-01-01,1,AUTOMOTIVE,0,0.000
1,2013-01-01,1,BABY CARE,0,0.000
2,2013-01-01,1,BEAUTY,0,0.000
3,2013-01-01,1,BEVERAGES,0,0.000
4,2013-01-01,1,BOOKS,0,0.000
...,...,...,...,...,...
3000883,2017-08-15,9,POULTRY,0,438.133
3000884,2017-08-15,9,PREPARED FOODS,1,154.553
3000885,2017-08-15,9,PRODUCE,148,2419.729
3000886,2017-08-15,9,SCHOOL AND OFFICE SUPPLIES,8,121.000


In [3]:
min_date = train_df['date'].min()
train_df['timestamp'] = (train_df['date'] - min_date).dt.days
train_df = train_df[['date', 'timestamp', 'store_nbr', 'family', 'onpromotion', 'sales']]
train_df = train_df.sort_values(by=['store_nbr', 'family', 'timestamp'])
train_df

Unnamed: 0_level_0,date,timestamp,store_nbr,family,onpromotion,sales
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,2013-01-01,0,1,AUTOMOTIVE,0,0.0
1782,2013-01-02,1,1,AUTOMOTIVE,0,2.0
3564,2013-01-03,2,1,AUTOMOTIVE,0,3.0
5346,2013-01-04,3,1,AUTOMOTIVE,0,3.0
7128,2013-01-05,4,1,AUTOMOTIVE,0,5.0
...,...,...,...,...,...,...
2993627,2017-08-11,1683,54,SEAFOOD,0,0.0
2995409,2017-08-12,1684,54,SEAFOOD,1,1.0
2997191,2017-08-13,1685,54,SEAFOOD,0,2.0
2998973,2017-08-14,1686,54,SEAFOOD,0,0.0


In [4]:
train_df['sales_lag_1'] = train_df.groupby(['store_nbr', 'family'])['sales'].shift(1)
train_df = train_df[['date', 'timestamp', 'store_nbr', 'family', 'onpromotion', 'sales_lag_1', 'sales']]
train_df

Unnamed: 0_level_0,date,timestamp,store_nbr,family,onpromotion,sales_lag_1,sales
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,2013-01-01,0,1,AUTOMOTIVE,0,,0.0
1782,2013-01-02,1,1,AUTOMOTIVE,0,0.0,2.0
3564,2013-01-03,2,1,AUTOMOTIVE,0,2.0,3.0
5346,2013-01-04,3,1,AUTOMOTIVE,0,3.0,3.0
7128,2013-01-05,4,1,AUTOMOTIVE,0,3.0,5.0
...,...,...,...,...,...,...,...
2993627,2017-08-11,1683,54,SEAFOOD,0,2.0,0.0
2995409,2017-08-12,1684,54,SEAFOOD,1,0.0,1.0
2997191,2017-08-13,1685,54,SEAFOOD,0,1.0,2.0
2998973,2017-08-14,1686,54,SEAFOOD,0,2.0,0.0


In [5]:
holidays_events_df = pd.read_csv('../data/store-sales-time-series/holidays_events.csv')
holidays_events_df['date'] = pd.to_datetime(holidays_events_df['date'])
holidays_events_df

Unnamed: 0,date,type,locale,locale_name,description,transferred
0,2012-03-02,Holiday,Local,Manta,Fundacion de Manta,False
1,2012-04-01,Holiday,Regional,Cotopaxi,Provincializacion de Cotopaxi,False
2,2012-04-12,Holiday,Local,Cuenca,Fundacion de Cuenca,False
3,2012-04-14,Holiday,Local,Libertad,Cantonizacion de Libertad,False
4,2012-04-21,Holiday,Local,Riobamba,Cantonizacion de Riobamba,False
...,...,...,...,...,...,...
345,2017-12-22,Additional,National,Ecuador,Navidad-3,False
346,2017-12-23,Additional,National,Ecuador,Navidad-2,False
347,2017-12-24,Additional,National,Ecuador,Navidad-1,False
348,2017-12-25,Holiday,National,Ecuador,Navidad,False


In [8]:
holiday_types = holidays_events_df['type'].unique()
holiday_types

array(['Holiday', 'Transfer', 'Additional', 'Bridge', 'Work Day', 'Event'],
      dtype=object)

In [6]:
stores_df = pd.read_csv('../data/store-sales-time-series/stores.csv')
stores_df

Unnamed: 0,store_nbr,city,state,type,cluster
0,1,Quito,Pichincha,D,13
1,2,Quito,Pichincha,D,13
2,3,Quito,Pichincha,D,8
3,4,Quito,Pichincha,D,9
4,5,Santo Domingo,Santo Domingo de los Tsachilas,D,4
5,6,Quito,Pichincha,D,13
6,7,Quito,Pichincha,D,8
7,8,Quito,Pichincha,D,8
8,9,Quito,Pichincha,B,6
9,10,Quito,Pichincha,C,15


In [7]:
oil_df = pd.read_csv('../data/store-sales-time-series/oil.csv')
oil_df['date'] = pd.to_datetime(oil_df['date'])
oil_df

Unnamed: 0,date,dcoilwtico
0,2013-01-01,
1,2013-01-02,93.14
2,2013-01-03,92.97
3,2013-01-04,93.12
4,2013-01-07,93.20
...,...,...
1213,2017-08-25,47.65
1214,2017-08-28,46.40
1215,2017-08-29,46.46
1216,2017-08-30,45.96
