In [2]:
from expt_settings.configs import ExperimentConfig

In [3]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
import datetime
import gc
import glob
import os
import shutil
import sys

import numpy as np
import pandas as pd
import pyunpack
import wget

In [4]:
os.getcwd()

'c:\\TFT\\TFT_code'

# Unzip

In [6]:
def unzip(zip_path, output_file, data_folder):
  """Unzips files and checks successful completion."""

  print('Unzipping file: {}'.format(zip_path))
  pyunpack.Archive(zip_path).extractall(data_folder)

  # Checks if unzip was successful
  if not os.path.exists(output_file):
    raise ValueError(
        'Error in unzipping process! {} not found.'.format(output_file))

In [11]:
url = 'https://www.kaggle.com/c/favorita-grocery-sales-forecasting/data'

In [7]:
data_folder ='0927_dataset'

In [8]:
# Save manual download to root folder to avoid deleting when re-processing.
zip_file = os.path.join(data_folder,
                          'favorita-grocery-sales-forecasting.zip')

In [9]:
zip_file

'0927_dataset\\favorita-grocery-sales-forecasting.zip'

In [13]:
if not os.path.exists(zip_file):
    raise ValueError(
        'Favorita zip file not found in {}!'.format(zip_file) +
        ' Please manually download data from Kaggle @ {}'.format(url))

In [20]:
# Unpack main zip file
outputs_file = os.path.join(data_folder, 'train.csv.7z')
unzip(zip_file, outputs_file, data_folder)

Unzipping file: 0927_dataset\favorita-grocery-sales-forecasting.zip


In [21]:
# Unpack individually zipped files
for file in glob.glob(os.path.join(data_folder, '*.7z')):
    csv_file = file.replace('.7z', '')
    unzip(file, csv_file, data_folder)
    print('Unzipping complete, commencing data processing...')

Unzipping file: 0927_dataset\holidays_events.csv.7z
Unzipping complete, commencing data processing...
Unzipping file: 0927_dataset\items.csv.7z
Unzipping complete, commencing data processing...
Unzipping file: 0927_dataset\oil.csv.7z
Unzipping complete, commencing data processing...
Unzipping file: 0927_dataset\sample_submission.csv.7z
Unzipping complete, commencing data processing...
Unzipping file: 0927_dataset\stores.csv.7z
Unzipping complete, commencing data processing...
Unzipping file: 0927_dataset\test.csv.7z
Unzipping complete, commencing data processing...
Unzipping file: 0927_dataset\train.csv.7z
Unzipping complete, commencing data processing...
Unzipping file: 0927_dataset\transactions.csv.7z
Unzipping complete, commencing data processing...


In [22]:
print('Unzipping complete, commencing data processing...')

Unzipping complete, commencing data processing...


# Sampling

In [23]:
# Extract only a subset of data to save/process for efficiency
start_date = datetime.datetime(2015, 1, 1)
end_date = datetime.datetime(2016, 6, 1)

print('Regenerating data...')

Regenerating data...


In [24]:
# load temporal data
temporal = pd.read_csv(os.path.join(data_folder, 'train.csv'), index_col=0)

  interactivity=interactivity, compiler=compiler, result=result)


In [25]:
temporal.shape

(125497040, 5)

In [26]:
store_info = pd.read_csv(os.path.join(data_folder, 'stores.csv'), index_col=0)
oil = pd.read_csv(
      os.path.join(data_folder, 'oil.csv'), index_col=0).iloc[:, 0]
holidays = pd.read_csv(os.path.join(data_folder, 'holidays_events.csv'))
items = pd.read_csv(os.path.join(data_folder, 'items.csv'), index_col=0)
transactions = pd.read_csv(os.path.join(data_folder, 'transactions.csv'))

In [27]:
# Take first 6 months of data
temporal['date'] = pd.to_datetime(temporal['date'])

In [28]:
# Filter dates to reduce storage space requirements
if start_date is not None:
    temporal = temporal[(temporal['date'] >= start_date)]
if end_date is not None:
    temporal = temporal[(temporal['date'] < end_date)]
dates = temporal['date'].unique()

In [33]:
dates

array(['2015-01-01T00:00:00.000000000', '2015-01-02T00:00:00.000000000',
       '2015-01-03T00:00:00.000000000', '2015-01-04T00:00:00.000000000',
       '2015-01-05T00:00:00.000000000', '2015-01-06T00:00:00.000000000',
       '2015-01-07T00:00:00.000000000', '2015-01-08T00:00:00.000000000',
       '2015-01-09T00:00:00.000000000', '2015-01-10T00:00:00.000000000',
       '2015-01-11T00:00:00.000000000', '2015-01-12T00:00:00.000000000',
       '2015-01-13T00:00:00.000000000', '2015-01-14T00:00:00.000000000',
       '2015-01-15T00:00:00.000000000', '2015-01-16T00:00:00.000000000',
       '2015-01-17T00:00:00.000000000', '2015-01-18T00:00:00.000000000',
       '2015-01-19T00:00:00.000000000', '2015-01-20T00:00:00.000000000',
       '2015-01-21T00:00:00.000000000', '2015-01-22T00:00:00.000000000',
       '2015-01-23T00:00:00.000000000', '2015-01-24T00:00:00.000000000',
       '2015-01-25T00:00:00.000000000', '2015-01-26T00:00:00.000000000',
       '2015-01-27T00:00:00.000000000', '2015-01-28

In [29]:
temporal['traj_id'] = temporal['store_nbr'].apply(
      str) + '_' + temporal['item_nbr'].apply(str)
traj_id_list = temporal['traj_id'].head(10).to_list()

In [30]:
traj_id_list

['25_103665',
 '25_105575',
 '25_108634',
 '25_108698',
 '25_108786',
 '25_108831',
 '25_108862',
 '25_108952',
 '25_111397',
 '25_112830']

In [31]:
# Remove all IDs with negative returns
print('Removing returns data')
min_returns = temporal['unit_sales'].groupby(temporal['traj_id']).min()
valid_ids = set(min_returns[min_returns >= 0].index)
selector = temporal['traj_id'].apply(lambda traj_id: traj_id in valid_ids)
new_temporal = temporal[selector].copy()
del temporal
gc.collect()
temporal = new_temporal
temporal['open'] = 1

Removing returns data


# Export raw dataset

In [None]:
temporal.to_csv('favorita_original.csv',index=False)

# Resampling

In [32]:
# Resampling
print('Resampling to regular grid')
resampled_dfs = []
for traj_id, raw_sub_df in temporal.groupby('traj_id'):
    print('Resampling', traj_id)
    sub_df = raw_sub_df.set_index('date', drop=True).copy()
    sub_df = sub_df.resample('1d').last()
    sub_df['date'] = sub_df.index
    sub_df[['store_nbr', 'item_nbr', 'onpromotion']] \
        = sub_df[['store_nbr', 'item_nbr', 'onpromotion']].fillna(method='ffill')
    sub_df['open'] = sub_df['open'].fillna(
        0)  # flag where sales data is unknown
    sub_df['log_sales'] = np.log(sub_df['unit_sales'])

    resampled_dfs.append(sub_df.reset_index(drop=True))

Resampling to regular grid
Resampling 10_1001305
Resampling 10_1003679
Resampling 10_1004550
Resampling 10_1004551
Resampling 10_1005458
Resampling 10_1005461
Resampling 10_1005463
Resampling 10_1005465
Resampling 10_1009512
Resampling 10_1009539
Resampling 10_1012473
Resampling 10_1012787
Resampling 10_1012788
Resampling 10_1014865
Resampling 10_1016067
Resampling 10_1017349
Resampling 10_1021281
Resampling 10_1022053
Resampling 10_1024975
Resampling 10_1024976
Resampling 10_1028589
Resampling 10_1033395
Resampling 10_103501
Resampling 10_103520
Resampling 10_1036320
Resampling 10_1036689
Resampling 10_1037654
Resampling 10_1037845
Resampling 10_1037846
Resampling 10_1037857
Resampling 10_1040170
Resampling 10_1046272
Resampling 10_1047394
Resampling 10_1047395
Resampling 10_1047396
Resampling 10_1047674
Resampling 10_1047679
Resampling 10_1047680
Resampling 10_1047681
Resampling 10_1047682
Resampling 10_1047685
Resampling 10_1047686
Resampling 10_1047690
Resampling 10_1047695
Resampl

# Add new variables

In [34]:
new_temporal = pd.concat(resampled_dfs, axis=0)
del temporal
gc.collect()
temporal = new_temporal

MemoryError: Unable to allocate 443. MiB for an array with shape (1, 58106641) and data type int64

In [84]:
# Oil (based on 'date')
print('Adding oil')
oil.name = 'oil'
oil.index = pd.to_datetime(oil.index)
temporal = temporal.join( 
      oil.fillna(method='ffill'), on='date', how='left')  # oil.loc[dates].fillna(method='ffill')
temporal['oil'] = temporal['oil'].fillna(-1)

Adding oil


In [85]:
# store (based on 'store_nbr')
print('Adding store info')
temporal = temporal.join(store_info, on='store_nbr', how='left')

Adding store info


In [86]:
print('Adding item info')
temporal = temporal.join(items, on='item_nbr', how='left')

transactions['date'] = pd.to_datetime(transactions['date'])
temporal = temporal.merge(
      transactions,
      left_on=['date', 'store_nbr'],
      right_on=['date', 'store_nbr'],
      how='left')
temporal['transactions'] = temporal['transactions'].fillna(-1)

Adding item info


In [87]:
# Additional date info
temporal['day_of_week'] = pd.to_datetime(temporal['date'].values).dayofweek
temporal['day_of_month'] = pd.to_datetime(temporal['date'].values).day
temporal['month'] = pd.to_datetime(temporal['date'].values).month

In [88]:
# Add holiday info
print('Adding holidays')
holiday_subset = holidays[holidays['transferred'].apply(
      lambda x: not x)].copy()
holiday_subset.columns = [
      s if s != 'type' else 'holiday_type' for s in holiday_subset.columns
  ]
holiday_subset['date'] = pd.to_datetime(holiday_subset['date'])
local_holidays = holiday_subset[holiday_subset['locale'] == 'Local']
regional_holidays = holiday_subset[holiday_subset['locale'] == 'Regional']
national_holidays = holiday_subset[holiday_subset['locale'] == 'National']

temporal['national_hol'] = temporal.merge(
      national_holidays, left_on=['date'], right_on=['date'],
      how='left')['description'].fillna('')
temporal['regional_hol'] = temporal.merge(
      regional_holidays,
      left_on=['state', 'date'],
      right_on=['locale_name', 'date'],
      how='left')['description'].fillna('')
temporal['local_hol'] = temporal.merge(
      local_holidays,
      left_on=['city', 'date'],
      right_on=['locale_name', 'date'],
      how='left')['description'].fillna('')

temporal.sort_values('unique_id', inplace=True)

Adding holidays


In [97]:
temporal[temporal['traj_id']=='25_103665'].head()
print(temporal[temporal['traj_id']=='25_103665'].shape)

Unnamed: 0,store_nbr,item_nbr,unit_sales,onpromotion,traj_id,unique_id,open,date,log_sales,oil,...,family,class,perishable,transactions,day_of_week,day_of_month,month,national_hol,regional_hol,local_hol
0,1.0,103520.0,2.0,,1_103520,1_103520_2013-01-04 00:00:00,1.0,2013-01-04,0.693147,93.12,...,GROCERY I,1028,0,1863.0,4,4,1,,,
1,1.0,103520.0,3.0,,1_103520,1_103520_2013-01-05 00:00:00,1.0,2013-01-05,1.098612,-1.00,...,GROCERY I,1028,0,1509.0,5,5,1,Recupero puente Navidad,,
3,1.0,103520.0,2.0,,1_103520,1_103520_2013-01-07 00:00:00,1.0,2013-01-07,0.693147,93.20,...,GROCERY I,1028,0,1807.0,0,7,1,,,
4,1.0,103520.0,6.0,,1_103520,1_103520_2013-01-08 00:00:00,1.0,2013-01-08,1.791759,93.21,...,GROCERY I,1028,0,1869.0,1,8,1,,,
5,1.0,103520.0,3.0,,1_103520,1_103520_2013-01-09 00:00:00,1.0,2013-01-09,1.098612,93.08,...,GROCERY I,1028,0,1910.0,2,9,1,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
810,1.0,103520.0,2.0,False,1_103520,1_103520_2015-03-25 00:00:00,1.0,2015-03-25,0.693147,48.75,...,GROCERY I,1028,0,1890.0,2,25,3,,,
811,1.0,103520.0,2.0,False,1_103520,1_103520_2015-03-26 00:00:00,1.0,2015-03-26,0.693147,51.41,...,GROCERY I,1028,0,1607.0,3,26,3,,,
813,1.0,103520.0,2.0,False,1_103520,1_103520_2015-03-28 00:00:00,1.0,2015-03-28,0.693147,-1.00,...,GROCERY I,1028,0,1279.0,5,28,3,,,
815,1.0,103520.0,1.0,False,1_103520,1_103520_2015-03-30 00:00:00,1.0,2015-03-30,0.000000,48.66,...,GROCERY I,1028,0,1753.0,0,30,3,,,


In [91]:
temporal['unit_sales'].isna().sum()

1007

# Export Dataset

In [98]:
data_csv_path = '0927_favorita_out.csv'
print('Saving processed file to {}'.format(data_csv_path))
temporal.to_csv(data_csv_path)

Saving processed file to favorita_out.csv


In [100]:
experiment_names = 'favorita'
parser = argparse.ArgumentParser(description='Data download configs')
parser.add_argument(
        'expt_name',
        metavar='e',
        type=str,
        nargs='?',
        choices=experiment_names,
        help='Experiment Name. Default={}'.format(','.join(experiment_names)))
parser.add_argument(
        'output_folder',
        metavar='f',
        type=str,
        nargs='?',
        default='.',
        help='Path to folder for data download')
parser.add_argument(
        'force_download',
        metavar='r',
        type=str,
        nargs='?',
        choices=['yes', 'no'],
        default='no',
        help='Whether to re-run data download')
args = parser.parse_known_args()[0]

In [102]:
root_folder = None if args.output_folder == '.' else args.output_folder

In [104]:
args

Namespace(expt_name=None, output_folder='.', force_download='no')