####  Workflow
1. Loading the data
2. Creating training and test sets of time series
3. Formatting data as JSON files and uploading to S3
4. Instantiating and training a DeepAR estimator
5. Deploying a model and creating a predictor
6. Evaluating the predictor

In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [200]:
# load data 
df = pd.read_csv('prepared_data_all.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Store,DayOfWeek,Date,Sales,Open,Promo,StateHoliday,SchoolHoliday,ID,StoreType,Assortment,CompetitionDistance,Promo_2_active,Open_sunday
0,0,1,5,2015-07-31,5263,1,1,1,1,0,2,0,1270,0,0
1,1,2,5,2015-07-31,6064,1,1,1,1,1,0,0,570,1,0
2,2,3,5,2015-07-31,8314,1,1,1,1,2,0,0,14130,1,0
3,3,4,5,2015-07-31,13995,1,1,1,1,3,2,2,620,0,0
4,4,5,5,2015-07-31,4822,1,1,1,1,4,0,0,29910,0,0


In [201]:
df = df.drop(columns=['Unnamed: 0'])

In [202]:
# change order of df so latest date comes first
df = df.iloc[::-1]

In [203]:
# for the DeepAr Algorithm, Categorical features must be encoded as a 0-based sequence of positive integers
# thus the stores (which is the category I want to distinguish in this time series) needs to start from zero and not from 1

# reduce all store numbers by one
df ['Store'] =  df.Store.apply(lambda x: (x-1))
df.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Open,Promo,StateHoliday,SchoolHoliday,ID,StoreType,Assortment,CompetitionDistance,Promo_2_active,Open_sunday
1017208,1114,2,2013-01-01,0,0,0,1,1,1017208,3,2,5350,0,0
1017207,1113,2,2013-01-01,0,0,0,1,1,1017207,0,2,870,0,0
1017206,1112,2,2013-01-01,0,0,0,1,1,1017206,0,2,9260,0,0
1017205,1111,2,2013-01-01,0,0,0,1,1,1017205,2,2,1880,0,0
1017204,1110,2,2013-01-01,0,0,0,1,1,1017204,0,0,1900,0,0


In [204]:
# filter into train/test before running transformation

# as defined in Proposal: 
# train range 07.01.2013 – 07.06.2015 (94,7% of data)
# test range 08.06.2015-26.07.2015 (5,3% of data) 

# TRAIN
# Filter out all rows with a date past 07.06.2015
df_train = df[df['Date']<'2015-06-08']
# Filter out all rows with a date before 07.01.2013
df_train = df_train[df_train['Date']>='2013-07-01']

# TEST
# !!!!! for this algorithm, the test set contains the complete range of each time series.!!!!!
# Filter out all rows with a date before 08.06.2015
df_test = df[df['Date']>='2013-07-01']
# Filter out all rows with a date past 26.07.2015
df_test = df_test[df_test['Date']<'2015-07-27']

In [205]:
df_train['Date'] = pd.to_datetime(df_train.Date)
df_test['Date'] = pd.to_datetime(df_test.Date)

In [206]:
df_train = df_train.set_index('Date')
df_test = df_test.set_index('Date')

#### Convert to JASON 

In [207]:
# subset a list of Stores to iterate over
store_nr = list(df_train['Store'].unique())

In [208]:
# import json for formatting data and os for saving
import json
import os 

# transforming df

def write_json_dataset(df, filename): 
    with open(filename, 'wb') as f:
        # for each of our times series, there is one JSON line
        for store in store_nr:
            df_store = df.loc[df['Store'] == store]
            obj = {"start": str(df_store.index[0]), "target": list(df_store.Sales), "cat": [int(store)], "dynamic_feat": [list(df_store.DayOfWeek),list(df_store.Open),list(df_store.Promo),list(df_store.StateHoliday),list(df_store.SchoolHoliday),list(df_store.StoreType),list(df_store.Assortment),list(df_store.CompetitionDistance),list(df_store.Promo_2_active),list(df_store.Open_sunday)]}
            json_line = json.dumps(obj) + '\n'
            json_line = json_line.encode('utf-8')
            f.write(json_line)
    print(filename + ' saved.')

In [209]:
# save this data to a local directory
data_dir = 'json_rossmann'

# make data dir, if it does not exist
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

In [210]:
# directories to save train/test data
train_key = os.path.join(data_dir, 'train.json')
test_key = os.path.join(data_dir, 'test.json')

# write train/test JSON files
write_json_dataset(df_train, train_key)        
write_json_dataset(df_test, test_key)

json_rossmann/train.json saved.
json_rossmann/test.json saved.


#### Store to S3

In [211]:
import boto3
import sagemaker
from sagemaker import get_execution_role

In [212]:
# session, role, bucket
sagemaker_session = sagemaker.Session()
role = get_execution_role()

bucket = sagemaker_session.default_bucket()

In [213]:
# general prefix
prefix='deepar-rossmann'

# *unique* train/test prefixes
train_prefix   = '{}/{}'.format(prefix, 'train')
test_prefix    = '{}/{}'.format(prefix, 'test')

# uploading data to S3, and saving locations
train_path  = sagemaker_session.upload_data(train_key, bucket=bucket, key_prefix=train_prefix)
test_path   = sagemaker_session.upload_data(test_key,  bucket=bucket, key_prefix=test_prefix)

In [214]:
# check locations
print('Training data is stored in: '+ train_path)
print('Test data is stored in: '+ test_path)

Training data is stored in: s3://sagemaker-eu-central-1-395339144106/deepar-rossmann/train/train.json
Test data is stored in: s3://sagemaker-eu-central-1-395339144106/deepar-rossmann/test/test.json


checking a preview of the json the data at s3 through the aws console shows, that all the data is in the right format as indicated in the aws documentation: https://docs.aws.amazon.com/sagemaker/latest/dg/deepar.html

## Modelling DeepAR

#### 1. Setup

In [215]:
from sagemaker.amazon.amazon_estimator import get_image_uri

image_name = get_image_uri(boto3.Session().region_name, # get the region
                           'forecasting-deepar') # specify image


In [216]:
from sagemaker.estimator import Estimator

# dir to save model artifacts
s3_output_path = "s3://{}/{}/output".format(bucket, prefix)

# instantiate a DeepAR estimator
estimator = Estimator(sagemaker_session=sagemaker_session,
                      image_name=image_name,
                      role=role,
                      train_instance_count=1,
                      train_instance_type='ml.p2.xlarge',
                      output_path=s3_output_path
                      )

In [217]:
freq='D'
prediction_length= 49 # number of days in test data set (7 weeks x 7 days)
context_length= 490 # less then number of days in train data set (126 weeks x 7 days); 
# "a model can look further back in the time series than the value specified for context_length"
epochs = 50 # the maximum number of times to pass over the data when training
# Further parameter explenation: https://docs.aws.amazon.com/forecast/latest/dg/aws-forecast-recipe-deeparplus.html 

hyperparameters = {
    "epochs": str(epochs),
    "time_freq": freq,
    "prediction_length": str(prediction_length),
    "context_length": str(context_length),
    "num_cells": "50",
    "num_layers": "3",
    "mini_batch_size": "128",
    "learning_rate": "0.001",
    "early_stopping_patience": "10"
}

In [218]:
# set the hyperparams
estimator.set_hyperparameters(**hyperparameters)

#### 2. Training

In [219]:
%%time
# train and test channels
data_channels = {
    "train": train_path,
    "test": test_path
}

# fit the estimator
estimator.fit(inputs=data_channels)

2020-04-07 14:27:32 Starting - Starting the training job...
2020-04-07 14:27:34 Starting - Launching requested ML instances...
2020-04-07 14:28:29 Starting - Preparing the instances for training.........
2020-04-07 14:29:50 Downloading - Downloading input data...
2020-04-07 14:30:05 Training - Downloading the training image....[34mArguments: train[0m
[34m[04/07/2020 14:31:07 INFO 140569346963264] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-input.json: {u'num_dynamic_feat': u'auto', u'dropout_rate': u'0.10', u'mini_batch_size': u'128', u'test_quantiles': u'[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]', u'_tuning_objective_metric': u'', u'_num_gpus': u'auto', u'num_eval_samples': u'100', u'learning_rate': u'0.001', u'num_cells': u'40', u'num_layers': u'2', u'embedding_dimension': u'10', u'_kvstore': u'auto', u'_num_kv_servers': u'auto', u'cardinality': u'auto', u'likelihood': u'student-t', u'early_stopping_patience': u''}[0m

[34m[04/07/2020 14:32:40 INFO 140569346963264] Epoch[2] Batch[5] avg_epoch_loss=6.056408[0m
[34m[04/07/2020 14:32:40 INFO 140569346963264] #quality_metric: host=algo-1, epoch=2, batch=5 train loss <loss>=6.05640816689[0m
[34m[04/07/2020 14:32:40 INFO 140569346963264] Epoch[2] Batch [5]#011Speed: 50.39 samples/sec#011loss=6.056408[0m
[34m[04/07/2020 14:32:52 INFO 140569346963264] Epoch[2] Batch[10] avg_epoch_loss=5.991790[0m
[34m[04/07/2020 14:32:52 INFO 140569346963264] #quality_metric: host=algo-1, epoch=2, batch=10 train loss <loss>=5.91424732208[0m
[34m[04/07/2020 14:32:52 INFO 140569346963264] Epoch[2] Batch [10]#011Speed: 53.76 samples/sec#011loss=5.914247[0m
[34m[04/07/2020 14:32:52 INFO 140569346963264] processed a total of 1339 examples[0m
[34m#metrics {"Metrics": {"update.time": {"count": 1, "max": 28067.676067352295, "sum": 28067.676067352295, "min": 28067.676067352295}}, "EndTime": 1586269972.048274, "Dimensions": {"Host": "algo-1", "Operation": "training", "A

[34m[04/07/2020 14:34:56 INFO 140569346963264] Epoch[7] Batch[5] avg_epoch_loss=6.059514[0m
[34m[04/07/2020 14:34:56 INFO 140569346963264] #quality_metric: host=algo-1, epoch=7, batch=5 train loss <loss>=6.05951364835[0m
[34m[04/07/2020 14:34:56 INFO 140569346963264] Epoch[7] Batch [5]#011Speed: 50.52 samples/sec#011loss=6.059514[0m
[34m[04/07/2020 14:35:06 INFO 140569346963264] processed a total of 1264 examples[0m
[34m#metrics {"Metrics": {"update.time": {"count": 1, "max": 25645.713090896606, "sum": 25645.713090896606, "min": 25645.713090896606}}, "EndTime": 1586270106.031816, "Dimensions": {"Host": "algo-1", "Operation": "training", "Algorithm": "AWS/DeepAR"}, "StartTime": 1586270080.385567}
[0m
[34m[04/07/2020 14:35:06 INFO 140569346963264] #throughput_metric: host=algo-1, train throughput=49.2867882102 records/second[0m
[34m[04/07/2020 14:35:06 INFO 140569346963264] #progress_metric: host=algo-1, completed 16 % of epochs[0m
[34m[04/07/2020 14:35:06 INFO 14056934696

[34m[04/07/2020 14:37:21 INFO 140569346963264] Epoch[12] Batch[10] avg_epoch_loss=6.061360[0m
[34m[04/07/2020 14:37:21 INFO 140569346963264] #quality_metric: host=algo-1, epoch=12, batch=10 train loss <loss>=6.14150800705[0m
[34m[04/07/2020 14:37:21 INFO 140569346963264] Epoch[12] Batch [10]#011Speed: 53.71 samples/sec#011loss=6.141508[0m
[34m[04/07/2020 14:37:21 INFO 140569346963264] processed a total of 1282 examples[0m
[34m#metrics {"Metrics": {"update.time": {"count": 1, "max": 28060.80198287964, "sum": 28060.80198287964, "min": 28060.80198287964}}, "EndTime": 1586270241.710082, "Dimensions": {"Host": "algo-1", "Operation": "training", "Algorithm": "AWS/DeepAR"}, "StartTime": 1586270213.648792}
[0m
[34m[04/07/2020 14:37:21 INFO 140569346963264] #throughput_metric: host=algo-1, train throughput=45.6863220713 records/second[0m
[34m[04/07/2020 14:37:21 INFO 140569346963264] #progress_metric: host=algo-1, completed 26 % of epochs[0m
[34m[04/07/2020 14:37:21 INFO 14056934

[34m[04/07/2020 14:39:28 INFO 140569346963264] Epoch[17] Batch[5] avg_epoch_loss=5.513202[0m
[34m[04/07/2020 14:39:28 INFO 140569346963264] #quality_metric: host=algo-1, epoch=17, batch=5 train loss <loss>=5.5132021904[0m
[34m[04/07/2020 14:39:28 INFO 140569346963264] Epoch[17] Batch [5]#011Speed: 50.41 samples/sec#011loss=5.513202[0m
[34m[04/07/2020 14:39:37 INFO 140569346963264] processed a total of 1252 examples[0m
[34m#metrics {"Metrics": {"update.time": {"count": 1, "max": 25674.915075302124, "sum": 25674.915075302124, "min": 25674.915075302124}}, "EndTime": 1586270377.554776, "Dimensions": {"Host": "algo-1", "Operation": "training", "Algorithm": "AWS/DeepAR"}, "StartTime": 1586270351.879386}
[0m
[34m[04/07/2020 14:39:37 INFO 140569346963264] #throughput_metric: host=algo-1, train throughput=48.76331615 records/second[0m
[34m[04/07/2020 14:39:37 INFO 140569346963264] #progress_metric: host=algo-1, completed 36 % of epochs[0m
[34m[04/07/2020 14:39:37 INFO 14056934696

[34m[04/07/2020 14:41:50 INFO 140569346963264] processed a total of 1262 examples[0m
[34m#metrics {"Metrics": {"update.time": {"count": 1, "max": 25632.642030715942, "sum": 25632.642030715942, "min": 25632.642030715942}}, "EndTime": 1586270510.83997, "Dimensions": {"Host": "algo-1", "Operation": "training", "Algorithm": "AWS/DeepAR"}, "StartTime": 1586270485.206783}
[0m
[34m[04/07/2020 14:41:50 INFO 140569346963264] #throughput_metric: host=algo-1, train throughput=49.2338517052 records/second[0m
[34m[04/07/2020 14:41:50 INFO 140569346963264] #progress_metric: host=algo-1, completed 46 % of epochs[0m
[34m[04/07/2020 14:41:50 INFO 140569346963264] #quality_metric: host=algo-1, epoch=22, train loss <loss>=5.61791858673[0m
[34m[04/07/2020 14:41:50 INFO 140569346963264] loss did not improve[0m
[34m[04/07/2020 14:41:54 INFO 140569346963264] Epoch[23] Batch[0] avg_epoch_loss=5.535528[0m
[34m[04/07/2020 14:41:54 INFO 140569346963264] #quality_metric: host=algo-1, epoch=23, batc

[34m[04/07/2020 14:44:24 INFO 140569346963264] Epoch[28] Batch[5] avg_epoch_loss=5.431944[0m
[34m[04/07/2020 14:44:24 INFO 140569346963264] #quality_metric: host=algo-1, epoch=28, batch=5 train loss <loss>=5.43194381396[0m
[34m[04/07/2020 14:44:24 INFO 140569346963264] Epoch[28] Batch [5]#011Speed: 50.46 samples/sec#011loss=5.431944[0m
[34m[04/07/2020 14:44:34 INFO 140569346963264] processed a total of 1263 examples[0m
[34m#metrics {"Metrics": {"update.time": {"count": 1, "max": 25617.027044296265, "sum": 25617.027044296265, "min": 25617.027044296265}}, "EndTime": 1586270674.429149, "Dimensions": {"Host": "algo-1", "Operation": "training", "Algorithm": "AWS/DeepAR"}, "StartTime": 1586270648.811634}
[0m
[34m[04/07/2020 14:44:34 INFO 140569346963264] #throughput_metric: host=algo-1, train throughput=49.3029335091 records/second[0m
[34m[04/07/2020 14:44:34 INFO 140569346963264] #progress_metric: host=algo-1, completed 58 % of epochs[0m
[34m[04/07/2020 14:44:34 INFO 14056934

#### 3. create predictor (by deploying estimator) 

In [221]:
%%time

# create a predictor
predictor = estimator.deploy(
    initial_instance_count=1,
    instance_type='ml.p2.xlarge',
    content_type="application/json" # specify that it will accept/produce JSON
)

Using already existing model: forecasting-deepar-2020-04-07-14-27-32-842


-------------!CPU times: user 196 ms, sys: 22.5 ms, total: 219 ms
Wall time: 6min 32s


#### 4. generating predictions

In [248]:
def instance_creator(ts):
    '''Accepts a list of input time series and returns an instance (jaspn transformed) input data for prediction'''
    # request data is made of JSON objects (instances) 
    # here I delete the last 49 values of the target list, as this is what should be predicted by the model
    # the dynamic features include these 49 values for the prediction length as indicated by the aws documentation 
    
    instances = []
    for store in store_nr:
        df_store = ts.loc[ts['Store'] == store]
        l = list(df_store.Sales)
        del l[-49:]
        json_obj = {"start": str(df_store.index[0]), "target": l , "cat": [int(store)], "dynamic_feat": [list(df_store.DayOfWeek),list(df_store.Open),list(df_store.Promo),list(df_store.StateHoliday),list(df_store.SchoolHoliday),list(df_store.StoreType),list(df_store.Assortment),list(df_store.CompetitionDistance),list(df_store.Promo_2_active),list(df_store.Open_sunday)]}
        instances.append(json_obj)
    return instances

In [249]:
def json_predictor_input(instances, num_samples=49, quantiles=['0.1', '0.5', '0.9']):
    '''Accepts a list of input time series and produces a formatted input.
       :input_ts: An list of input time series.
       :num_samples: Number of samples to calculate metrics with.
       :quantiles: A list of quantiles to return in the predicted output.
       :return: The JSON-formatted input.
       '''
    # request data is made of JSON objects (instances)
    # and an output configuration that details the type of data/quantiles we want
    
 
    # specify the output quantiles and samples
    configuration = {"num_samples": num_samples, 
                     "output_types": ["mean","quantiles"], 
                     "quantiles": quantiles}

    request_data = {"instances": instances, 
                    "configuration": configuration}

    json_request = json.dumps(request_data).encode('utf-8')
    
    return json_request

In [250]:
# get all input and target (test) time series
input_ts = df_test

# get formatted input time series
ts_instances = instance_creator(df_test)

In [251]:
json_input_ts = json_predictor_input(ts_instances)

In [252]:
# get the prediction from the predictor
json_prediction = predictor.predict(json_input_ts)

ConnectionClosedError: Connection was closed before we received a valid response from endpoint URL: "https://runtime.sagemaker.eu-central-1.amazonaws.com/endpoints/forecasting-deepar-2020-04-07-14-27-32-842/invocations".