In [6]:
import time
import numpy as np
import pandas as pd
import json
import matplotlib.pyplot as plt
import boto3
import sagemaker
from sqlalchemy import create_engine
from sagemaker import get_execution_role
np.random.seed(1)

from redshift import RedshiftConnection,get_secret
import sys



secret = get_secret()
username = secret['username']
password = secret['password']
engine = secret['engine']
host = secret['host']
port = secret['port']
redshift = RedshiftConnection(username,password,engine,host,port)
print("Redshift Connection Established")
#Truncate table with datasource as model
data = RedshiftConnection.read_redshift_data_booking(redshift)
print(data)

Redshift Connection Established
     data_category data_source model_name travel_date  year quarter month  \
0           ACTUAL         ETL         NA   103202106  2021      Q1   JAN   
1           ACTUAL         ETL         NA   103202106  2021      Q1   JAN   
2           ACTUAL         ETL         NA   103202106  2021      Q1   JAN   
3           ACTUAL         ETL         NA   103202106  2021      Q1   JAN   
4           ACTUAL         ETL         NA   103202106  2021      Q1   JAN   
...            ...         ...        ...         ...   ...     ...   ...   
2399      FORECAST       MODEL         NA   326202212  2022      Q1   MAR   
2400      FORECAST       MODEL         NA   328202218  2022      Q1   MAR   
2401      FORECAST       MODEL         NA   328202218  2022      Q1   MAR   
2402      FORECAST       MODEL         NA   329202224  2022      Q1   MAR   
2403      FORECAST       MODEL         NA   329202224  2022      Q1   MAR   

     week       day  hour  ... source_wind 

In [36]:
data2 = data[['date', 'number_of_booking']]
data2.columns = ['start', 'target']
data2

Unnamed: 0,start,target
0,2021-01-03 06:00:00,25
1,2021-01-03 06:00:00,50
2,2021-01-03 06:00:00,100
3,2021-01-03 06:00:00,292
4,2021-01-05 12:00:00,30
...,...,...
187,2021-03-28 18:00:00,289
188,2021-03-29 00:00:00,28
189,2021-03-29 00:00:00,57
190,2021-03-29 00:00:00,93


In [38]:
data2.set_index('start', inplace=True)
data2

Unnamed: 0_level_0,target
start,Unnamed: 1_level_1
2021-01-03 06:00:00,25
2021-01-03 06:00:00,50
2021-01-03 06:00:00,100
2021-01-03 06:00:00,292
2021-01-05 12:00:00,30
...,...
2021-03-28 18:00:00,289
2021-03-29 00:00:00,28
2021-03-29 00:00:00,57
2021-03-29 00:00:00,93


In [41]:
start_dataset = pd.Timestamp("2021-01-03 06:00:00", freq=freq)
end_training = pd.Timestamp("2021-03-08 00:00:00", freq=freq)

In [35]:
train_data = data2[:len(data2)-50]
test_data = data2[len(data2)-50:]

train_json = train_data.to_json(orient='records', lines=True)
print(train_json)
test_json = test_data.to_json(orient='records', lines=True)
print(test_json)

{"start":1609653600000,"target":25}
{"start":1609653600000,"target":50}
{"start":1609653600000,"target":100}
{"start":1609653600000,"target":292}
{"start":1609848000000,"target":30}
{"start":1609848000000,"target":45}
{"start":1609848000000,"target":150}
{"start":1609848000000,"target":242}
{"start":1610042400000,"target":24}
{"start":1610042400000,"target":58}
{"start":1610042400000,"target":125}
{"start":1610042400000,"target":260}
{"start":1610064000000,"target":25}
{"start":1610064000000,"target":50}
{"start":1610064000000,"target":100}
{"start":1610064000000,"target":292}
{"start":1610258400000,"target":25}
{"start":1610258400000,"target":50}
{"start":1610258400000,"target":100}
{"start":1610258400000,"target":292}
{"start":1610452800000,"target":30}
{"start":1610452800000,"target":45}
{"start":1610452800000,"target":150}
{"start":1610452800000,"target":242}
{"start":1610647200000,"target":26}
{"start":1610647200000,"target":55}
{"start":1610647200000,"target":126}
{"start":161064

In [23]:
from sagemaker.amazon.amazon_estimator import get_image_uri

image_uri = get_image_uri(boto3.Session().region_name, "forecasting-deepar")

The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.


In [27]:
s3_output_path = "airline-test/output/DeepAR_output/"
import sagemaker

session = sagemaker.Session()

region = session.boto_region_name

estimator = sagemaker.estimator.Estimator(
    sagemaker_session=session,
    image_uri=image_uri,
    image_name=sagemaker.amazon.amazon_estimator.get_image_uri(region, "forecasting-deepar", "latest"),
    role=sagemaker.get_execution_role(),
    train_instance_count=1,
    train_instance_type='ml.c4.2xlarge',
    base_job_name='deepar-poc1',
    output_path=f"s3://{s3_output_path}")

The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: latest.
train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [28]:
freq = "H"
prediction_length = 48
context_length = 72
hyperparameters = {
    "time_freq": freq,
    "context_length": str(context_length),
    "prediction_length": str(prediction_length),
    "num_cells": "40",
    "num_layers": "3",
    "likelihood": "gaussian",
    "epochs": "20",
    "mini_batch_size": "32",
    "learning_rate": "0.001",
    "dropout_rate": "0.05",
    "early_stopping_patience": "10",
}

In [29]:
estimator.set_hyperparameters(**hyperparameters)

In [None]:
bucket = "airline-test"
sagemaker_session = sagemaker.Session()
role = get_execution_role()
FILE_TRAIN = "train.json"
s3 = boto3.client("s3")
s3.upload_file(FILE_TRAIN, bucket,"/data/train/" + FILE_TRAIN)

In [None]:
s3.upload_file(FILE_TRAIN, bucket,"/data/train/" + FILE_TRAIN)

In [None]:

s3_data_path = "airline-test/train-data.jsonl"

data_channels = {"train": f"s3://{s3_data_path}/train/", "test": f"s3://{s3_data_path}/test/"}

estimator.fit(inputs=data_channels)

############################################################

*************************First Approach****************************

#####################################################################

In [7]:
data1 = data.set_index('date')
data1 = data1[['year', 'quarter',
       'month', 'week', 'day', 'hour', 'origin', 'destination',
       'flight', 'capacity', 'price_type', 'promotion', 'roundtrip_or_oneway',
       'customer_type', 'product_type', 'location_lifestyle',
       'location_economical_status', 'location_employment_status',
       'location_event', 'source_wind', 'source_humidity',
       'source_precipitation', 'destination_wind', 'destination_humidity',
       'destination_precipitation', 'number_of_booking']]
data1

Unnamed: 0_level_0,year,quarter,month,week,day,hour,origin,destination,flight,capacity,...,location_economical_status,location_employment_status,location_event,source_wind,source_humidity,source_precipitation,destination_wind,destination_humidity,destination_precipitation,number_of_booking
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-01-03 06:00:00,2021,Q1,JAN,1,SUNDAY,6,MAA,DXB,FD101,467,...,HIGH,EMPLOYED,CHRISTMAS,0.45 MPS,30%,RAIN,0.52 MPS,20%,DRIZZLE,10.0
2021-01-03 06:00:00,2021,Q1,JAN,1,SUNDAY,6,MAA,DXB,FD101,467,...,HIGH,EMPLOYED,CHRISTMAS,0.45 MPS,30%,RAIN,0.52 MPS,20%,DRIZZLE,5.0
2021-01-03 06:00:00,2021,Q1,JAN,1,SUNDAY,6,MAA,DXB,FD102,467,...,HIGH,EMPLOYED,CHRISTMAS,0.45 MPS,30%,RAIN,0.52 MPS,20%,DRIZZLE,10.0
2021-01-03 06:00:00,2021,Q1,JAN,1,SUNDAY,6,MAA,DXB,FD102,467,...,HIGH,EMPLOYED,CHRISTMAS,0.45 MPS,30%,RAIN,0.52 MPS,20%,DRIZZLE,20.0
2021-01-03 06:00:00,2021,Q1,JAN,1,SUNDAY,6,MAA,DXB,FD101,467,...,HIGH,EMPLOYED,CHRISTMAS,0.45 MPS,30%,RAIN,0.52 MPS,20%,DRIZZLE,20.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-03-26 12:00:00,2022,Q1,MAR,4,TUESDAY,12,MAA,DXB,FD102,467,...,HIGH,EMPLOYED,CHRISTMAS,0.45 MPS,30%,RAIN,0.52 MPS,20%,DRIZZLE,
2022-03-28 18:00:00,2022,Q1,MAR,4,THURSDAY,18,MAA,DXB,FD101,467,...,HIGH,EMPLOYED,CHRISTMAS,0.45 MPS,30%,RAIN,0.52 MPS,20%,DRIZZLE,
2022-03-28 18:00:00,2022,Q1,MAR,4,THURSDAY,18,MAA,DXB,FD102,467,...,HIGH,EMPLOYED,CHRISTMAS,0.45 MPS,30%,RAIN,0.52 MPS,20%,DRIZZLE,
2022-03-30 00:00:00,2022,Q1,MAR,4,FRIDAY,24,MAA,DXB,FD102,467,...,HIGH,EMPLOYED,CHRISTMAS,0.45 MPS,30%,RAIN,0.52 MPS,20%,DRIZZLE,


In [11]:
data1.dtypes

year                          object
quarter                       object
month                         object
week                          object
day                           object
hour                           int64
origin                        object
destination                   object
flight                        object
capacity                       int64
price_type                    object
promotion                     object
roundtrip_or_oneway           object
customer_type                 object
product_type                  object
location_lifestyle            object
location_economical_status    object
location_employment_status    object
location_event                object
source_wind                   object
source_humidity               object
source_precipitation          object
destination_wind              object
destination_humidity          object
destination_precipitation     object
number_of_booking              int64
dtype: object

In [None]:
data1['year'] = data1['year'].astype(int)
data1['quarter'] = data1['quarter'].astype('category')
data1['month'] = data1['month'].astype('category')
data1['week'] = data1['week'].astype(int)
data1['day'] = data1['day'].astype('category')
data1['origin'] = data1['origin'].astype('category')
data1['destination'] = data1['destination'].astype('category')
data1['flight'] = data1['flight'].astype('category')
data1['price_type'] = data1['price_type'].astype('category')
data1['promotion'] = data1['promotion'].astype('category')
data1['roundtrip_or_oneway'] = data1['roundtrip_or_oneway'].astype('category')
data1['customer_type'] = data1['customer_type'].astype('category')
data1['product_type'] = data1['product_type'].astype('category')
data1['location_lifestyle'] = data1['location_lifestyle'].astype('category')
data1['location_economical_status'] = data1['location_economical_status'].astype('category')
data1['location_employment_status'] = data1['location_employment_status'].astype('category')
data1['location_event'] = data1['location_event'].astype('category')
data1['day'] = data1['day'].astype('category')
cat_columns = df.select_dtypes(['category']).columns
df[cat_columns] = df[cat_columns].apply(lambda x: x.cat.codes)

In [15]:
data1['source_wind'].str.split()

['0.45', 'MPS']

In [4]:
data1.columns

Index(['data_category', 'data_source', 'model_name', 'year', 'quarter',
       'month', 'week', 'day', 'hour', 'region', 'origin', 'destination',
       'flight', 'capacity', 'price_type', 'promotion', 'roundtrip_or_oneway',
       'customer_type', 'product_type', 'location_lifestyle',
       'location_economical_status', 'location_employment_status',
       'location_event', 'source_wind', 'source_humidity',
       'source_precipitation', 'destination_wind', 'destination_humidity',
       'destination_precipitation', 'number_of_booking'],
      dtype='object')

In [43]:
from __future__ import print_function

%matplotlib inline

import sys
import zipfile
from dateutil.parser import parse
import json
from random import shuffle
import random
import datetime
import os

import boto3
import s3fs
import sagemaker
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import timedelta

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
from ipywidgets import IntSlider, FloatSlider, Checkbox

In [44]:
# set random seeds for reproducibility
np.random.seed(42)
random.seed(42)

In [45]:
sagemaker_session = sagemaker.Session()

In [46]:
s3_bucket = sagemaker.Session().default_bucket()  # replace with an existing bucket if needed
s3_prefix = "deepar-demo-notebook"  # prefix used for all data stored within the bucket

role = sagemaker.get_execution_role()  # IAM role to use by SageMaker

In [47]:
region = sagemaker_session.boto_region_name

s3_data_path = "s3://{}/{}/data".format(s3_bucket, s3_prefix)
s3_output_path = "s3://{}/{}/output".format(s3_bucket, s3_prefix)

In [48]:
image_name = sagemaker.amazon.amazon_estimator.get_image_uri(region, "forecasting-deepar", "latest")

The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: latest.


In [None]:
DATA_HOST = "sagemaker-sample-files"
DATA_PATH = "datasets/timeseries/uci_electricity/"
ARCHIVE_NAME = "LD2011_2014.txt.zip"
FILE_NAME = ARCHIVE_NAME[:-4]

In [None]:
s3_client = boto3.client("s3")

if not os.path.isfile(FILE_NAME):
    print("downloading dataset (258MB), can take a few minutes depending on your connection")
    s3_client.download_file(DATA_HOST, DATA_PATH + ARCHIVE_NAME, ARCHIVE_NAME)

    print("\nextracting data archive")
    zip_ref = zipfile.ZipFile(ARCHIVE_NAME, "r")
    zip_ref.extractall("./")
    zip_ref.close()
else:
    print("File found skipping download")

In [1]:
from __future__ import print_function

%matplotlib inline

import sys
import zipfile
from dateutil.parser import parse
import json
from random import shuffle
import random
import datetime
import os

import boto3
import s3fs
import sagemaker
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import timedelta

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
from ipywidgets import IntSlider, FloatSlider, Checkbox

In [3]:
import time
import numpy as np
import pandas as pd
import json
import matplotlib.pyplot as plt
import boto3
import sagemaker
from sqlalchemy import create_engine
from sagemaker import get_execution_role
np.random.seed(1)

from redshift import RedshiftConnection,get_secret
import sys



secret = get_secret()
username = secret['username']
password = secret['password']
engine = secret['engine']
host = secret['host']
port = secret['port']
redshift = RedshiftConnection(username,password,engine,host,port)
print("Redshift Connection Established")
#Truncate table with datasource as model
data = RedshiftConnection.read_redshift_data_booking(redshift)
print(data)

Redshift Connection Established
     data_category data_source model_name travel_date  year quarter month  \
0           ACTUAL         ETL         NA   103202106  2021      Q1   JAN   
1           ACTUAL         ETL         NA   103202106  2021      Q1   JAN   
2           ACTUAL         ETL         NA   103202106  2021      Q1   JAN   
3           ACTUAL         ETL         NA   103202106  2021      Q1   JAN   
4           ACTUAL         ETL         NA   103202106  2021      Q1   JAN   
...            ...         ...        ...         ...   ...     ...   ...   
2299      FORECAST       MODEL         NA   329202224  2022      Q1   MAR   
2300      FORECAST       MODEL         NA   329202224  2022      Q1   MAR   
2301      FORECAST       MODEL         NA   329202224  2022      Q1   MAR   
2302      FORECAST       MODEL         NA   329202224  2022      Q1   MAR   
2303      FORECAST       MODEL         NA   329202224  2022      Q1   MAR   

     week     day  hour  ... source_wind so

In [4]:
data.columns

Index(['data_category', 'data_source', 'model_name', 'travel_date', 'year',
       'quarter', 'month', 'week', 'day', 'hour', 'region', 'origin',
       'destination', 'flight', 'capacity', 'price_type', 'promotion',
       'roundtrip_or_oneway', 'customer_type', 'product_type',
       'location_lifestyle', 'location_economical_status',
       'location_employment_status', 'location_event', 'source_wind',
       'source_humidity', 'source_precipitation', 'destination_wind',
       'destination_humidity', 'destination_precipitation',
       'number_of_booking', 'date', 'model_accuracy', 'accuracy_probability'],
      dtype='object')

In [5]:
# set random seeds for reproducibility
np.random.seed(42)
random.seed(42)

In [6]:
sagemaker_session = sagemaker.Session()

In [7]:
s3_bucket = sagemaker.Session().default_bucket()  # replace with an existing bucket if needed
s3_prefix = "deepar-demo-notebook"  # prefix used for all data stored within the bucket

role = sagemaker.get_execution_role()  # IAM role to use by SageMaker

In [8]:
region = sagemaker_session.boto_region_name

s3_data_path = "s3://{}/{}/data".format(s3_bucket, s3_prefix)
s3_output_path = "s3://{}/{}/output".format(s3_bucket, s3_prefix)

In [9]:
image_name = sagemaker.amazon.amazon_estimator.get_image_uri(region, "forecasting-deepar", "latest")

The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: latest.


In [20]:
data.columns

Index(['data_category', 'data_source', 'model_name', 'travel_date', 'year',
       'quarter', 'month', 'week', 'day', 'hour', 'region', 'origin',
       'destination', 'flight', 'capacity', 'price_type', 'promotion',
       'roundtrip_or_oneway', 'customer_type', 'product_type',
       'location_lifestyle', 'location_economical_status',
       'location_employment_status', 'location_event', 'source_wind',
       'source_humidity', 'source_precipitation', 'destination_wind',
       'destination_humidity', 'destination_precipitation',
       'number_of_booking', 'date', 'model_accuracy', 'accuracy_probability'],
      dtype='object')

In [21]:
data1 = data[['date', 'year',
       'quarter', 'month', 'week', 'day', 'hour', 'origin',
       'destination', 'flight', 'capacity', 'price_type', 'promotion',
       'roundtrip_or_oneway', 'customer_type', 'product_type',
       'location_lifestyle', 'location_economical_status',
       'location_employment_status', 'location_event', 'source_wind',
       'source_humidity', 'source_precipitation', 'destination_wind',
       'destination_humidity', 'destination_precipitation',
       'number_of_booking']]
data1.dtypes

date                          datetime64[ns]
year                                  object
quarter                               object
month                                 object
week                                  object
day                                   object
hour                                   int64
origin                                object
destination                           object
flight                                object
capacity                               int64
price_type                            object
promotion                             object
roundtrip_or_oneway                   object
customer_type                         object
product_type                          object
location_lifestyle                    object
location_economical_status            object
location_employment_status            object
location_event                        object
source_wind                           object
source_humidity                       object
source_pre

In [22]:
data1['year'] = data1['year'].astype(int)
data1['quarter'] = data1['quarter'].astype('category')
data1['month'] = data1['month'].astype('category')
data1['week'] = data1['week'].astype(int)
data1['day'] = data1['day'].astype('category')
data1['origin'] = data1['origin'].astype('category')
data1['destination'] = data1['destination'].astype('category')
data1['flight'] = data1['flight'].astype('category')
data1['price_type'] = data1['price_type'].astype('category')
data1['promotion'] = data1['promotion'].astype('category')
data1['roundtrip_or_oneway'] = data1['roundtrip_or_oneway'].astype('category')
data1['customer_type'] = data1['customer_type'].astype('category')
data1['product_type'] = data1['product_type'].astype('category')
data1['location_lifestyle'] = data1['location_lifestyle'].astype('category')
data1['location_economical_status'] = data1['location_economical_status'].astype('category')
data1['location_employment_status'] = data1['location_employment_status'].astype('category')
data1['location_event'] = data1['location_event'].astype('category')
data1['source_wind'] = data1['source_wind'].astype('category')
data1['source_humidity'] = data1['source_humidity'].astype('category')
data1['source_precipitation'] = data1['source_precipitation'].astype('category')
data1['destination_wind'] = data1['destination_wind'].astype('category')
data1['destination_humidity'] = data1['destination_humidity'].astype('category')
data1['destination_precipitation'] = data1['destination_precipitation'].astype('category')
cat_columns = data1.select_dtypes(['category']).columns
data1[cat_columns] = data1[cat_columns].apply(lambda x: x.cat.codes)
data1.dtypes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = valu

date                          datetime64[ns]
year                                   int64
quarter                                 int8
month                                   int8
week                                   int64
day                                     int8
hour                                   int64
origin                                  int8
destination                             int8
flight                                  int8
capacity                               int64
price_type                              int8
promotion                               int8
roundtrip_or_oneway                     int8
customer_type                           int8
product_type                            int8
location_lifestyle                      int8
location_economical_status              int8
location_employment_status              int8
location_event                          int8
source_wind                             int8
source_humidity                         int8
source_pre

In [23]:
data1.set_index('date', inplace=True)
data1

Unnamed: 0_level_0,year,quarter,month,week,day,hour,origin,destination,flight,capacity,...,location_economical_status,location_employment_status,location_event,source_wind,source_humidity,source_precipitation,destination_wind,destination_humidity,destination_precipitation,number_of_booking
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-01-03 06:00:00,2021,0,1,1,1,6,0,0,0,467,...,0,0,0,0,0,0,0,0,0,10
2021-01-03 06:00:00,2021,0,1,1,1,6,0,0,0,467,...,0,0,0,0,0,0,0,0,0,5
2021-01-03 06:00:00,2021,0,1,1,1,6,0,0,1,467,...,0,0,0,0,0,0,0,0,0,10
2021-01-03 06:00:00,2021,0,1,1,1,6,0,0,1,467,...,0,0,0,0,0,0,0,0,0,20
2021-01-03 06:00:00,2021,0,1,1,1,6,0,0,0,467,...,0,0,0,0,0,0,0,0,0,20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-03-30 00:00:00,2022,0,2,4,0,24,0,0,1,467,...,0,0,0,0,0,0,0,0,0,10
2022-03-30 00:00:00,2022,0,2,4,0,24,0,0,0,467,...,0,0,0,0,0,0,0,0,0,18
2022-03-30 00:00:00,2022,0,2,4,0,24,0,0,0,467,...,0,0,0,0,0,0,0,0,0,19
2022-03-30 00:00:00,2022,0,2,4,0,24,0,0,1,467,...,0,0,0,0,0,0,0,0,0,60


In [25]:
num_timeseries = data1.shape[1]
data_kw = data1.resample("H").sum() / 8
timeseries = []
for i in range(num_timeseries):
    timeseries.append(np.trim_zeros(data_kw.iloc[:, i], trim="f"))

In [11]:
# we use 2 hour frequency for the time series
freq = "H"

# we predict for 7 days
prediction_length = 7 * 24

# we also use 7 days as context length, this is the number of state updates accomplished before making predictions
context_length = 7 * 24

In [15]:
data['date'].unique()

array(['2021-01-03T06:00:00.000000000', '2021-01-05T12:00:00.000000000',
       '2021-01-07T18:00:00.000000000', '2021-01-09T00:00:00.000000000',
       '2021-01-10T06:00:00.000000000', '2021-01-12T12:00:00.000000000',
       '2021-01-14T18:00:00.000000000', '2021-01-16T00:00:00.000000000',
       '2021-01-17T06:00:00.000000000', '2021-01-19T12:00:00.000000000',
       '2021-01-21T18:00:00.000000000', '2021-01-23T00:00:00.000000000',
       '2021-01-24T06:00:00.000000000', '2021-01-26T12:00:00.000000000',
       '2021-01-28T18:00:00.000000000', '2021-01-30T00:00:00.000000000',
       '2021-02-03T06:00:00.000000000', '2021-02-05T12:00:00.000000000',
       '2021-02-07T18:00:00.000000000', '2021-02-09T00:00:00.000000000',
       '2021-02-10T06:00:00.000000000', '2021-02-12T12:00:00.000000000',
       '2021-02-14T18:00:00.000000000', '2021-02-16T00:00:00.000000000',
       '2021-02-17T06:00:00.000000000', '2021-02-19T12:00:00.000000000',
       '2021-02-21T18:00:00.000000000', '2021-02-23

In [16]:
start_dataset = pd.Timestamp("2021-01-03 06:00:00", freq=freq)
end_training = pd.Timestamp("2021-03-30 00:00:00", freq=freq)

In [26]:
training_data = [
    {
        "start": str(start_dataset),
        "target": ts[
            start_dataset : end_training - timedelta(days=1)
        ].tolist(),  # We use -1, because pandas indexing includes the upper bound
    }
    for ts in timeseries
]
print(len(training_data))

26


In [27]:
num_test_windows = 4

test_data = [
    {
        "start": str(start_dataset),
        "target": ts[start_dataset : end_training + timedelta(days=k * prediction_length)].tolist(),
    }
    for k in range(1, num_test_windows + 1)
    for ts in timeseries
]
print(len(test_data))

104


In [28]:
def write_dicts_to_file(path, data1):
    with open(path, "wb") as fp:
        for d in data1:
            fp.write(json.dumps(d).encode("utf-8"))
            fp.write("\n".encode("utf-8"))

In [29]:
%%time
write_dicts_to_file("train.json", training_data)
write_dicts_to_file("test.json", test_data)

CPU times: user 53.7 ms, sys: 6.54 ms, total: 60.2 ms
Wall time: 85.6 ms


In [30]:
s3 = boto3.resource("s3")


def copy_to_s3(local_file, s3_path, override=False):
    assert s3_path.startswith("s3://")
    split = s3_path.split("/")
    bucket = split[2]
    path = "/".join(split[3:])
    buk = s3.Bucket(bucket)

    if len(list(buk.objects.filter(Prefix=path))) > 0:
        if not override:
            print(
                "File s3://{}/{} already exists.\nSet override to upload anyway.\n".format(
                    s3_bucket, s3_path
                )
            )
            return
        else:
            print("Overwriting existing file")
    with open(local_file, "rb") as data1:
        print("Uploading file to {}".format(s3_path))
        buk.put_object(Key=path, Body=data1)

In [31]:
%%time
copy_to_s3("train.json", s3_data_path + "/train/train.json")
copy_to_s3("test.json", s3_data_path + "/test/test.json")

Uploading file to s3://sagemaker-us-east-2-363247502029/deepar-demo-notebook/data/train/train.json
Uploading file to s3://sagemaker-us-east-2-363247502029/deepar-demo-notebook/data/test/test.json
CPU times: user 41.8 ms, sys: 7.34 ms, total: 49.2 ms
Wall time: 221 ms


In [32]:
s3filesystem = s3fs.S3FileSystem()
with s3filesystem.open(s3_data_path + "/train/train.json", "rb") as fp:
    print(fp.readline().decode("utf-8")[:100] + "...")

{"start": "2021-01-03 06:00:00", "target": [4042.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0...


In [33]:
estimator = sagemaker.estimator.Estimator(
    image_uri=image_name,
    sagemaker_session=sagemaker_session,
    role=role,
    train_instance_count=1,
    train_instance_type="ml.c4.2xlarge",
    base_job_name="deepar-electricity-demo",
    output_path=s3_output_path,
)

train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [34]:
hyperparameters = {
    "time_freq": freq,
    "epochs": "400",
    "early_stopping_patience": "40",
    "mini_batch_size": "64",
    "learning_rate": "5E-4",
    "context_length": str(context_length),
    "prediction_length": str(prediction_length),
}

In [35]:
estimator.set_hyperparameters(**hyperparameters)

In [36]:
%%time
data_channels = {"train": "{}/train/".format(s3_data_path), "test": "{}/test/".format(s3_data_path)}

estimator.fit(inputs=data_channels, wait=True)

2021-12-30 06:38:03 Starting - Starting the training job...
2021-12-30 06:38:27 Starting - Launching requested ML instancesProfilerReport-1640846283: InProgress
...
2021-12-30 06:39:00 Starting - Preparing the instances for training.........
2021-12-30 06:40:28 Downloading - Downloading input data
2021-12-30 06:40:28 Training - Downloading the training image.....[34mArguments: train[0m
[34m[12/30/2021 06:41:09 INFO 140511108945280] Reading default configuration from /opt/amazon/lib/python3.6/site-packages/algorithm/resources/default-input.json: {'_kvstore': 'auto', '_num_gpus': 'auto', '_num_kv_servers': 'auto', '_tuning_objective_metric': '', 'cardinality': 'auto', 'dropout_rate': '0.10', 'early_stopping_patience': '', 'embedding_dimension': '10', 'learning_rate': '0.001', 'likelihood': 'student-t', 'mini_batch_size': '128', 'num_cells': '40', 'num_dynamic_feat': 'auto', 'num_eval_samples': '100', 'num_layers': '2', 'test_quantiles': '[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]'}

In [37]:
from sagemaker.serializers import IdentitySerializer

In [38]:
class DeepARPredictor(sagemaker.predictor.Predictor):
    def __init__(self, *args, **kwargs):
        super().__init__(
            *args,
            # serializer=JSONSerializer(),
            serializer=IdentitySerializer(content_type="application/json"),
            **kwargs,
        )

    def predict(
        self,
        ts,
        cat=None,
        dynamic_feat=None,
        num_samples=100,
        return_samples=False,
        quantiles=["0.1", "0.5", "0.9"],
    ):
        """Requests the prediction of for the time series listed in `ts`, each with the (optional)
        corresponding category listed in `cat`.

        ts -- `pandas.Series` object, the time series to predict
        cat -- integer, the group associated to the time series (default: None)
        num_samples -- integer, number of samples to compute at prediction time (default: 100)
        return_samples -- boolean indicating whether to include samples in the response (default: False)
        quantiles -- list of strings specifying the quantiles to compute (default: ["0.1", "0.5", "0.9"])

        Return value: list of `pandas.DataFrame` objects, each containing the predictions
        """
        prediction_time = ts.index[-1] + ts.index.freq
        quantiles = [str(q) for q in quantiles]
        req = self.__encode_request(ts, cat, dynamic_feat, num_samples, return_samples, quantiles)
        res = super(DeepARPredictor, self).predict(req)
        return self.__decode_response(res, ts.index.freq, prediction_time, return_samples)

    def __encode_request(self, ts, cat, dynamic_feat, num_samples, return_samples, quantiles):
        instance = series_to_dict(
            ts, cat if cat is not None else None, dynamic_feat if dynamic_feat else None
        )

        configuration = {
            "num_samples": num_samples,
            "output_types": ["quantiles", "samples"] if return_samples else ["quantiles"],
            "quantiles": quantiles,
        }

        http_request_data = {"instances": [instance], "configuration": configuration}

        return json.dumps(http_request_data).encode("utf-8")

    def __decode_response(self, response, freq, prediction_time, return_samples):
        # we only sent one time series so we only receive one in return
        # however, if possible one will pass multiple time series as predictions will then be faster
        predictions = json.loads(response.decode("utf-8"))["predictions"][0]
        prediction_length = len(next(iter(predictions["quantiles"].values())))
        prediction_index = pd.date_range(
            start=prediction_time, freq=freq, periods=prediction_length
        )
        if return_samples:
            dict_of_samples = {"sample_" + str(i): s for i, s in enumerate(predictions["samples"])}
        else:
            dict_of_samples = {}
        return pd.DataFrame(
            data={**predictions["quantiles"], **dict_of_samples}, index=prediction_index
        )

    def set_frequency(self, freq):
        self.freq = freq


def encode_target(ts):
    return [x if np.isfinite(x) else "NaN" for x in ts]


def series_to_dict(ts, cat=None, dynamic_feat=None):
    """Given a pandas.Series object, returns a dictionary encoding the time series.

    ts -- a pands.Series object with the target time series
    cat -- an integer indicating the time series category

    Return value: a dictionary
    """
    obj = {"start": str(ts.index[0]), "target": encode_target(ts)}
    if cat is not None:
        obj["cat"] = cat
    if dynamic_feat is not None:
        obj["dynamic_feat"] = dynamic_feat
    return obj

In [39]:
predictor = estimator.deploy(
    initial_instance_count=1, instance_type="ml.m5.large", predictor_cls=DeepARPredictor
)

-----!

In [51]:
predictor.predict(ts=timeseries[25])

Unnamed: 0,0.1,0.5,0.9
2022-03-30 01:00:00,-0.014229,-0.005282,0.007760
2022-03-30 02:00:00,-0.010172,-0.000415,0.010066
2022-03-30 03:00:00,-0.010866,0.002125,0.014311
2022-03-30 04:00:00,-0.007570,-0.001272,0.005656
2022-03-30 05:00:00,-0.013403,-0.002611,0.006495
...,...,...,...
2022-04-05 20:00:00,-0.014879,-0.005751,0.003340
2022-04-05 21:00:00,-0.014847,-0.005998,0.001633
2022-04-05 22:00:00,-0.013741,-0.006907,0.002622
2022-04-05 23:00:00,-0.016960,-0.007571,0.000027


In [52]:
predictor.delete_endpoint()

In [None]:
def plot(
    predictor,
    target_ts,
    cat=None,
    dynamic_feat=None,
    forecast_date=end_training,
    show_samples=False,
    plot_history=7 * 12,
    confidence=80,
):
    freq = target_ts.index.freq
    print(
        "calling served model to generate predictions starting from {}".format(str(forecast_date))
    )
    assert confidence > 50 and confidence < 100
    low_quantile = 0.5 - confidence * 0.005
    up_quantile = confidence * 0.005 + 0.5

    # we first construct the argument to call our model
    args = {
        "ts": target_ts[:forecast_date],
        "return_samples": show_samples,
        "quantiles": [low_quantile, 0.5, up_quantile],
        "num_samples": 100,
    }

    if dynamic_feat is not None:
        args["dynamic_feat"] = dynamic_feat
        fig = plt.figure(figsize=(20, 6))
        ax = plt.subplot(2, 1, 1)
    else:
        fig = plt.figure(figsize=(20, 3))
        ax = plt.subplot(1, 1, 1)

    if cat is not None:
        args["cat"] = cat
        ax.text(0.9, 0.9, "cat = {}".format(cat), transform=ax.transAxes)

    # call the end point to get the prediction
    prediction = predictor.predict(**args)

    # plot the samples
    if show_samples:
        for key in prediction.keys():
            if "sample" in key:
                prediction[key].plot(color="lightskyblue", alpha=0.2, label="_nolegend_")

    # plot the target
    target_section = target_ts[
        forecast_date - plot_history * freq : forecast_date + prediction_length * freq
    ]
    target_section.plot(color="black", label="target")

    # plot the confidence interval and the median predicted
    ax.fill_between(
        prediction[str(low_quantile)].index,
        prediction[str(low_quantile)].values,
        prediction[str(up_quantile)].values,
        color="b",
        alpha=0.3,
        label="{}% confidence interval".format(confidence),
    )
    prediction["0.5"].plot(color="b", label="P50")
    ax.legend(loc=2)

    # fix the scale as the samples may change it
    ax.set_ylim(target_section.min() * 0.5, target_section.max() * 1.5)

    if dynamic_feat is not None:
        for i, f in enumerate(dynamic_feat, start=1):
            ax = plt.subplot(len(dynamic_feat) * 2, 1, len(dynamic_feat) + i, sharex=ax)
            feat_ts = pd.Series(
                index=pd.date_range(
                    start=target_ts.index[0], freq=target_ts.index.freq, periods=len(f)
                ),
                data=f,
            )
            feat_ts[
                forecast_date - plot_history * freq : forecast_date + prediction_length * freq
            ].plot(ax=ax, color="g")

##############################################################

***************************Second Approach*****************

#######################################################################

In [16]:
data1 = data[['date', 'year',
       'quarter', 'month', 'week', 'day', 'hour', 'origin',
       'destination', 'flight', 'capacity', 'price_type', 'promotion',
       'roundtrip_or_oneway', 'customer_type', 'product_type',
       'location_lifestyle', 'location_economical_status',
       'location_employment_status', 'location_event', 'source_wind',
       'source_humidity', 'source_precipitation', 'destination_wind',
       'destination_humidity', 'destination_precipitation',
       'number_of_booking']]
data1.dtypes

date                          datetime64[ns]
year                                  object
quarter                               object
month                                 object
week                                  object
day                                   object
hour                                   int64
origin                                object
destination                           object
flight                                object
capacity                               int64
price_type                            object
promotion                             object
roundtrip_or_oneway                   object
customer_type                         object
product_type                          object
location_lifestyle                    object
location_economical_status            object
location_employment_status            object
location_event                        object
source_wind                           object
source_humidity                       object
source_pre

In [17]:
data1.set_index('date', inplace=True)
data1

Unnamed: 0_level_0,year,quarter,month,week,day,hour,origin,destination,flight,capacity,...,location_economical_status,location_employment_status,location_event,source_wind,source_humidity,source_precipitation,destination_wind,destination_humidity,destination_precipitation,number_of_booking
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-01-03 06:00:00,2021,Q1,JAN,1,SUNDAY,6,MAA,DXB,FD101,467,...,HIGH,EMPLOYED,CHRISTMAS,0.45 MPS,30%,RAIN,0.52 MPS,20%,DRIZZLE,10.0
2021-01-03 06:00:00,2021,Q1,JAN,1,SUNDAY,6,MAA,DXB,FD101,467,...,HIGH,EMPLOYED,CHRISTMAS,0.45 MPS,30%,RAIN,0.52 MPS,20%,DRIZZLE,5.0
2021-01-03 06:00:00,2021,Q1,JAN,1,SUNDAY,6,MAA,DXB,FD102,467,...,HIGH,EMPLOYED,CHRISTMAS,0.45 MPS,30%,RAIN,0.52 MPS,20%,DRIZZLE,10.0
2021-01-03 06:00:00,2021,Q1,JAN,1,SUNDAY,6,MAA,DXB,FD102,467,...,HIGH,EMPLOYED,CHRISTMAS,0.45 MPS,30%,RAIN,0.52 MPS,20%,DRIZZLE,20.0
2021-01-03 06:00:00,2021,Q1,JAN,1,SUNDAY,6,MAA,DXB,FD101,467,...,HIGH,EMPLOYED,CHRISTMAS,0.45 MPS,30%,RAIN,0.52 MPS,20%,DRIZZLE,20.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-03-26 12:00:00,2022,Q1,MAR,4,TUESDAY,12,MAA,DXB,FD102,467,...,HIGH,EMPLOYED,CHRISTMAS,0.45 MPS,30%,RAIN,0.52 MPS,20%,DRIZZLE,
2022-03-28 18:00:00,2022,Q1,MAR,4,THURSDAY,18,MAA,DXB,FD101,467,...,HIGH,EMPLOYED,CHRISTMAS,0.45 MPS,30%,RAIN,0.52 MPS,20%,DRIZZLE,
2022-03-28 18:00:00,2022,Q1,MAR,4,THURSDAY,18,MAA,DXB,FD102,467,...,HIGH,EMPLOYED,CHRISTMAS,0.45 MPS,30%,RAIN,0.52 MPS,20%,DRIZZLE,
2022-03-30 00:00:00,2022,Q1,MAR,4,FRIDAY,24,MAA,DXB,FD102,467,...,HIGH,EMPLOYED,CHRISTMAS,0.45 MPS,30%,RAIN,0.52 MPS,20%,DRIZZLE,


In [54]:
#!pip install jsonlines
import pandas as pd
import jsonlines
from sklearn import preprocessing
le = preprocessing.LabelEncoder()


#series = pd.read_csv('test.csv', parse_dates=[0], index_col=0)
#series.sort_index(inplace=True)

target_column = 'number_of_booking'
group_column = 'quarter'

for col in data1.columns:
    if col !=target_column:
        data1[col] = le.fit_transform(data1[col])

if data1[group_column].nunique()==1:
    a = [data1]
else:
    a = [v for k, v in data1.groupby(group_column)]

out = []

for i in range(len(a)):
    dynamic_feat = []
    cat = []
    for col in a[0].columns:
        if col == target_column:
            target = a[0][col].values.tolist()
            start = str(a[0].index[0])

        else:
            if a[0][col].nunique()>=2: #if 2 or more values, add as dynamic feature
                dynamic_feat.append(a[0][col].values.astype(float).tolist())
            elif a[0][col].nunique()==1: #if 1 value, add as category
                cat.append(int(a[0][col][0]))
    out.append({'start':start, 'target':target, 'cat':cat, 'dynamic_feat':dynamic_feat})
    
with jsonlines.open('train-data.jsonl', mode='w') as writer:
    writer.write_all(out)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [55]:
s3 = boto3.resource("s3")


def copy_to_s3(local_file, s3_path, override=False):
    assert s3_path.startswith("s3://")
    split = s3_path.split("/")
    bucket = split[2]
    path = "/".join(split[3:])
    buk = s3.Bucket(bucket)

    if len(list(buk.objects.filter(Prefix=path))) > 0:
        if not override:
            print(
                "File s3://{}/{} already exists.\nSet override to upload anyway.\n".format(
                    s3_bucket, s3_path
                )
            )
            return
        else:
            print("Overwriting existing file")
    with open(local_file, "rb") as data:
        print("Uploading file to {}".format(s3_path))
        buk.put_object(Key=path, Body=data)

In [56]:
s3_bucket = sagemaker.Session().default_bucket()  # replace with an existing bucket if needed
s3_prefix = "deepar-demo-notebook1"  # prefix used for all data stored within the bucket
s3_data_path = "s3://{}/{}/data".format(s3_bucket, s3_prefix)
s3_output_path = "s3://{}/{}/output".format(s3_bucket, s3_prefix)
%%time
copy_to_s3("train-data.jsonl", s3_data_path + "/train/train-data.jsonl")
#copy_to_s3("test.json", s3_data_path + "/test/test-data.json")

UsageError: Line magic function `%%time` not found.


In [58]:
import sagemaker

session = sagemaker.Session()

region = session.boto_region_name

estimator = sagemaker.estimator.Estimator(
    sagemaker_session=session,
    image_uri=sagemaker.amazon.amazon_estimator.get_image_uri(region, "forecasting-deepar", "latest"),
    role=sagemaker.get_execution_role(),
    train_instance_count=1,
    train_instance_type='ml.c4.2xlarge',
    base_job_name='deepar-poc',
    output_path=s3_output_path)

The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: latest.
train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [59]:
hyperparameters = {
    "time_freq": '1H',
    "epochs": "400",
    "early_stopping_patience": "40",
    "mini_batch_size": "64",
    "learning_rate": "5E-4",
    "context_length": '10',
    "prediction_length": '1'
}

estimator.set_hyperparameters(**hyperparameters)

In [60]:
%%time
data_channels = {"train-data": "{}/train/".format(s3_data_path), "test-data": "{}/test/".format(s3_data_path)}

estimator.fit(inputs=data_channels, wait=True)

ClientError: An error occurred (ValidationException) when calling the CreateTrainingJob operation: No S3 objects found under S3 URL "s3://sagemaker-us-east-2-363247502029/deepar-demo-notebook1/data/train/" given in input data source. Please ensure that the bucket exists in the selected region (us-east-2), that objects exist under that S3 prefix, and that the role "arn:aws:iam::363247502029:role/Sagemaker-Full-Access" has "s3:ListBucket" permissions on bucket "sagemaker-us-east-2-363247502029".