# Import Dataset
* This notebook has the following processes:
    * Create IAM role
    * Create a dataset group
    * Create a schema for a dataset
    * Create the dataset
    * Attach the dataset to the dataset group
    * Create a dataset import job
    
    
* **About 3 mins may be elapsed**


In [114]:
import boto3
from time import sleep
import os
import pandas as pd
import json
import time
import pprint
import numpy as np

In [None]:
%store -r

## Parmeters

In [124]:
DATASET_FREQUENCY = "D" 
TIMESTAMP_FORMAT = "yyyy-MM-dd hh:mm:ss"

suffix = str(np.random.uniform())[4:9]


# Enter a project name
project = 'StoreItemDemand'
target_suffix = '_target'


target_datasetName= project+'DS' + suffix
target_datasetGroupName= project +'DSG'+ suffix

In [125]:
with open('/opt/ml/metadata/resource-metadata.json') as notebook_info:
     data = json.load(notebook_info)
     resource_arn = data['ResourceArn']
     region = resource_arn.split(':')[3]
print(region)

us-east-2


In [126]:
session = boto3.Session(region_name=region)
forecast = session.client(service_name='forecast')
forecast_query = session.client(service_name='forecastquery')

## Create role
**Make sure that the role for SageMaker notebook instance have these policies attached such as AmazonSageMakerFullAccess, AmazonS3FullAccess, AmazonForecastFullAccess, IAMFullAccess**

In [None]:
iam = boto3.client("iam")

# Put the role name
role_name = "ForecastRolePOC" + suffix
assume_role_policy_document = {
    "Version": "2012-10-17",
    "Statement": [
        {
          "Effect": "Allow",
          "Principal": {
            "Service": "forecast.amazonaws.com"
          },
          "Action": "sts:AssumeRole"
        }
    ]
}

create_role_response = iam.create_role(
    RoleName = role_name,
    AssumeRolePolicyDocument = json.dumps(assume_role_policy_document)
)

# AmazonPersonalizeFullAccess provides access to any S3 bucket with a name that includes "personalize" or "Personalize" 
# if you would like tåo use a bucket with a different name, please consider creating and attaching a new policy
# that provides read access to your bucket or attaching the AmazonS3ReadOnlyAccess policy to the role
policy_arn = "arn:aws:iam::aws:policy/AmazonForecastFullAccess"
iam.attach_role_policy(
    RoleName = role_name,
    PolicyArn = policy_arn
)

# Now add S3 support
iam.attach_role_policy(
    PolicyArn='arn:aws:iam::aws:policy/AmazonS3FullAccess',
    RoleName=role_name
)
time.sleep(60) # wait for a minute to allow IAM role policy attachment to propagate

role_arn = create_role_response["Role"]["Arn"]
print(role_arn)

## Create DatasetGroup

create_dataset_group_response = forecast.create_dataset_group(
      DatasetGroupName= target_datasetGroupName,
      Domain="CUSTOM",
     )
target_datasetGroupArn = create_dataset_group_response['DatasetGroupArn']

In [None]:
forecast.describe_dataset_group(DatasetGroupArn=target_datasetGroupArn)

## Create schema

In [127]:
# Specify the schema of your dataset here. Make sure the order of columns matches the raw data files.
schema ={
   "Attributes":[
      {
         "AttributeName":"timestamp",
         "AttributeType":"timestamp"
      },
      {
         "AttributeName":"item_id",
         "AttributeType":"string"
      },
      {
         "AttributeName":"store",
         "AttributeType":"string"
      },       
      {
         "AttributeName":"target_value",
         "AttributeType":"float"
      },
   ]
}

## Create Target Time Sereis Dataset

In [128]:
response=forecast.create_dataset(
                    Domain="CUSTOM",
                    DatasetType='TARGET_TIME_SERIES',
                    DatasetName=target_datasetName,
                    DataFrequency=DATASET_FREQUENCY, 
                    Schema = schema
)

In [129]:
target_datasetArn = response['DatasetArn']
forecast.describe_dataset(DatasetArn=target_datasetArn)

{'DatasetArn': 'arn:aws:forecast:us-east-2:057716757052:dataset/StoreItemDemandDS47645',
 'DatasetName': 'StoreItemDemandDS47645',
 'Domain': 'CUSTOM',
 'DatasetType': 'TARGET_TIME_SERIES',
 'DataFrequency': 'D',
 'Schema': {'Attributes': [{'AttributeName': 'timestamp',
    'AttributeType': 'timestamp'},
   {'AttributeName': 'item_id', 'AttributeType': 'string'},
   {'AttributeName': 'store', 'AttributeType': 'string'},
   {'AttributeName': 'target_value', 'AttributeType': 'float'}]},
 'EncryptionConfig': {},
 'Status': 'ACTIVE',
 'CreationTime': datetime.datetime(2020, 3, 28, 7, 7, 48, 899000, tzinfo=tzlocal()),
 'LastModificationTime': datetime.datetime(2020, 3, 28, 7, 7, 48, 899000, tzinfo=tzlocal()),
 'ResponseMetadata': {'RequestId': '5d96ca18-45d7-4819-82ce-cde38e995208',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'content-type': 'application/x-amz-json-1.1',
   'date': 'Sat, 28 Mar 2020 07:07:49 GMT',
   'x-amzn-requestid': '5d96ca18-45d7-4819-82ce-cde38e995208',
   'content-len

## Attach the target time series dataset to the DatasetGroup

In [None]:
# Attach the Dataset to the Dataset Group:
forecast.update_dataset_group(
    DatasetGroupArn=target_datasetGroupArn, 
    DatasetArns=[target_datasetArn])

## Create a bucket
Uncomment the following code if needed

In [133]:
print(region)
s3 = boto3.client('s3')
account_id = boto3.client('sts').get_caller_identity().get('Account')
# Enter bucketname
bucket_name =  "forecastpoc_" + account_id
print(bucket_name)
if region != "us-east-1":
    s3.create_bucket(Bucket=bucket_name, CreateBucketConfiguration={'LocationConstraint': region})
else:
    s3.create_bucket(Bucket=bucket_name)

## Create dataset_import_job used to download dataset from S3

In [134]:
# Upload Target File
boto3.Session().resource('s3').Bucket(bucket_name).Object(target_time_series_filename).upload_file(target_time_series_path)
target_s3DataPath = "s3://"+bucket_name+"/"+target_time_series_filename

In [135]:
# Finally we can call import the dataset
datasetImportJobName = 'DSIMPORT_JOB_TARGET_WALMART' + suffix
ds_import_job_response=forecast.create_dataset_import_job(DatasetImportJobName=datasetImportJobName,
                                                          DatasetArn=target_datasetArn,
                                                          DataSource= {
                                                              "S3Config" : {
                                                                 "Path":target_s3DataPath,
                                                                 "RoleArn": role_arn
                                                              } 
                                                          },
                                                          TimestampFormat=TIMESTAMP_FORMAT
                                                         )

In [136]:
ds_target_import_job_arn=ds_import_job_response['DatasetImportJobArn']
print(ds_target_import_job_arn)

arn:aws:forecast:us-east-2:057716757052:dataset-import-job/StoreItemDemandDS47645/DSIMPORT_JOB_TARGET_WALMART47645


In [137]:
%%time

while True:
    dataImportStatus = forecast.describe_dataset_import_job(DatasetImportJobArn=ds_target_import_job_arn)['Status']
    print(dataImportStatus)
    if dataImportStatus != 'ACTIVE' and dataImportStatus != 'CREATE_FAILED':
        sleep(30)
    else:
        break

CREATE_PENDING
CREATE_IN_PROGRESS
CREATE_IN_PROGRESS
CREATE_IN_PROGRESS
CREATE_IN_PROGRESS
CREATE_IN_PROGRESS
CREATE_IN_PROGRESS
CREATE_IN_PROGRESS
CREATE_IN_PROGRESS
CREATE_IN_PROGRESS
ACTIVE
CPU times: user 37.1 ms, sys: 8.08 ms, total: 45.2 ms
Wall time: 5min


In [139]:
%store data_dir
%store project
%store suffix
%store target_suffix
%store region
%store ds_target_import_job_arn
%store target_datasetArn
%store bucket_name
%store role_arn
%store validation_stores_sales

Stored 'data_dir' (str)
Stored 'project' (str)
Stored 'suffix' (str)
Stored 'target_suffix' (str)
Stored 'region' (str)
Stored 'ds_target_import_job_arn' (str)
Stored 'target_datasetArn' (str)
Stored 'bucket_name' (str)
Stored 'role_arn' (str)
Stored 'validation_stores_sales' (DataFrame)
