In [1]:
import boto3

client=boto3.client('sts')
account=client.get_caller_identity()['Account']

my_session=boto3.session.Session()
region=my_session.region_name

algorithm_name="pytorch-tft-container-test"
ecr_image='{}.dkr.ecr.{}.amazonaws.com/{}:latest'.format(account, region, algorithm_name)

ecr_image



'551329315830.dkr.ecr.us-east-1.amazonaws.com/pytorch-tft-container-test:latest'

In [2]:
! aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-east-1.amazonaws.com

https://docs.docker.com/engine/reference/commandline/login/#credentials-store

Login Succeeded


In [3]:
import sagemaker
import uuid

sagemaker_session = sagemaker.Session()
print('SageMaker version: ' + sagemaker.__version__)

bucket = sagemaker_session.default_bucket()
prefix = 'sagemaker/DEMO-pytorch-cnn-cifar10'

role = sagemaker.get_execution_role()
checkpoint_suffix = str(uuid.uuid4())[:8]
checkpoint_s3_path = 's3://{}/checkpoints/checkpoint-{}'.format(bucket, checkpoint_suffix)

print('Checkpointing Path: {}'.format(checkpoint_s3_path))

SageMaker version: 2.106.0
Checkpointing Path: s3://sagemaker-us-east-1-551329315830/checkpoints/checkpoint-86bcdfa1


In [6]:
from utils_timeseries import download_process_and_return_raw_data, save_local_and_upload_s3, metadata_json_upload_s3
import sagemaker
import uuid

max_prediction_length = 6
max_encoder_length = 24
special_days = [
        "easter_day",
        "good_friday",
        "new_year",
        "christmas",
        "labor_day",
        "independence_day",
        "revolution_day_memorial",
        "regional_games",
        "fifa_u_17_world_cup",
        "football_gold_cup",
        "beer_capital",
        "music_fest",
    ]

training_metadata = {}
training_metadata['time_idx'] = "time_idx"
training_metadata['target'] = "volume"
training_metadata['group_ids'] = ["agency", "sku"]
training_metadata['min_encoder_length'] = max_encoder_length // 2      # keep encoder length long (as it is in the validation set)
training_metadata['max_encoder_length'] = max_encoder_length
training_metadata['min_prediction_length'] = 1      
training_metadata['max_prediction_length'] = max_prediction_length
training_metadata['static_categoricals'] = ["agency", "sku"]
training_metadata['static_reals'] = ["avg_population_2017", "avg_yearly_household_income_2017"]
training_metadata['time_varying_known_categoricals'] = ["special_days", "month"]
training_metadata['variable_groups'] = {"special_days": special_days}
training_metadata['time_varying_known_reals'] = ["time_idx", "price_regular", "discount_in_percent"]
training_metadata['time_varying_unknown_categoricals'] = []
training_metadata['time_varying_unknown_reals'] = [
        "volume",
        "log_volume",
        "industry_volume",
        "soda_volume",
        "avg_max_temp",
        "avg_volume_by_agency",
        "avg_volume_by_sku",
    ]
training_metadata['target_normalizer'] = {
                            "normalized_groups": ["agency", "sku"],
                            "normalization_transformation": 'softplus'
                        }
training_metadata['add_relative_time_idx'] = True
training_metadata['add_target_scales'] = True
training_metadata['add_encoder_length'] = True
training_metadata['allow_missing_timesteps'] = True




sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()

# obtain & upload training data
training_data = download_process_and_return_raw_data()
inputs = save_local_and_upload_s3(training_data, sagemaker_session, bucket, data_filename="stallion_data")

# upload metadata
training_metadata['training_cutoff'] = int(training_data["time_idx"].max() - max_prediction_length)
metadata_json_upload_s3(training_metadata, sagemaker_session, bucket, metadata_filename="stallion_metadata")

Checkpointing directory timeseries_data exists
saved raw data to timeseries_data/stallion_data.parquet
Checkpointing directory timeseries_data exists
saved metadata to timeseries_data/stallion_metadata.json


's3://sagemaker-us-east-1-551329315830/data/timeseries_data'

In [7]:
use_spot_instances = False
max_run=6000      # in seconds, after this, job will be terminated
max_wait = 10 * max_run if use_spot_instances else None
local_image_name = 'pytorch-tft-container-test'

In [12]:
from sagemaker.pytorch import PyTorch
from sagemaker.estimator import Estimator
from sagemaker.debugger import TensorBoardOutputConfig


hyperparameters = {
        'num-epochs': 3, 
        'data-filename': "stallion_data.parquet",
        'metadata-filename': "stallion_metadata.json",
        'run-mode': 'test'
    }

tensorboard_output_config = TensorBoardOutputConfig(
    s3_output_path='s3://sagemaker-us-east-1-551329315830/tensorboard',
    container_local_output_path='/lightning_logs'
)

spot_estimator  = PyTorch(entry_point='TFT_docker/TFT.py',
                            dependencies=['TFT_docker/requirements.txt'],
                            role=role,
                            framework_version='1.7.1',
                            py_version='py3',
                            instance_count=1,
#                             instance_type='local',
#                             instance_type='ml.p3.2xlarge',
                            instance_type='ml.p2.xlarge',
#                             instance_type='ml.p2.8xlarge',
                            base_job_name='tft-pytorch-spot-1',
                            hyperparameters=hyperparameters,
                            checkpoint_s3_uri=checkpoint_s3_path,
                            debugger_hook_config=False,
                            input_mode = 'FastFile',
                            use_spot_instances=use_spot_instances,
                            max_run=max_run,
                            max_wait=max_wait,
                            tensorboard_output_config=tensorboard_output_config,
                            metric_definitions=[
                                   {'Name': 'train:loss', 'Regex': 'train_loss_epoch=(.*?),'},
                                   {'Name': 'trainstep:loss', 'Regex': 'train_loss_step=(.*?),'},
                                   {'Name': 'validation:loss', 'Regex': 'val_loss=(.*?),'}
                                ]
                           )

spot_estimator.fit(
                inputs,
                logs = 'All'
            )

2022-09-03 08:56:16 Starting - Starting the training job...ProfilerReport-1662195376: InProgress
...
2022-09-03 08:57:06 Starting - Preparing the instances for training......
2022-09-03 08:58:17 Downloading - Downloading input data......
2022-09-03 08:59:07 Training - Downloading the training image...........................
2022-09-03 09:03:48 Training - Training image download completed. Training in progress.[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2022-09-03 09:03:42,423 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2022-09-03 09:03:42,460 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2022-09-03 09:03:42,468 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2022-09-03 09:03:43,038 sagemaker-training-toolkit INFO     Installing dependencies from requir

[34m  Building wheel for fastparquet (setup.py): finished with status 'done'
  Created wheel for fastparquet: filename=fastparquet-0.8.0-cp36-cp36m-linux_x86_64.whl size=1256923 sha256=12b99739af3c5431ab49d23762d118b1015ded65dd6bfe0bf0842d3e7689fd82
  Stored in directory: /root/.cache/pip/wheels/6a/6a/4f/0fd8e8bcbc4b5b751186e363b5b03975d8643eee2975eed2ca
  Building wheel for idna-ssl (setup.py): started[0m
[34m  Building wheel for idna-ssl (setup.py): finished with status 'done'
  Created wheel for idna-ssl: filename=idna_ssl-1.1.0-py3-none-any.whl size=3161 sha256=48a23731c24fa95de7ebb979e64f542d43ee1f910b09758b644057db9077e55d
  Stored in directory: /root/.cache/pip/wheels/6a/f5/9c/f8331a854f7a8739cf0e74c13854e4dd7b1af11b04fe1dde13
  Building wheel for pyperclip (setup.py): started
  Building wheel for pyperclip (setup.py): finished with status 'done'
  Created wheel for pyperclip: filename=pyperclip-1.8.2-py3-none-any.whl size=11106 sha256=c229c4e55b93187bef318da131717f1de1cfd224

[34mDistributed training - False[0m
[34mCheckpointing directory /opt/ml/checkpoints exists[0m
[34mDevice Type: cuda[0m
[34mLoad Time Series dataset from S3[0m
[34mcreating dataloader[0m
[34mcreate model trainer[0m
[34mcreate model from dataset[0m
[34mNumber of parameters in network: 3392.2k[0m
[34mtraining model[0m
[34m#015Validation sanity check: 0it [00:00, ?it/s]#015Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]#015Validation sanity check:  50%|█████     | 1/2 [00:01<00:01,  1.88s/it]#015                                                                      #015#015Training: 0it [00:00, ?it/s]#015Training:   0%|          | 0/39 [00:00<?, ?it/s]#015Epoch 0:   0%|          | 0/39 [00:00<?, ?it/s] #015Epoch 0:   3%|▎         | 1/39 [00:00<00:25,  1.49it/s]#015Epoch 0:   3%|▎         | 1/39 [00:00<00:25,  1.49it/s, loss=11.5, train_loss_step=11.50]#015Epoch 0:   5%|▌         | 2/39 [00:00<00:15,  2.32it/s, loss=11.5, train_loss_step=11.50]#015Epoch 0:  


2022-09-03 09:05:20 Uploading - Uploading generated training model[34m#015                                                         #033[A#015Epoch 1: 100%|██████████| 39/39 [00:09<00:00,  4.33it/s, loss=107, train_loss_step=110.0, val_loss=142.0, train_loss_epoch=117.0]#015Epoch 1:   0%|          | 0/39 [00:00<?, ?it/s, loss=107, train_loss_step=110.0, val_loss=142.0, train_loss_epoch=117.0]         #015Epoch 2:   0%|          | 0/39 [00:00<?, ?it/s, loss=107, train_loss_step=110.0, val_loss=142.0, train_loss_epoch=117.0]#015Epoch 2:   3%|▎         | 1/39 [00:00<00:22,  1.71it/s, loss=120, train_loss_step=369.0, val_loss=142.0, train_loss_epoch=117.0]#015Epoch 2:   5%|▌         | 2/39 [00:00<00:16,  2.26it/s, loss=117, train_loss_step=73.60, val_loss=142.0, train_loss_epoch=117.0]#015Epoch 2:   8%|▊         | 3/39 [00:01<00:14,  2.55it/s, loss=117, train_loss_step=73.60, val_loss=142.0, train_loss_epoch=117.0]#015Epoch 2:   8%|▊         | 3/39 [00:01<00:14,  2.54it/s, loss=119, train


2022-09-03 09:05:49 Failed - Training job failed
ProfilerReport-1662195376: NoIssuesFound


UnexpectedStatusException: Error for Training job tft-pytorch-spot-1-2022-09-03-08-56-15-855: Failed. Reason: AlgorithmError: ExecuteUserScriptError:
Command "/opt/conda/bin/python3.6 TFT.py --data-filename stallion_data.parquet --metadata-filename stallion_metadata.json --num-epochs 3 --run-mode test"
Global seed set to 42
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

   | Name                               | Type                            | Params
----------------------------------------------------------------------------------------
0  | loss                               | QuantileLoss                    | 0     
1  | logging_metrics                    | ModuleList                      | 0     
2  | input_embeddings                   | MultiEmbedding                  | 1.3 K 
3  | prescalers                         | ModuleDict                      | 5.1 K 
4  | static_variable_selection          | V

In [None]:
from sagemaker.s3 import S3Downloader
import tarfile
from pytorch_forecasting import TemporalFusionTransformer

def download_data_from_s3(s3_uri, sagemaker_session):
    S3Downloader.download(
                        s3_uri=s3_uri, 
                        local_path = "./trained_model_artifact",
                        sagemaker_session=sagemaker_session
                )
    return "./trained_model_artifact/model.tar.gz"

model_path = download_data_from_s3(spot_estimator.model_data, sagemaker_session)
tar = tarfile.open(model_path, "r:gz")

checkpointed_model = TemporalFusionTransformer.load_from_checkpoint(tar.extractfile(member=tar.getmember(name="model_trainer.ckpt")))
best_tft = checkpointed_model

## Get Dataset again, for testing purposes

In [None]:
from pytorch_forecasting import TimeSeriesDataSet
from pytorch_forecasting.data import GroupNormalizer
import torch

max_prediction_length = 6
max_encoder_length = 24
training_cutoff = training_data["time_idx"].max() - max_prediction_length

training = TimeSeriesDataSet(
    training_data[lambda x: x.time_idx <= training_cutoff],
    time_idx="time_idx",
    target="volume",
    group_ids=["agency", "sku"],
    min_encoder_length=max_encoder_length // 2,  # keep encoder length long (as it is in the validation set)
    max_encoder_length=max_encoder_length,
    min_prediction_length=1,
    max_prediction_length=max_prediction_length,
    static_categoricals=["agency", "sku"],
    static_reals=["avg_population_2017", "avg_yearly_household_income_2017"],
    time_varying_known_categoricals=["special_days", "month"],
    variable_groups={"special_days": special_days},  # group of categorical variables can be treated as one variable
    time_varying_known_reals=["time_idx", "price_regular", "discount_in_percent"],
    time_varying_unknown_categoricals=[],
    time_varying_unknown_reals=[
        "volume",
        "log_volume",
        "industry_volume",
        "soda_volume",
        "avg_max_temp",
        "avg_volume_by_agency",
        "avg_volume_by_sku",
    ],
    target_normalizer=GroupNormalizer(
        groups=["agency", "sku"], transformation="softplus"
    ),  # use softplus and normalize by group
    add_relative_time_idx=True,
    add_target_scales=True,
    add_encoder_length=True,
)

# create validation set (predict=True) which means to predict the last max_prediction_length points in time
# for each series
validation = TimeSeriesDataSet.from_dataset(training, training_data, predict=True, stop_randomization=True)

# create dataloaders for model
batch_size = 128  # set this between 32 to 128
train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=0)
val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size * 10, num_workers=0)

In [None]:
next(iter(val_dataloader))[1]

In [None]:
actuals = torch.cat([y[0] for x, y in iter(val_dataloader)])
predictions = checkpointed_model.predict(val_dataloader)
(actuals - predictions).abs().mean()

In [None]:

# raw predictions are a dictionary from which all kind of information including quantiles can be extracted
raw_predictions, x = best_tft.predict(val_dataloader, mode="raw", return_x=True)
for idx in range(10):  # plot 10 examples
    best_tft.plot_prediction(x, raw_predictions, idx=idx, add_loss_to_title=True);