In [1]:
import boto3

client=boto3.client('sts')
account=client.get_caller_identity()['Account']

my_session=boto3.session.Session()
region=my_session.region_name

algorithm_name="pytorch-tft-container-test"
ecr_image='{}.dkr.ecr.{}.amazonaws.com/{}:latest'.format(account, region, algorithm_name)

ecr_image



'551329315830.dkr.ecr.us-east-1.amazonaws.com/pytorch-tft-container-test:latest'

In [2]:
! aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-east-1.amazonaws.com

https://docs.docker.com/engine/reference/commandline/login/#credentials-store

Login Succeeded


In [3]:
import sagemaker
import uuid

sagemaker_session = sagemaker.Session()
print('SageMaker version: ' + sagemaker.__version__)

bucket = sagemaker_session.default_bucket()
prefix = 'sagemaker/DEMO-pytorch-cnn-cifar10'

role = sagemaker.get_execution_role()
checkpoint_suffix = str(uuid.uuid4())[:8]
checkpoint_s3_path = 's3://{}/checkpoints/checkpoint-{}'.format(bucket, checkpoint_suffix)

print('Checkpointing Path: {}'.format(checkpoint_s3_path))

SageMaker version: 2.103.0
Checkpointing Path: s3://sagemaker-us-east-1-551329315830/checkpoints/checkpoint-50760748


In [4]:
from utils_timeseries import download_process_and_return_raw_data, save_local_and_upload_s3, metadata_json_upload_s3
import sagemaker
import uuid

max_prediction_length = 6
max_encoder_length = 24
special_days = [
        "easter_day",
        "good_friday",
        "new_year",
        "christmas",
        "labor_day",
        "independence_day",
        "revolution_day_memorial",
        "regional_games",
        "fifa_u_17_world_cup",
        "football_gold_cup",
        "beer_capital",
        "music_fest",
    ]

training_metadata = {}
training_metadata['time_idx'] = "time_idx"
training_metadata['target'] = "volume"
training_metadata['group_ids'] = ["agency", "sku"]
training_metadata['min_encoder_length'] = max_encoder_length // 2      # keep encoder length long (as it is in the validation set)
training_metadata['max_encoder_length'] = max_encoder_length
training_metadata['min_prediction_length'] = 1      
training_metadata['max_prediction_length'] = max_prediction_length
training_metadata['static_categoricals'] = ["agency", "sku"]
training_metadata['static_reals'] = ["avg_population_2017", "avg_yearly_household_income_2017"]
training_metadata['time_varying_known_categoricals'] = ["special_days", "month"]
training_metadata['variable_groups'] = {"special_days": special_days}
training_metadata['time_varying_known_reals'] = ["time_idx", "price_regular", "discount_in_percent"]
training_metadata['time_varying_unknown_categoricals'] = []
training_metadata['time_varying_unknown_reals'] = [
        "volume",
        "log_volume",
        "industry_volume",
        "soda_volume",
        "avg_max_temp",
        "avg_volume_by_agency",
        "avg_volume_by_sku",
    ]
training_metadata['target_normalizer'] = {
                            "normalized_groups": ["agency", "sku"],
                            "normalization_transformation": 'softplus'
                        }
training_metadata['add_relative_time_idx'] = True
training_metadata['add_target_scales'] = True
training_metadata['add_encoder_length'] = True




sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()

# obtain & upload training data
training_data = download_process_and_return_raw_data()
inputs = save_local_and_upload_s3(training_data, sagemaker_session, bucket, data_filename="stallion_data")

# upload metadata
training_metadata['training_cutoff'] = int(training_data["time_idx"].max() - max_prediction_length)
metadata_json_upload_s3(training_metadata, sagemaker_session, bucket, metadata_filename="stallion_metadata")

Checkpointing directory timeseries_data exists
saved raw data to timeseries_data/stallion_data.parquet
Checkpointing directory timeseries_data exists
saved metadata to timeseries_data/stallion_metadata.json


's3://sagemaker-us-east-1-551329315830/data/timeseries_data'

In [5]:
use_spot_instances = False
max_run=600      # in seconds, after this, job will be terminated
max_wait = 10 * max_run if use_spot_instances else None
local_image_name = 'pytorch-tft-container-test'

In [6]:
from sagemaker.pytorch import PyTorch
from sagemaker.estimator import Estimator
from sagemaker.debugger import TensorBoardOutputConfig


hyperparameters = {
        'epochs': 5,
        'data-filename': "stallion_data.parquet",
        'metadata-filename': "stallion_metadata.json"
    }

tensorboard_output_config = TensorBoardOutputConfig(
    s3_output_path='s3://sagemaker-us-east-1-551329315830/tensorboard',
    container_local_output_path='/lightning_logs'
)

spot_estimator  = PyTorch(entry_point='TFT_docker/TFT.py',
                            dependencies=['TFT_docker/requirements.txt'],
                            role=role,
                            framework_version='1.7.1',
                            py_version='py3',
                            instance_count=1,
#                             instance_type='local',
                            instance_type='ml.p3.2xlarge',
#                             instance_type='ml.p2.xlarge',
                            base_job_name='tft-pytorch-spot-1',
                            hyperparameters=hyperparameters,
                            checkpoint_s3_uri=checkpoint_s3_path,
                            debugger_hook_config=False,
                            input_mode = 'FastFile',
                            use_spot_instances=use_spot_instances,
                            max_run=max_run,
                            max_wait=max_wait,
                            tensorboard_output_config=tensorboard_output_config
                           )

spot_estimator.fit(
                inputs,
                logs = 'All'
            )

2022-08-21 17:28:46 Starting - Starting the training job...ProfilerReport-1661102926: InProgress
...
2022-08-21 17:29:29 Starting - Preparing the instances for training......
2022-08-21 17:30:37 Downloading - Downloading input data...
2022-08-21 17:31:10 Training - Downloading the training image..................
2022-08-21 17:34:14 Training - Training image download completed. Training in progress..[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2022-08-21 17:34:16,962 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2022-08-21 17:34:16,986 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2022-08-21 17:34:16,994 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2022-08-21 17:34:17,420 sagemaker-training-toolkit INFO     Installing dependencies from requirements.txt:

[34m  Building wheel for fastparquet (setup.py): finished with status 'done'
  Created wheel for fastparquet: filename=fastparquet-0.8.0-cp36-cp36m-linux_x86_64.whl size=1256928 sha256=0e0768739f0a286dcf2ecc7c43997999df35df2cbda833b4c2951dfe40927ca2
  Stored in directory: /root/.cache/pip/wheels/6a/6a/4f/0fd8e8bcbc4b5b751186e363b5b03975d8643eee2975eed2ca
  Building wheel for idna-ssl (setup.py): started
  Building wheel for idna-ssl (setup.py): finished with status 'done'
  Created wheel for idna-ssl: filename=idna_ssl-1.1.0-py3-none-any.whl size=3161 sha256=5e508e1a302ca9569c1c38dca81846c8bf1bd55e80068350582788ea5cbd1282
  Stored in directory: /root/.cache/pip/wheels/6a/f5/9c/f8331a854f7a8739cf0e74c13854e4dd7b1af11b04fe1dde13
  Building wheel for pyperclip (setup.py): started[0m
[34m  Building wheel for pyperclip (setup.py): finished with status 'done'
  Created wheel for pyperclip: filename=pyperclip-1.8.2-py3-none-any.whl size=11106 sha256=818c93fd502a263d67914cb0d6766779764aad69

[34mDistributed training - False[0m
[34mCheckpointing directory /opt/ml/checkpoints exists[0m
[34mDevice Type: cuda[0m
[34mLoad Time Series dataset from S3[0m
[34mcreating dataloader[0m
[34mget GPU information[0m
[34mGPU count: 1[0m
[34mcreate model trainer[0m
[34mcreate model from dataset[0m
[34mNumber of parameters in network: 29.7k[0m
[34mtraining model[0m
[34m#015Validation sanity check: 0it [00:00, ?it/s]#015Validation sanity check:   0%|          | 0/1 [00:00<?, ?it/s]#015Validation sanity check: 100%|██████████| 1/1 [00:02<00:00,  2.59s/it]#015                                                                      #015#015Training: 0it [00:00, ?it/s]#015Training:   0%|          | 0/31 [00:00<?, ?it/s]#015Epoch 0:   0%|          | 0/31 [00:00<?, ?it/s] #015Epoch 0:   3%|▎         | 1/31 [00:01<00:31,  1.07s/it]#015Epoch 0:   3%|▎         | 1/31 [00:01<00:31,  1.07s/it, loss=116, v_num=0, train_loss_step=116.0]#015Epoch 0:   6%|▋         | 2/31 [00:01<00:20,  

[34m#015                                                         #033[A#015Epoch 1: 100%|██████████| 31/31 [00:15<00:00,  1.97it/s, loss=76.4, v_num=0, train_loss_step=58.00, val_loss=104.0, train_loss_epoch=78.60]#015Epoch 1:   0%|          | 0/31 [00:00<?, ?it/s, loss=76.4, v_num=0, train_loss_step=58.00, val_loss=104.0, train_loss_epoch=78.60]         #015Epoch 2:   0%|          | 0/31 [00:00<?, ?it/s, loss=76.4, v_num=0, train_loss_step=58.00, val_loss=104.0, train_loss_epoch=78.60]#015Epoch 2:   3%|▎         | 1/31 [00:00<00:23,  1.29it/s, loss=76.4, v_num=0, train_loss_step=58.00, val_loss=104.0, train_loss_epoch=78.60]#015Epoch 2:   3%|▎         | 1/31 [00:00<00:23,  1.29it/s, loss=73.9, v_num=0, train_loss_step=51.00, val_loss=104.0, train_loss_epoch=78.60]#015Epoch 2:   6%|▋         | 2/31 [00:01<00:16,  1.73it/s, loss=73.9, v_num=0, train_loss_step=51.00, val_loss=104.0, train_loss_epoch=78.60]#015Epoch 2:   6%|▋         | 2/31 [00:01<00:16,  1.73it/s, loss=74.3, v_num=0, tr

[34m#015                                                         #033[A#015Epoch 2: 100%|██████████| 31/31 [00:15<00:00,  1.99it/s, loss=75.6, v_num=0, train_loss_step=69.60, val_loss=102.0, train_loss_epoch=73.40]#015Epoch 2:   0%|          | 0/31 [00:00<?, ?it/s, loss=75.6, v_num=0, train_loss_step=69.60, val_loss=102.0, train_loss_epoch=73.40]         #015Epoch 3:   0%|          | 0/31 [00:00<?, ?it/s, loss=75.6, v_num=0, train_loss_step=69.60, val_loss=102.0, train_loss_epoch=73.40]#015Epoch 3:   3%|▎         | 1/31 [00:00<00:26,  1.15it/s, loss=75.6, v_num=0, train_loss_step=69.60, val_loss=102.0, train_loss_epoch=73.40]#015Epoch 3:   3%|▎         | 1/31 [00:00<00:26,  1.15it/s, loss=76.3, v_num=0, train_loss_step=88.70, val_loss=102.0, train_loss_epoch=73.40]#015Epoch 3:   6%|▋         | 2/31 [00:01<00:18,  1.61it/s, loss=76.3, v_num=0, train_loss_step=88.70, val_loss=102.0, train_loss_epoch=73.40]#015Epoch 3:   6%|▋         | 2/31 [00:01<00:18,  1.61it/s, loss=77, v_num=0, trai

[34m#015                                                         #033[A#015Epoch 3: 100%|██████████| 31/31 [00:15<00:00,  1.94it/s, loss=67.2, v_num=0, train_loss_step=55.20, val_loss=91.80, train_loss_epoch=71.40]#015Epoch 3:   0%|          | 0/31 [00:00<?, ?it/s, loss=67.2, v_num=0, train_loss_step=55.20, val_loss=91.80, train_loss_epoch=71.40]         #015Epoch 4:   0%|          | 0/31 [00:00<?, ?it/s, loss=67.2, v_num=0, train_loss_step=55.20, val_loss=91.80, train_loss_epoch=71.40]#015Epoch 4:   3%|▎         | 1/31 [00:00<00:24,  1.23it/s, loss=67.2, v_num=0, train_loss_step=55.20, val_loss=91.80, train_loss_epoch=71.40]#015Epoch 4:   3%|▎         | 1/31 [00:00<00:24,  1.23it/s, loss=67.7, v_num=0, train_loss_step=63.50, val_loss=91.80, train_loss_epoch=71.40]#015Epoch 4:   6%|▋         | 2/31 [00:01<00:17,  1.67it/s, loss=67.7, v_num=0, train_loss_step=63.50, val_loss=91.80, train_loss_epoch=71.40]#015Epoch 4:   6%|▋         | 2/31 [00:01<00:17,  1.67it/s, loss=66.4, v_num=0, tr


2022-08-21 17:36:51 Uploading - Uploading generated training model
2022-08-21 17:37:11 Completed - Training job completed
ProfilerReport-1661102926: NoIssuesFound
Training seconds: 378
Billable seconds: 378


In [None]:
spot_estimator.latest_job_tensorboard_artifacts_path()

In [None]:
# # deploy the trained model
# predictor=estimator.deploy(1, instance_type)
tensorflow_logs_path = "lightning_logs"

In [None]:
aws_region = sagemaker_session.boto_region_name
!AWS_REGION={aws_region}
!echo tensorboard --logdir {tensorflow_logs_path}

In [None]:
!AWS_REGION=eu-east-1 tensorboard --logdir s3://sagemaker-us-east-1-551329315830/tensorboard/

TensorFlow installation not found - running with reduced feature set.
Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.10.0 at http://localhost:6007/ (Press CTRL+C to quit)
