In [2]:
import copy
from pathlib import Path
import warnings

import numpy as np
import pandas as pd
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor
from pytorch_lightning.loggers import TensorBoardLogger
import torch

from pytorch_forecasting import Baseline, TemporalFusionTransformer, TimeSeriesDataSet
from pytorch_forecasting.data import GroupNormalizer
from pytorch_forecasting.metrics import SMAPE, PoissonLoss, QuantileLoss
from pytorch_forecasting.models.temporal_fusion_transformer.tuning import optimize_hyperparameters

In [3]:
import sagemaker
import uuid

sagemaker_session = sagemaker.Session()
print('SageMaker version: ' + sagemaker.__version__)

bucket = sagemaker_session.default_bucket()
prefix = 'sagemaker/DEMO-pytorch-cnn-cifar10'

role = sagemaker.get_execution_role()
checkpoint_suffix = str(uuid.uuid4())[:8]
checkpoint_s3_path = 's3://{}/checkpoints/checkpoint-{}'.format(bucket, checkpoint_suffix)

print('Checkpointing Path: {}'.format(checkpoint_s3_path))

SageMaker version: 2.103.0
Checkpointing Path: s3://sagemaker-us-east-1-551329315830/checkpoints/checkpoint-7b2a7d42


In [4]:
electricity_df = pd.read_csv("tft/outputs/data/electricity/hourly_electricity.csv")

electricity_df['hours_from_start'] = electricity_df['hours_from_start'].astype(int)
electricity_df['power_usage'] = electricity_df['power_usage'].astype(float)
electricity_df['hour'] = electricity_df['hour'].astype(int)
electricity_df['day_of_week'] = electricity_df['day_of_week'].astype(int)
electricity_df['categorical_id'] = electricity_df['categorical_id'].astype(str)

In [12]:
import os
import json
def save_local_and_upload_s3(data_df, sagemaker_session, bucket, dir_name = "timeseries_data", data_filename = "data"):
    #create data directory if not exist
    if os.path.isdir(dir_name):
        print("Checkpointing directory {} exists".format(dir_name))
    else:
        print("Creating Checkpointing directory {}".format(dir_name))
        os.makedirs(dir_name)

    data_df.to_parquet('{}/{}.parquet'.format(dir_name, data_filename))   
    print("saved raw data to {}/{}.parquet".format(dir_name, data_filename))
    
    return sagemaker_session.upload_data(path=dir_name, bucket=bucket, key_prefix='data/{}'.format(dir_name))



def metadata_json_upload_s3(training_metadata, sagemaker_session, bucket, dir_name = "timeseries_data", metadata_filename = "data_metadata"):
    #create data directory if not exist
    if os.path.isdir(dir_name):
        print("Checkpointing directory {} exists".format(dir_name))
    else:
        print("Creating Checkpointing directory {}".format(dir_name))
        os.makedirs(dir_name)
    
    with open('{}/{}.json'.format(dir_name, metadata_filename), 'w') as fp:
        json.dump(training_metadata, fp)
        print("saved metadata to {}/{}.json".format(dir_name, metadata_filename))
    
    return sagemaker_session.upload_data(path=dir_name, bucket=bucket, key_prefix='data/{}'.format(dir_name))


In [13]:
# max_prediction_length = 24
# max_encoder_length = 24 * 7
# num_epochs = 100
# early_stopping_patience = 5
# multiprocessing_workers = 5


# dropout_rate = 0.1
# hidden_layer_size = 160
# learning_rate = 0.001
# minibatch_size = 64
# max_gradient_norm = 0.01
# num_heads = 4
# stack_size =  1

In [14]:
import pandas as pd 

inputs = save_local_and_upload_s3(electricity_df, sagemaker_session, bucket,
                                  dir_name = "timeseries_data/electricity",
                                  data_filename="electricity_training_data")
inputs

Creating Checkpointing directory timeseries_data/electricity
saved raw data to timeseries_data/electricity/electricity_training_data.parquet


's3://sagemaker-us-east-1-551329315830/data/timeseries_data/electricity'

In [15]:
max_prediction_length = 24
max_encoder_length = 24 * 7


training_metadata = {}
training_metadata['time_idx'] = "hours_from_start"
training_metadata['target'] = "power_usage"
training_metadata['group_ids'] = ["categorical_id"]
training_metadata['min_encoder_length'] = max_encoder_length      # keep encoder length long (as it is in the validation set)
training_metadata['max_encoder_length'] = max_encoder_length
training_metadata['min_prediction_length'] = 1      
training_metadata['max_prediction_length'] = max_prediction_length
training_metadata['static_categoricals'] = ["categorical_id"]
training_metadata['static_reals'] = []
training_metadata['time_varying_known_categoricals'] = []
training_metadata['variable_groups'] = {}
training_metadata['time_varying_known_reals'] = ["hours_from_start", "day_of_week", "hour"]
training_metadata['time_varying_unknown_categoricals'] = []
training_metadata['time_varying_unknown_reals'] = []
training_metadata['target_normalizer'] = {
                            "normalized_groups": ["categorical_id"],
                            "normalization_transformation": 'softplus'
                        }
training_metadata['add_relative_time_idx'] = True
training_metadata['add_target_scales'] = True
training_metadata['add_encoder_length'] = True
training_metadata['allow_missing_timesteps'] = True

training_metadata['training_cutoff'] = int(electricity_df[training_metadata['time_idx']].max() - max_prediction_length)
# training_metadata['training_cutoff'] = int(electricity_df[training_metadata['time_idx']].max())

# upload metadata
metadata_json_upload_s3(training_metadata, sagemaker_session, bucket, 
                                    dir_name = "timeseries_data/electricity",
                                    metadata_filename="electricity_metadata")

Checkpointing directory timeseries_data/electricity exists
saved metadata to timeseries_data/electricity/electricity_metadata.json


's3://sagemaker-us-east-1-551329315830/data/timeseries_data/electricity'

In [16]:
hyperparameters = {
        'data-filename': "electricity_training_data.parquet",
        'metadata-filename': "electricity_metadata.json",
    
        'max-prediction-length' : max_prediction_length,
        'max-encoder-length' : max_encoder_length,
#         'num-epochs' : 2,
        'num-epochs' : 100,

        'early-stopping-patience' : 5,
        'multiprocessing-workers' : 5,


        'dropout-rate' : 0.1,
        'hidden-layer-size' : 160,
        'learning-rate' : 0.001,
        'minibatch-size' : 64,
        'max-gradient-norm' : 0.01,
        'num-heads' : 4
    
    }


In [18]:
use_spot_instances = True
max_run=36000      # in seconds, after this, job will be terminated
max_wait = 10 * max_run if use_spot_instances else None
local_image_name = 'pytorch-tft-container-test'

In [19]:
from sagemaker.pytorch import PyTorch
from sagemaker.estimator import Estimator
from sagemaker.debugger import TensorBoardOutputConfig


tensorboard_output_config = TensorBoardOutputConfig(
    s3_output_path='s3://sagemaker-us-east-1-551329315830/tensorboard',
    container_local_output_path='/lightning_logs'
)

spot_estimator  = PyTorch(entry_point='../TFT_docker/TFT.py',
                            dependencies=['../TFT_docker/requirements.txt'],
                            role=role,
                            framework_version='1.7.1',
                            py_version='py3',
                            instance_count=1,
#                             instance_type='local',
                            instance_type='ml.p3.2xlarge',
#                             instance_type='ml.p2.xlarge',
                            base_job_name='tft-pytorch-spot-1',
                            hyperparameters=hyperparameters,
                            checkpoint_s3_uri=checkpoint_s3_path,
                            debugger_hook_config=False,
                            input_mode = 'File',
                            use_spot_instances=use_spot_instances,
                            max_run=max_run,
                            max_wait=max_wait,
                            tensorboard_output_config=tensorboard_output_config
                           )

spot_estimator.fit(
                inputs,
                logs = 'All'
            )

2022-09-01 04:48:25 Starting - Starting the training job...
2022-09-01 04:48:54 Starting - Preparing the instances for trainingProfilerReport-1662007705: InProgress
.........
2022-09-01 04:50:23 Downloading - Downloading input data...
2022-09-01 04:50:43 Training - Downloading the training image..................
2022-09-01 04:53:49 Training - Training image download completed. Training in progress..[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2022-09-01 04:53:52,441 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2022-09-01 04:53:52,472 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2022-09-01 04:53:52,481 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2022-09-01 04:53:53,000 sagemaker-training-toolkit INFO     Installing dependencies from requirements.txt:

[34m  Building wheel for fastparquet (setup.py): finished with status 'done'
  Created wheel for fastparquet: filename=fastparquet-0.8.0-cp36-cp36m-linux_x86_64.whl size=1256932 sha256=5131ff0851b643c6ee565037b1b69a88607c52dfd70393df2224cd0ab7ee5983
  Stored in directory: /root/.cache/pip/wheels/6a/6a/4f/0fd8e8bcbc4b5b751186e363b5b03975d8643eee2975eed2ca
  Building wheel for idna-ssl (setup.py): started[0m
[34m  Building wheel for idna-ssl (setup.py): finished with status 'done'
  Created wheel for idna-ssl: filename=idna_ssl-1.1.0-py3-none-any.whl size=3161 sha256=ce1d8f65e34bdfddf988e0276a7130e94114c9ad4a43721c6ea2367e96cbfb23
  Stored in directory: /root/.cache/pip/wheels/6a/f5/9c/f8331a854f7a8739cf0e74c13854e4dd7b1af11b04fe1dde13
  Building wheel for pyperclip (setup.py): started
  Building wheel for pyperclip (setup.py): finished with status 'done'
  Created wheel for pyperclip: filename=pyperclip-1.8.2-py3-none-any.whl size=11106 sha256=d8370c3c9180018934c9a51d5b41bd24307a1989

[34mDistributed training - False[0m
[34mCheckpointing directory /opt/ml/checkpoints exists[0m
[34mDevice Type: cuda[0m
[34mLoad Time Series dataset from S3[0m
[34mcreating dataloader[0m
[34mcreate model trainer[0m
[34mcreate model from dataset[0m
[34mNumber of parameters in network: 2443.6k[0m
[34mtraining model[0m
[34m#015Validation sanity check: 0it [00:00, ?it/s]#015Validation sanity check:   0%|          | 0/1 [00:00<?, ?it/s]#015Validation sanity check: 100%|██████████| 1/1 [00:02<00:00,  2.62s/it]#015                                                                      #015#015Training: 0it [00:00, ?it/s]#015Training:   0%|          | 0/33371 [00:00<?, ?it/s]#015Epoch 0:   0%|          | 0/33371 [00:00<?, ?it/s] #015Epoch 0:   0%|          | 1/33371 [00:01<11:55:13,  1.29s/it]#015Epoch 0:   0%|          | 1/33371 [00:01<11:55:38,  1.29s/it, loss=74, v_num=0, train_loss_step=74.00]#015Epoch 0:   0%|          | 2/33371 [00:01<6:46:16,  1.37it/s, loss=74, v_num=0,

[34moch 0:   0%|          | 77/33371 [00:14<1:41:41,  5.46it/s, loss=40.8, v_num=0, train_loss_step=18.70]#015Epoch 0:   0%|          | 78/33371 [00:14<1:41:38,  5.46it/s, loss=40.8, v_num=0, train_loss_step=18.70]#015Epoch 0:   0%|          | 78/33371 [00:14<1:41:38,  5.46it/s, loss=40.4, v_num=0, train_loss_step=31.30]#015Epoch 0:   0%|          | 79/33371 [00:14<1:41:28,  5.47it/s, loss=40.4, v_num=0, train_loss_step=31.30]#015Epoch 0:   0%|          | 79/33371 [00:14<1:41:29,  5.47it/s, loss=41.8, v_num=0, train_loss_step=66.20]#015Epoch 0:   0%|          | 80/33371 [00:14<1:41:19,  5.48it/s, loss=41.8, v_num=0, train_loss_step=66.20]#015Epoch 0:   0%|          | 80/33371 [00:14<1:41:19,  5.48it/s, loss=43.8, v_num=0, train_loss_step=71.90]#015Epoch 0:   0%|          | 81/33371 [00:14<1:41:12,  5.48it/s, loss=43.8, v_num=0, train_loss_step=71.90]#015Epoch 0:   0%|          | 81/33371 [00:14<1:41:13,  5.48it/s, loss=43.4, v_num=0, train_loss_step=20.90]#015Epoch 0:   0%|          |

[34m   | 155/33371 [00:27<1:38:47,  5.60it/s, loss=28.9, v_num=0, train_loss_step=19.90]#015Epoch 0:   0%|          | 155/33371 [00:27<1:38:48,  5.60it/s, loss=32.6, v_num=0, train_loss_step=87.90]#015Epoch 0:   0%|          | 156/33371 [00:27<1:38:45,  5.61it/s, loss=32.6, v_num=0, train_loss_step=87.90]#015Epoch 0:   0%|          | 156/33371 [00:27<1:38:46,  5.60it/s, loss=32.4, v_num=0, train_loss_step=16.70]#015Epoch 0:   0%|          | 157/33371 [00:27<1:38:41,  5.61it/s, loss=32.4, v_num=0, train_loss_step=16.70]#015Epoch 0:   0%|          | 157/33371 [00:27<1:38:41,  5.61it/s, loss=35.7, v_num=0, train_loss_step=88.50]#015Epoch 0:   0%|          | 158/33371 [00:28<1:38:36,  5.61it/s, loss=35.7, v_num=0, train_loss_step=88.50]#015Epoch 0:   0%|          | 158/33371 [00:28<1:38:36,  5.61it/s, loss=31.3, v_num=0, train_loss_step=52.20]#015Epoch 0:   0%|          | 159/33371 [00:28<1:38:34,  5.62it/s, loss=31.3, v_num=0, train_loss_step=52.20]#015Epoch 0:   0%|          | 159/33371

[34mtep=16.60]#015Epoch 0:   1%|          | 232/33371 [00:40<1:36:53,  5.70it/s, loss=29.5, v_num=0, train_loss_step=18.00]#015Epoch 0:   1%|          | 233/33371 [00:40<1:36:49,  5.70it/s, loss=29.5, v_num=0, train_loss_step=18.00]#015Epoch 0:   1%|          | 233/33371 [00:40<1:36:50,  5.70it/s, loss=29.5, v_num=0, train_loss_step=16.50]#015Epoch 0:   1%|          | 234/33371 [00:41<1:36:48,  5.71it/s, loss=29.5, v_num=0, train_loss_step=16.50]#015Epoch 0:   1%|          | 234/33371 [00:41<1:36:48,  5.71it/s, loss=29.6, v_num=0, train_loss_step=18.20]#015Epoch 0:   1%|          | 235/33371 [00:41<1:36:46,  5.71it/s, loss=29.6, v_num=0, train_loss_step=18.20]#015Epoch 0:   1%|          | 235/33371 [00:41<1:36:46,  5.71it/s, loss=34.8, v_num=0, train_loss_step=122.0]#015Epoch 0:   1%|          | 236/33371 [00:41<1:36:44,  5.71it/s, loss=34.8, v_num=0, train_loss_step=122.0]#015Epoch 0:   1%|          | 236/33371 [00:41<1:36:44,  5.71it/s, loss=34.8, v_num=0, train_loss_step=15.50]#015

[34mss=30.9, v_num=0, train_loss_step=14.50]#015Epoch 0:   1%|          | 310/33371 [00:54<1:36:41,  5.70it/s, loss=30.9, v_num=0, train_loss_step=14.50]#015Epoch 0:   1%|          | 310/33371 [00:54<1:36:41,  5.70it/s, loss=28.7, v_num=0, train_loss_step=24.20]#015Epoch 0:   1%|          | 311/33371 [00:54<1:36:41,  5.70it/s, loss=28.7, v_num=0, train_loss_step=24.20]#015Epoch 0:   1%|          | 311/33371 [00:54<1:36:41,  5.70it/s, loss=26.6, v_num=0, train_loss_step=17.60]#015Epoch 0:   1%|          | 312/33371 [00:54<1:36:43,  5.70it/s, loss=26.6, v_num=0, train_loss_step=17.60]#015Epoch 0:   1%|          | 312/33371 [00:54<1:36:44,  5.70it/s, loss=26.6, v_num=0, train_loss_step=14.50]#015Epoch 0:   1%|          | 313/33371 [00:54<1:36:42,  5.70it/s, loss=26.6, v_num=0, train_loss_step=14.50]#015Epoch 0:   1%|          | 313/33371 [00:54<1:36:42,  5.70it/s, loss=24.1, v_num=0, train_loss_step=17.20]#015Epoch 0:   1%|          | 314/33371 [00:55<1:36:42,  5.70it/s, loss=24.1, v_num

[34m [01:07<1:36:02,  5.72it/s, loss=40.5, v_num=0, train_loss_step=14.90]#015Epoch 0:   1%|          | 387/33371 [01:07<1:36:02,  5.72it/s, loss=40.3, v_num=0, train_loss_step=18.00]#015Epoch 0:   1%|          | 388/33371 [01:07<1:36:01,  5.72it/s, loss=40.3, v_num=0, train_loss_step=18.00]#015Epoch 0:   1%|          | 388/33371 [01:07<1:36:02,  5.72it/s, loss=42.3, v_num=0, train_loss_step=75.40]#015Epoch 0:   1%|          | 389/33371 [01:07<1:36:04,  5.72it/s, loss=42.3, v_num=0, train_loss_step=75.40]#015Epoch 0:   1%|          | 389/33371 [01:07<1:36:04,  5.72it/s, loss=42.5, v_num=0, train_loss_step=41.70]#015Epoch 0:   1%|          | 390/33371 [01:08<1:36:04,  5.72it/s, loss=42.5, v_num=0, train_loss_step=41.70]#015Epoch 0:   1%|          | 390/33371 [01:08<1:36:04,  5.72it/s, loss=38.2, v_num=0, train_loss_step=16.40]#015Epoch 0:   1%|          | 391/33371 [01:08<1:36:03,  5.72it/s, loss=38.2, v_num=0, train_loss_step=16.40]#015Epoch 0:   1%|          | 391/33371 [01:08<1:36:0

KeyboardInterrupt: 

In [None]:
inputs

In [None]:
electricity_df.info()

In [None]:
len(electricity_df['categorical_id'].unique())

In [None]:
sum(electricity_df['id'] != electricity_df['categorical_id']

In [None]:
electricity_df.info()

In [None]:
electricity_df.head(5)