In [21]:
import warnings
warnings.filterwarnings('ignore')

import copy,torch,ast
import os
import numpy as np
import pandas as pd
import pytorch_lightning as pl  #==1.5.10
from pathlib import Path
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_forecasting import Baseline, TemporalFusionTransformer, TimeSeriesDataSet
from pytorch_forecasting.data import GroupNormalizer
from pytorch_forecasting.metrics import MAE,RMSE
from pytorch_forecasting.models.temporal_fusion_transformer.tuning import optimize_hyperparameters

import pytorch_forecasting #==0.9.2
from datetime import datetime
from azureml.core import WOrkspace,Datastore,Dataset
import torch.multiprocessing
torch.multiprocessing.set_sharing_strategy('file_system')
import argparse
from azureml.core import Run
from dateutil.relativedelta import relativedelta

import pickle

2022-07-12 19:53:26.767294: E tensorflow/core/lib/monitoring/collection_registry.cc:81] Cannot register 2 metrics with the same name: /tensorflow/api/oss-keras/optimizers


AlreadyExistsError: Another metric with the same name already exists.

In [None]:
data
year,month,country,product_code,quantity,temperature

# Parameter

In [None]:
features=[]

arg = {
    "data_path":
    ,"max_epochs":
    ,"n_gpu":1
    ,"max_prediction_length":12 #forecast horizon
    ,"max_encoder_length":24 #look-back-window
    ,"group_id":'[]'' #feature for identifying models like ['country', 'sku']
    ,"target": #target variable to forecast
    ,"time_varying_known_reals":'['time_idx']'
    ,"time_varying_known_categoricals":'['month']'
    ,"time_varying_unknown_categoricals":'[]'
    ,"time_varying_unknown_reals":str(features)
    
    ,"hyperparameter_tuning":0
    ,"allow_missing_timesteps":True
    ,"batch_size":32
    ,"gradient_clip_val":0.11910545520702305
    ,"learning_rate":0.0008
    ,"hidden_size":14
    ,"attention_head_size":4
    ,"dropout":0.17212
    ,"hidden_continuous_size":11
    ,"output_size":1
    ,"reduce_on_plateau_patience":4
    ,"stop_randomization":True
    ,"num_workers":6
    ,"accelerator":'dp'
    ,"weights_summary":'top'
    ,"limit_train_batches":30
    ,"min_delta":0.1
    ,"monitor":'val_loss'
}

data_path = args['data_path']
max_epochs = args['max_epochs']
n_gpu = args['n_gpu']
max_prediction_length = args['max_prediction_length']
max_encoder_length = args['max_encoder_length']
group_id = ast.literal_eval(args['group_id'])
target = args['target']
time_varying_known_reals = ast.literal_eval(args['time_varying_known_reals'])
time_varying_known_categoricals = ast.literal_eval(args['time_varying_known_categoricals'])
time_varying_unknown_categoricals = ast.literal_eval(args['time_varying_unknown_categoricals'])
time_varying_unknown_reals = ast.literal_eval(args['time_varying_unknown_reals'])
hyperparameter_tuning = args['hyperparameter_tuning']
allow_missing_timesteps = args['allow_missing_timesteps']
batch_size = args['batch_size']
gradient_clip_val = args['gradient_clip_val']
learning_rate = args['learning_rate']
hidden_size = args['hidden_size']
attention_head_size = args['attention_head_size']
dropout = args['dropout']
hidden_continuous_size = args['hidden_continuous_size']
output_size = args['output_size']
reduce_on_plateau_patience = args['reduce_on_plateau_patience']
stop_randomization = args['stop_randomization']
num_workers = args['num_workers']
accelerator = args['accelerator']
weights_summary = args['weights_summary']
limit_train_batches = args['limit_train_batches']
min_delta = args['min_delta']
monitor = args['monitor']

In [None]:
data["time_idx"] = data["year"] * 12 + data["month"]
data["time_idx"] -= data["time_idx"].min()

for col in data.columns:
    if col in ('year','month'):
        data[col] = data[col].astype('category')
    elif col == 'time_idx':
        data[col] = data[col].astype('int')
    else:
        data[col] = data[col].astype('float')

# Create dataset and dataloader

## Training set

In [None]:
n_gpu = n_gpu
max_prediction_length = max_prediction_length
max_encoder_length = max_encoder_length
training_cutoff = data["time_idx"].max() - max_prediction_length

training = TimeSeriesDataSet(
    data[lambda x: x.time_idx <= training_cutoff],
    time_idx="time_idx",
    target=target,
    group_ids=group_ids,
    
    min_encoder_length=max_encoder_length // 2,  # keep encoder length long (as it is in the validation set)
    max_encoder_length=max_encoder_length,
    min_prediction_length=1,
    max_prediction_length=max_prediction_length,
    
    time_varying_known_categoricals=time_varying_known_categoricals,
    time_varying_known_reals=time_varying_known_reals,
    time_varying_unknown_categoricals=time_varying_unknown_categoricals,
    time_varying_unknown_reals=time_varying_unknown_reals,
    
    categorical_encoder = {'month':pytorch_forecasting.data.encoders.NaNLabelEncoder(add_nan=True)},
    
    add_relative_time_idx=True,
    add_target_scales=True,
    add_encoder_length=True,
    allow_missing_timesteps=allow_missing_timesteps
)

## Validation set

In [None]:
# create validation set (predict=True) which means to predict the last max_prediction_length points in time
# for each series
validation = TimeSeriesDataSet.from_dataset(training, data, predict=True, stop_randomization=True)

## Create dataloader for model

In [None]:
# create dataloaders for model
batch_size = 128  # set this between 32 to 128
train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=0)
val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size * 10, num_workers=0)

# Create baseline model

In [None]:
# calculate baseline mean absolute error, i.e. predict next value as the last available value from the history
actuals = torch.cat([y for x, (y, weight) in iter(val_dataloader)])
baseline_predictions = Baseline().predict(val_dataloader)
(actuals - baseline_predictions).abs().mean().item()

# Train the temporal fusion transformer

## Hyperparameter tuning

In [None]:
# configure network and trainer
pl.seed_everything(42)
early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=1e-4, patience=10, verbose=False, mode="min")
lr_logger = LearningRateMonitor()  # log the learning rate
logger = TensorBoardLogger("lightning_logs")  # logging results to a tensorboard

if hyperparameter_tuning == 1:
    
    study = optimize_hyperparameters(
        train_dataloader,
        val_dataloader,
        model_path="optuna_test",
        n_trials=200,
        max_epochs=50,
        gradient_clip_val_range=(0.01, 1.0),
        hidden_size_range=(8, 128),
        hidden_continuous_size_range=(8, 128),
        attention_head_size_range=(1, 4),
        learning_rate_range=(0.001, 0.1),
        dropout_range=(0.1, 0.3),
        trainer_kwargs=dict(limit_train_batches=30),
        reduce_on_plateau_patience=4,
        use_learning_rate_finder=True,  # use Optuna to find ideal learning rate or use in-built learning rate finder
    )
    best_param = study.best_trial.params
    print('best_param',best_param)

ekse:
    best_param = {}
    best_param['gradient_clip_val'] = gradient_clip_val_range 
    best_param['earning_rate'] = earning_rate
    best_param['hidden_size'] = hidden_size
    best_param['hidden_continuous_size'] = hidden_continuous_size
    best_param['attention_head_size'] = attention_head_size

## Train model

In [None]:
trainer = pl.Trainer(
    max_epochs=max_epochs,
    gpus=n_gpu,
    weights_summary = weights_summary,
    gradient_clip_val=best_param['gradient_clip_val'],
    limit_train_batches=limit_train_batches,  # coment in for training, running valiation every 30 batches
    # fast_dev_run=True,  # comment in to check that networkor dataset has no serious bugs
    callbacks=[lr_logger, early_stop_callback],
    logger=logger,
)


tft = TemporalFusionTransformer.from_dataset(
    training,
    learning_rate=best_param['learning_rate'],
    hidden_size=best_param['hidden_size'],
    attention_head_size=best_param['attention_head_size'],
    dropout=dropout,
    hidden_continuous_size=best_param['hidden_continuous_size'],
    output_size=out_size,  # 7 quantiles by default
    loss=RMSE(),
    log_interval=10,  # uncomment for learning rate finder and otherwise, e.g. to 10 for logging every 10 batches
    reduce_on_plateau_patience=reduce_on_plateau_patience,
)
print(f"Number of parameters in network: {tft.size()/1e3:.1f}k")

In [None]:
trainer.fit(
    tft,
    train_dateloaders = train_dateloader,
    val_dateloaders = val_dateloader,
)

# load the best model according to the validation loss
# (given that we use early stopping, this is not necessarily the last epoch)
best_model_path = trainer.checkpoint_callback.best_model_path
best_tft = TemporalFusionTransformer.load_from_checkpoint(best_model_path)

# Evaulate performance

In [None]:
# calcualte mean absolute error on validation set
actuals = torch.cat([y[0] for x, y in iter(val_dataloader)])
predictions = best_tft.predict(val_dataloader)
(actuals - predictions).abs().mean()

# Prediction

In [None]:
encoder_data = data[lambda x: x.time_idx > x.time_idx.max() - max_encoder_length]

# select last known data point and create decoder data from it by repeating it and incrementing the month
# in a real world dataset, we should not just forward fill the covariates but specify them to account
# for changes in special days and prices (which you absolutely should do but we are too lazy here)
last_data = data[lambda x: x.time_idx == x.time_idx.max()]
decoder_data = pd.concat(
    [last_data.assign(date=lambda x: x.date + pd.offsets.MonthBegin(i)) for i in range(1, max_prediction_length + 1)],
    ignore_index=True,
)

# add time index consistent with "data"
decoder_data["time_idx"] = decoder_data["year"] * 12 + decoder_data["month"]
decoder_data["time_idx"] += encoder_data["time_idx"].max() + 1 - decoder_data["time_idx"].min()

# adjust additional time feature(s)
decoder_data["month"] = decoder_data.date.dt.month.astype(str).astype("category")  # categories have be strings

# combine encoder and decoder data
new_prediction_data = pd.concat([encoder_data, decoder_data], ignore_index=True)

for i in new_prediction.data.columns:
    if i != 'date':
        new_prediction[i] = new_prediction_data[i].shift(max_prediction_length)

new_prediction_data =new_prediction_data.iloc[max_prediction_length:,:]
new_prediction_data['time_idx'] = new_prediction_data['time_idx'].map(int)

In [None]:
new_predictions,new_x = best_tft.predict(new_prediction_data,return_x = True)
new_predictions

In [None]:
new_raw_prediction, new_x = best_tft.predict(new_prediction_data,return_x = True)

prediction_df = training.x_to_index(new_x)

prediction_df['Max_data_date'] = encoder_data['date'].max()

pred = pd.DataFrame(new_raw_prediction).astype('float')

predictions_df = pd.concat([predictions,pred],axis=1)

predictions_df =predictions_df.drop(columns=['time_idx'])
predictions_df = predictions_df.melt(id_vars = ['country','Max_data_date'],var_name = 'Predicted_month'.value_value = 'Predicted')


predictions_df['Predicted'] = predictions_df['Predicted'].apply(np.floor)
predictions_df['Predicted_month'] = predictions_df['Predicted_month']+1
predictions_df['Date'] = list(map(lambda x,m: x+relativedelta(months=m),predictions_df['Max_data_date'],predictions_df['Predicted_month']))
predictions_df['Date'] = list(map(lambda x:x.strftime("%Y-%m"),predictions_df['Date']))

default_features = ['month','time_idx','relative_time_idx'].extend(time_varying_unknown_reals)

print('--------------\nmodel error:')
print((actuals-predictions).abs().mean())


from sklearn.metrics import mean_absolute_percentage_error
print((actuals,predictions))
print(mean_absolute_percentage_error(actuals,predictions))

output_df['Accuracy'] = 100 - round(mean_absolute_percentage_error(actuals,predictions)*100,2)

raw_predictions,x=best_tft.predict(val_dataloader,mode='raw',return_x = True)
interpretation = best_tft.interpret_output(raw_predictions,reduction='sum')
shap = pd.DataFrame(columns = ['Features','Importance'])
shap['Features'] = default_feature
shap['Importance'] = interpretation['encoder_variables'].numpy()
shap['Importance'] = shap['Importance'].map(lambda x:x*100)
shap = shap.sort_values(by='Importance',ascending=False)

att = pd.DataFrame(columns = ['Months','Attention'])
att['Month'] = [x*-1 for x in list(range(len(interpretation['attention'])))][::-1]
att['Attention'] = interpretation['attention'].numpy()
att['Attention'] = att['Attention'].map(lambda x:x*100)