<a href="https://colab.research.google.com/github/Bassie1/notebooks/blob/main/model_github.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

import os
import warnings

warnings.filterwarnings("ignore")  # avoid printing out absolute paths

#os.chdir("../../..")


In [None]:
import copy
from pathlib import Path
import warnings

import numpy as np
import pandas as pd
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor
from pytorch_lightning.loggers import TensorBoardLogger
import torch
#from pytorch_forecasting import pytorch_lightning as pl
from pytorch_forecasting import Baseline, TemporalFusionTransformer, TimeSeriesDataSet
from pytorch_forecasting.data import GroupNormalizer
from pytorch_forecasting.metrics import SMAPE, PoissonLoss, QuantileLoss
from pytorch_forecasting.models.temporal_fusion_transformer.tuning import optimize_hyperparameters
from pytorch_forecasting.data.encoders import NaNLabelEncoder

In [None]:
torch. __version__

'1.9.0+cu102'

In [None]:
pl. __version__

'1.3.3'

In [None]:
df = pd.read_csv('github_data.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df['month']=df['month'].astype('str').astype('category')

In [None]:
from pytorch_forecasting.data import (
    TimeSeriesDataSet,
    GroupNormalizer
)
df= df
max_prediction_length =  3 # changing to a forecst of 3 months, which is 90 days,from a forecast 6 months
max_encoder_length = 10  # using 12 months, which is 360 DAYS, instead of  24 months of history
training_cutoff =df["time_idx"].max() - max_prediction_length
#print(training_cutoff)
training = TimeSeriesDataSet(
    df[lambda x: x.time_idx <= training_cutoff],
    allow_missings=True,
    categorical_encoders={'product_id': NaNLabelEncoder(add_nan=True), 'month': NaNLabelEncoder(add_nan=True)},
    time_idx="time_idx",
    target="target",
    group_ids=["product_id"],
    #min_encoder_length=0,  # allow predictions without history
    min_encoder_length=max_encoder_length // 2, # time series have a minimum length of 13 (min_prediction_length + min_encoder_length)
    max_encoder_length=max_encoder_length,
    min_prediction_length=1,
    max_prediction_length=max_prediction_length,
    static_categoricals=["product_id"],
    time_varying_known_categoricals=['month'],
    time_varying_known_reals=[
        "time_idx",
        "feature_1",
        'feature_2'
    ],
    time_varying_unknown_categoricals=[],
    time_varying_unknown_reals=[
        'target',
    ],
    target_normalizer=GroupNormalizer(
        groups=["product_id"], transformation='softplus'
    ),  # use softplus with beta=1.0 and normalize by group
    add_relative_time_idx=True,  # add as feature
    add_target_scales=True,  # add as feature
    add_encoder_length=True,  # add as feature
    
)
# create validation set (predict=True) which means to predict the
# last max_prediction_length points in time for each series
validation = TimeSeriesDataSet.from_dataset(
    training, df, predict=True, stop_randomization=True
)
# create dataloaders for model
batch_size = 128
train_dataloader = training.to_dataloader(
    train=True, batch_size=batch_size, num_workers=0
)
val_dataloader = validation.to_dataloader(
    train=False, batch_size=batch_size * 10, num_workers=0, drop_last=True
)

In [None]:
# configure network and trainer
pl.seed_everything(42)
trainer = pl.Trainer(
    gpus=0,
    # clipping gradients is a hyperparameter and important to prevent divergance
    # of the gradient for recurrent neural networks
    gradient_clip_val=0.1,
)


tft = TemporalFusionTransformer.from_dataset(
    training,
    # not meaningful for finding the learning rate but otherwise very important
    learning_rate=0.03,
    hidden_size=16,  # most important hyperparameter apart from learning rate
    # number of attention heads. Set to up to 4 for large datasets
    attention_head_size=1,
    dropout=0.1,  # between 0.1 and 0.3 are good values
    hidden_continuous_size=8,  # set to <= hidden_size
    output_size=7,  # 7 quantiles by default
    loss=QuantileLoss(),
    log_interval=0,
    # reduce learning rate if no improvement in validation loss after x epochs
    reduce_on_plateau_patience=4,
)
print(f"Number of parameters in network: {tft.size()/1e3:.1f}k")

In [None]:
# configure network and trainer
early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=1e-4, patience=10, verbose=False, mode="min")
lr_logger = LearningRateMonitor()  # log the learning rate
logger = TensorBoardLogger("lightning_logs")  # logging results to a tensorboard

trainer = pl.Trainer(
    max_epochs=30,
    gpus=1,
    weights_summary="top", #can change to "full"
    gradient_clip_val=0.1,
    #limit_train_batches=30,  # coment in for training, running valiation every 30 batches
    # fast_dev_run=True,  # comment in to check that networkor dataset has no serious bugs
    callbacks=[lr_logger, early_stop_callback],
    logger=logger,
)


tft = TemporalFusionTransformer.from_dataset(
    training,
    learning_rate=0.03,
    hidden_size=16,
    attention_head_size=1,
    dropout=0.1,
    hidden_continuous_size=8,
    #output_size=4,  # 7 quantiles by default
    loss=QuantileLoss(),
    #log_interval=10,  # uncomment for learning rate finder and otherwise, e.g. to 10 for logging every 10 batches
    reduce_on_plateau_patience=4,
    log_interval=0 #added because of histogram error "hacky solution" github issue 376
)
print(f"Number of parameters in network: {tft.size()/1e3:.1f}k")

In [None]:
# fit network
trainer.fit(
    tft,
    train_dataloader=train_dataloader,
    val_dataloaders=val_dataloader,
)
    

In [None]:
# #code to save the trainer and tft
torch.save(trainer, "trainer.pt")
torch.save(tft, "tft.pt")

In [None]:

# #code to read back in the saved files
tft1 = torch.load('tft.pt')
trainer1 = torch.load("trainer.pt")

In [None]:
# load the best model according to the validation loss
# (given that we use early stopping, this is not necessarily the last epoch)
best_model_path = trainer1.checkpoint_callback.best_model_path
best_tft = TemporalFusionTransformer.load_from_checkpoint(best_model_path)

In [None]:
# calculate mean absolute error on validation set
actuals = torch.cat([y[0] for x, y in iter(val_dataloader)])
predictions = best_tft.predict(val_dataloader, show_progress_bar=True)
(actuals - predictions).abs().mean()

In [None]:
# calculate mean absolute error on validation set
actuals = torch.cat([y[0] for x, y in iter(val_dataloader)])
predictions = best_tft.predict(val_dataloader, show_progress_bar=True)
#pr, x_pr, idx = best_model.predict(data_ts, mode="prediction", return_x=True, return_index=True)
predictions, x, idx = best_tft.predict(val_dataloader, mode='prediction', show_progress_bar=True, return_x=True, return_index=True)
(actuals - predictions).abs().mean()

In [None]:


# turn predictions into dataframe: time_idx is at horizon=0 (first prediction)
df_pred_before = pd.DataFrame(predictions.numpy(), index=pd.MultiIndex.from_frame(idx), columns=pd.RangeIndex(0, predictions.size(1), name="horizon"))

# change time_idx to correspond to each prediction
df_pred = (
    df_pred_before
    .stack()
    .reset_index(["time_idx", "horizon"])
    .assign(time_idx=lambda x: x.time_idx + x.horizon - 1)
    .set_index(["time_idx", "horizon"], append=True)[0]
    .unstack("horizon")
    .add_prefix("prediction_at_horizon_")
)

# add predictions to original dataframe
original_df_with_predictions = df.join(df_pred, on=df_pred.index.names)


In [None]:
#raw predictions are a dictionary from which all kind of information including quantiles can be extracted
raw_predictions, x = best_tft.predict(val_dataloader, mode="raw", return_x=True, show_progress_bar=True)


In [None]:
for idx in range(10):  # plot 10 examples
    best_tft.plot_prediction(x, raw_predictions, idx=0, add_loss_to_title=True);


In [None]:
# calcualte metric by which to display
predictions = best_tft.predict(val_dataloader)
mean_losses = SMAPE(reduction="none")(predictions, actuals).mean(1)
indices = mean_losses.argsort(descending=True)  # sort losses
for idx in range(10):  # plot 10 examples
    best_tft.plot_prediction(
        x, raw_predictions, idx=indices[idx], add_loss_to_title=SMAPE(quantiles=best_tft.loss.quantiles)
    );


In [None]:

predictions, x = best_tft.predict(val_dataloader, return_x=True)
predictions_vs_actuals = best_tft.calculate_prediction_actual_by_variable(x, predictions)
best_tft.plot_prediction_actual_by_variable(predictions_vs_actuals);

In [None]:
best_tft.predict(
    training.filter(lambda x: (x.product_id == 'B0000AA8UL')  & (x.time_idx_first_prediction == 9)),
    mode="quantiles",
)

In [None]:
raw_prediction, x = best_tft.predict(
    training.filter(lambda x: (x.asin == "B0000AA8UL")  & (x.time_idx_first_prediction == 9)),
    mode="raw",
    return_x=True,
)
best_tft.plot_prediction(x, raw_prediction, idx=0);


In [None]:
# select last 24 months from data (max_encoder_length is 24)
encoder_data = df[lambda x: x.time_idx > x.time_idx.max() - max_encoder_length]

# select last known data point and create decoder data from it by repeating it and incrementing the month
# in a real world dataset, we should not just forward fill the covariates but specify them to account
# for changes in special days and prices (which you absolutely should do but we are too lazy here)
last_data = df[lambda x: x.time_idx == x.time_idx.max()]
decoder_data = pd.concat(
    [last_data.assign(date=lambda x: x.date + pd.offsets.MonthBegin(i)) for i in range(1, max_prediction_length + 1)],
    ignore_index=True,
)

# add time index consistent with "data"
decoder_data["time_idx"] = decoder_data["date"].dt.year * 12 + decoder_data["date"].dt.month
decoder_data["time_idx"] += encoder_data["time_idx"].max() + 1 - decoder_data["time_idx"].min()

# adjust additional time feature(s)
decoder_data["month"] = decoder_data.date.dt.month.astype(str).astype("category")  # categories have be strings

# combine encoder and decoder data
new_prediction_data = pd.concat([encoder_data, decoder_data], ignore_index=True)

In [None]:
new_raw_predictions, new_x = best_tft.predict(new_prediction_data, mode="raw", return_x=True)

for idx in range(10):  # plot 10 examples
    best_tft.plot_prediction(new_x, new_raw_predictions, idx=idx, show_future_observed=False);

In [None]:
interpretation = best_tft.interpret_output(raw_predictions, reduction="sum")
best_tft.plot_interpretation(interpretation)

In [None]:
dependency = best_tft.predict_dependency(
    val_dataloader.dataset, "discount_in_percent", np.linspace(0, 30, 30), show_progress_bar=True, mode="dataframe"
)
# plotting median and 25% and 75% percentile
agg_dependency = dependency.groupby("discount_in_percent").normalized_prediction.agg(
    median="median", q25=lambda x: x.quantile(0.25), q75=lambda x: x.quantile(0.75)
)
ax = agg_dependency.plot(y="median")
ax.fill_between(agg_dependency.index, agg_dependency.q25, agg_dependency.q75, alpha=0.3);