In [1]:
!pip install -q -U pytorch-lightning==2.5.0
!pip install -q -U pytorch-forecasting==1.2.0
!pip install -q -U pytorch_optimizer==3.3.4

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m819.4/819.4 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.4/40.4 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m181.9/181.9 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m815.2/815.2 kB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m00:01[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.7/66.7 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m221.9/221.9 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import numpy as np
import pandas as pd

import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor
from pytorch_lightning.loggers import TensorBoardLogger

from pytorch_forecasting import Baseline, TemporalFusionTransformer, TimeSeriesDataSet
from pytorch_forecasting.data import GroupNormalizer
from pytorch_forecasting.metrics import MAE, SMAPE, PoissonLoss, QuantileLoss
from pytorch_forecasting.models.temporal_fusion_transformer.tuning import optimize_hyperparameters

# Load data

In [3]:
train = pd.read_csv('/kaggle/input/playground-series-s5e1/test.csv', index_col = 0)
test = pd.read_csv('/kaggle/input/playground-series-s5e1/train.csv', index_col = 0)

# Preprocessing data

Fill the missing value and remove value `Nans`.

Group the filtered DataFrame by 'country', 'store', and 'product'. Within each group, apply the ffill() method to the 'num_sold' column to propagate the last valid observation forward.

In [None]:
condition = (
    (train['country'] == 'Canada') & 
    (train['store'] == 'Discount Stickers') & 
    (train['product'] == 'Kerneler' )
)

train.loc[condition, 'num_sold'] = train.loc[condition, 'num_sold'].ffill() # 

condition = (
    (train['country'] == 'Kenya') & 
    (train['store'] == 'Discount Stickers') & 
    (train['product'] == 'Kerneler' )
)

train.loc[condition, 'num_sold'] = train.loc[condition, 'num_sold'].ffill()

condition = (
    (train['country'] == 'Kenya') & 
    (train['store'] == 'Discount Stickers') & 
    (train['product'] == 'Kerneler Dark Mode' )
)

train.loc[condition, 'num_sold'] = train.loc[condition, 'num_sold'].ffill()

train = train.dropna() # Handle any straggling Nans

# Create Pytorch Model Ready TimeSeriesDataset

In [None]:
# Converting dataset to a Standardized Format
train['date']=pd.to_datetime(train['date'])
test['date'] = pd.to_datetime(test['date'])

### Creating a Time Index for Sequential Ordering

Temporal Fusion Transformer models require a numerical time index(time_idx) for their architecture to recognize the temporal sequence of data points.

* **max_prediction_length**: Defines the forecast horizon(number of future time steps to predict)
* **max_encoder_length**: Determines the maximum historical time steps used as input
* **training_cutoff**: Seperaets the training data from the validation period by reserving the last **max_prediction_length** time steps for validation.

In [None]:
train["time_idx"]=train["date"].dt.year*12+train["date"].dt.month
train["time_idx"]-=train["time_idx"].min()

test["time_idx"]=test["date"].dt.year*12+test["date"].dt.month
test["time_idx"]-=test["time_idx"].min()

max_prediction_length=6
max_encoder_length=train.date.nunique()
training_cutoff=train["time_idx"].max()- max_prediction_length

## Feature Engineer

Temporal patterns often vary by month. TFT can leverage this information when months are provided as categorical variables.

In [None]:
train["month"] = train.date.dt.month.astype(str).astype("category")
test["month"] = test.date.dt.month.astype(str).astype("category")

train["log_num_sold"] = np.log(train.num_sold + 1e-8)
train["avg_num_sold_by_country"] = train.groupby(["time_idx", "country"], observed=True).num_sold.transform("mean")
train["avg_num_sold_by_store"] = train.groupby(["time_idx", "store"], observed=True).num_sold.transform("mean")
train["avg_num_sold_by_product"] = train.groupby(["time_idx", "product"], observed=True).num_sold.transform("mean")

# Set up the TimeSeriesDataSet

* **Static Categoricals**: Group-level identifiers (country, store, product).
* **Time-Varying Features**
    * **Known**: Features available for all time steps (e.g., month, time_idx).
    * **Unknown**: Features only known for historical data (e.g., num_sold, averages).
* **Normalization**: GroupNormalizer normalizes the target variable within each group using the softplus transformation.

In [None]:
training=TimeSeriesDataSet(
    train[lambda x:x.time_idx <= training_cutoff],
    time_idx="time_idx",
    target="num_sold",
    group_ids=["country", "store", "product"],
    min_encoder_length=1,
    max_encoder_length=max_encoder_length,
    min_prediction_length=1,
    max_prediction_length=max_prediction_length,
    static_categoricals=["country", "store", "product"],
    time_varying_known_categoricals=["month"],
    time_varying_known_reals=["time_idx"],
    time_varying_unknown_categoricals=[],
    time_varying_unknown_reals=[
        "num_sold",
        "log_num_sold",
        "avg_num_sold_by_country",
        "avg_num_sold_by_store",
        "avg_num_sold_by_product",
    ],
    allow_missing_timesteps=True,
    target_normalizer=GroupNormalizer(
        groups=["country", "store", "product"], transformation="softplus"
    ),
    add_relative_time_idx=True,
    add_target_scales=True,
    add_encoder_length=True,
)


# Creating the Validation Dataset
validation = TimeSeriesDataSet.from_dataset(training, train, predict=True, stop_randomization=True)

## Creating Dataloaders

In [None]:
batch_size = 128
train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=0)
val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size * 10, num_workers=0)

# Training the TFT Model

In [None]:
early_stop_callback=EarlyStopping(
    monitor="val_loss", # monitors validation loss
    min_delta=1e-4, # minimum change in the monitored quantity to qualify as an improvement
    patience=10,  # stops training if no improvement is seen after 10 epochs
    verbose=False,
    mode="min" # lower validation loss is better
)

lr_logger=LearningRateMonitor() # tracks learning rate during training

# Initiate the trainer
trainer=Trainer(
    max_epochs=50, # train the model for a maximum of 50 epochs
    accelerator="gpu",
    enable_model_summary=True, # prints a summary of the model architecture
    gradient_clip_val=0.1, # clips gradients to prevent exploading gradients
    limit_train_batches=50, # limits the number of trianing batches per epoch
    callbacks=[lr_logger, early_stop_callback], # include callbacks for logging and early stopping
)

# compile the model
tft=TemporalFusionTransformer.from_dataset(
    training, # the training dataset
    learning_rate=0.03,
    hidden_size=16, # size of hidden layers in the model
    attention_head_size=2, # number of attention heads
    dropout=0.1, # dropout for regulatization
    hidden_continuous_size=8, # size of hidden layers for continuous variables
    loss=QuantileLoss(), # uses Quantile Loss for probabilistic forecasting
    log_interval=10, # logs metrics every 10 batches
    optimizer="adam", # uses the Adam optimizer
    reduce_on_plateau_patience=4, # reduces learning rate if validation performance plateaus
)

trainer.fit(
    tft,
    train_dataloaders=train_dataloader,
    val_dataloaders=val_dataloader,
)

# Forecast with the model

In [None]:
best_model_path=trainer.checkpoint_callback.best_model_path
best_tft=TemporalFusionTransformer.load_from_checkpoint(best_model_path)

In [None]:
predictions = best_tft.predict(val_dataloader, return_y=True, trainer_kwargs=dict(accelerator="gpu"))
MAE()(predictions.output, predictions.y)