In [None]:
import pandas as pd
import numpy as np
import pickle
from pytorch_forecasting import TimeSeriesDataSet, TemporalFusionTransformer, QuantileLoss, GroupNormalizer, Baseline, MAE, SMAPE
from pytorch_forecasting.models.temporal_fusion_transformer.tuning import optimize_hyperparameters
import lightning.pytorch as pl
from lightning.pytorch import Trainer
from lightning.pytorch.tuner import Tuner
import pytorch_optimizer
import matplotlib
from lightning.pytorch.callbacks import EarlyStopping, LearningRateMonitor
from lightning.pytorch.loggers import TensorBoardLogger
from sklearn import precision_score, recall_score, f1_score

In [None]:
content = pd.read_csv('data/Final Transactions.csv')
display(content)

In [None]:
content['CUSTOMER_ID'].nunique()

In [None]:
content.dropna(subset=["CUSTOMER_ID"], inplace=True)
content.drop(['Count'], axis=1,inplace=True)
print(len(content))

In [None]:
content['TX_DATETIME'] = pd.to_datetime(content['TX_DATETIME'])
content['CUSTOMER_ID'] = content['CUSTOMER_ID'].astype('str')
content['TERMINAL_ID'] = content['TERMINAL_ID'].astype('str')
content.sort_values(by=['CUSTOMER_ID'], kind='mergesort')
content = content.sort_values(by='TX_DATETIME', kind='mergesort', ascending=True).reset_index(drop=True)
content.drop(['TX_TIME_SECONDS', 'TX_TIME_DAYS'], axis=1,inplace=True)
content["time_idx"] = content.groupby("CUSTOMER_ID").cumcount()
display(content)

In [None]:
content["TX_FRAUD"] = content["TX_FRAUD"].astype("float32")
content["TX_AMOUNT"] = content["TX_AMOUNT"].astype("float32")
content["TX_FRAUD_SCENARIO"] = content["TX_FRAUD_SCENARIO"].astype("float32")

In [None]:
#groupNorm = GroupNormalizer(groups=["CUSTOMER_ID"], transformation="softplus")
#groupNorm.fit(content["TX_FRAUD"], content["CUSTOMER_ID"])

In [None]:
trainDataList = []
valDataList = []
count = 0
groupCount = 0
for customerID, group in content.groupby('CUSTOMER_ID'):
    groupSorted = group.sort_values('TX_DATETIME')
    numTransactions = len(groupSorted)

    if numTransactions >= 200:
        splitIdx = int(numTransactions * 0.8)
        trainDataList.append(groupSorted.iloc[:splitIdx])
        valDataList.append(groupSorted.iloc[splitIdx:])
        count+=1
        groupCount += len(group)
print(count)
print(groupCount)
trainData = pd.concat(trainDataList, ignore_index=True)
valData = pd.concat(valDataList, ignore_index=True)
trainData["time_idx"] = trainData.groupby('CUSTOMER_ID').cumcount()
valData["time_idx"] = valData.groupby('CUSTOMER_ID').cumcount()

In [None]:
maxEncoderLength = 200
maxPredLength = 3
training = TimeSeriesDataSet( 
    trainData,
    time_idx='time_idx',
    target='TX_AMOUNT',
    group_ids=['CUSTOMER_ID'],
    static_categoricals=['CUSTOMER_ID','TERMINAL_ID'],
    time_varying_unknown_reals=['TX_AMOUNT'],
    max_encoder_length=maxEncoderLength,
    max_prediction_length=maxPredLength,
    target_normalizer=GroupNormalizer(groups=["CUSTOMER_ID"]),
    add_relative_time_idx=True,
    add_target_scales=True,
    add_encoder_length=True,
)

In [None]:
batchSize = 128
trainDataloader = training.to_dataloader(
    train=True, batch_size=batchSize, num_workers = 2, persistent_workers=True,
)
validation = TimeSeriesDataSet.from_dataset(
    training, valData, predict=True, stop_randomization=True
)
valDataloader = validation.to_dataloader(
    train=False,batch_size=batchSize*10, num_workers=2, persistent_workers=True,
)
pl.seed_everything(42)
trainer = pl.Trainer(
    accelerator="auto",
    gradient_clip_val=0.1,
)
TFT = TemporalFusionTransformer.from_dataset(
    training,
    learning_rate = 0.03,
    hidden_size=8,
    attention_head_size=3,
    dropout=0.1,
    hidden_continuous_size=8,
    loss=QuantileLoss(),
    optimizer="ranger",
)

In [None]:
res = Tuner(trainer).lr_find(
    TFT,
    train_dataloaders=trainDataloader,
    val_dataloaders=valDataloader,
    max_lr=10.0,
    min_lr=1e-6
)
print(f"suggested learning rate: {res.suggestion()}")
fig = res.plot(show=True, suggest=True)
fig.show()

In [None]:
earlyStopper = EarlyStopping(
    monitor="val_loss", min_delta=1e-4, patience=10, verbose=False, mode="min"
)
learningRateLogger = LearningRateMonitor()
logger = TensorBoardLogger(save_dir="lightning_logs", name="Model")

trainer = pl.Trainer(
    max_epochs=50,
    accelerator="auto",
    enable_model_summary=True,
    gradient_clip_val=0.1,
    limit_train_batches=50,
    callbacks=[learningRateLogger, earlyStopper],
    logger=logger,
)
biggerTFT = TemporalFusionTransformer.from_dataset(
    training,
    learning_rate=0.02,
    hidden_size=40,
    attention_head_size=3,
    dropout=0.1,
    hidden_continuous_size=38,
    loss=QuantileLoss(),
    log_interval=10,
    optimizer="ranger",
    reduce_on_plateau_patience=4,
)
print(f"Number of parameters in network: {biggerTFT.size() / 1e3:.1f}k")

In [None]:
trainer.fit(
    biggerTFT,
    train_dataloaders=trainDataloader,
    val_dataloaders=valDataloader,
)

In [None]:
#study = optimize_hyperparameters( 
#    trainDataloader,
#    valDataloader,
#    model_path="optunaTest",
#    n_trials=200,
#    max_epochs=50,
#    gradient_clip_val_range=(0.01,1.0),
#    hidden_size_range=(8, 128),
#    attention_head_size_range=(1,4),
#    learning_rate_range=(0.001,0.1),
#    dropout_range=(0.1,0.3),
#    trainer_kwargs=dict(limit_train_batches=30),
#    reduce_on_plateau_patience=4,
#    use_learning_rate_finder=False,
#)
#
#with open("testStudy.pkl", "wb") as fout:
#    pickle.dump(study,fout)
#
#print(study.best_trial.params)


In [None]:
bestPath = trainer.checkpoint_callback.best_model_path
bestTFT = TemporalFusionTransformer.load_from_checkpoint(bestPath)


In [None]:
baselinePrediction = Baseline().predict(valDataloader, return_y=True)
MAE()(baselinePrediction.output, baselinePrediction.y)

In [None]:
predictions = bestTFT.predict(
    valDataloader, return_y=True, trainer_kwargs=dict(accelerator="auto")
)
MAE()(predictions.output, predictions.y)


In [None]:
#Plot attention and results :)
rawPredictions = bestTFT.predict(
    valDataloader, mode="raw", return_x=True, trainer_kwargs=dict(accelerator="auto")
)
for idx in range(10):
    bestTFT.plot_prediction(
        rawPredictions.x, rawPredictions.output, idx=idx, add_loss_to_title=True
    )

In [None]:
#Graphing worst performing portions
meanLosses = SMAPE(reduction="none").loss(predictions.output, predictions.y[0]).mean(1)
indices = meanLosses.argsort(descending=True)
for idx in range(10):
    bestTFT.plot_prediction(
        rawPredictions.x,
        rawPredictions.output,
        idx=indices[idx],
        add_loss_to_title=SMAPE(quantiles=bestTFT.loss.quantiles),
    )

In [None]:
#Actual results vs predicted
predictionsVsReality = bestTFT.calculate_prediction_actual_by_variable(
    predictions.x, predictions.output
)
bestTFT.plot_prediction_actual_by_variable(predictionsVsReality)

In [None]:
#Prediction on subsets of data
#Predict on new data and define covariates 

In [None]:
interpretation = bestTFT.interpret_output(rawPredictions.output, reduction="sum")
bestTFT.plot_interpretation(interpretation)