# TFT (Long forecasting)
#### Sliding Window Forecasting - 3 years in, 1 year out

### Need for runnning colab

In [95]:
# !pip install torch torchvision torchaudio pandas numpy scikit-learn pytorch-forecasting

# from google.colab import files
# uploaded = files.upload()

# # Mount Google Drive
# from google.colab import drive
# drive.mount('/content/drive')

## Importing Data

In [96]:
import pandas as pd

# Correct the file path to the actual location of the CSV file
df = pd.read_csv('/Users/casper/Documents/GitHub/p9-energy/Dataset/ConsumptionIndustry.csv', sep=';')

# Load the dataset for colab
# df = pd.read_csv('ConsumptionIndustry.csv', sep=';')


# Convert HourDK to datetime
df['HourDK'] = pd.to_datetime(df['HourDK'])

# Convert ConsumptionkWh to numeric
df['ConsumptionkWh'] = df['ConsumptionkWh'].str.replace(",", ".").astype(float)


## Data preperation + Feature Engineering

In [97]:
import numpy as np
import torch

df['HourDK'] = pd.to_datetime(df['HourDK'])

# Create lag features, rolling averages, and other engineered features
df['ConsumptionkWh_lag1'] = df['ConsumptionkWh'].shift(1)
df['ConsumptionkWh_lag24'] = df['ConsumptionkWh'].shift(24)
df['ConsumptionkWh_lag168'] = df['ConsumptionkWh'].shift(168)
df['ConsumptionkWh_roll24'] = df['ConsumptionkWh'].rolling(window=24).mean()
df['ConsumptionkWh_roll168'] = df['ConsumptionkWh'].rolling(window=168).mean()

holidays = [
    '2021-01-01', '2021-04-01', '2021-04-02', '2021-04-05', '2021-05-13', '2021-05-21',
    # Add more holidays here...
]
holidays = pd.to_datetime(holidays)
df['is_holiday'] = df['HourDK'].dt.date.isin(holidays.date)
df['day_of_week'] = df['HourDK'].dt.dayofweek
df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
df['hour_sin'] = np.sin(2 * np.pi * df['HourDK'].dt.hour / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['HourDK'].dt.hour / 24)
df['day_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
df['day_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)
df['month_sin'] = np.sin(2 * np.pi * df['HourDK'].dt.month / 12)
df['month_cos'] = np.cos(2 * np.pi * df['HourDK'].dt.month / 12)

# Assuming hourly data, use 'h' for hours
df['time_idx'] = (df['HourDK'] - df['HourDK'].min()) // np.timedelta64(1, 'h')


# Drop rows with NaN values after feature engineering
##df.dropna(inplace=True)

# Ensure 'HourDK' is in datetime format
df['HourDK'] = pd.to_datetime(df['HourDK'])

# Create a full time index (hourly frequency from min to max time in your dataset)
full_time_index = pd.date_range(
    df['HourDK'].min(), df['HourDK'].max(), freq='H')

# Create a DataFrame with this full time index
df_full = pd.DataFrame(full_time_index, columns=['HourDK'])

# Merge with the existing data to find missing values
df_full = df_full.merge(df, on='HourDK', how='left')

# Fill or handle missing values
# Forward fill missing consumption values
df_full['ConsumptionkWh'].fillna(method='ffill', inplace=True)

# Create a new time index as an integer (starting from 0 for the first timestamp)
df['time_idx'] = (df['HourDK'] - df['HourDK'].min()
                  ).dt.total_seconds() // 3600  # Hours as integers

# Now use df_full for model training
train_df = df_full

print(df.head())
print(df.info())

            HourUTC              HourDK  MunicipalityNo Branche  \
0  2021-01-01 00:00 2021-01-01 01:00:00             851  Privat   
1  2021-01-01 01:00 2021-01-01 02:00:00             851  Privat   
2  2021-01-01 02:00 2021-01-01 03:00:00             851  Privat   
3  2021-01-01 03:00 2021-01-01 04:00:00             851  Privat   
4  2021-01-01 04:00 2021-01-01 05:00:00             851  Privat   

   ConsumptionkWh  ConsumptionkWh_lag1  ConsumptionkWh_lag24  \
0       35086.772                  NaN                   NaN   
1       31777.762            35086.772                   NaN   
2       28423.659            31777.762                   NaN   
3       25675.926            28423.659                   NaN   
4       24283.909            25675.926                   NaN   

   ConsumptionkWh_lag168  ConsumptionkWh_roll24  ConsumptionkWh_roll168  \
0                    NaN                    NaN                     NaN   
1                    NaN                    NaN               

  full_time_index = pd.date_range(
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_full['ConsumptionkWh'].fillna(method='ffill', inplace=True)
  df_full['ConsumptionkWh'].fillna(method='ffill', inplace=True)


## Splitting the dataset

In [98]:
# Define the dataset split ranges
train_start = "2021-01-01"
train_end = "2023-06-30"
val_start = "2023-07-01"
val_end = "2023-11-30"
test_start = "2023-12-01"
test_end = "2024-11-01"

# Split datasets
train_df = df[(df['HourDK'] >= train_start) & (df['HourDK'] <= train_end)]
val_df = df[(df['HourDK'] >= val_start) & (df['HourDK'] <= val_end)]
test_df = df[(df['HourDK'] >= test_start) & (df['HourDK'] <= test_end)]

print(f"Training Set: {train_df.shape[0]} rows")
print(f"Validation Set: {val_df.shape[0]} rows")
print(f"Test Set: {test_df.shape[0]} rows")

Training Set: 21839 rows
Validation Set: 3650 rows
Test Set: 8065 rows


## Define the TFT Model

In [99]:
from pytorch_forecasting import GroupNormalizer, TimeSeriesDataSet 

max_encoder_length = 336 # 14 days
max_prediction_length = 24 # 1 day

#Traning Dataset 
train_dataset = TimeSeriesDataSet(
  train_df,
  time_idx="time_idx",
  target="ConsumptionkWh",
  group_ids=["HourDK"],
  max_encoder_length=max_encoder_length, 
  max_prediction_length=max_prediction_length,
  time_varying_known_reals=["hour_sin", "hour_cos", "day_sin", "day_cos", "month_sin", "month_cos", "is_holiday"],
  time_varying_unknown_reals=["ConsumptionkWh", "ConsumptionkWh_lag1", "ConsumptionkWh_lag24", "ConsumptionkWh_roll24"],
  target_normalizer=GroupNormalizer(groups=["HourDK"], transformation="softplus"),
  add_relative_time_idx=True,
  add_target_scales=True,
  add_encoder_length=True,
)

#Validation Dataset
val_dataset = TimeSeriesDataSet.from_dataset(train_dataset, val_df)

AssertionError: Timeseries index should be of type integer

## Create DataLoaders

In [None]:
from torch.utils.data import DataLoader

batch_size = 128
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

## Define and Train the TFT model

In [None]:
from pytorch_forecasting import QuantileLoss
from pytorch_forecasting.models import TemporalFusionTransformer

# Define the model 
tft = TemporalFusionTransformer.from_dataset(
  dataset=train_dataset,
  learning_rate=0.01, 
  hidden_size=64,
  attention_head_size=4,
  dropout=0.1,
  hidden_continuous_size=32,
  output_size=7, # 7 quantiles by default
  loss=QuantileLoss(),
  log_interval=10,
  reduce_on_plateau_patience=4
)

# Train the model 
trainer = tft.trainer(
  max_epochs=10,
  gpus=1 if torch.cuda.is_available() else 0,
  gradient_clip_val=0.1,
)

trainer.fit(tft, train_dataloader=train_dataloader, val_dataloaders=val_dataloader)

## Predict the test period

In [None]:
# Create the test dataset
test_dataset = TimeSeriesDataSet.from_dataset(train_dataset, test_df)

# Create test DataLoader
test_dataloader = DataLoader(
    test_dataset, batch_size=batch_size, shuffle=False)

# Predict using the trained model
raw_predictions, x = tft.predict(test_dataloader, return_x=True)

# Extract predictions
predictions = raw_predictions.numpy()

# Map predictions back to the original test dataframe
predicted_dates = test_df["HourDK"].iloc[-len(predictions):].reset_index(drop=True)
predicted_df = pd.DataFrame({"Date": predicted_dates, "Predicted_ConsumptionkWh": predictions})

# True values (actual consumption) from the test set
# Adjust based on the length of predictions
true_values = test_df["ConsumptionkWh"].values[-len(predictions):]

# Predictions (model's predicted consumption)
predictions = predicted_df["Predicted_ConsumptionkWh"].values

## Calculate error on model

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Calculate the metrics
mae = mean_absolute_error(true_values, predictions)
mse = mean_squared_error(true_values, predictions)
rmse = np.sqrt(mse)

# Mean Absolute Scaled Error (MASE)
mase = mean_absolute_error(true_values, predictions) / \
    np.mean(np.abs(np.diff(true_values)))

# Symmetric Mean Absolute Percentage Error (sMAPE)
sMAPE = 100 * np.mean(2 * np.abs(predictions - true_values) /
                      (np.abs(true_values) + np.abs(predictions)))

# Mean Absolute Percentage Error (MAPE)
mape = 100 * np.mean(np.abs(predictions - true_values) / np.abs(true_values))

# R-squared score
r2 = r2_score(true_values, predictions)

# Print the performance metrics
print("Energy Transformer Model Performance:")
print("Total Training time: 9min 23sec")  # Adjust as needed
print("-------------------------------------")
print(f"Mean Absolute Error: {mae:.2f}")
print(f"Mean Squared Error: {mse:.2f}")
print(f"Root Mean Squared Error: {rmse:.2f}")
print(f"R^2 Score: {r2:.2f}")
print('\n')
print(f"Mean Absolute Scaled Error: {mase:.2f}")
print(f"Symmetric Mean Absolute Percentage Error: {sMAPE:.2f}")
print(f"Mean Absolute Percentage Error: {mape:.2f}")

## Visualize predictions

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))
plt.plot(test_df["HourDK"], test_df["ConsumptionkWh"],
         label="Actual", alpha=0.8)
plt.plot(predicted_df["Date"], predicted_df["Predicted_ConsumptionkWh"],
         label="Predicted", linestyle="--")
plt.legend()
plt.title("Electricity Consumption Predictions (1/12/2023 - 1/11/2024)")
plt.xlabel("Date")
plt.ylabel("Consumption (kWh)")
plt.show()

predicted_df.to_csv("predictions_2023_12_to_2024_11.csv", index=False)