### Imports

In [None]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import torch
import torch.nn as nn
import torch.nn.functional as T
from torch.utils.data import Dataset, random_split, DataLoader
from pyspark.sql.functions import from_utc_timestamp, to_utc_timestamp, lit, date_format, to_timestamp
from sklearn.model_selection import train_test_split
import torch.optim as optim
from sklearn.preprocessing import MinMaxScaler
from datetime import timedelta
import mlflow
import mlflow.pytorch
from torch.utils.data import DataLoader, TensorDataset
from math import pi, sin, cos
import math
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error

### Necessary variables

In [None]:

"""
Desired resolution of the downsampled power consumption data.
"""
res = "60S" 



"""
We set the values for window_length and values_to_predict, which is the low-resolution value divided by the high-resolution value.
"""
high_res_freq = 2
low_res_freq = 10
window_length = int(low_res_freq/high_res_freq)
values_to_predict = int(low_res_freq/high_res_freq)

"""
We determine the length of the train, validation and test sets, by setting number of rows in each set.
"""
train_rows = 4320
validation_rows = 1440
test_rows = 1440


"""
Setting the batch_size value.
"""
batch_size = 32



"""
Setting the number of epochs and learning rate value.
"""
num_epochs = 100
learning_rate = 0.0001



"""
Parameter values for the T2SR framework.
"""
input_size = 1
output_size = values_to_predict
d_model = 64
nhead = 4
num_encoder_layers = 2
num_decoder_layers = 2
dropout_p = 0.0


### Example data 

* In the initial experiment we use real-world data obtained from building in Norway. We create some example data to illustrate the format of the data.

* We create 5 days of power consumption data with a 2-second resolution.

* We also simulate missing data in order to get the example data as realistic as possible.

In [None]:


# Base date
start_date = datetime.strptime("2023-01-01T00:00:00.000+0000", "%Y-%m-%dT%H:%M:%S.%f%z")

# Generate timestamps with 2-second resolution for 10 days
# Correctly calculate the total number of 2-second intervals in 10 days
total_intervals = 5 * 24 * 60 * 60 // 2  # Dividing total seconds in 10 days by 2
timestamps = [start_date + timedelta(seconds=2*i) for i in range(total_intervals)]

# Generate random power consumption values
power_consumption = np.random.uniform(low=50, high=300, size=len(timestamps))


# Simulate missing values by removing a percentage of data points
missing_indices = np.random.choice(len(timestamps), size=int(len(timestamps) * 0.05), replace=False)
timestamps = [timestamps[i] for i in range(len(timestamps)) if i not in missing_indices]
power_consumption = [power_consumption[i] for i in range(len(power_consumption)) if i not in missing_indices]



# Create DataFrame
df = pd.DataFrame({
    "timestamp": timestamps,
    "value": power_consumption
})

print (df.head(10))

print (df.size)


                  timestamp       value
0 2023-01-01 00:00:00+00:00  119.443437
1 2023-01-01 00:00:02+00:00  174.756214
2 2023-01-01 00:00:04+00:00  125.664476
3 2023-01-01 00:00:06+00:00  213.255499
4 2023-01-01 00:00:10+00:00  155.417678
5 2023-01-01 00:00:12+00:00  224.596200
6 2023-01-01 00:00:14+00:00  226.341309
7 2023-01-01 00:00:16+00:00  260.997628
8 2023-01-01 00:00:18+00:00  113.396241
9 2023-01-01 00:00:20+00:00  235.358306
410400


## Data Preprocessing

* The data preprocessing is performed in the 'data_preprocessing' notebook. We call on this this notebook by using the %run command.
* We then create a function 'final_preprocessed_data' where we call the methods created in 'data_preprocessing'.

In [None]:
%run " /path/to/data_preprocessing"


In [None]:
def final_preprocessed_data (df):
    
    # Get the interpolated high-resolution data 
    high_res_data = interpolate_and_fill_missing(df)
    
    # Get the downsampled low-resolution data
    low_res_data  = downsample_data(high_res_data, res)
    
    # Map the low-resolution and high-resolution data together 
    mapped_data = mapping_function (high_res_data, low_res_data, window_length)
    
    # Split into train, validation and test sets
    train_df, valid_df, test_df = data_splitting (mapped_data, train_rows, validation_rows, test_rows)
    
    # Split into feature and targets
    X_train, y_train, X_valid, y_valid, X_test, y_test = split_feature_target(train_df, valid_df, test_df)
    
    # Scale the data
    X_train, y_train, X_valid, y_valid, X_test, y_test = scale_data(X_train, y_train, X_valid, y_valid, X_test, y_test)
    
    # Create dataloaders
    train_loader, valid_loader, test_loader = to_dataloaders((X_train, y_train), (X_valid, y_valid), (X_test, y_test))

    # Returns three dataloaders: train, validation and test
    return train_loader, valid_loader, test_loader


train_loader, valid_loader, test_loader = final_preprocessed_data(df)



## Implementation of T2SR Framework

* We implement the architecture of the T2SR framework in the 'model' notebook.

In [None]:
%run " /path/to/model"

## Model Training and Evaluation

* We implement functions for training, validation and testing in the 'train' notebook.

In [None]:
%run " /path/to/train"

## Train the Model


In [None]:
# Initialize lists to store loss and MAE for each epoch for training and validation phases
train_losses_mse = []
train_maes = []
valid_losses_mse = []
valid_maes = []



# Initialize the best validation MSE to infinity for later comparison
best_valid_mse = float('inf')



# Ensure the model is moved to the GPU if available, otherwise it stays on the CPU
model.to(device)



# Start training over specified number of epochs
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    
    # Train the model for one epoch, capturing the mean squared error (MSE) and mean absolute error (MAE)
    train_loss, train_mae = train_model(model, optimizer, criterion_mse, train_loader, device)
    # Store the training loss and MAE for this epoch
    train_losses_mse.append(train_loss)
    train_maes.append(train_mae)
    
    # Print the training loss and MAE for the current epoch
    print(f"Train MSE: {train_loss:.4f}, Train MAE: {train_mae:.4f}")

    # Validate the model using the validation dataset, capturing the MSE and MAE
    valid_loss, valid_mae = validate_model(model, criterion_mse, valid_loader, device)
    # Store the validation loss and MAE for this epoch
    valid_losses_mse.append(valid_loss)
    valid_maes.append(valid_mae)
    
    # Print the validation loss and MAE for the current epoch
    print(f"Valid MSE: {valid_loss:.4f}, Valid MAE: {valid_mae:.4f}")

    # If the current validation loss (MSE) is lower than the best one recorded, update the best score and save the model
    if valid_loss < best_valid_mse:
        best_valid_mse = valid_loss  # Update the best validation MSE
        # Save the current model's state dictionary as the best model checkpoint
        torch.save(model.state_dict(), 'best_model_checkpoint.pth')


## Postprocessing of Data
* We must perform some postprocessing of the data, includign reshaping data and inverse transforms

In [None]:
test_loss, test_mae, test_predictions, test_targets = evaluate_model(model, criterion_mse, test_loader, device)


# Convert predictions list to numpy array
predictions_array = np.array(test_predictions)
targets_array = np.array(test_targets)


# Reshape your data
predictions_reshaped = predictions_array.reshape(-1, values_to_predict)
targets_reshaped = targets_array.reshape(-1, values_to_predict)

# Inverse transform
predicted_values_original_scale = scaler_target.inverse_transform(predictions_reshaped)
actual_values_original_scale = scaler_target.inverse_transform(targets_reshaped)

# If you want to flatten it further, you can do so:
predicted_values_original_scale_flattened = predicted_values_original_scale.flatten()
actual_values_original_scale_flattened = actual_values_original_scale.flatten()




## Model Evaluation

* Finally we are able to calculate MAE and MSE and evaluate the model

In [None]:
# Calculate MAE and MSE using NumPy

#MAE
mae_original = np.mean(np.abs(actual_values_original_scale_flattened - predicted_values_original_scale_flattened))

#MSE
mse_original = np.mean((actual_values_original_scale_flattened - predicted_values_original_scale_flattened)**2)



# Print MAE and MSE in original units
print("MAE in original units:", mae_original)
print("MSE in original units:", mse_original)
