# Inference
#### Calculate NSE for all train and test periods of the trained model

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.preprocessing import StandardScaler
import plotly.express as px
import plotly.graph_objects as go

In [None]:
# Define hyperparameters
# Number of validation boreholes (not used in training or testing)
num_validation = 4
# Sequence length also acts similarly to the warm up period in conventional models
seq_length = 730
# Portion of data for training, from which test proportion is inferred
train_split = 0.8
# Batch size should be a exponent of base 2
batch_size = 512
# Hidden size of LSTM
hidden_size = 5
# Number of stacked layers of LSTM
num_layers = 5
# Embedding size
embedding_size = 4
# Initial learning rate
lr = 0.0001
# Number of epochs
epochs = 100
# Learning rate scheduler patience
patience = 20


# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
# Function to load data
def load_data(file_path: str|Path, prefix: str) -> list[pd.DataFrame]:
    """
    Load and concatenate CSV files from a directory into a DataFrame.

    This function reads all CSV files in the specified directory that match the provided prefix, 
    parses the "Date" column into datetime format with day first, and concatenates the data into a single DataFrame.

    Args:
        file_path (str|Path): The directory path where the CSV files are located.
        prefix (str): The prefix of the CSV files to be loaded.

    Returns:
        pd.DataFrame: The concatenated DataFrame of all matched CSV files.

    """
    return pd.concat(
        [
            pd.read_csv(p, parse_dates=["Date"], dayfirst=True)
            for p in Path(file_path).glob(f"{prefix}*.csv")
        ]
    )

In [None]:
# Load in observed groundwater and meteorological data
df_gwl = load_data("../data_processed/", "AquiMod_")
df_met = load_data("../data_processed/", "ukcp18_")
# Generate incremental borehole_id
df_gwl['bhid'] = (df_gwl['Borehole'] != df_gwl['Borehole'].shift()).cumsum() - 1
num_boreholes = df_gwl["bhid"].max() + 1
num_training = num_boreholes - num_validation
# Merge data
df_data = pd.merge(left=df_gwl, right=df_met, on=["Borehole", "Model", "Date"], how="inner").dropna().reset_index(drop=True)
# Save memory
del df_gwl
del df_met
df_data

In [None]:
# Split data into separate dataframes for training, testing and validation
# Fit GWL scalers
# Transform GWL data with scalers
# Create sequences
# I have just realised that the validation data needs to include the final seq_length values from the training data

df_train_test = df_data.query(f"bhid < {num_training}")
# df_validation = df_data.query(f"bhid >= {num_training}")
train_list = []
test_list = []
# Loop through training boreholes
for i in range(num_training):
    # Slice dataframe to borehole
    df = df_train_test.query("bhid == @i").copy()
    # Split data into training and testing
    train_size = int((len(df) - seq_length) * train_split)
    train_list.append(df.iloc[:train_size])
    test_list.append(df.iloc[train_size:])

df_train = pd.concat(train_list)
df_test = pd.concat(test_list)

In [None]:
# Initialise scalers
precip_scaler = StandardScaler()
pet_scaler = StandardScaler()
gwl_scalers = [StandardScaler() for _ in range(num_boreholes)]

# Fit and transform borehole-independent scalers
precip_train = precip_scaler.fit_transform(df_train["precipwsnow"].values.reshape(-1, 1))
precip_test = precip_scaler.transform(df_test["precipwsnow"].values.reshape(-1, 1))
pet_train = pet_scaler.fit_transform(df_train["PET"].values.reshape(-1, 1))
pet_test = pet_scaler.transform(df_test["PET"].values.reshape(-1, 1))
# Extract bhid data
bhid_train = df_train["bhid"].values.reshape(-1, 1)
bhid_test = df_test["bhid"].values.reshape(-1, 1)

# Fit and transform borehole scalers
gwl_train = []
gwl_test = []

for i in range(num_training):
    scaler = gwl_scalers[i]
    gwl_train.append(scaler.fit_transform(df_train[df_train["bhid"] == i]["Obs"].values.reshape(-1, 1)))
    gwl_test.append(scaler.transform(df_test[df_test["bhid"] == i]["Obs"].values.reshape(-1, 1)))

gwl_train = np.vstack(gwl_train)
gwl_test = np.vstack(gwl_test)

In [None]:
# Add the final (seq_length - 1) number of timesteps to the testing data to generate continous sequences
# I need to confirm this but I think we need to add (seq_length - 1) instead of (seq_length)
# This is because prepending the full seq_length would create an entire timestep within the training data
# Ultimately, it is only one day and doesn't actually matter much
precip_test = np.concatenate((precip_train[-(seq_length - 1):], precip_test), axis=0)
pet_test = np.concatenate((pet_train[-(seq_length - 1):], pet_test), axis=0)
bhid_test = np.concatenate((bhid_train[-(seq_length - 1):], bhid_test), axis=0)
gwl_test = np.concatenate((gwl_train[-(seq_length - 1):], gwl_test), axis=0)

In [None]:
def create_sequences(data: np.ndarray, seq_length: int) -> np.ndarray:
    """
    Transforms 2D time-series data into an array of sequences of a specified length.

    Parameters:
    data (np.ndarray): A 2D numpy array where each row is a time step and each column is a feature.
    seq_length (int): The number of time steps to include in each output sequence.

    Returns:
    np.ndarray: A 3D numpy array of shape (num_samples - seq_length + 1, seq_length, num_features).
    """

    xs = []  # Initialise an empty list to store sequences

    # For each possible sequence in the data...
    for i in range(len(data) - seq_length + 1):
        # Extract a sequence of length `seq_length`
        x = data[i: (i + seq_length)]
        # Append the sequence to the list
        xs.append(x)

    # Convert the list of sequences into a 3D numpy array
    return np.array(xs)

In [None]:
# create_sequences has to be called individually on each timeseries from each borehole
# Initialise lists to hold dynamic and static data for each borehole for train and test periods
dynamic_train_list = []
dynamic_test_list = []
static_train_list = []
static_test_list = []
gwl_train_list = []
gwl_test_list = []

# Loop through training boreholes in each of the data types and call create_sequences
for i in range(num_boreholes):
    train_mask = (bhid_train == i)
    test_mask = (bhid_test == i)
    dynamic_train_list.append(
        torch.from_numpy(
            create_sequences(
                np.column_stack((precip_train[train_mask], pet_train[train_mask])), seq_length
            )
        ).float()
    )
    dynamic_test_list.append(
        torch.from_numpy(
            create_sequences(
                np.column_stack((precip_test[test_mask], pet_test[test_mask])), seq_length
            )
        ).float()
    )
    static_train_list.append(torch.from_numpy(create_sequences(bhid_train[train_mask], seq_length)))
    static_test_list.append(torch.from_numpy(create_sequences(bhid_test[test_mask], seq_length)))
    gwl_train_list.append(
        torch.from_numpy(gwl_train[train_mask][seq_length - 1:].reshape(-1, 1)).float()
    )
    gwl_test_list.append(
        torch.from_numpy(gwl_test[test_mask][seq_length - 1:].reshape(-1, 1)).float()
    )

# dynamic_train_arr = np.concatenate(dynamic_train_list)
# dynamic_test_arr = np.concatenate(dynamic_test_list)
# static_train_arr = np.concatenate(static_train_list)
# static_test_arr = np.concatenate(static_test_list)
# gwl_train_arr = np.concatenate(gwl_train_list)
# gwl_test_arr = np.concatenate(gwl_test_list)

# # Save memory
# del dynamic_train_list
# del dynamic_test_list
# del static_train_list
# del static_test_list
# del gwl_train_list
# del gwl_test_list

In [None]:
def inference(model, dynamic_sequence, static_data):
    """
    Perform inference with the trained LSTM model.

    Parameters:
    - model: Trained LSTM model
    - dynamic_sequence: Input dynamic sequence (shape: [sequence_length, dynamic_size])
    - static_data: Input static data (shape: [sequence_length])

    Returns:
    - predictions: Model predictions (shape: [sequence_length, output_size])
    """

    # Ensure that dynamic_sequence has the shape [batch_size, sequence_length, dynamic_size]
    # Ensure that static_data has the shape [batch_size, sequence_length]

    # Forward pass through the model
    with torch.no_grad():
        predictions = model(dynamic_sequence, static_data)

    return predictions.cpu().numpy()  # Convert predictions to a NumPy array

In [None]:
model = torch.load("model.pt")