In [1]:
import os
from filelock import FileLock
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_regression
from torch.optim.lr_scheduler import ReduceLROnPlateau

In [2]:
import os
import sys

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from models import new_models
from config import load_data

In [3]:
data_dir = "./data/"
target_variable = "Q_Kalltveit"

In [4]:
def load_data_from_file(file_path):
    """Loads a pandas DataFrame from a CSV or XLSX file."""
    with FileLock(os.path.expanduser("~/.data.lock")):
        # get file extension
        file_ext = os.path.splitext(file_path)[1]

        # check if file is .csv or .xlsx
        if file_ext == ".csv":
            data = pd.read_csv(file_path, index_col='Datetime')
        elif file_ext == ".xlsx":
            data = pd.read_excel(file_path)

    return data 

In [5]:
def get_raw_data():
    # Construct file path using os.path.join

    raw_data_path = os.path.join(data_dir, "raw_data", "cascaded_use_case_data.xlsx")
    
    if os.path.isfile(raw_data_path):
        return load_data_from_file(raw_data_path)
    else:
        raise FileNotFoundError("Raw data file does not exist at path: {}".format(raw_data_path))

In [6]:
def create_lagged_matrix(window_size, vars_to_lag=None, pca=False, mi=False): #TODO: Fix decimal (five values in dataframe four in X, y)
    """
    Create a lagged matrix from time series data.
    Args:
    - window_size: number of lags to include.
    - vars_to_lag: list of variable names to include in the lagged matrix.
    If None, all variables except the target variable are included.
    Returns:
    - X: tensor array of shape (n_samples, window_size, ).
    - y: tensor array of shape (n_samples,).
    """

    # Construct file path using os.path.join
    path = os.path.join(
        data_dir, 
        "clean_data", 
        "multivariate", 
        target_variable, 
        f"{window_size}_lag_" + ("_".join(vars_to_lag) if vars_to_lag else "") + ".csv")
        
    if os.path.isfile(path): 
        lagged_df = load_data_from_file(path)
    else:
        data = get_raw_data()

        if vars_to_lag:
            lagged_df = data[[target_variable] + vars_to_lag].copy()
            # create a lagged matrix of target and variables
            for i in range(1, window_size+1):
                for var in vars_to_lag:
                    lagged_df.loc[:, f'{var}_{i}'] = lagged_df[var].shift(i)
        else:
            lagged_df = data[[target_variable]].copy()
            # create a lagged matrix of target
            for i in range(1, window_size+1):
                lagged_df.loc[:, f'{target_variable}_{i}'] = lagged_df[target_variable].shift(i)

        # set datetime to index
        lagged_df.index = data['Datetime']

        # remove rows with NaN values
        lagged_df.dropna(inplace=True)

        # save lagged matrix
        lagged_df.to_csv(path, index=True)
    
    # separate the target variable from the input variables
    X = lagged_df.drop(columns=[f'{target_variable}'], axis=1)
    y = lagged_df[f'{target_variable}'] # TODO: methods such as Granger causality or structural equation modeling to determine whether the lagged values of "Nedbør Nilsebu" and "Q_Lyngsaana" are predictive of future values of "Q_Kalltveit".

    X = torch.tensor(np.array(X)).float()
    y = torch.tensor(np.array(y)).float()
    if not vars_to_lag:
        # reshape X into a 3D tensor with dimensions 
        # (number of sequences, sequence length, 1) if univariate
        X = X.unsqueeze(-1)

    return X, y

In [7]:
def split_data(X, y, train_size=0.7, val_size=0.2, test_size=0.1):
    """
    Splits the dataset into training, validation, and test sets.
    
    Parameters:
        X (array-like): The input data.
        y (array-like): The target data.
        train_size (float): The proportion of the dataset to use for training.
        val_size (float): The proportion of the dataset to use for validation.
        test_size (float): The proportion of the dataset to use for testing.
    
    Returns:
        A tuple (X_train, y_train, X_val, y_val, X_test, y_test) containing the
        training, validation, and test sets.
    """
    # Check that the sizes add up to 1.0
    if round(train_size + val_size + test_size, 2) != 1.0:
        raise ValueError("Train, validation, and test sizes must add up to 1.0")

    # Split the dataset into training and test sets
    X_train, X_val_test, y_train, y_val_test = train_test_split(X, y, train_size=train_size, shuffle=True)

    # Compute the validation size relative to the remaining data after the train split
    val_size_ratio = test_size / (val_size + test_size)
    
    # Split the remaining data into training and validation sets
    X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test,
                                                    test_size=val_size_ratio,
                                                    shuffle=False)

    return X_train, y_train, X_val, y_val, X_test, y_test

In [8]:
from tqdm import tqdm
from torchmetrics.functional import mean_absolute_error

def fit(model, loss_function, optimizer, data_loader, num_epochs, mode, use_amp=False):
	history = {"train": {"loss": [], "mae": []}, "val": {"loss": [], "mae": []}}
	scaler = torch.cuda.amp.GradScaler(enabled=use_amp) # Mixed-precision support for compatible GPUs
	print("\nTraining the model:")
	for epoch in range(num_epochs):
		print("\nEpoch", epoch+1)
		if epoch < num_epochs - 1:
			keys = ["train", "val"]
		else:
			keys = ["train", "val", "test"]
		for key in keys:
			dataset_size = 0
			dataset_loss = 0.0
			if key == "train":
				model.train()
			else:
				model.eval()
			for X_batch, y_batch in tqdm(data_loader[key]):
				X_batch, y_batch = X_batch.to(mode["device"]), y_batch.to(mode["device"])
				with torch.set_grad_enabled(mode=(key=="train")): # Autograd activated only during training
					with torch.cuda.amp.autocast(enabled=use_amp): # Mixed-precision support for compatible GPUs
						batch_output = model(X_batch.float())
						batch_loss = loss_function(batch_output, y_batch)
					if key == "train":
						scaler.scale(batch_loss).backward()
						scaler.step(optimizer) 	
						scaler.update()
						optimizer.zero_grad()
				dataset_size += y_batch.shape[0]
				dataset_loss += y_batch.shape[0] * batch_loss.item()
			dataset_loss /= dataset_size
			if key in ["train", "val"]:
				history[key]["loss"].append(dataset_loss)
			else:
				print("\nEvaluating the model:")
			print(key, "loss:", dataset_loss)
	return history


In [9]:
def create_dataloader(X, y, sequence_length, batch_size, shuffle):
    """
    Creates a PyTorch DataLoader from input data X and target data y.
    
    Parameters:
        X (ndarray): The input data.
        y (ndarray): The target data.
        sequence_length (int): The length of each sequence in the input data.
        batch_size (int): The batch size to use for the DataLoader.
        shuffle (bool): Whether to shuffle the data before creating the DataLoader.
    
    Returns:
        A PyTorch DataLoader object.
    """
    if X.shape[-1] != 1:
        # reshape X_train into a 3D tensor with dimensions (number of sequences, sequence length, number of features)
        num_sequences = X.shape[0]
        num_features = X.shape[1]
        X_3d = np.zeros((num_sequences, sequence_length, num_features))
        for i in range(sequence_length, num_sequences):
            X_3d[i] = X[i-sequence_length:i, :]
        X_3d = X_3d.astype(np.float32)
        X = X_3d.copy()
        X = torch.tensor(X)

    # create a PyTorch dataset and dataloader
    dataset = TensorDataset(X, y)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)

    return dataloader

In [10]:
def train_model(config):

    use_GPU = torch.cuda.is_available()
    if use_GPU:
        mode = {"name": "cuda", "device": torch.device("cuda")}
    else:
        mode = {"name": "cpu", "device": torch.device("cpu")}

    # Define hyperparameters
    train_size = 0.6
    val_size = 0.2
    test_size = 0.2

    sequence_length = 25
    batch_size = 256
    num_epochs = 20
    lr = 0.00001
    weight_decay = 0

    vars = None #["Nedbør Nilsebu", "Q_Lyngsaana"]
    
    ########################################################################################################################
    ld = load_data(data_dir = data_dir, target_variable = 'Q_Kalltveit') # god
    
    X, y = ld.create_lagged_matrix(window_size=sequence_length, vars_to_lag=vars)

    X_train, y_train, X_val, y_val, X_test, y_test = ld.split_data(X, y)
    
    train_dataloader = ld.create_dataloader(X_train, y_train, sequence_length, batch_size=batch_size, shuffle=True)
    val_dataloader = ld.create_dataloader(X_val, y_val, sequence_length, batch_size=batch_size, shuffle=True)
    test_dataloader = ld.create_dataloader(X_test, y_test, sequence_length, batch_size=batch_size, shuffle=False)
    ########################################################################################################################


    for i, j in test_dataloader:
        print(i.shape)
        break
    
    # Model inputs
    input_size = X_train.shape[-1]
    hidden_size = 64
    num_layers = 2
    output_size = 1

    net = new_models.LSTM(input_size,
                    hidden_size,
                    num_layers,
                    output_size,
                    )
    
    data_loader = {
    "train": train_dataloader,
    "val": val_dataloader,
    "test": test_dataloader,
    }
    
    net.to(mode["device"])

    for i, j in test_dataloader:
        print(i.shape)
        break

    loss_function = nn.MSELoss().to(mode["device"])
    optimizer = optim.Adam(net.parameters(), lr=lr, weight_decay=weight_decay)

    # Define your learning rate scheduler
    lr_scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5, verbose=True)
                                           
    scaler = torch.cuda.amp.GradScaler(enabled=False) # Mixed-precision support for compatible GPUs
    for epoch in range(num_epochs):
        print("\nEpoch", epoch+1)
        if epoch < num_epochs - 1:
            keys = ["train", "val"]
        else:
            keys = ["train", "val", "test"]
        for key in keys:
            dataset_size = 0
            dataset_loss = 0.0
            if key == "train":
                net.train()
            else:
                net.eval()
            for X_batch, y_batch in tqdm(data_loader[key]):
                X_batch, y_batch = X_batch.to(mode["device"]), y_batch.to(mode["device"])
                with torch.set_grad_enabled(mode=(key=="train")): # Autograd activated only during training
                    with torch.cuda.amp.autocast(enabled=False): # Mixed-precision support for compatible GPUs
                        batch_output = net(X_batch.float())
                        batch_loss = loss_function(batch_output, y_batch)
                    if key == "train":
                        scaler.scale(batch_loss).backward()
                        scaler.step(optimizer) 	
                        scaler.update()
                        optimizer.zero_grad()
                dataset_size += y_batch.shape[0]
                dataset_loss += y_batch.shape[0] * batch_loss.item()

            dataset_loss /= dataset_size

            # Report results to Ray Tune
            if key == "train":
                pass
            elif key == "val":
                # Update learning rate
                lr_scheduler.step(metrics=dataset_loss)
            else:
                print("\nEvaluating the model:")
                print(key, "loss:", dataset_loss)


In [11]:
train_model("")

ValueError: could not broadcast input array from shape (25,25) into shape (25,1)