In [1]:
%load_ext autoreload
%autoreload 2

import torch
import math
import tqdm
import pandas as pd
import numpy as np
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split


from main import train, TPALSTM

## Load Data in Pytorch DataLoaders

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
# TODO Rethink how this dataset works
class ElectricityDataset(Dataset):
    def __init__(self, df, window_size):
        self.df = df
        self.w_size = window_size
        self.label_col_name = "price actual"

        self.X = torch.tensor(self.df.values, dtype=torch.float32).to(device)
        self.y = torch.tensor(self.df.loc[:, self.label_col_name].values, dtype=torch.float32).unsqueeze(1).to(device)
    
    def __getitem__(self, idx):
        idcs = range(idx, idx + self.w_size)
        return self.X[idx:idx + self.w_size, :], self.y[idx + self.w_size]

    def __len__(self):
        return len(self.df) - self.w_size


In [4]:
data_df = pd.read_csv('data/clean_and_merge_data.csv', index_col=0)
data_df.head()

Unnamed: 0_level_0,generation biomass,generation fossil brown coal/lignite,generation fossil gas,generation fossil hard coal,generation fossil oil,generation hydro pumped storage consumption,generation hydro run-of-river and poundage,generation hydro water reservoir,generation nuclear,generation other,...,weekday,month,business hour,temp_range_Barcelona,temp_range_Bilbao,temp_range_Madrid,temp_range_Seville,temp_range_Valencia,temp_weighted,generation coal all
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-12-31 23:00:00+00:00,447.0,329.0,4844.0,4821.0,162.0,863.0,1051.0,1899.0,7096.0,43.0,...,0.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,273.181801,5150.0
2015-01-01 00:00:00+00:00,449.0,328.0,5196.0,4755.0,158.0,920.0,1009.0,1658.0,7096.0,43.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,273.188663,5083.0
2015-01-01 01:00:00+00:00,448.0,323.0,4857.0,4581.0,157.0,1164.0,973.0,1371.0,7099.0,43.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,272.557335,4904.0
2015-01-01 02:00:00+00:00,438.0,254.0,4314.0,4131.0,160.0,1503.0,949.0,779.0,7098.0,43.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,272.554211,4385.0
2015-01-01 03:00:00+00:00,428.0,187.0,4130.0,3840.0,156.0,1826.0,953.0,720.0,7097.0,43.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,272.572446,4027.0


In [5]:
train_frac = 0.7
val_frac = 0.1
test_frac = 0.2

train_lim = math.floor(train_frac * data_df.shape[0])
val_lim = math.floor(val_frac * data_df.shape[0]) + train_lim

window_size = 24

train_dataset = ElectricityDataset(data_df.iloc[:train_lim], window_size=window_size)
val_dataset = ElectricityDataset(data_df.iloc[train_lim:val_lim], window_size=window_size)
test_dataset = ElectricityDataset(data_df.iloc[val_lim:], window_size=window_size)

batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

## Train the model

In [6]:
model = TPALSTM(
    input_size=data_df.shape[1],
    output_horizon=1,
    hidden_size=25,
    obs_len=24,
    n_layers=2
)
model.to(device)

criterion = nn.MSELoss()
optimiser = torch.optim.Adam(model.parameters(), lr=0.005)

In [7]:
def train_one_epoch(train_loader):
    running_loss = 0.
    last_loss = 0.0

    for i, data in enumerate(train_loader):
        inputs, label = data 
        # print(inputs.shape, label.shape)

        optimiser.zero_grad()

        outputs = model(inputs)

        loss = criterion(outputs, label)
        loss.backward()

        optimiser.step()

        running_loss += loss.item()
        if i % batch_size == 0:
            last_loss = running_loss / batch_size # loss per batch
            # print('  batch {} loss: {}'.format(i // batch_size, last_loss))
            running_loss = 0.

    return last_loss

In [9]:
EPOCHS = 6
epoch_nb = 0

best_vloss = 1_000_000

for epoch in range(EPOCHS):
    model.train(True)
    avg_loss = train_one_epoch(train_loader)


    if epoch % 5 == 0:
        # model.train(False)
        with torch.no_grad():
            running_vloss = 0.0
            for i, vdata in enumerate(val_loader):
                vinputs, vlabels = vdata
                voutputs = model(vinputs)
                vloss = criterion(voutputs, vlabels)
                running_vloss += vloss

            avg_vloss = running_vloss / len(val_loader)
            print(f'Epoch {epoch}, LOSS train {avg_loss:.3f} valid {avg_vloss:.3f}')

    epoch_nb += 1

Epoch 0, LOSS train 201.532 valid 260.157
Epoch 5, LOSS train 172.586 valid 232.746
