In [23]:
import pandas as pd
import numpy as np

import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset, random_split

In [24]:
def normalize(data):
    
    min_val = np.min(np.min(data, axis=0), axis=0)
    data = data - min_val

    max_val = np.max(np.max(data, axis=0), axis=0)
    data = data / (max_val + 1e-7)
    
    data = data.astype(np.float32)
    
    return data

In [25]:
class MakeDATA(Dataset):
    def __init__(self, data, seq_len):
        data = np.asarray(data, dtype=np.float32)
        norm_data = normalize(data)
        seq_data = []
        for i in range(len(norm_data) - seq_len + 1):
            x = norm_data[i : i + seq_len]
            seq_data.append(x)
        self.samples = []
        idx = torch.randperm(len(seq_data))
        for i in range(len(seq_data)):
            self.samples.append(seq_data[idx[i]])
        self.samples = np.asarray(self.samples, dtype=np.float32)
            
    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return self.samples[idx]

In [26]:
def data_preprocess(subset_len=25000):
    
    data_dir = f'./preprocessing/data/meter_data/LCL-June2015v2_0.csv'
    cond_data_dir = "./preprocessing/data/conditioning_data/weather_hourly_darksky.csv"
    
    data = pd.read_csv(data_dir)
    data.replace('Null', 0, inplace=True)
    
    cond_data = pd.read_csv(cond_data_dir)
    cond_data = cond_data[["time", "humidity", "temperature", "windSpeed"]]
    cond_data['time'] = pd.to_datetime(cond_data['time'])
    cond_data.set_index('time', inplace=True)
    cond_data = cond_data.resample('30min').interpolate(method='linear')
    
    if subset_len != "None":
       data = data.iloc[:subset_len, 3]
       cond_data = cond_data.iloc[:subset_len, :]
    else:    
        data = data.iloc[:len(cond_data), 3] #for this test dataset cond data is less than actual data
    
    return data, cond_data

In [37]:
def data_preprocess_house_zero():
    data_dir_y1 = './preprocessing/data/meter_data/48190963_Loads_hourly.csv'
    data_dir_y2 = './preprocessing/data/meter_data/48190948_Loads_hourly.csv'

    cond_data_dir_y1 = "./preprocessing/data/conditioning_data/48190936_Weather.csv"
    cond_data_dir_y2 = "./preprocessing/data/conditioning_data/48190930_Weather.csv"

    data_y1 = pd.read_csv(data_dir_y1)
    data_y2 = pd.read_csv(data_dir_y2)

    cond_data_y1 = pd.read_csv(cond_data_dir_y1, encoding='unicode_escape')
    cond_data_y2 = pd.read_csv(cond_data_dir_y2, encoding='unicode_escape')

    meter_data = pd.concat([data_y1.reset_index(drop=True), data_y2.reset_index(drop=True)], axis=0)
    cond_data = pd.concat([cond_data_y1.reset_index(drop=True), cond_data_y2.reset_index(drop=True)], axis=0)

    meter_data = meter_data[["NET", "Cooling", "PV_meter1_load (kW)", "PV_meter2_load (kW)", "Battery cabinet", "Sumppump", "Plug_Basement", "Solar rapid shutdown"]]
    cond_data = cond_data.iloc[:, 1:]
    
    return meter_data, cond_data

In [38]:
def LoadData(seq_len, subset_len, dataset="HouseZero"):
    if dataset == "HouseZero":
        data, cond_data = data_preprocess_house_zero()
    if dataset == "LondonDataStore":
        data, cond_data = data_preprocess(subset_len) 
    
    tts_split = 0.8
    data = MakeDATA(data, seq_len)
    cond_data = MakeDATA(cond_data, seq_len)
    
    train_size = int(len(data) * tts_split)
    test_size = len(data) - train_size
    train_data, test_data = random_split(data, [train_size, test_size])
    
    train_size = int(len(cond_data) * tts_split)
    test_size = len(cond_data) - train_size
    cond_data_train, cond_data_test = random_split(cond_data, [train_size, test_size])
    
    return train_data, test_data, cond_data_train, cond_data_test

In [39]:
def serve_data(seq_len, batch_size, subset_len=25000):
    train_data, test_data, cond_data_train, cond_data_test = LoadData(seq_len=seq_len, subset_len=subset_len)
    train_data, test_data, cond_data_train, cond_data_test= np.asarray(train_data), np.asarray(test_data), np.asarray(cond_data_train), np.asarray(cond_data_test)
    
    if len(train_data.shape) < 3:
        train_data = np.expand_dims(train_data, axis=-1)
        test_data = np.expand_dims(test_data, axis=-1)
    
    features = train_data.shape[2]
    cond_features = cond_data_train.shape[2]
    
    print(f"num of channels in transformer: {features} \nnum of cond feature: {cond_features}")

    train_data, test_data, cond_data_train, cond_data_test = train_data.transpose(0,2,1), test_data.transpose(0,2,1), cond_data_train.transpose(0,2,1), cond_data_test.transpose(0,2,1)
    print(f"Train shape (batch, features, seq_len): {train_data.shape}")
    print(f"Cond shape (batch, features, seq_len): {cond_data_train.shape}")
    
    train_dataset = TensorDataset(torch.from_numpy(train_data), torch.from_numpy(cond_data_train))
    train_loader = DataLoader(train_dataset, batch_size)

    test_dataset = TensorDataset(torch.from_numpy(test_data), torch.from_numpy(cond_data_test))
    test_loader = DataLoader(test_dataset, batch_size)

    real_data, real_cond_data = next(iter(train_loader))
    print(f"batched data shape: {real_data.shape}")
    
    return train_loader, test_loader, features, cond_features

In [41]:
train_loader, test_loader, features, cond_features = serve_data(15, 32)

num of channels in transformer: 8 
num of cond feature: 8
Train shape (batch, features, seq_len): (14024, 8, 15)
Cond shape (batch, features, seq_len): (14024, 8, 15)
batched data shape: torch.Size([32, 8, 15])
