In [None]:
import json
import math
import os
from datetime import datetime, timedelta
from pathlib import Path
from random import shuffle

import numpy as np
import scipy.io as sio
import torch
from scipy import signal
from tqdm.notebook import tqdm

In [None]:
# Settings to create dataset filtered and subsampled and files lists of the dataset
data_location_raw = "./data/"
labels_location = "./etna_dataset/labels.pt"
data_location_processed = "./etna_dataset/data_5Hz/"
save_folders = ["save_2011_2012", "save_2013_2014", "save_2015"]

stations = ["EBELz", "ECANz", "ECBDz", "ECCSz", "ECPNz", "ECZMz", "EFIUz", "EMFOz", "EMGRz", "EMNRz", "EMPLz", "EPDNz", "EPLCz", "EPOZz", "EPZFz", "ESCVz", "ESMLz", "ESPCz", "ESVOz", "EZPOz"]
channels_name = ["0:EBELz", "1:ECANz", "2:ECBDz","3:ECCSz", "4:ECPNz", "5:ECZMz", "6:EFIUz", "7:EMFOz", "8:EMGRz", "9:EMNRz", "10:EMPLz", "11:EPDNz", "12:EPLCz", "13:EPOZz", "14:EPZFz", "15:ESCVz", "16:ESMLz", "17:ESPCz", "18:ESVOz", "19:EZPOz"]
channels = [2, 4, 7, 10, 15, 17]    # Channels to keep

desired_file_length = 10    #minutes
file_frequency = 100    #Hz
subsample_frequency = 5    #Hz
files_day = 144    # number of files in one day
days_to_skip = 7    # number of days before and after an event

# Set up Butterworth filter
fs = file_frequency    # Sampling frequency, default=100
lowcut = 0.01    # Low cut-off frequency of the filter
highcut = subsample_frequency/2    # High cut-off frequency of the filter
nyq = 0.5 * fs
low = lowcut / nyq
high = highcut / nyq
sos = signal.butter(2, [low, high], analog=False, btype='bandpass', output='sos')

data_location_concat = "./etna_dataset/data_concat_5Hz/"
data_location_concat_percentile = "./etna_dataset/data_concat_5Hz/percentiles/"
files_list_location = "./etna_dataset/files_list/"

dataset_files_list = "./etna_dataset/dataset/"
ratio = 80    # % ratio between training set and validation set

In [None]:
# Concat dataset one channel at time,
# Calculate percentiles of each channel and max, min, mean and std excluding values outside the percentiles

Path(data_location_concat).mkdir(parents=True, exist_ok=True)
Path(data_location_concat_percentile).mkdir(parents=True, exist_ok=True)
p_0_01, p_99_99, p_0_1, p_99_9, mean, std, max, min, mean1, std1, max1, min1 = [[None for i in range(0,20)] for i in range(12)]

files = list()
for (dirpath, dirnames, filenames) in os.walk(data_location_processed):
    files+=filenames

label = []
for ch in tqdm(range(len(channels_name)), desc="Channels"):
    t = []
    # Dataset concatenation
    for file in tqdm(files, desc="Files"):
        load = torch.load(os.path.join(data_location_processed, file))
        t.append(load['DATA'][ch][0])
        if(ch == 0): 
            label.append(load['LABEL'][0])
    t = torch.cat(t, 0)
    
    # Percentiles calculation
    p_0_01[ch] = np.percentile(t, 0.01)
    p_99_99[ch] = np.percentile(t, 99.99)
    p_0_1[ch] = np.percentile(t, 0.1)
    p_99_9[ch] = np.percentile(t, 99.9)
    
    # max, min, mean and std excluding values outside the percentiles
    mask = (t >= p_0_01[ch]) & (t <= p_99_99[ch])
    arr = t[mask]
    mean[ch] = arr.mean().item()
    std[ch] = arr.std().item()
    max[ch] = arr.max().item()
    min[ch] = arr.min().item()
    
    mask = (t >= p_0_1[ch]) & (t <= p_99_9[ch])
    arr = t[mask]
    mean1[ch] = arr.mean().item()
    std1[ch] = arr.std().item()
    max1[ch] = arr.max().item()
    min1[ch] = arr.min().item()
    
    data_dict = dict()
    data_dict['DATA_CONCAT'] = t
    data_dict['LABEL_CONCAT'] = label
    torch.save(data_dict, os.path.join(data_location_concat, "data_concat_CH" + str(ch) + ".pt"))
    
perc_dict = dict()
perc_dict['perc_0.01'] = p_0_01
perc_dict['perc_99.99'] = p_99_99
perc_dict['perc_0.1'] = p_0_1
perc_dict['perc_99.9'] = p_99_9
torch.save(perc_dict, os.path.join(data_location_concat_percentile, "percentiles.pt"))
        
data_dict = dict()
data_dict['perc_0.01'] = p_0_01
data_dict['perc_99.99'] = p_99_99
data_dict['MEAN'] = torch.Tensor(mean)
data_dict['STD'] = torch.Tensor(std)
data_dict['MAX'] = torch.Tensor(max)
data_dict['MIN'] = torch.Tensor(min)
torch.save(data_dict, os.path.join(data_location_concat_percentile, "mean_std_max_min__0_01_99_99.pt"))

data_dict = dict()
data_dict['perc_0.1'] = p_0_1
data_dict['perc_99.9'] = p_99_9
data_dict['MEAN'] = torch.Tensor(mean1)
data_dict['STD'] = torch.Tensor(std1)
data_dict['MAX'] = torch.Tensor(max1)
data_dict['MIN'] = torch.Tensor(min1)
torch.save(data_dict, os.path.join(data_location_concat_percentile, "mean_std_max_min__0_1_99_9.pt"))

In [None]:
# Create following files lists:
# - Whole dataset
# - Files with label=0
# - Files with label=1 or 2
# - Files in which specific channels are ON (max and min != 0)
# - Files in which specific channels are ON and with label=0
# - Files in which specific channels are ON and with label=1 or 2

Path(files_list_location).mkdir(parents=True, exist_ok=True)

files = list()
for (dirpath, dirnames, filenames) in os.walk(data_location_processed):
    files+=filenames
print("Number of files: " + str(len(files)))

files_0 = [files[i] for i in range(len(files)) if files[i][11:12] == "0"]
print("Number of files with label=0: " + str(len(files_0)))
    
files_1_2 = [files[i] for i in range(len(files)) if files[i][11:12] != "0"]
print("Number of files with label=1 or label=2: " + str(len(files_1_2)))
        
data_dict=dict()
data_dict['files_list'] = files
data_dict['files_list_0'] = files_0
data_dict['files_list_1_2'] = files_1_2
torch.save(data_dict, os.path.join(files_list_location, "files_lists.pt"))

files_ON = list()
for file in tqdm(files, desc="Find ON"):
    dict_load = torch.load(data_location_processed + file)
    file_ok = True
    for ch in channels:
        data_current = dict_load['DATA'][ch]
        if np.abs(data_current).max() == 0 and np.abs(data_current).min() == 0:
            file_ok = False
            break
    if file_ok:
        files_ON.append(file)
print("Number of files in which channels: " + str(channels) + " are ON: " + str(len(files_ON)))

files_0_ON = [files_ON[i] for i in range(len(files_ON)) if files_ON[i][11:12] == "0"]
print("Number of files in which channels: " + str(channels) + " are ON and with label=0: " + str(len(files_0_ON)))

files_1_2_ON = [files_ON[i] for i in range(len(files_ON)) if files_ON[i][11:12] != "0"]
print("Number of files in which channels: " + str(channels) + " are ON and with label=1 or 2: " + str(len(files_1_2_ON)))

data_dict=dict()
data_dict['channels'] = channels
data_dict['files_ON'] = files_ON
data_dict['files_0_ON'] = files_0_ON
data_dict['files_1_2_ON'] = files_1_2_ON
torch.save(data_dict, os.path.join(files_list_location, "files_lists_ON.pt"))

In [None]:
# Create following files lists:
# - Files with label = 0 and x days before and after every event
# - Files in which specific channels are ON, with label = 0 and x days before and after every event
# - Files in which specific channels are ON, with label = 0, x days before and after every event and without values outside the percentiles

Path(files_list_location).mkdir(parents=True, exist_ok=True)
skip_x_days = days_to_skip*files_day

files = list()
for (dirpath, dirnames, filenames) in os.walk(data_location_processed):
    files+=filenames
print("Number of files: " + str(len(files)))

files_0_OK = list()
for i in tqdm(range(len(files)), desc="Files " + str(days_to_skip) + " days before and after"):
    start = 0 if (i-skip_x_days < 0) else i-skip_x_days
    stop = len(files) if (i+skip_x_days > len(files)) else i+skip_x_days
    file_ok = True
    for j in range(start, stop):
        if files[j][11:12] != "0":
            file_ok = False
            break
    if file_ok:
        files_0_OK.append(files[i])
print("Number of files with label=0 and " + str(days_to_skip) + " days before and after any event: " + str(len(files_0_OK)))

files_0_OK_ON = list()
for file in tqdm(files_0_OK, desc="Find ON"):
    dict_load = torch.load(os.path.join(data_location_processed, file))
    file_ok = True
    for ch in channels:
        data_current = dict_load['DATA'][ch]
        if np.abs(data_current).max() == 0 and np.abs(data_current).min() == 0:
            file_ok = False
            break
    if file_ok:
        files_0_OK_ON.append(file)
print("Number of files in which channels: " + str(channels) + " are ON, with label=0 and " + str(days_to_skip) + " days before and after any event: " + str(len(files_0_OK_ON)))
  
perc = torch.load(os.path.join(data_location_concat_percentile, "percentiles.pt"))

files_0_OK_ON_wo_percentiles = list()
for file in tqdm(files_0_OK_ON, desc="Find without values outside the percentiles"):
    load = torch.load(os.path.join(data_location_processed, file))
    file_ok = True
    for ch in channels:
        data = load['DATA'][ch]
        if (data.min() < perc['perc_0.01'][ch] or data.max() > perc['perc_99.99'][ch]):
            file_ok = False
            break
    if file_ok:
        files_0_OK_ON_wo_percentiles.append(file)
print("Number of files in which channels: " + str(channels) + " are ON, with label=0, " + str(days_to_skip) + " days before and after any event and without values outside the percentiles: " + str(len(files_0_OK_ON_wo_percentiles)))

data_dict = dict()
data_dict['channels'] = channels
data_dict['files_0_' + str(days_to_skip) + 'days'] = files_0_OK
data_dict['files_0_ON_' + str(days_to_skip) + 'days'] = files_0_OK_ON
data_dict['files_0_ON_' + str(days_to_skip) + 'days_wo_perc'] = files_0_OK_ON_wo_percentiles
torch.save(data_dict, os.path.join(files_list_location, "files_lists_0_ON_" + str(days_to_skip) + "days_wo_perc.pt"))

In [None]:
# Create files list of files excluded: x days before and after every event and with values inside the percentiles

Path(files_list_location).mkdir(parents=True, exist_ok=True)

load = torch.load(os.path.join(files_list_location, "files_lists_0_ON_" + str(days_to_skip) + "days_wo_perc.pt"))
files_0_OK_ON_wo_percentiles = load['files_0_ON_' + str(days_to_skip) + 'days_wo_perc']

load = torch.load(os.path.join(files_list_location, "files_lists_ON.pt"))
files_0_ON = load['files_0_ON'] # ALL 0 label files when ON

files_excluded = list(set(files_0_ON) - set(files_0_OK_ON_wo_percentiles))

print("Number of files in which channels: " + str(channels) + " are ON, with label=0, or inside " + str(days_to_skip) + " days before and after any event or with values inside the percentiles: " + str(len(files_excluded)))

data_dict = dict()
data_dict['files_0_excluded_ON'] = files_excluded
torch.save(data_dict, os.path.join(files_list_location, "files_0_excluded_ON.pt"))

In [None]:
# Create files lists for train/validation and test

Path(dataset_files_list).mkdir(parents=True, exist_ok=True)

load = torch.load(os.path.join(files_list_location, "files_lists_0_ON_" + str(days_to_skip) + "days_wo_perc.pt"))
files_0_OK_ON_wo_percentiles = load['files_0_ON_' + str(days_to_skip) + 'days_wo_perc']

# Round number of files to multiple of files_day 
round_files = files_day * round(len(files_0_OK_ON_wo_percentiles)/files_day)
train_val_files = files_0_OK_ON_wo_percentiles[:round_files]
excluded_train_val_files = files_0_OK_ON_wo_percentiles[round_files:]

shuffle(train_val_files)
split = ratio/100
split_index = math.floor(len(train_val_files) * split)
trainingSet = train_val_files[:split_index]
validationSet = train_val_files[split_index:]

# Test set is formed by files with label=1 or 2, label=0 excluded due to conditions above,
# label=0 excluded by rounding training and validation set

files_1_2_ON = torch.load(os.path.join(files_list_location, "files_lists_ON.pt"))['files_1_2_ON']
files_excluded = torch.load(os.path.join(files_list_location, "files_0_excluded_ON.pt"))['files_0_excluded_ON']
testSet = sorted(files_1_2_ON + files_excluded + excluded_train_val_files)

print("Number of files of trainingSet: " + str(len(trainingSet)))
print("Number of files of validationSet: " + str(len(validationSet)))
print("Number of files of testSet: " + str(len(testSet)))

# if testSet is too big (1year = 52560 files) split it in different lists
files_year = files_day*365
if len(testSet) > files_year:
    testSet_file_list = os.path.join(dataset_files_list, "testSet/")
    Path(testSet_file_list).mkdir(parents=True, exist_ok=True)
    big_testSet = [testSet[x:x + files_year] for x in range(0, len(testSet), files_year)]
    print("\tTest set is bigger than 1 year (" + str(files_year) + " files) and is splitted in:")
    for i in range(len(big_testSet)):
        print("\t\tNumber of files of testSet" + str(i) + ": " + str(len(big_testSet[i])))
        data_dict = dict()
        data_dict['testSet'] = big_testSet[i]
        torch.save(data_dict, os.path.join(testSet_file_list, "testSet" + str(i) + ".pt"))
        with open(os.path.join(testSet_file_list, "testSet" + str(i) + ".json"), "w") as write_file:
            json.dump(data_dict, write_file)

data_dict = dict()
data_dict['trainingSet'] = sorted(trainingSet)
torch.save(data_dict, os.path.join(dataset_files_list, "trainingSet.pt"))
with open(os.path.join(dataset_files_list, "trainingSet.json"), "w") as write_file:
    json.dump(data_dict, write_file)
data_dict = dict()
data_dict['validationSet'] = sorted(validationSet)
torch.save(data_dict, os.path.join(dataset_files_list, "validationSet.pt"))
with open(os.path.join(dataset_files_list, "validationSet.json"), "w") as write_file:
    json.dump(data_dict, write_file)
data_dict = dict()
data_dict['testSet'] = sorted(testSet)
torch.save(data_dict, os.path.join(dataset_files_list, "testSet.pt"))
with open(os.path.join(dataset_files_list, "testSet.json"), "w") as write_file:
    json.dump(data_dict, write_file)

In [None]:
# Calculate mean and std of trainingSet

Path(dataset_files_list).mkdir(parents=True, exist_ok=True)

mean, std = [[None for i in range(len(channels))] for i in range(2)]
trainingSet = torch.load(dataset_files_list + "trainingSet.pt")['trainingSet']

for ch in tqdm(range(len(channels)), desc="Channels"):
    t = []
    for file in tqdm(trainingSet, desc="Files trainingSet"):
        load = torch.load(data_location_processed + file)
        t.append(load['DATA'][channels[ch]][0])
    t = torch.cat(t, 0)
    mean[ch] = t.mean().item()
    std[ch] = t.std().item()

data_dict = dict()
data_dict['mean'] = mean
data_dict['std'] = std
torch.save(data_dict, os.path.join(dataset_files_list, "normalize_params.pt"))
with open(os.path.join(dataset_files_list, "normalize_params.json"), "w") as write_file:
    json.dump(data_dict, write_file)