In [40]:
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import os
import regex as re
from tqdm import tqdm

# Data Reading

In [2]:
data = pd.read_pickle("./WESAD/S2/S2.pkl")

In [3]:
data.keys()

dict_keys(['signal', 'label', 'subject'])

In [4]:
data['signal'].keys()

dict_keys(['chest', 'wrist'])

In [5]:
data['signal']['chest'].keys()

dict_keys(['ACC', 'ECG', 'EMG', 'EDA', 'Temp', 'Resp'])

In [6]:
data['signal']['wrist'].keys()

dict_keys(['ACC', 'BVP', 'EDA', 'TEMP'])

In [7]:
len(data['label']) # This is taken at 700Hz

4255300

In [8]:
len(data['signal']['chest']['ACC']) # This one is also sampled at 700Hz

# Notice we have a total of 6079 seconds of samples, which is the amount of samples divided by the frequency.

4255300

In [9]:
len(data['signal']['wrist']['ACC']) # Here instead is sampled at 32Hz

194528

# Data Encoding
We would like to encode our data into a pyarrow tables. This will allow us to save our files into parquet which will be later read into apache spark from HDFS.

In [49]:
# First we need to decide what columns to make. These will be our features.
# We will ignore wrist signals for the moment and will see later if we can use them.
print('FEATURES: \n' + str(data['signal']['chest'].keys())) # Features
print('DATA STRUCTURE: \n' + str(data['signal']['chest']))        # Some data
print('LABELS: \n' + str(data['label']))                    # Labels

FEATURES: 
dict_keys(['ACC', 'ECG', 'EMG', 'EDA', 'Temp', 'Resp'])
DATA STRUCTURE: 
{'ACC': array([[ 0.95539999, -0.222     , -0.55799997],
       [ 0.92579997, -0.2216    , -0.55379999],
       [ 0.90820003, -0.21960002, -0.53920001],
       ...,
       [ 0.87179995, -0.12379998, -0.30419999],
       [ 0.87300003, -0.12339997, -0.30260003],
       [ 0.87020004, -0.12199998, -0.30220002]]), 'ECG': array([[ 0.02142334],
       [ 0.02032471],
       [ 0.01652527],
       ...,
       [-0.00544739],
       [ 0.00013733],
       [ 0.0040741 ]]), 'EMG': array([[-0.00444031],
       [ 0.00434875],
       [ 0.00517273],
       ...,
       [-0.01716614],
       [-0.02897644],
       [-0.02357483]]), 'EDA': array([[5.25054932],
       [5.26733398],
       [5.24330139],
       ...,
       [0.36048889],
       [0.36582947],
       [0.365448  ]]), 'Temp': array([[30.120758],
       [30.129517],
       [30.138214],
       ...,
       [31.459229],
       [31.484283],
       [31.456268]], dtype=float3

In [51]:
# Split the multi-value column in three different features
ACC = np.squeeze(np.split(data['signal']['chest']['ACC'], indices_or_sections=3, axis=1))

In [54]:
# Create a dictionary squeezing the data (this is due to their bi-dimensional nature, we remove the empty dimension)
data_dict = {
    'ACC_1': ACC[0],
    'ACC_2': ACC[1],
    'ACC_3': ACC[2],
    'ECG': np.squeeze(data['signal']['chest']['ECG']),
    'EMG': np.squeeze(data['signal']['chest']['EMG']),
    'EDA': np.squeeze(data['signal']['chest']['EDA']),
    'Temp': np.squeeze(data['signal']['chest']['Temp']),
    'Resp': np.squeeze(data['signal']['chest']['Resp']),
    'label': data['label']
}

data_dict

{'ACC_1': array([0.95539999, 0.92579997, 0.90820003, ..., 0.87179995, 0.87300003,
        0.87020004]),
 'ACC_2': array([-0.222     , -0.2216    , -0.21960002, ..., -0.12379998,
        -0.12339997, -0.12199998]),
 'ACC_3': array([-0.55799997, -0.55379999, -0.53920001, ..., -0.30419999,
        -0.30260003, -0.30220002]),
 'ECG': array([ 0.02142334,  0.02032471,  0.01652527, ..., -0.00544739,
         0.00013733,  0.0040741 ]),
 'EMG': array([-0.00444031,  0.00434875,  0.00517273, ..., -0.01716614,
        -0.02897644, -0.02357483]),
 'EDA': array([5.25054932, 5.26733398, 5.24330139, ..., 0.36048889, 0.36582947,
        0.365448  ]),
 'Temp': array([30.120758, 30.129517, 30.138214, ..., 31.459229, 31.484283,
        31.456268], dtype=float32),
 'Resp': array([-1.14898682, -1.12457275, -1.15203857, ..., -1.10321045,
        -1.08642578, -1.09710693]),
 'label': array([0, 0, 0, ..., 0, 0, 0])}

In [55]:
table = pa.Table.from_pydict(data_dict)
table.schema

ACC_1: double
ACC_2: double
ACC_3: double
ECG: double
EMG: double
EDA: double
Temp: float
Resp: double
label: int32

In [41]:
pq.write_table(table, './WESAD/S2/S2.parquet')

# Automatize Process
Now we want to repeat the steps we did before for every data file.

In [69]:
def write_data(path_to_pkl, path_to_parquet):
    """Function that given the path to a pickle file, loads into memory, squeeze some features and writes a pyarrow table to parquet file

    Args:
        path_to_pkl (string): the path to the pickle file
        path_to_parquet (_type_): that path to the parquet file that will be saved
    """

    data = pd.read_pickle(path_to_pkl)
    
    # Split the multi-value column in three different features
    ACC = np.squeeze(np.split(data['signal']['chest']['ACC'], indices_or_sections=3, axis=1))

    # Build the dictionary squeezing the features
    data_dict = {
        'ACC_1': ACC[0],
        'ACC_2': ACC[1],
        'ACC_3': ACC[2],
        'ECG': np.squeeze(data['signal']['chest']['ECG']),
        'EMG': np.squeeze(data['signal']['chest']['EMG']),
        'EDA': np.squeeze(data['signal']['chest']['EDA']),
        'Temp': np.squeeze(data['signal']['chest']['Temp']),
        'Resp': np.squeeze(data['signal']['chest']['Resp']),
        'label': data['label']
    }

    # Create pyarrow table
    table = pa.Table.from_pydict(data_dict)

    # Write parquet file into memory
    pq.write_table(table, path_to_parquet)

In [72]:
file_names = os.listdir('./WESAD/')
for file, i in zip(file_names, tqdm(range(len(file_names)))):
    if re.match(r'S[0-9]', file) != None:
        write_data(f'./WESAD/{file}/{file}.pkl', f'./WESAD/parquet_data/{file}.parquet')

 94%|█████████▍| 16/17 [02:16<00:08,  8.56s/it]
