In [1]:
%pip install -q \
  antropy \
  openpyxl \
  pandas \
  scipy

Note: you may need to restart the kernel to use updated packages.


In [2]:
RAW_DATA_DIR = '../datasets/swell/raw'
TRANSFORMED_DATA_OUTPUT_NAME = 'test-custom-1'
TRANSFORM_RATE = 0.1
SAMPLE_RATE_HZ = 4

In [3]:
# set up the environment
import pandas as pd
pd.set_option('display.max_columns', 128)

In [4]:
# prepare the raw data
from pathlib import Path
import pandas as pd
from zipfile import ZipFile

# extract the compressed raw data files
raw_data_dir = Path(RAW_DATA_DIR)

if raw_data_dir.exists():
  print(f'Raw data files were already extracted')
else:
  ZipFile(raw_data_dir.with_suffix('.zip'), 'r').extractall('..')
  print(f'Raw data files have been extracted')

# load the data
print(f'Loading raw data files from "{raw_data_dir}"')

# combine the label data
LABEL_DATA = (
  pd.concat(
    copy=False,
    objs=pd.read_excel(
      io=raw_data_dir.joinpath('labels/hrv stress labels.xlsx'),
      index_col=0,
      sheet_name=None,
    ),
  )[['subject', 'ElapsedTime', 'label']]
  .set_index(['subject', 'ElapsedTime'])
  .sort_index()
)
LABEL_DATA['label'] = LABEL_DATA['label'].astype('category')
print(f'Label data has been loaded:')
display(LABEL_DATA)

# combine the RRI data
RRI_DATA = []
for rri_data_path in raw_data_dir.glob('rri/*.csv'):
  data = pd.read_csv(rri_data_path)
  data['subject'] = rri_data_path.stem
  RRI_DATA.append(data.rename(columns={'Time': 'ElapsedTime (sec)'}))
RRI_DATA = pd.concat(objs=RRI_DATA, copy=False)
RRI_DATA['ElapsedTime'] = (RRI_DATA['ElapsedTime (sec)'] / 60).astype(int)
RRI_DATA.set_index(['subject', 'ElapsedTime'], inplace=True)
RRI_DATA.sort_index(inplace=True)

print('RR interval data has been loaded:')
display(RRI_DATA)

Raw data files were already extracted
Loading raw data files from "../datasets/swell/raw"
Label data has been loaded:


Unnamed: 0_level_0,Unnamed: 1_level_0,label
subject,ElapsedTime,Unnamed: 2_level_1
p1,0,rest
p1,1,rest
p1,2,rest
p1,3,rest
p1,4,rest
...,...,...
p9,158,interruption
p9,159,interruption
p9,160,interruption
p9,161,interruption


RR interval data has been loaded:


Unnamed: 0_level_0,Unnamed: 1_level_0,ElapsedTime (sec),rri
subject,ElapsedTime,Unnamed: 2_level_1,Unnamed: 3_level_1
p1,0,1.265625,870.11719
p1,0,1.515625,885.36996
p1,0,1.765625,890.18974
p1,0,2.015625,886.73851
p1,0,2.265625,877.17820
...,...,...,...
p9,127,7644.475100,884.87308
p9,127,7644.725100,903.45423
p9,127,7644.975100,920.95615
p9,127,7645.225100,936.70456


In [5]:
# merge label and RRI data
MERGED_DATA = (
  pd.merge(
    left=LABEL_DATA,
    right=RRI_DATA,
    left_index=True,
    right_index=True,
    how='inner',
    copy=False,
  ).droplevel(level='ElapsedTime', axis=0)
  .sort_index(axis=1)
)
MERGED_DATA = MERGED_DATA[MERGED_DATA['label'] != 'rest']
MERGED_DATA['label'] = MERGED_DATA['label'].cat.remove_unused_categories()
MERGED_DATA.drop(columns='ElapsedTime (sec)', inplace=True)

# subject data chunks
MERGED_DATA

Unnamed: 0_level_0,label,rri
subject,Unnamed: 1_level_1,Unnamed: 2_level_1
p1,no stress,732.20732
p1,no stress,709.33594
p1,no stress,690.18264
p1,no stress,683.69979
p1,no stress,687.40428
...,...,...
p9,interruption,884.87308
p9,interruption,903.45423
p9,interruption,920.95615
p9,interruption,936.70456


In [6]:
# split the merged data into raw JSON data
import json
from pathlib import Path

for (subject, label), data in MERGED_DATA.groupby(['subject', 'label']):
  subject_label = subject + '-' + label.replace(' ', '-')
  raw_json_path = Path(RAW_DATA_DIR).joinpath(f'rri/{subject_label}.json')
  raw_json_path.write_text(json.dumps({
    'category': 'stress',
    'rr_intervals': data['rri'].to_list(),
  }))

In [7]:
# transform label and RRI data
from multiprocess.pool import Pool
import pandas as pd
from tqdm import tqdm
import utils.hrv_feature_extraction as hfe

SAMPLE_RATE_HZ = 4
SAMPLE_WINDOW_SIZE = 300 * SAMPLE_RATE_HZ

def extract_features_from_data(data: pd.DataFrame):
  def _iterator():
    for window in hfe.get_window_iterator(
      values=data,
      window_size=SAMPLE_WINDOW_SIZE,
    ):
      features = hfe.extract_hrv_features_from_rri_window(
        rri_window=window['rri'].values,
      )
      features['condition'] = window['label'].value_counts().idxmax()
      yield features
  return pd.DataFrame(_iterator())

data = [d for _, d in MERGED_DATA.groupby('subject')]

with Pool() as pool:
  TRANSFORMED_DATA = (
    pd.concat(
      objs=tqdm(
        pool.imap(
          func=extract_features_from_data,
          iterable=data,
        ),
        total=len(data),
      ),
      copy=False,
      ignore_index=True,
    )
    .sample(frac=TRANSFORM_RATE, random_state=123)
    .reset_index(drop=True)
  )

TRANSFORMED_DATA['condition'] = TRANSFORMED_DATA['condition'].astype('category')
display(TRANSFORMED_DATA)
data = None

100%|██████████| 23/23 [03:03<00:00,  7.97s/it]


Unnamed: 0,MEAN_RR,MEDIAN_RR,SDRR,RMSSD,SDSD,SDRR_RMSSD,HR,pNN25,pNN50,KURT,SKEW,SD1,SD2,MEAN_REL_RR,MEDIAN_REL_RR,SDRR_REL_RR,RMSSD_REL_RR,SDSD_REL_RR,SDRR_RMSSD_REL_RR,KURT_REL_RR,SKEW_REL_RR,VLF,VLF_PCT,LF,LF_PCT,LF_NU,HF,HF_PCT,HF_NU,TP,LF_HF,HF_LF,sampen,higuci,datasetId,condition
0,925.973041,925.025710,90.448194,12.474020,12.473995,7.250926,64.796703,4.920767,0.166806,-0.675211,-0.230193,8.820447,127.608586,0.000027,-0.000747,0.013945,0.005296,0.005296,2.633378,1.234719,0.390740,3535.805739,85.170236,611.477121,14.729217,99.321997,4.174135,0.100546,0.678003,4151.456995,146.491951,0.006826,2.181946,1.143381,2,time pressure
1,1076.515693,637.400120,564.783533,11.314435,11.296663,49.917079,55.735370,2.001668,0.750626,-1.802077,0.375396,7.987947,798.684588,-0.000635,-0.000351,0.011469,0.005695,0.005695,2.013698,22.463016,-3.076520,14597.515794,99.100656,130.876556,0.888504,98.794691,1.596712,0.010840,1.205309,14729.989062,81.966294,0.012200,0.828993,1.131304,2,no stress
2,766.189745,768.997310,43.452527,9.331512,9.331464,4.656537,78.309584,0.917431,0.000000,-0.228441,-0.169693,6.598341,61.095876,-0.000037,-0.000109,0.012298,0.007727,0.007727,1.591680,0.236484,-0.044622,885.520380,77.420800,220.140265,19.246802,85.241292,38.115166,3.332398,14.758708,1143.775810,5.775661,0.173140,2.194468,1.406244,2,interruption
3,1101.564476,647.894545,558.199446,11.445375,11.434589,48.770743,54.467987,2.001668,0.750626,-1.827185,0.280964,8.085475,789.371818,-0.000616,-0.000291,0.011450,0.005677,0.005677,2.016836,22.638796,-3.102406,10425.271517,99.245099,78.602481,0.748269,99.121481,0.696658,0.006632,0.878519,10504.570655,112.827940,0.008863,0.925353,1.122202,2,no stress
4,1006.461402,1007.827650,86.938644,21.506201,21.506105,4.042492,59.614805,27.439533,1.084237,0.117197,-0.327944,15.207112,122.005735,0.000070,0.000486,0.021831,0.010566,0.010566,2.066241,-0.221181,-0.110189,3652.140818,70.310476,1531.669251,29.487470,99.319445,10.495282,0.202054,0.680555,5194.305350,145.938839,0.006852,2.191665,1.340752,2,interruption
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39255,668.029882,661.452990,42.751319,8.520780,8.520639,5.017301,89.816341,1.167640,0.000000,3.898732,1.440920,6.025002,60.158540,0.000076,0.000276,0.012708,0.008201,0.008201,1.549650,2.425193,-0.158177,494.288795,66.657028,210.299343,28.359796,85.054793,36.952265,4.983176,14.945207,741.540403,5.691108,0.175713,1.985269,1.314885,2,no stress
39256,1141.242442,1268.573750,419.921721,18.718405,18.712000,22.433627,52.574280,15.012510,2.919099,-0.871343,-0.495696,13.231382,593.711575,0.000485,-0.000105,0.026613,0.017214,0.017214,1.545985,10.944985,-0.710971,43411.642401,99.252249,326.383279,0.746212,99.794153,0.673236,0.001539,0.205847,43738.698916,484.797485,0.002063,1.838795,1.111082,2,time pressure
39257,769.308096,766.139865,59.214160,9.335741,9.335682,6.342738,77.992160,1.334445,0.000000,-0.540895,0.174771,6.601324,83.480872,-0.000044,-0.000432,0.012180,0.005306,0.005306,2.295692,0.134157,0.370290,2945.248118,86.298060,457.476590,13.404420,97.828625,10.154016,0.297521,2.171375,3412.878724,45.053759,0.022196,2.211720,1.182334,2,interruption
39258,997.487882,1006.964800,101.327783,19.960864,19.960863,5.076323,60.151107,23.185988,0.750626,0.556747,-0.707483,14.114461,142.602319,-0.000007,-0.000033,0.020096,0.010049,0.010049,1.999791,-0.088426,-0.004106,2643.733169,60.085952,1745.125087,39.662665,99.370189,11.060646,0.251383,0.629811,4399.918902,157.777867,0.006338,2.102354,1.354164,2,interruption


In [8]:
# save the compressed and transformed data
from pathlib import Path
from zipfile import ZipFile, ZIP_DEFLATED

transformed_data_output_path = Path('../datasets/swell/final').joinpath(
  TRANSFORMED_DATA_OUTPUT_NAME
)

save_zipfile_path = transformed_data_output_path.with_suffix('.zip')

with ZipFile(
  file=save_zipfile_path,
  mode='w',
  compression=ZIP_DEFLATED,
  compresslevel=9,
) as comp_file:
  TRANSFORMED_DATA.to_csv(
    index=False,
    path_or_buf=comp_file.open(
      name=str(transformed_data_output_path.with_suffix('.csv')),
      mode='w',
    ),
  )
  print(f'Transformed data has been saved to {save_zipfile_path}')

Transformed data has been saved to ../datasets/swell/final/test-custom-1.zip


In [9]:
def data_memory_mb(data):
  return data.memory_usage(deep=True).sum() / 1024 / 1024

[data_memory_mb(d) for d in [
  LABEL_DATA, RRI_DATA,
  MERGED_DATA, TRANSFORMED_DATA,
]]

[0.08281803131103516, 28.89668846130371, 27.50631618499756, 10.52141284942627]