In [2]:
%pip install -q \
  antropy \
  openpyxl \
  pandas \
  scipy

Note: you may need to restart the kernel to use updated packages.


In [3]:
RAW_DATA_DIR = '../datasets/swell/raw'
TRANSFORMED_DATA_OUTPUT_NAME = 'test-custom-1'

In [4]:
# set up the environment
import pandas as pd
pd.set_option('display.max_columns', 128)

In [5]:
# prepare the raw data
from pathlib import Path
import pandas as pd
from zipfile import ZipFile

# extract the compressed raw data files
raw_data_dir = Path(RAW_DATA_DIR)

if raw_data_dir.exists():
  print(f'Raw data files were already extracted')
else:
  ZipFile(raw_data_dir.with_suffix('.zip'), 'r').extractall('..')
  print(f'Raw data files have been extracted')

# load the data
print(f'Loading raw data files from "{raw_data_dir}"')

# combine the label data
LABEL_DATA = (
  pd.concat(
    copy=False,
    objs=pd.read_excel(
      io=raw_data_dir.joinpath('labels/hrv stress labels.xlsx'),
      index_col=0,
      sheet_name=None,
    ),
  )[['subject', 'ElapsedTime', 'label']]
  .set_index(['subject', 'ElapsedTime'])
  .sort_index()
)
LABEL_DATA['label'] = LABEL_DATA['label'].astype('category')
print(f'Label data has been loaded:')
display(LABEL_DATA)

# combine the RRI data
RRI_DATA = []
for rri_data_path in raw_data_dir.glob('rri/*.csv'):
  data = pd.read_csv(rri_data_path)
  data['subject'] = rri_data_path.stem
  RRI_DATA.append(data.rename(columns={'Time': 'ElapsedTime (sec)'}))
RRI_DATA = pd.concat(objs=RRI_DATA, copy=False)
RRI_DATA['ElapsedTime'] = (RRI_DATA['ElapsedTime (sec)'] / 60).astype(int)
RRI_DATA.set_index(['subject', 'ElapsedTime'], inplace=True)
RRI_DATA.sort_index(inplace=True)

print('RR interval data has been loaded:')
display(RRI_DATA)

Raw data files were already extracted
Loading raw data files from "../datasets/swell/raw"
Label data has been loaded:


Unnamed: 0_level_0,Unnamed: 1_level_0,label
subject,ElapsedTime,Unnamed: 2_level_1
p1,0,rest
p1,1,rest
p1,2,rest
p1,3,rest
p1,4,rest
...,...,...
p9,158,interruption
p9,159,interruption
p9,160,interruption
p9,161,interruption


RR interval data has been loaded:


Unnamed: 0_level_0,Unnamed: 1_level_0,ElapsedTime (sec),rri
subject,ElapsedTime,Unnamed: 2_level_1,Unnamed: 3_level_1
p1,0,1.265625,870.11719
p1,0,1.515625,885.36996
p1,0,1.765625,890.18974
p1,0,2.015625,886.73851
p1,0,2.265625,877.17820
...,...,...,...
p9,127,7644.475100,884.87308
p9,127,7644.725100,903.45423
p9,127,7644.975100,920.95615
p9,127,7645.225100,936.70456


In [6]:
# merge label and RRI data
MERGED_DATA = (pd.merge(
    left=LABEL_DATA,
    right=RRI_DATA,
    left_index=True,
    right_index=True,
    how='inner',
    copy=False,
  ).droplevel(level='ElapsedTime', axis=0)
  .sort_index(axis=1))
MERGED_DATA = MERGED_DATA[MERGED_DATA['label'] != 'rest']
MERGED_DATA['label'] = MERGED_DATA['label'].cat.remove_unused_categories()
MERGED_DATA.drop(columns='ElapsedTime (sec)', inplace=True)

# subject data chunks
MERGED_DATA

Unnamed: 0_level_0,label,rri
subject,Unnamed: 1_level_1,Unnamed: 2_level_1
p1,no stress,732.20732
p1,no stress,709.33594
p1,no stress,690.18264
p1,no stress,683.69979
p1,no stress,687.40428
...,...,...
p9,interruption,884.87308
p9,interruption,903.45423
p9,interruption,920.95615
p9,interruption,936.70456


In [7]:
SAMPLE_FREQ_HZ = 4

In [8]:
# transform label and RRI data
from multiprocess.pool import Pool
import pandas as pd
from tqdm import tqdm
import utils.hrv_feature_extraction as hfe

SAMPLE_FREQ_HZ = 4
SAMPLE_WINDOW_SIZE = 300 * SAMPLE_FREQ_HZ
TRANSFORMED_DATA_ROWS = 41000

def extract_features_from_subject(subject: pd.DataFrame):
  def _iterator():
    for window in hfe.get_window_iterator(
      values=subject,
      window_size=SAMPLE_WINDOW_SIZE,
    ):
      features = hfe.extract_hrv_features_from_rri_window(
        rri_window=window['rri'].values,
        fs=SAMPLE_FREQ_HZ,
      )
      features['condition'] = window['label'].value_counts().idxmax()
      yield features
  return pd.DataFrame(_iterator())

with Pool() as pool:
  subjects = list(map(lambda t: t[1], MERGED_DATA.groupby('subject')))
  TRANSFORMED_DATA = (
    pd.concat(
      objs=tqdm(
        pool.imap(
          func=extract_features_from_subject,
          iterable=subjects,
        ),
        total=len(subjects),
      ),
      copy=False,
      ignore_index=True,
    )
    .sample(n=TRANSFORMED_DATA_ROWS, random_state=123)
    .reset_index(drop=True)
  )
  TRANSFORMED_DATA['condition'] = TRANSFORMED_DATA['condition'].astype('category')
  display(TRANSFORMED_DATA)
  subjects = None

100%|██████████| 23/23 [03:08<00:00,  8.17s/it]


Unnamed: 0,MEAN_RR,MEDIAN_RR,SDRR,RMSSD,SDSD,SDRR_RMSSD,HR,pNN25,pNN50,KURT,SKEW,SD1,SD2,MEAN_REL_RR,MEDIAN_REL_RR,SDRR_REL_RR,RMSSD_REL_RR,SDSD_REL_RR,SDRR_RMSSD_REL_RR,KURT_REL_RR,SKEW_REL_RR,VLF,VLF_PCT,LF,LF_PCT,LF_NU,HF,HF_PCT,HF_NU,TP,LF_HF,HF_LF,sampen,higuci,datasetId,condition
0,925.973041,925.025710,90.448194,12.474020,12.473995,7.250926,64.796703,4.920767,0.166806,-0.675211,-0.230193,8.820447,127.608586,0.000027,-0.000747,0.013945,0.005296,0.005296,2.633378,1.234719,0.390740,4013.932766,85.346729,683.282549,14.528378,99.147676,5.873849,0.124893,0.852324,4703.089164,116.326212,0.008597,2.181946,1.143381,2,time pressure
1,1076.515693,637.400120,564.783533,11.314435,11.296663,49.917079,55.735370,2.001668,0.750626,-1.802077,0.375396,7.987947,798.684588,-0.000635,-0.000351,0.011469,0.005695,0.005695,2.013698,22.463016,-3.076520,23081.254383,99.020709,220.413401,0.945594,96.559010,7.854682,0.033697,3.440990,23309.522465,28.061404,0.035636,0.828993,1.131304,2,no stress
2,766.189745,768.997310,43.452527,9.331512,9.331464,4.656537,78.309584,0.917431,0.000000,-0.228441,-0.169693,6.598341,61.095876,-0.000037,-0.000109,0.012298,0.007727,0.007727,1.591680,0.236484,-0.044622,1042.302847,78.206893,254.438411,19.091224,87.602125,36.009352,2.701882,12.397875,1332.750609,7.065898,0.141525,2.194468,1.406244,2,interruption
3,1101.564476,647.894545,558.199446,11.445375,11.434589,48.770743,54.467987,2.001668,0.750626,-1.827185,0.280964,8.085475,789.371818,-0.000616,-0.000291,0.011450,0.005677,0.005677,2.016836,22.638796,-3.102406,27036.886485,99.160394,223.041407,0.818026,97.429716,5.884035,0.021580,2.570284,27265.811927,37.906203,0.026381,0.925353,1.122202,2,no stress
4,1006.461402,1007.827650,86.938644,21.506201,21.506105,4.042492,59.614805,27.439533,1.084237,0.117197,-0.327944,15.207112,122.005735,0.000070,0.000486,0.021831,0.010566,0.010566,2.066241,-0.221181,-0.110189,3918.562382,67.274253,1896.046155,32.551502,99.467559,10.149363,0.174245,0.532441,5824.757901,186.814294,0.005353,2.191665,1.340752,2,interruption
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40995,1014.513755,1016.379550,94.023480,20.853638,20.853629,4.508733,59.141633,24.603837,1.334445,-0.275500,-0.228049,14.745743,132.149131,-0.000019,0.000770,0.020424,0.009499,0.009498,2.150286,-0.153826,-0.242806,3523.630373,65.086190,1884.920611,34.817018,99.722768,5.240138,0.096792,0.277232,5413.791122,359.708216,0.002780,2.190086,1.290532,2,interruption
40996,789.745559,798.233630,57.674330,8.160873,8.160644,7.067177,75.973837,0.166806,0.000000,-0.593011,-0.316444,5.770447,81.359441,-0.000079,0.000301,0.010505,0.005240,0.005240,2.004969,0.497635,-0.120310,1455.825265,79.558852,362.705716,19.821369,96.967984,11.341160,0.619779,3.032016,1829.872141,31.981358,0.031268,2.183018,1.264438,2,no stress
40997,665.434517,681.729395,97.121313,17.874366,17.874366,5.433553,90.166648,10.508757,2.919099,0.887466,-0.996118,12.639085,136.767512,0.000003,-0.000824,0.029587,0.019027,0.019027,1.554977,8.181196,0.567968,3204.985394,53.482102,2577.163535,43.005539,92.449446,210.482752,3.512359,7.550554,5992.631681,12.244060,0.081672,1.917141,1.264783,2,no stress
40998,757.267899,760.507415,55.096318,9.335227,9.335157,5.901979,79.232198,1.584654,0.000000,-0.718788,-0.181565,6.600953,77.637851,0.000045,0.000684,0.012656,0.007986,0.007986,1.584821,0.872605,-0.006643,1106.136756,72.043586,387.964832,25.268465,90.385213,41.270017,2.687950,9.614787,1535.371605,9.400646,0.106376,2.204726,1.331896,2,no stress


In [9]:
# save the compressed and transformed data
from pathlib import Path
from zipfile import ZipFile, ZIP_DEFLATED

transformed_data_output_path = Path(
  '../datasets/swell/final'
).joinpath(TRANSFORMED_DATA_OUTPUT_NAME)

save_zipfile_path = transformed_data_output_path.with_suffix('.zip')

with ZipFile(
  file=save_zipfile_path,
  mode='w',
  compression=ZIP_DEFLATED,
  compresslevel=9,
) as comp_file:
  TRANSFORMED_DATA.to_csv(
    index=False,
    path_or_buf=comp_file.open(
      name=str(transformed_data_output_path.with_suffix('.csv')),
      mode='w',
    ),
  )
  print(f'Transformed data has been saved to {save_zipfile_path}')

Transformed data has been saved to ../datasets/swell/final/test-custom-1.zip


In [10]:
# transform the raw RRI data into JSON
import json
from pathlib import Path

for subject, data in RRI_DATA.groupby('subject'):
  raw_json_path = Path(RAW_DATA_DIR).joinpath(f'rri/{subject}.json')
  raw_json_path.write_text(json.dumps({
    'sampling_frequency': SAMPLE_FREQ_HZ,
    'rr_intervals': data['rri'].to_list(),
  }))

In [11]:
def data_memory_mb(data):
  return data.memory_usage(deep=True).sum() / 1024 / 1024

[data_memory_mb(d) for d in [LABEL_DATA, RRI_DATA, MERGED_DATA, TRANSFORMED_DATA]]

[0.08281803131103516, 28.89668846130371, 27.50631618499756, 10.987702369689941]