In [1]:
import datasets
from datasets import load_dataset

In [2]:
import numpy as np
import pandas as pd

In [3]:
import scipy
from scipy import stats
import collections

In [4]:
from tqdm import tqdm

In [5]:
import neurokit2 as nk

In [6]:
import cupy as cp

In [7]:
import scipy
from scipy import stats

def TINN(x:np.array):
  """ Compute all the triangular interpolation to calculate the TINN scores. It also computes HRV index from an array x which contains 
      all the interbeats times for a given ECG signal.

      The axis is divided in 2 parts respectively on the right and left of the abscissa of the maximum value of the gaussian distribution
      The TINN score calculation is defined in the WESAD Dataset paper, to calculate it we needthe closest triangular interpolation 
      of the gaussian distribution of the interbeats times. The triangular interpolation is defined by 2 lines that meet at the maximum value
      of the gaussian distribution and cross the x-axis in N on the first half of the x-axis and M on the second half of the x-axis. 
      Thus inside ]N;M[ the interpolation function != 0
      Outside of ]N;M[ the interpolation function equals 0.
  """

  kernel = stats.gaussian_kde(x) #Create an approximated kernel for gaussian distribution from the x array (interbeats times)
  absi=np.linspace(np.min(x),np.max(x),len(x)) # Compute the x-axis of the interbeats distribution (from minimum interbeat time to maximum interbeat time)
  val=kernel.evaluate(absi) # Fit the gaussian distribution to the created x-axis
  ecart=absi[1]-absi[0] # Space between 2 values on the axis
  maxind=np.argmax(val) # Select the index for which the gaussian distribution (val array) is maximum 
  max_pos=absi[maxind]  # Interbeat time (abscissa) for which the gaussian distribution is maximum
  maxvalue=np.amax(val) # Max of the gaussian distribution
  N_abs=absi[0:maxind+1] # First half of the x-axis
  M_abs=absi[maxind:] # Second half of the x-axis
  HRVindex=len(x)/maxvalue
  err_N=[]
  err_M=[]

  for i in range(0,len(N_abs)-1):
    N=N_abs[i]
    slope=(maxvalue)/(max_pos-N)
    D=val[0:maxind+1]
    q=np.clip(slope*ecart*np.arange(-i,-i+maxind+1),0,None) #Triangular interpolation on the First half of the x-axis
    diff=D-q 
    err=np.multiply(diff,diff)
    err1=np.delete(err,-1)
    err2=np.delete(err, 0)
    errint=(err1+err2)/2
    errtot=np.linalg.norm(errint) # Error area between the triangular interpolation and the gaussian distribution on the first half of the x-axis
    err_N.append((errtot,N,N_abs,q))
  
  for i in range(1,len(M_abs)):
    M=M_abs[i]
    slope=(maxvalue)/(max_pos-M)
    D=val[maxind:]
    q=np.clip(slope*ecart*np.arange(-i,len(D)-i),0,None) #Triangular interpolation on the second half of the x-axis
    diff=D-q
    err=np.multiply(diff,diff)
    err1=np.delete(err,-1)
    err2=np.delete(err, 0)
    errint=(err1+err2)/2
    errtot=np.linalg.norm(errint) # Error area between the triangular interpolation and the gaussian distribution on the second half of the x-axis
    err_M.append((errtot,M,M_abs,q))

  return (err_N,err_M,absi,val,HRVindex)

def best_TINN(x:np.array):
  """Select the best N and M that give the best triangular interpolation function approximation of the gaussian distrbution and return
    N; M; the TINN score = M-N ; and the HRV index
  
  """
  err_N,err_M,_,_,HRVindex=TINN(x)
  N=np.argmin(np.array(err_N,dtype=object)[:,0])
  M=np.argmin(np.array(err_M,dtype=object)[:,0])
  absN=err_N[N][1]
  absM=err_M[M][1]
  return float(absN),float(absM),float(absM-absN),HRVindex

# _,_,T,HRVindex=best_TINN(hrv)
# T, HRVindex

In [8]:
def num_compare_NN50(x,i):
  """Count the number of HRV intervals differing more than 50 ms for a given HRV interval x[i]
  
  """
  ref=x[i]
  k=0
  diff=np.absolute(x-ref)
  k+=np.sum(np.where(diff>0.05,1,0))
  return k 

def compare_NN50(x):
  """ Returns the number and percentage of HRV intervals differing more than 50ms for all intervals
  
  """
  k=0
  for i in range(0,len(x)):
    k+=num_compare_NN50(x,i)
  if k==0:
    k=1
  return k,(k/(len(x)*len(x)))

# num50,p50=compare_NN50(hrv)
# num50, p50

In [9]:
def get_freq_features_ecg(x):
  """ Returns frequential features of the Heart Rate Variability signal (interbeats times) by computing FFT, to compute the Fouriers 
  Frequencies the mean of the Heart Rate variability is used as sampling period  
  """
  mean=np.mean(x)
  yf=np.array(scipy.fft.fft(x-mean))
  xf=scipy.fft.fftfreq(len(x),mean)[0:len(x)//2]
  psd=(2/len(yf))*np.abs(yf)[0:len(x)//2]
  fmean=np.mean(xf)
  fstd=np.std(xf)
  sumpsd=np.sum(psd)
  return fmean,fstd,sumpsd

# fmean,fstd,sumpsd=get_freq_features_ecg(hrv)
# fmean,fstd, sumpsd


In [10]:
# https://github.com/Edouard99/Stress_Detection_ECG/tree/main
# pd.DataFrame({
#     'meanHR': meanHR,
#     'stdHR': stdHR,
#     'TINN': hrv_indices['HRV_TINN'],
#     'HRVindex': HRVindex,
#     '%NN50': num50,
#     'pnn50': hrv_indices['HRV_pNN50'],
#     'meanHRV': meanHRV,
#     'stdHRV': stdHRV,
#     'rmsHRV': rmsHRV,
#     'Mean Fourier Frequencies': fmean,
#     'STD Fourier Frequencies': fstd,
#     'Sum PSD components': sumpsd
# }, index=[0])

In [11]:
# https://dl-acm-org.vu-nl.idm.oclc.org/doi/epdf/10.1145/3242969.3242985
# pd.DataFrame({
#     'μHR': meanHR,
#     'σHR': stdHR,
#     'μHRV': meanHRV,
#     'σHRV': stdHRV,
#     'NN50': num50, 
#     'pNN50': hrv_indices['HRV_pNN50'],
#     'TINN': hrv_indices['HRV_TINN'],
#     'rmsHRV': rmsHRV,
#     'ULF': frequencies['HRV_ULF'],
#     'LF': frequencies['HRV_LF'],
#     'HF': frequencies['HRV_HF'],
#     'UHF': frequencies['HRV_VHF'],
#     'LF_HF_Ratio': frequencies['HRV_LF'] / frequencies['HRV_HF'],
#     'total_power': total_power,
#     'relative_power_ulf': (frequencies['HRV_ULF'] / total_power) * 100,
#     'relative_power_lf': (frequencies['HRV_LF'] / total_power) * 100,
#     'relative_power_hf': (frequencies['HRV_HF'] / total_power) * 100,
#     'relative_power_vhf': (frequencies['HRV_VHF'] / total_power) * 100,
#     'LF_norm': np.nan,  ## Can only be normalised after all the LF and HF are calculated
#     'HF_norm': np.nan,  ## Can only be normalised after all the LF and HF are calculated
# }, index=[0])

The “heart rate” can be described as a true rate in beats per minute (HR) or as the RR interval in milliseconds. 
The RR interval is the time elapsed between two successive R waves of the QRS signal on the electrocardiogram
“Heart rate variability” has become the conventionally accepted term to describe variations of both instantaneous heart rate and RR intervals.

The RR interval and HR are hyperbolically related (HR x RR interval = 60000; see figure 1).

In [12]:
import math 

In [13]:
import sleepecg

In [14]:
def _get_average_t(signal, peaks, sampling_rate):
    try:
        _, signals = nk.ecg_delineate(signal, rpeaks=peaks, sampling_rate=sampling_rate)

        t_peak_indices = signals['ECG_T_Peaks']
        t_peaks = [signal[idx] for idx in t_peak_indices]

        return np.mean(t_peaks)
    except:
        return np.nan

def modified_moving_average(signal, sampling_rate):
    peaks = sleepecg.detect_heartbeats(signal, sampling_rate)

    if len(peaks) % 2 != 0:
        # We want balanced buckets, so pop the last item to make it even.
        peaks = peaks[:-1]

    even_bucket = peaks[1::2]
    odd_bucket = peaks[::2]

    average_t_even = _get_average_t(signal, even_bucket, sampling_rate)
    average_t_odd = _get_average_t(signal, odd_bucket, sampling_rate)

    if average_t_even is None or average_t_odd is None:
        return np.nan
    else:
        twa = abs(average_t_even - average_t_odd)
        return twa

Dataset

In [15]:
from pathlib import Path
from glob import glob

In [16]:
sampling_rate = 1000
n_window = 60 * sampling_rate

In [17]:
window_shift_size = 0.25
step_size = int(window_shift_size * sampling_rate)

In [18]:
base_directory = f'../data/ecg_preprocessed_cleaned'
out_directory = f'../data/ecg_final'

In [19]:
files = glob(f'{base_directory}/*.csv')

In [20]:
baseline = ['Sitting', 'Recov1', 'Recov2', 'Recov3', 'Recov4', 'Recov5', 'Recov6']
mental_stress = ['TA', 'SSST_Sing_countdown', 'Pasat', 'Raven', 'TA_repeat', 'Pasat_repeat']
high_physical_stress = ['Treadmill1', 'Treadmill2', 'Treadmill3', 'Treadmill4', 'Walking_fast_pace', 'Cycling', 'stairs_up_and_down']
moderate_physical_stress = ['Walking_own_pace', 'Dishes', 'Vacuum']
low_physical_stress = ['Standing', 'Lying_supine', 'Recov_standing']

In [21]:
from scipy.signal import find_peaks
from scipy.fft import fft, fftfreq

In [22]:
import pyhrv

In [23]:
def f_fr_n(freq, max_freq, l ):
    if freq < max_freq:
        return int(freq * l/max_freq)
    else:
        return l - 1
    
def detect_peaks_ECG(peaks, window_size,timestep_data,distance):
    # f_p = find_peaks(sample, distance=distance)# height = 0.4, distance = distance)
    #time features
    # f_p_diff = np.diff(f_p[0]) * timestep_data
    f_p_diff = np.diff(peaks) * timestep_data
    
    # heart rate mean std min max 
    HR_mean = (60/f_p_diff).mean()
    HR_std = (60/f_p_diff).std()
    HR_max = (60/f_p_diff).max()
    HR_min = (60/f_p_diff).min()
    #NN50
    #pNN50
    NN50 = sum(np.abs(np.diff(f_p_diff)) > 0.050)
    N_HRV_50 = NN50
    P_HRV_50 = NN50/len(f_p_diff)
    #rr_features
    rmssd = np.sqrt(np.mean(np.square(np.diff(f_p_diff))))
    rr_mean = f_p_diff.mean()
    rr_std = f_p_diff.std()
    # freq features
    # f_p_diff_fft = savgol_filter(np.diff(f_p_diff), 5,2)
    
    T = window_size * timestep_data
    k = np.arange(len(f_p_diff))
    freqs = k/T
    m = freqs.max()/2
    l = int(len(freqs)/2)
    ffts = abs(np.fft.fft(f_p_diff)*np.hamming(len(k)))**2
    ULF = sum( ffts[ f_fr_n(0.01,m,l):f_fr_n(0.04,m,l) ] )
    HF = sum( ffts[ f_fr_n(0.15,m,l):f_fr_n(0.4,m,l) ] )
    LF = sum( ffts[ f_fr_n(0.04,m,l):f_fr_n(0.15,m,l) ] )
    UHF = sum( ffts[ f_fr_n(0.4,m,l):f_fr_n(1,m,l) ] )
    
    TP = ULF + LF + HF + UHF

    rate_L_H = LF/HF
    lfN = LF / TP 
    hfN = HF / TP
    
    return {
        'μhr' : HR_mean,
        'σhr' : HR_std,
        'HR_max': HR_max,
        'HR_min' : HR_min,
        'NN50' : N_HRV_50,
        'pNN50' : P_HRV_50,
        'rmssd' : rmssd,
        'rr_mean' : rr_mean,
        'rr_std' : rr_std,
        'ULF' : ULF,
        'HF': HF,
        'LF': LF,
        'UHF': UHF,
        'LF_HF_ratio': rate_L_H,
        'Σ': TP,
        'relative_power_ULF': (ULF / TP) * 100,
        'relative_power_LF': (LF / TP) * 100,
        'relative_power_HF': (HF / TP) * 100,
        'relative_power_UHF': (UHF / TP) * 100,
        'LF_norm': lfN,
        'HF_norm': hfN,
    }

In [24]:
def calculate_twa(signal, tpeaks, tsets):
    even_bucket = tpeaks[1::2]
    odd_bucket = tpeaks[::2]

    average_t_even = cp.mean(cp.take(signal, even_bucket))
    average_t_odd = cp.mean(cp.take(signal, odd_bucket))

    tsets = cp.abs(cp.diff(tsets))

    tset_even = tsets[1::2]
    tset_odd = tsets[::2]
    
    average_tset_even = cp.mean(tset_even)
    average_tset_odd = cp.mean(tset_odd)

    if average_t_even is None or average_t_odd is None:
        return {
            'twa': cp.nan.item(),
            'twa_width': cp.nan.item()
        }
    else:
        twa = abs(average_t_even - average_t_odd)
        tset_width = abs(average_tset_even - average_tset_odd)
        return {
            'twa': twa.item(),
            'twa_width': tset_width.item()
        }

In [25]:
def get_frequency_domain_crappy_features(rpeaks, window_size, timestep, sampling_rate=1000):
    T = window_size * timestep
    k = cp.arange(len(rpeaks))
    freqs = k/T
    m = freqs.max()/2
    l = int(len(freqs)/2)
    ffts = abs(cp.fft.fft(rpeaks)*cp.hamming(len(k)))**2
    ULF = cp.sum( ffts[ f_fr_n(0.01,m,l):f_fr_n(0.04,m,l) ] )
    HF = cp.sum( ffts[ f_fr_n(0.15,m,l):f_fr_n(0.4,m,l) ] )
    LF = cp.sum( ffts[ f_fr_n(0.04,m,l):f_fr_n(0.15,m,l) ] )
    UHF = cp.sum( ffts[ f_fr_n(0.4,m,l):f_fr_n(1,m,l) ] )
    
    TP = ULF + LF + HF + UHF

    rate_L_H = LF/HF
    lfN = LF / TP 
    hfN = HF / TP

    return {
        "ulf": ULF.item(),
        "vlf": 0,
        "lf": LF.item(),
        "hf": HF.item(),
        "vhf": 0,
        "uhf": UHF.item(),
        "tp": TP.item(),
        "lf_hf_ratio": rate_L_H.item(),
        "lp_ulf": ((ULF / TP) * 100).item(),
        "lp_vlf": ((0 / TP) * 100).item(),
        "lp_lf": ((LF / TP) * 100).item(),
        "lp_hf": ((HF / TP) * 100).item(),
        "lp_vhf": ((0 / TP) * 100).item(),
        "lp_uhf": ((UHF / TP) * 100).item(),
        "lf_normalized": lfN.item(),
        "hf_normalized": hfN.item(),
    }
    
def get_frequency_domain_features(rpeaks, sampling_rate=1000, prefix=''):
    result = {}

    for statistic in ['min', 'max', 'mean', 'median', 'std', 'power', 'covariance', 'energy', 'entropy']:
        df = nk.hrv_frequency(rpeaks.get(), statistic=statistic, sampling_rate=sampling_rate)
        df.fillna(0, inplace=True)
        result.update({
            f"{prefix}ulf_{statistic}": df["HRV_ULF"].item(),
            f"{prefix}vlf_{statistic}": df["HRV_VLF"].item(),
            f"{prefix}lf_{statistic}": df["HRV_LF"].item(),
            f"{prefix}hf_{statistic}": df["HRV_HF"].item(),
            f"{prefix}vhf_{statistic}": df["HRV_VHF"].item(),
            f"{prefix}uhf_{statistic}": df["HRV_UHF"].item(),
            f"{prefix}tp_{statistic}": df["HRV_TP"].item(),
            f"{prefix}lf_hf_ratio_{statistic}": df["HRV_LFHF"].item(),
            f"{prefix}lp_ulf_{statistic}": (df["HRV_ULF"].item() / df["HRV_TP"].item()) * 100,
            f"{prefix}lp_vlf_{statistic}": (df["HRV_VLF"].item() / df["HRV_TP"].item()) * 100,
            f"{prefix}lp_lf_{statistic}": (df["HRV_LF"].item() / df["HRV_TP"].item()) * 100,
            f"{prefix}lp_hf_{statistic}": (df["HRV_HF"].item() / df["HRV_TP"].item()) * 100,
            f"{prefix}lp_vhf_{statistic}": (df["HRV_VHF"].item() / df["HRV_TP"].item()) * 100,
            f"{prefix}lp_uhf_{statistic}": (df["HRV_UHF"].item() / df["HRV_TP"].item()) * 100,
            f"{prefix}lf_normalized_{statistic}": df["HRV_LFn"].item(),
            f"{prefix}hf_normalized_{statistic}": df["HRV_HFn"].item(),
            f"{prefix}lf/hf+lf_{statistic}": df["HRV_LF/HF+LF"].item(),
            f"{prefix}hf/hf+lf_{statistic}": df["HRV_HF/HF+LF"].item(),
        })
        
    return result

In [26]:
def calculate_rr_interval(rpeaks, sampling_rate=1000):
    rri = cp.diff(rpeaks) * (1 / sampling_rate) # Compute R-R intervals (also referred to as NN) in milliseconds
    return rri

def get_time_domain_features(rpeaks, sampling_rate=1000, prefix=''):
    rri = calculate_rr_interval(rpeaks, sampling_rate)
    hr = 60 / rri # HR = 60/RR interval in seconds
    nn50 = cp.sum(cp.abs(cp.diff(rri)) > 0.050) # "Number of N-N intervals greater than 50ms",

    return {
        f"{prefix}hr_mean": hr.mean().item(),
        f"{prefix}hr_min": hr.min().item(),
        f"{prefix}hr_max": hr.max().item(),
        f"{prefix}hr_std": hr.std(ddof=1).item(),

        f"{prefix}rr_mean": cp.nanmean(rri).item(),
        f"{prefix}rr_min": cp.nanmin(rri).item(),
        f"{prefix}rr_max": cp.nanmax(rri).item(),
        f"{prefix}rr_std": cp.nanstd(rri, ddof=1).item(), 

        f"{prefix}nn50": nn50.item(),
        f"{prefix}pnn50": (nn50 / len(rri)).item(),  # Proportion of NN50 divided by total number of NN intervals
        f"{prefix}rmssd": cp.sqrt(cp.mean(cp.square(cp.diff(rri)))).item(),
    }

In [27]:
from timeit import default_timer as timer   
import datetime

In [28]:
def get_rqa(rpeaks, sampling_rate=1000, prefix=''):
    rqa = nk.hrv_rqa(rpeaks.get(), sampling_rate=sampling_rate)
    rqa.fillna(0, inplace=True)

    return {
        f"{prefix}w": rqa['W'].item(),
        f"{prefix}wmax": rqa['WMax'].item(),
        f"{prefix}wen": rqa['WEn'].item()
    }

In [29]:
def get_nonlinear(rpeaks, sampling_rate=1000, prefix=''):
    hrv_nonlinear = {}
    try:
        hrv_nonlinear = nk.hrv_nonlinear(rpeaks.get(), sampling_rate=sampling_rate)
        hrv_nonlinear.fillna(0, inplace=True)
        hrv_nonlinear = hrv_nonlinear.iloc[0].to_dict()
    except Exception as e:
        print(e)

    return {
        # Poincare Plot Geometry
        f'{prefix}SD1': hrv_nonlinear.get("HRV_SD1", 0),
        f'{prefix}SD2': hrv_nonlinear.get("HRV_SD2", 0),
        f'{prefix}SD1SD2': hrv_nonlinear.get("HRV_SD1SD2", 0),
        f'{prefix}S': hrv_nonlinear.get("HRV_S", 0),
        f'{prefix}CSI': hrv_nonlinear.get("HRV_CSI", 0),
        f'{prefix}CVI': hrv_nonlinear.get("HRV_CVI", 0),
        f'{prefix}CSI_Modified': hrv_nonlinear.get("HRV_CSI_Modified", 0),

        # Indices of Heart Rate Assymetry
        f'{prefix}GI': hrv_nonlinear.get("HRV_GI", 0),
        f'{prefix}SI': hrv_nonlinear.get("HRV_SI", 0),
        f'{prefix}AI': hrv_nonlinear.get("HRV_AI", 0),
        f'{prefix}PI': hrv_nonlinear.get("HRV_PI", 0),
        f'{prefix}SD1d': hrv_nonlinear.get("HRV_SD1d", 0),
        f'{prefix}SD1a': hrv_nonlinear.get("HRV_SD1a", 0),
        f'{prefix}C1d': hrv_nonlinear.get("HRV_C1d", 0),
        f'{prefix}C1a': hrv_nonlinear.get("HRV_C1a", 0),
        f'{prefix}SD2d': hrv_nonlinear.get("HRV_SD2d", 0),
        f'{prefix}SD2a': hrv_nonlinear.get("HRV_SD2a", 0),
        f'{prefix}C2d': hrv_nonlinear.get("HRV_C2d", 0),
        f'{prefix}C2a': hrv_nonlinear.get("HRV_C2a", 0),
        f'{prefix}SDNNd': hrv_nonlinear.get("HRV_SDNNd", 0),
        f'{prefix}SDNNa': hrv_nonlinear.get("HRV_SDNNa", 0),
        f'{prefix}Cd': hrv_nonlinear.get("HRV_Cd", 0),
        f'{prefix}Ca': hrv_nonlinear.get("HRV_Ca", 0),

        # Indices of Heart Rate Fragmentation
        f'{prefix}PIP': hrv_nonlinear.get("HRV_PIP", 0),
        f'{prefix}IALS': hrv_nonlinear.get("HRV_IALS", 0),
        f'{prefix}PSS': hrv_nonlinear.get("HRV_PSS", 0),
        f'{prefix}PAS': hrv_nonlinear.get("HRV_PAS", 0),

        # Indices of Complexity
        f'{prefix}ApEn': hrv_nonlinear.get("HRV_ApEn", 0),
        f'{prefix}SampEn': hrv_nonlinear.get("HRV_SampEn", 0),
        f'{prefix}ShanEn': hrv_nonlinear.get("HRV_ShanEn", 0),
        f'{prefix}FuzzyEn': hrv_nonlinear.get("HRV_FuzzyEn", 0),
        f'{prefix}MSEn': hrv_nonlinear.get("HRV_MSEn", 0),
        f'{prefix}CMSEn': hrv_nonlinear.get("HRV_CSMEn", 0),
        f'{prefix}CD': hrv_nonlinear.get("HRV_CD", 0),
        f'{prefix}HFD': hrv_nonlinear.get("HRV_HFD", 0),
        f'{prefix}KFD': hrv_nonlinear.get("HRV_KFD", 0),
        f'{prefix}LZC': hrv_nonlinear.get("HRV_LZC", 0),
        f'{prefix}DFA_alpha1': hrv_nonlinear.get("HRV_DFA_alpha1", 0),
        f'{prefix}DFA_alpha2': hrv_nonlinear.get("HRV_DFA_alpha2", 0),
    }

In [30]:
def get_time(rpeaks, sampling_rate=1000, prefix=''):
    hrv_time = {}
    try:
        hrv_time = nk.hrv_time(rpeaks.get(), sampling_rate=sampling_rate)
        hrv_time.fillna(0, inplace=True)
        hrv_time = hrv_time.iloc[0].to_dict()
    except Exception as e:
        print(e)

    return {
        f'{prefix}MeanNN': hrv_time.get("HRV_MeanNN", 0),
        f'{prefix}SDNN': hrv_time.get("HRV_SDNN", 0),
        f'{prefix}SDANN1': hrv_time.get("HRV_SDANN1", 0),
        f'{prefix}SDNNI1': hrv_time.get("HRV_SDNNI1", 0),
        f'{prefix}SDANN2': hrv_time.get("HRV_SDANN2", 0),
        f'{prefix}SDNNI2': hrv_time.get("HRV_SDNNI2", 0),
        f'{prefix}SDANN5': hrv_time.get("HRV_SDANN5", 0),
        f'{prefix}SDNNI5': hrv_time.get("HRV_SDNNI5", 0),
        f'{prefix}RMSSD': hrv_time.get("HRV_RMSSD", 0),
        f'{prefix}SDSD': hrv_time.get("HRV_SDSD", 0),
        f'{prefix}CVNN': hrv_time.get("HRV_CVNN", 0),
        f'{prefix}CVSD': hrv_time.get("HRV_CVSD", 0),
        f'{prefix}MedianNN': hrv_time.get("HRV_MedianNN", 0),
        f'{prefix}MadNN': hrv_time.get("HRV_MadNN", 0),
        f'{prefix}MCVNN': hrv_time.get("HRV_MCVNN", 0),
        f'{prefix}IQRNN': hrv_time.get("HRV_IQRNN", 0),
        f'{prefix}SDRMSSD': hrv_time.get("HRV_SDRMSSD", 0),
        f'{prefix}Prc20NN': hrv_time.get("HRV_Prc20NN", 0),
        f'{prefix}Prc80NN': hrv_time.get("HRV_Prc80NN", 0),
        f'{prefix}pNN50': hrv_time.get("HRV_pNN50", 0),
        f'{prefix}pNN20': hrv_time.get("HRV_pNN20", 0),
        f'{prefix}MinNN': hrv_time.get("HRV_MinNN", 0),
        f'{prefix}MaxNN': hrv_time.get("HRV_MaxNN", 0),
        f'{prefix}HTI': hrv_time.get("HRV_HTI", 0),
        f'{prefix}TINN': hrv_time.get("HRV_TINN", 0)
    }

In [31]:
def preprocess_and_save(file, out_directory, n_window, step_size):
    df = pd.read_csv(
        file,
        skiprows=[0],
        names=['timestamp','signal','signal_normalised','subject_id','category','code','ECG_Raw','ECG_Clean','ECG_Rate','ECG_Quality','ECG_R_Peaks','ECG_P_Peaks','ECG_P_Onsets','ECG_P_Offsets','ECG_Q_Peaks','ECG_R_Onsets','ECG_R_Offsets','ECG_S_Peaks','ECG_T_Peaks','ECG_T_Onsets','ECG_T_Offsets','ECG_Phase_Atrial','ECG_Phase_Completion_Atrial','ECG_Phase_Ventricular','ECG_Phase_Completion_Ventricular'],
        dtype={
            'timestamp': str, 
            'signal': float, 
            'signal_normalised': float, 
            'subject_id': str, 
            'category': str, 
            'code': str, 
            'ECG_Raw': float, 
            'ECG_Clean': float, 
            'ECG_Rate': float, 
            'ECG_Quality': float, 
            'ECG_R_Peaks': float, 
            'ECG_P_Peaks': float, 
            'ECG_P_Onsets': float, 
            'ECG_P_Offsets': float, 
            'ECG_Q_Peaks': float, 
            'ECG_R_Onsets': float, 
            'ECG_R_Offsets': float, 
            'ECG_S_Peaks': float, 
            'ECG_T_Peaks': float, 
            'ECG_T_Onsets': float, 
            'ECG_T_Offsets': float, 
            'ECG_Phase_Atrial': float, 
            'ECG_Phase_Completion_Atrial': float, 
            'ECG_Phase_Ventricular': float, 
            'ECG_Phase_Completion_Ventricular': float
        }
    )

    peaks, _ = nk.ecg_delineate(df['ECG_Clean'], sampling_rate=1000, method='peaks')

    data = []
    # with tqdm(total=len(dataset)) as pbar:
    for start_idx in range(0, len(df), step_size): ## Window shift
        # start = timer()
        try:
            if start_idx + n_window > len(df):
                break
            sample = df[start_idx:start_idx+n_window]
            delineation = peaks[start_idx:start_idx+n_window]
            if len(sample['ECG_Clean']) < n_window:
                continue

            counter = collections.Counter(sample['category'])
            if len(counter) > 1:
                continue
            label = counter.most_common(1)[0][0]

            X = cp.column_stack([cp.array(sample['ECG_Clean']), cp.array(sample['ECG_R_Peaks']), cp.array(delineation['ECG_T_Peaks']), cp.array(sample['ECG_T_Onsets']), cp.array(sample['ECG_T_Offsets'])])
                        
            rpeaks = cp.where(X[:, 1] == 1.0)[0]
            tpeaks = cp.where(X[:, 2] == 1.0)[0]

            if len(cp.where(X[:, 3] == 1.0)[0]) > len(cp.where(X[:, 4] == 1.0)[0]):
                offsets = cp.where(X[:, 4] == 1.0)[0]
                onsets = cp.where(X[:, 3] == 1.0)[0][:len(offsets)]
            elif len(cp.where(X[:, 3] == 1.0)[0]) < len(cp.where(X[:, 4] == 1.0)[0]):
                onsets = cp.where(X[:, 3] == 1.0)[0]
                offsets = cp.where(X[:, 4] == 1.0)[0][:len(onsets)]
            else:
                onsets = cp.where(X[:, 3] == 1.0)[0]
                offsets = cp.where(X[:, 4] == 1.0)[0]
            tsets = cp.column_stack([onsets, offsets])

            ## HRV
            hrv = cp.array([(rpeaks[i]-rpeaks[i-1])/sampling_rate for i in range(1,len(rpeaks))])
            # _,_,tinn,_ = best_TINN(hrv.get()) 
            
            features = {
                'label': label,
                
                'hrv_mean': hrv.mean().item(),
                'hrv_min': hrv.min().item(),
                'hrv_max': hrv.max().item(),
                'hrv_std': hrv.std(ddof=1).item(),
                'hrv_rms': cp.sqrt(cp.mean(hrv**2)).item(),

                **get_time_domain_features(rpeaks, sampling_rate=sampling_rate),
                **get_frequency_domain_features(rpeaks, sampling_rate=sampling_rate),
                **get_rqa(rpeaks, sampling_rate=sampling_rate),
                **get_time(rpeaks, sampling_rate=sampling_rate),
                **get_nonlinear(rpeaks, sampling_rate=sampling_rate),

                **calculate_twa(X[:, 0], tpeaks, tsets),
            }
            data.append(features)

            print(features)
            # pbar.update(step_size)
            
            # time_secs = timer()-start
        except Exception as e:
            print(file, ":", start_idx, "->", e)
            # pbar.update(step_size)
            continue
        
    result = pd.DataFrame(data)

    stem = Path(file).stem
    result.to_csv(f'{out_directory}/{stem}.csv')

In [32]:
from joblib import Parallel, delayed

In [36]:
files[90:]

['../data/ecg_preprocessed_cleaned\\30194.csv',
 '../data/ecg_preprocessed_cleaned\\30195.csv',
 '../data/ecg_preprocessed_cleaned\\30196.csv',
 '../data/ecg_preprocessed_cleaned\\30197.csv',
 '../data/ecg_preprocessed_cleaned\\30198.csv',
 '../data/ecg_preprocessed_cleaned\\30200.csv',
 '../data/ecg_preprocessed_cleaned\\30201.csv',
 '../data/ecg_preprocessed_cleaned\\30202.csv',
 '../data/ecg_preprocessed_cleaned\\30204.csv',
 '../data/ecg_preprocessed_cleaned\\30205.csv',
 '../data/ecg_preprocessed_cleaned\\30206.csv',
 '../data/ecg_preprocessed_cleaned\\30207.csv',
 '../data/ecg_preprocessed_cleaned\\30208.csv',
 '../data/ecg_preprocessed_cleaned\\30209.csv',
 '../data/ecg_preprocessed_cleaned\\30210.csv',
 '../data/ecg_preprocessed_cleaned\\30211.csv',
 '../data/ecg_preprocessed_cleaned\\30212.csv',
 '../data/ecg_preprocessed_cleaned\\30213.csv',
 '../data/ecg_preprocessed_cleaned\\30214.csv',
 '../data/ecg_preprocessed_cleaned\\30215.csv',
 '../data/ecg_preprocessed_cleaned\\3021

In [34]:
# for file in files:
#     preprocess_and_save(file, out_directory, n_window, step_size)
#     break

In [37]:
Parallel(n_jobs=8)(delayed(preprocess_and_save)(file, out_directory, n_window, step_size) for file in files[91:]) 

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [None]:
second = sampling_rate
minute = 60 * second

In [None]:
# Parallel(n_jobs=8)(delayed(preprocess_and_save)(file, '../data/ecg_features_60s_clean_twa_rqa_2m', minute * 2, step_size) for file in files) 

KeyboardInterrupt: 

In [None]:
# Parallel(n_jobs=8)(delayed(preprocess_and_save)(file, '../data/ecg_features_60s_clean_twa_rqa_5m', minute * 5, step_size) for file in files) 

In [None]:
# Parallel(n_jobs=8)(delayed(preprocess_and_save)(file, '../data/ecg_features_60s_clean_twa_rqa_30s', second * 30, step_size) for file in files) 

In [None]:
# Parallel(n_jobs=8)(delayed(preprocess_and_save)(file, '../data/ecg_features_60s_clean_twa_rqa_10s', second * 10, step_size) for file in files) 