In [1]:
import datasets
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import numpy as np
import pandas as pd

In [3]:
import scipy
from scipy import stats
import collections

In [4]:
from tqdm import tqdm

In [5]:
import neurokit2 as nk

In [6]:
import scipy
from scipy import stats

def TINN(x:np.array):
  """ Compute all the triangular interpolation to calculate the TINN scores. It also computes HRV index from an array x which contains 
      all the interbeats times for a given ECG signal.

      The axis is divided in 2 parts respectively on the right and left of the abscissa of the maximum value of the gaussian distribution
      The TINN score calculation is defined in the WESAD Dataset paper, to calculate it we needthe closest triangular interpolation 
      of the gaussian distribution of the interbeats times. The triangular interpolation is defined by 2 lines that meet at the maximum value
      of the gaussian distribution and cross the x-axis in N on the first half of the x-axis and M on the second half of the x-axis. 
      Thus inside ]N;M[ the interpolation function != 0
      Outside of ]N;M[ the interpolation function equals 0.
  """

  kernel = stats.gaussian_kde(x) #Create an approximated kernel for gaussian distribution from the x array (interbeats times)
  absi=np.linspace(np.min(x),np.max(x),len(x)) # Compute the x-axis of the interbeats distribution (from minimum interbeat time to maximum interbeat time)
  val=kernel.evaluate(absi) # Fit the gaussian distribution to the created x-axis
  ecart=absi[1]-absi[0] # Space between 2 values on the axis
  maxind=np.argmax(val) # Select the index for which the gaussian distribution (val array) is maximum 
  max_pos=absi[maxind]  # Interbeat time (abscissa) for which the gaussian distribution is maximum
  maxvalue=np.amax(val) # Max of the gaussian distribution
  N_abs=absi[0:maxind+1] # First half of the x-axis
  M_abs=absi[maxind:] # Second half of the x-axis
  HRVindex=len(x)/maxvalue
  err_N=[]
  err_M=[]

  for i in range(0,len(N_abs)-1):
    N=N_abs[i]
    slope=(maxvalue)/(max_pos-N)
    D=val[0:maxind+1]
    q=np.clip(slope*ecart*np.arange(-i,-i+maxind+1),0,None) #Triangular interpolation on the First half of the x-axis
    diff=D-q 
    err=np.multiply(diff,diff)
    err1=np.delete(err,-1)
    err2=np.delete(err, 0)
    errint=(err1+err2)/2
    errtot=np.linalg.norm(errint) # Error area between the triangular interpolation and the gaussian distribution on the first half of the x-axis
    err_N.append((errtot,N,N_abs,q))
  
  for i in range(1,len(M_abs)):
    M=M_abs[i]
    slope=(maxvalue)/(max_pos-M)
    D=val[maxind:]
    q=np.clip(slope*ecart*np.arange(-i,len(D)-i),0,None) #Triangular interpolation on the second half of the x-axis
    diff=D-q
    err=np.multiply(diff,diff)
    err1=np.delete(err,-1)
    err2=np.delete(err, 0)
    errint=(err1+err2)/2
    errtot=np.linalg.norm(errint) # Error area between the triangular interpolation and the gaussian distribution on the second half of the x-axis
    err_M.append((errtot,M,M_abs,q))

  return (err_N,err_M,absi,val,HRVindex)

def best_TINN(x:np.array):
  """Select the best N and M that give the best triangular interpolation function approximation of the gaussian distrbution and return
    N; M; the TINN score = M-N ; and the HRV index
  
  """
  err_N,err_M,_,_,HRVindex=TINN(x)
  N=np.argmin(np.array(err_N,dtype=object)[:,0])
  M=np.argmin(np.array(err_M,dtype=object)[:,0])
  absN=err_N[N][1]
  absM=err_M[M][1]
  return float(absN),float(absM),float(absM-absN),HRVindex

# _,_,T,HRVindex=best_TINN(hrv)
# T, HRVindex

In [7]:
def num_compare_NN50(x,i):
  """Count the number of HRV intervals differing more than 50 ms for a given HRV interval x[i]
  
  """
  ref=x[i]
  k=0
  diff=np.absolute(x-ref)
  k+=np.sum(np.where(diff>0.05,1,0))
  return k 

def compare_NN50(x):
  """ Returns the number and percentage of HRV intervals differing more than 50ms for all intervals
  
  """
  k=0
  for i in range(0,len(x)):
    k+=num_compare_NN50(x,i)
  if k==0:
    k=1
  return k,(k/(len(x)*len(x)))

# num50,p50=compare_NN50(hrv)
# num50, p50

In [8]:
def get_freq_features_ecg(x):
  """ Returns frequential features of the Heart Rate Variability signal (interbeats times) by computing FFT, to compute the Fouriers 
  Frequencies the mean of the Heart Rate variability is used as sampling period  
  """
  mean=np.mean(x)
  yf=np.array(scipy.fft.fft(x-mean))
  xf=scipy.fft.fftfreq(len(x),mean)[0:len(x)//2]
  psd=(2/len(yf))*np.abs(yf)[0:len(x)//2]
  fmean=np.mean(xf)
  fstd=np.std(xf)
  sumpsd=np.sum(psd)
  return fmean,fstd,sumpsd

# fmean,fstd,sumpsd=get_freq_features_ecg(hrv)
# fmean,fstd, sumpsd


In [9]:
# https://github.com/Edouard99/Stress_Detection_ECG/tree/main
# pd.DataFrame({
#     'meanHR': meanHR,
#     'stdHR': stdHR,
#     'TINN': hrv_indices['HRV_TINN'],
#     'HRVindex': HRVindex,
#     '%NN50': num50,
#     'pnn50': hrv_indices['HRV_pNN50'],
#     'meanHRV': meanHRV,
#     'stdHRV': stdHRV,
#     'rmsHRV': rmsHRV,
#     'Mean Fourier Frequencies': fmean,
#     'STD Fourier Frequencies': fstd,
#     'Sum PSD components': sumpsd
# }, index=[0])

In [10]:
# https://dl-acm-org.vu-nl.idm.oclc.org/doi/epdf/10.1145/3242969.3242985
# pd.DataFrame({
#     'μHR': meanHR,
#     'σHR': stdHR,
#     'μHRV': meanHRV,
#     'σHRV': stdHRV,
#     'NN50': num50, 
#     'pNN50': hrv_indices['HRV_pNN50'],
#     'TINN': hrv_indices['HRV_TINN'],
#     'rmsHRV': rmsHRV,
#     'ULF': frequencies['HRV_ULF'],
#     'LF': frequencies['HRV_LF'],
#     'HF': frequencies['HRV_HF'],
#     'UHF': frequencies['HRV_VHF'],
#     'LF_HF_Ratio': frequencies['HRV_LF'] / frequencies['HRV_HF'],
#     'total_power': total_power,
#     'relative_power_ulf': (frequencies['HRV_ULF'] / total_power) * 100,
#     'relative_power_lf': (frequencies['HRV_LF'] / total_power) * 100,
#     'relative_power_hf': (frequencies['HRV_HF'] / total_power) * 100,
#     'relative_power_vhf': (frequencies['HRV_VHF'] / total_power) * 100,
#     'LF_norm': np.nan,  ## Can only be normalised after all the LF and HF are calculated
#     'HF_norm': np.nan,  ## Can only be normalised after all the LF and HF are calculated
# }, index=[0])

The “heart rate” can be described as a true rate in beats per minute (HR) or as the RR interval in milliseconds. 
The RR interval is the time elapsed between two successive R waves of the QRS signal on the electrocardiogram
“Heart rate variability” has become the conventionally accepted term to describe variations of both instantaneous heart rate and RR intervals.

The RR interval and HR are hyperbolically related (HR x RR interval = 60000; see figure 1).

In [11]:
import math 

In [12]:
def _get_average_t(epochs, sampling_rate=1000):
    try:
        average = nk.epochs_average(epochs, which="Signal")

        min_length = sampling_rate * 4
        n = len(average)
        # Unfortunately, NeuroKit is unable to detect the P-QRST-T peaks on a single heartbeat. It requires multiple, so let's just repeat the signal to satisfy the minimum length required.
        signal = np.concatenate( [list(average['Signal_Mean']) for _ in range(math.ceil(min_length / n))])

        _, waves_grand = nk.ecg_delineate(signal, rpeaks=None, method='peak', sampling_rate=sampling_rate)
        t_index = waves_grand['ECG_T_Peaks'][0]

        if np.isnan(t_index):
            return np.nan
        else:
            return signal[t_index]
    except:
        return np.nan

def modified_moving_average(signal, sampling_rate=1000):
    epochs = nk.ecg_segment(signal, rpeaks=None, sampling_rate=sampling_rate)

    if len(epochs) % 2 != 0:
        # We want balanced buckets, so pop the last item to make it even.
        epochs.popitem()
    
    even_keys = list(epochs.keys())[1::2]
    odd_keys = list(epochs.keys())[::2]

    even_bucket = {key: epochs[key] for key in even_keys}
    odd_bucket = {key: epochs[key] for key in odd_keys}

    average_t_even = _get_average_t(even_bucket, sampling_rate)
    average_t_odd = _get_average_t(odd_bucket, sampling_rate)

    if average_t_even is None or average_t_odd is None:
        return np.nan
    else:
        twa = abs(average_t_even - average_t_odd)
        return twa

Dataset

In [13]:
from pathlib import Path
from glob import glob

In [14]:
n_window = 60 * 1000
sampling_rate = 1000

In [15]:
window_shift_size = 0.25
step_size = int(window_shift_size * sampling_rate)

In [16]:
files = glob('../data/ecg_preprocessed/*.csv')

In [17]:
baseline = ['Sitting', 'Recov1', 'Recov2', 'Recov3', 'Recov4', 'Recov5', 'Recov6']
mental_stress = ['TA', 'SSST_Sing_countdown', 'Pasat', 'Raven', 'TA_repeat', 'Pasat_repeat']
high_physical_stress = ['Treadmill1', 'Treadmill2', 'Treadmill3', 'Treadmill4', 'Walking_fast_pace', 'Cycling', 'stairs_up_and_down']
moderate_physical_stress = ['Walking_own_pace', 'Dishes', 'Vacuum']
low_physical_stress = ['Standing', 'Lying_supine', 'Recov_standing']

In [18]:
from scipy.signal import find_peaks
from scipy.fft import fft, fftfreq

In [19]:
import pyhrv

In [20]:
def f_fr_n(freq, max_freq, l ):
    if freq < max_freq:
        return int(freq * l/max_freq)
    else:
        return l - 1
    
def detect_peaks_ECG(peaks, window_size,timestep_data,distance):
    # f_p = find_peaks(sample, distance=distance)# height = 0.4, distance = distance)
    #time features
    # f_p_diff = np.diff(f_p[0]) * timestep_data
    f_p_diff = np.diff(peaks) * timestep_data
    
    # heart rate mean std min max 
    HR_mean = (60/f_p_diff).mean()
    HR_std = (60/f_p_diff).std()
    HR_max = (60/f_p_diff).max()
    HR_min = (60/f_p_diff).min()
    #NN50
    #pNN50
    NN50 = sum(np.abs(np.diff(f_p_diff)) > 0.050)
    N_HRV_50 = NN50
    P_HRV_50 = NN50/len(f_p_diff)
    #rr_features
    rmssd = np.sqrt(np.mean(np.square(np.diff(f_p_diff))))
    rr_mean = f_p_diff.mean()
    rr_std = f_p_diff.std()
    # freq features
    # f_p_diff_fft = savgol_filter(np.diff(f_p_diff), 5,2)
    
    T = window_size * timestep_data
    k = np.arange(len(f_p_diff))
    freqs = k/T
    m = freqs.max()/2
    l = int(len(freqs)/2)
    ffts = abs(np.fft.fft(f_p_diff)*np.hamming(len(k)))**2
    ULF = sum( ffts[ f_fr_n(0.01,m,l):f_fr_n(0.04,m,l) ] )
    HF = sum( ffts[ f_fr_n(0.15,m,l):f_fr_n(0.4,m,l) ] )
    LF = sum( ffts[ f_fr_n(0.04,m,l):f_fr_n(0.15,m,l) ] )
    UHF = sum( ffts[ f_fr_n(0.4,m,l):f_fr_n(1,m,l) ] )
    
    TP = ULF + LF + HF + UHF

    rate_L_H = LF/HF
    lfN = LF / TP 
    hfN = HF / TP
    
    return {
        'μhr' : HR_mean,
        'σhr' : HR_std,
        'HR_max': HR_max,
        'HR_min' : HR_min,
        'NN50' : N_HRV_50,
        'pNN50' : P_HRV_50,
        'rmssd' : rmssd,
        'rr_mean' : rr_mean,
        'rr_std' : rr_std,
        'ULF' : ULF,
        'HF': HF,
        'LF': LF,
        'UHF': UHF,
        'LF_HF_ratio': rate_L_H,
        'Σ': TP,
        'relative_power_ULF': (ULF / TP) * 100,
        'relative_power_LF': (LF / TP) * 100,
        'relative_power_HF': (HF / TP) * 100,
        'relative_power_UHF': (UHF / TP) * 100,
        'LF_norm': lfN,
        'HF_norm': hfN,
    }

In [21]:
def preprocess_and_save(file):
    dataset = load_dataset(
        '../data/ecg_preprocessed', 
        data_files=[f'{Path(file).stem}.csv'],
    )['train']
    
    data = []
    # with tqdm(total=len(dataset)) as pbar:
    for start_idx in range(0, len(dataset), step_size): ## Window shift
        try:
            if start_idx + n_window > len(dataset):
                break
            sample = dataset[start_idx:start_idx+n_window]
            if len(sample['ECG_Clean']) < n_window:
                continue

            label = collections.Counter(sample['category']).most_common(1)[0][0]

            peaks, _ = nk.ecg_peaks(sample['ECG_Clean'], sampling_rate=sampling_rate)
            peaks_indices = peaks[peaks['ECG_R_Peaks'] == 1].index
            
            ## HRV
            hrv = np.array([(peaks_indices[i]-peaks_indices[i-1])/sampling_rate for i in range(1,len(peaks_indices))])
            mean_hrv = np.mean(hrv)
            std_hrv = np.std(hrv)
            rms_hrv = np.sqrt(np.mean(hrv**2))
            _,_,tinn,_ = best_TINN(hrv) 

            r_peaks = nk.ecg_findpeaks(sample['ECG_Clean'])['ECG_R_Peaks']
            fp_data = detect_peaks_ECG(r_peaks, n_window, 1/sampling_rate, 200)

            twa = modified_moving_average(sample['ECG_Clean'], sampling_rate)
            
            data.append({
                'label': label,
                'μhr': fp_data['μhr'],
                'σhr': fp_data['σhr'],
                'μhrv': mean_hrv,
                'σhrv': std_hrv,
                'NN50': fp_data['NN50'],
                'pNN50': fp_data['pNN50'],
                'TINN': tinn,
                'rmsHRV': rms_hrv,
                'ULF': fp_data['ULF'],
                'LF': fp_data['LF'],
                'HF': fp_data['HF'],
                'UHF': fp_data['UHF'],
                'LF_HF_ratio': fp_data['LF_HF_ratio'],
                'Σ': fp_data['Σ'],
                'relative_power_ulf': fp_data['relative_power_ULF'],
                'relative_power_lf': fp_data['relative_power_LF'],
                'relative_power_hf': fp_data['relative_power_HF'],
                'relative_power_uhf': fp_data['relative_power_UHF'],
                'LF_norm': fp_data['HF_norm'],
                'HF_norm': fp_data['HF_norm'],
                'hr_max': fp_data['HR_max'],
                'hr_min': fp_data['HR_min'],
                'rmssd': fp_data['rmssd'],
                'rr_mean': fp_data['rr_mean'],
                'rr_std': fp_data['rr_std'],
                'twa': twa
            })
            # pbar.update(step_size)
        except Exception as e:
            print(file, ":", start_idx, "->", e)
            # pbar.update(step_size)
            continue
        
    result = pd.DataFrame(data)

    stem = Path(file).stem
    result.to_csv(f'../data/ecg_features_60s_clean_twa/{stem}.csv')

In [22]:
from joblib import Parallel, delayed

In [23]:
# for file in files:
#     preprocess_and_save(file)
#     break

In [24]:
Parallel(n_jobs=6)(delayed(preprocess_and_save)(file) for file in files) 

KeyboardInterrupt: 