In [1]:
import datasets
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import numpy as np
import pandas as pd

In [43]:
import scipy
from scipy import stats

In [3]:
from tqdm import tqdm

In [4]:
from sia.transformers import as_window

In [6]:
ds = load_dataset('csv', data_files='./data/ecg_model/30100.csv')['train']

In [5]:
import neurokit2 as nk

In [8]:
n = 20 * 1000

In [9]:
window = ds[0:0+n]['signal']

In [62]:
peaks, info = nk.ecg_peaks(window, sampling_rate=1000)

In [63]:
# ECG_Rate_Mean, ECG_Rate_STD
meanHR = np.mean(nk.signal_rate(peaks, sampling_rate=1000))
stdHR = np.std(nk.signal_rate(peaks, sampling_rate=1000))

In [64]:
peak_indices = peaks[peaks['ECG_R_Peaks'] == 1].index

In [69]:
periods=np.array([(peak_indices[i+1]-peak_indices[i])/1000 for i in range(0,len(peak_indices)-1)])
frequency=1/periods
meanfreq = np.mean(frequency)
stdfreq = np.std(frequency)
frequency, meanfreq, stdfreq

(array([0.86730269, 0.91827365, 0.9718173 , 0.95877277, 0.99700897,
        1.02564103, 1.001001  , 1.02040816, 1.04712042, 1.03305785,
        1.11731844, 1.14416476, 1.00908174, 1.00806452, 1.01317123,
        0.9487666 , 0.93632959, 0.93896714, 0.96061479]),
 0.9956254016145706,
 0.06392357793790138)

In [67]:
hrv = np.array([(peak_indices[i]-peak_indices[i-1])/1000 for i in range(1,len(peak_indices))])
meanHRV=np.mean(hrv)
stdHRV=np.std(hrv)
rmsHRV=np.sqrt(np.mean(hrv**2))
meanHRV, stdHRV, rmsHRV

In [7]:
import scipy
from scipy import stats

def TINN(x:np.array):
  """ Compute all the triangular interpolation to calculate the TINN scores. It also computes HRV index from an array x which contains 
      all the interbeats times for a given ECG signal.

      The axis is divided in 2 parts respectively on the right and left of the abscissa of the maximum value of the gaussian distribution
      The TINN score calculation is defined in the WESAD Dataset paper, to calculate it we needthe closest triangular interpolation 
      of the gaussian distribution of the interbeats times. The triangular interpolation is defined by 2 lines that meet at the maximum value
      of the gaussian distribution and cross the x-axis in N on the first half of the x-axis and M on the second half of the x-axis. 
      Thus inside ]N;M[ the interpolation function != 0
      Outside of ]N;M[ the interpolation function equals 0.
  """

  kernel = stats.gaussian_kde(x) #Create an approximated kernel for gaussian distribution from the x array (interbeats times)
  absi=np.linspace(np.min(x),np.max(x),len(x)) # Compute the x-axis of the interbeats distribution (from minimum interbeat time to maximum interbeat time)
  val=kernel.evaluate(absi) # Fit the gaussian distribution to the created x-axis
  ecart=absi[1]-absi[0] # Space between 2 values on the axis
  maxind=np.argmax(val) # Select the index for which the gaussian distribution (val array) is maximum 
  max_pos=absi[maxind]  # Interbeat time (abscissa) for which the gaussian distribution is maximum
  maxvalue=np.amax(val) # Max of the gaussian distribution
  N_abs=absi[0:maxind+1] # First half of the x-axis
  M_abs=absi[maxind:] # Second half of the x-axis
  HRVindex=len(x)/maxvalue
  err_N=[]
  err_M=[]

  for i in range(0,len(N_abs)-1):
    N=N_abs[i]
    slope=(maxvalue)/(max_pos-N)
    D=val[0:maxind+1]
    q=np.clip(slope*ecart*np.arange(-i,-i+maxind+1),0,None) #Triangular interpolation on the First half of the x-axis
    diff=D-q 
    err=np.multiply(diff,diff)
    err1=np.delete(err,-1)
    err2=np.delete(err, 0)
    errint=(err1+err2)/2
    errtot=np.linalg.norm(errint) # Error area between the triangular interpolation and the gaussian distribution on the first half of the x-axis
    err_N.append((errtot,N,N_abs,q))
  
  for i in range(1,len(M_abs)):
    M=M_abs[i]
    slope=(maxvalue)/(max_pos-M)
    D=val[maxind:]
    q=np.clip(slope*ecart*np.arange(-i,len(D)-i),0,None) #Triangular interpolation on the second half of the x-axis
    diff=D-q
    err=np.multiply(diff,diff)
    err1=np.delete(err,-1)
    err2=np.delete(err, 0)
    errint=(err1+err2)/2
    errtot=np.linalg.norm(errint) # Error area between the triangular interpolation and the gaussian distribution on the second half of the x-axis
    err_M.append((errtot,M,M_abs,q))

  return (err_N,err_M,absi,val,HRVindex)

def best_TINN(x:np.array):
  """Select the best N and M that give the best triangular interpolation function approximation of the gaussian distrbution and return
    N; M; the TINN score = M-N ; and the HRV index
  
  """
  err_N,err_M,_,_,HRVindex=TINN(x)
  N=np.argmin(np.array(err_N,dtype=object)[:,0])
  M=np.argmin(np.array(err_M,dtype=object)[:,0])
  absN=err_N[N][1]
  absM=err_M[M][1]
  return float(absN),float(absM),float(absM-absN),HRVindex

# _,_,T,HRVindex=best_TINN(hrv)
# T, HRVindex

In [8]:
def num_compare_NN50(x,i):
  """Count the number of HRV intervals differing more than 50 ms for a given HRV interval x[i]
  
  """
  ref=x[i]
  k=0
  diff=np.absolute(x-ref)
  k+=np.sum(np.where(diff>0.05,1,0))
  return k 

def compare_NN50(x):
  """ Returns the number and percentage of HRV intervals differing more than 50ms for all intervals
  
  """
  k=0
  for i in range(0,len(x)):
    k+=num_compare_NN50(x,i)
  if k==0:
    k=1
  return k,(k/(len(x)*len(x)))

# num50,p50=compare_NN50(hrv)
# num50, p50

In [9]:
def get_freq_features_ecg(x):
  """ Returns frequential features of the Heart Rate Variability signal (interbeats times) by computing FFT, to compute the Fouriers 
  Frequencies the mean of the Heart Rate variability is used as sampling period  
  """
  mean=np.mean(x)
  yf=np.array(scipy.fft.fft(x-mean))
  xf=scipy.fft.fftfreq(len(x),mean)[0:len(x)//2]
  psd=(2/len(yf))*np.abs(yf)[0:len(x)//2]
  fmean=np.mean(xf)
  fstd=np.std(xf)
  sumpsd=np.sum(psd)
  return fmean,fstd,sumpsd

# fmean,fstd,sumpsd=get_freq_features_ecg(hrv)
# fmean,fstd, sumpsd


In [44]:
hrv_indices = nk.hrv(peaks, sampling_rate=1000)
hrv_indices

Unnamed: 0,HRV_MeanNN,HRV_SDNN,HRV_SDANN1,HRV_SDNNI1,HRV_SDANN2,HRV_SDNNI2,HRV_SDANN5,HRV_SDNNI5,HRV_RMSSD,HRV_SDSD,...,HRV_SampEn,HRV_ShanEn,HRV_FuzzyEn,HRV_MSEn,HRV_CMSEn,HRV_RCMSEn,HRV_CD,HRV_HFD,HRV_KFD,HRV_LZC
0,1008.473684,65.574697,,,,,,,45.192182,46.059495,...,inf,4.247928,1.254404,,,,1.525146,,1.373195,0.894301


In [76]:
# https://github.com/Edouard99/Stress_Detection_ECG/tree/main
pd.DataFrame({
    'meanHR': meanHR,
    'stdHR': stdHR,
    'TINN': hrv_indices['HRV_TINN'],
    'HRVindex': HRVindex,
    '%NN50': num50,
    'pnn50': hrv_indices['HRV_pNN50'],
    'meanHRV': meanHRV,
    'stdHRV': stdHRV,
    'rmsHRV': rmsHRV,
    'Mean Fourier Frequencies': fmean,
    'STD Fourier Frequencies': fstd,
    'Sum PSD components': sumpsd
}, index=[0])

Unnamed: 0,meanHR,stdHR,TINN,HRVindex,%NN50,pnn50,meanHRV,stdHRV,rmsHRV,Mean Fourier Frequencies,STD Fourier Frequencies,Sum PSD components
0,59.72544,3.738671,132.8125,3.229072,204,26.315789,1.008474,0.063826,1.010491,0.208757,0.134752,0.186687


In [81]:
frequencies = nk.hrv_frequency(
    peaks, 
    sampling_rate=1000,
    ulf=[0.01,0.04],
    lf=[0.04,0.15],
    hf=[0.15,0.4],
    vhf=[0.4,1]
)
frequencies

Unnamed: 0,HRV_ULF,HRV_VLF,HRV_LF,HRV_HF,HRV_VHF,HRV_TP,HRV_LFHF,HRV_LFn,HRV_HFn,HRV_LnHF
0,,,,0.016549,0.001722,0.018271,,,0.905738,-4.101447


In [88]:
total_power = np.nansum([frequencies['HRV_ULF'], frequencies['HRV_LF'], frequencies['HRV_HF'], frequencies['HRV_VHF']])

In [91]:
# https://dl-acm-org.vu-nl.idm.oclc.org/doi/epdf/10.1145/3242969.3242985
pd.DataFrame({
    'μHR': meanHR,
    'σHR': stdHR,
    'μHRV': meanHRV,
    'σHRV': stdHRV,
    'NN50': num50, 
    'pNN50': hrv_indices['HRV_pNN50'],
    'TINN': hrv_indices['HRV_TINN'],
    'rmsHRV': rmsHRV,
    'ULF': frequencies['HRV_ULF'],
    'LF': frequencies['HRV_LF'],
    'HF': frequencies['HRV_HF'],
    'UHF': frequencies['HRV_VHF'],
    'LF_HF_Ratio': frequencies['HRV_LF'] / frequencies['HRV_HF'],
    'total_power': total_power,
    'relative_power_ulf': (frequencies['HRV_ULF'] / total_power) * 100,
    'relative_power_lf': (frequencies['HRV_LF'] / total_power) * 100,
    'relative_power_hf': (frequencies['HRV_HF'] / total_power) * 100,
    'relative_power_vhf': (frequencies['HRV_VHF'] / total_power) * 100,
    'LF_norm': np.nan,  ## Can only be normalised after all the LF and HF are calculated
    'HF_norm': np.nan,  ## Can only be normalised after all the LF and HF are calculated
}, index=[0])

Unnamed: 0,μHR,σHR,μHRV,σHRV,NN50,pNN50,TINN,rmsHRV,ULF,LF,HF,UHF,LF_HF_Ratio,total_power,relative_power_ulf,relative_power_lf,relative_power_hf,relative_power_vhf,LF_norm,HF_norm
0,59.72544,3.738671,1.008474,0.063826,204,26.315789,132.8125,1.010491,,,0.016549,0.001722,,0.018271,,,90.573825,9.426175,,


The “heart rate” can be described as a true rate in beats per minute (HR) or as the RR interval in milliseconds. 
The RR interval is the time elapsed between two successive R waves of the QRS signal on the electrocardiogram
“Heart rate variability” has become the conventionally accepted term to describe variations of both instantaneous heart rate and RR intervals.

The RR interval and HR are hyperbolically related (HR x RR interval = 60000; see figure 1).

In [15]:
hrv_indices.columns

Index(['HRV_MeanNN', 'HRV_SDNN', 'HRV_SDANN1', 'HRV_SDNNI1', 'HRV_SDANN2',
       'HRV_SDNNI2', 'HRV_SDANN5', 'HRV_SDNNI5', 'HRV_RMSSD', 'HRV_SDSD',
       'HRV_CVNN', 'HRV_CVSD', 'HRV_MedianNN', 'HRV_MadNN', 'HRV_MCVNN',
       'HRV_IQRNN', 'HRV_SDRMSSD', 'HRV_Prc20NN', 'HRV_Prc80NN', 'HRV_pNN50',
       'HRV_pNN20', 'HRV_MinNN', 'HRV_MaxNN', 'HRV_HTI', 'HRV_TINN', 'HRV_ULF',
       'HRV_VLF', 'HRV_LF', 'HRV_HF', 'HRV_VHF', 'HRV_TP', 'HRV_LFHF',
       'HRV_LFn', 'HRV_HFn', 'HRV_LnHF', 'HRV_SD1', 'HRV_SD2', 'HRV_SD1SD2',
       'HRV_S', 'HRV_CSI', 'HRV_CVI', 'HRV_CSI_Modified', 'HRV_PIP',
       'HRV_IALS', 'HRV_PSS', 'HRV_PAS', 'HRV_GI', 'HRV_SI', 'HRV_AI',
       'HRV_PI', 'HRV_C1d', 'HRV_C1a', 'HRV_SD1d', 'HRV_SD1a', 'HRV_C2d',
       'HRV_C2a', 'HRV_SD2d', 'HRV_SD2a', 'HRV_Cd', 'HRV_Ca', 'HRV_SDNNd',
       'HRV_SDNNa', 'HRV_DFA_alpha1', 'HRV_MFDFA_alpha1_Width',
       'HRV_MFDFA_alpha1_Peak', 'HRV_MFDFA_alpha1_Mean',
       'HRV_MFDFA_alpha1_Max', 'HRV_MFDFA_alpha1_Delta',
     

Dataset

In [58]:
from pathlib import Path
from glob import glob

In [10]:
n_window = 20 * 1000
sampling_rate = 1000

In [56]:
files = glob('./data/ecg_model/*.csv')

In [60]:
for file in files:
    dataset = load_dataset('csv', data_files=file)['train']
    dataframes = []
    for i in range(len(dataset) // n_window):
        try:
            start_idx = i * n_window
            sample = dataset[start_idx:start_idx+n_window]
            signal = sample['signal']
            label = stats.mode(sample['label'])[0]

            peaks, _ = nk.ecg_peaks(signal, sampling_rate=sampling_rate)
            peaks_indices = peaks[peaks['ECG_R_Peaks'] == 1].index

            ## HR
            signal_rate = nk.signal_rate(peaks, sampling_rate=sampling_rate)
            mean_hr = np.mean(signal_rate)
            std_hr = np.std(signal_rate)

            ## Frequencies
            periods = np.array([(peaks_indices[i+1]-peaks_indices[i])/sampling_rate for i in range(0,len(peaks_indices)-1)])
            frequency = 1 / periods
            mean_freq = np.mean(frequency)
            std_freq = np.std(frequency)
            mean_f, std_f, sum_psd = get_freq_features_ecg(periods)
            
            ## HRV
            hrv = np.array([(peaks_indices[i]-peaks_indices[i-1])/sampling_rate for i in range(1,len(peaks_indices))])
            mean_hrv = np.mean(hrv)
            std_hrv = np.std(hrv)
            rms_hrv = np.sqrt(np.mean(hrv**2))
            _, _, _, hrv_index = best_TINN(hrv)

            ## %NN50
            NN50, pNN50 = compare_NN50(hrv)

            ## Power
            frequencies = nk.hrv_frequency(
                peaks, 
                sampling_rate=sampling_rate,
                ulf=[0.01,0.04],
                lf=[0.04,0.15],
                hf=[0.15,0.4],
                vhf=[0.4,1]
            )
            total_power = np.nansum([frequencies['HRV_ULF'], frequencies['HRV_LF'], frequencies['HRV_HF'], frequencies['HRV_VHF']])

            ## Dataframe
            df = nk.hrv(peaks, sampling_rate=sampling_rate)
            df['label'] = label
            df['mean_hr'] = mean_hr
            df['std_hr'] = std_hr
            df['hrv_index'] = hrv_index
            df['nn50'] = NN50
            df['mean_hrv'] = mean_hrv
            df['std_hrv'] = std_hrv
            df['rms_hrv'] = rms_hrv
            df['mean_fourier_frequencies'] = mean_f
            df['std_fourier_frequencies'] = std_f
            df['sum_psd'] = sum_psd
            df['ulf'] = frequencies['HRV_ULF']
            df['lf'] = frequencies['HRV_LF']
            df['hf'] = frequencies['HRV_HF']
            df['uhf'] = frequencies['HRV_VHF']
            df['lf_hf_ratio'] = frequencies['HRV_LF'] / frequencies['HRV_HF']
            df['total_power'] = total_power
            df['relative_power_ulf'] = (frequencies['HRV_ULF'] / total_power) * 100
            df['relative_power_lf'] = (frequencies['HRV_LF'] / total_power) * 100
            df['relative_power_hf'] = (frequencies['HRV_HF'] / total_power) * 100
            df['relative_power_vhf'] = (frequencies['HRV_VHF'] / total_power) * 100
            dataframes.append(df)
        except Exception as e:
            # print(e)
            continue
        
    result = pd.concat(dataframes, ignore_index=True)

    stem = Path(file).stem
    result.to_csv(f'./data/ecg_features/{stem}.csv')

Generating train split: 8344000 examples [00:05, 1466368.61 examples/s]
Generating train split: 8174000 examples [00:05, 1501210.86 examples/s]
Generating train split: 9684000 examples [00:06, 1472055.45 examples/s]
Generating train split: 7842000 examples [00:05, 1432841.18 examples/s]
Generating train split: 7729000 examples [00:05, 1457442.11 examples/s]
Generating train split: 7737000 examples [00:05, 1489999.87 examples/s]
Generating train split: 9241000 examples [00:06, 1451260.84 examples/s]
Generating train split: 8718000 examples [00:05, 1457704.18 examples/s]
Generating train split: 10272000 examples [00:07, 1446480.51 examples/s]
Generating train split: 8212000 examples [00:05, 1435243.77 examples/s]
Generating train split: 7871000 examples [00:05, 1483389.34 examples/s]
Generating train split: 8530000 examples [00:05, 1462965.23 examples/s]
Generating train split: 7112000 examples [00:05, 1366895.17 examples/s]
Generating train split: 8624000 examples [00:05, 1467109.58 exa

In [77]:
for column, dtype in zip(result.columns, result.dtypes):
    print(f"'{column}',")

for column, dtype in zip(result.columns, result.dtypes):
    print(f"'{column}': Value('{dtype}'),")

'HRV_MeanNN',
'HRV_SDNN',
'HRV_SDANN1',
'HRV_SDNNI1',
'HRV_SDANN2',
'HRV_SDNNI2',
'HRV_SDANN5',
'HRV_SDNNI5',
'HRV_RMSSD',
'HRV_SDSD',
'HRV_CVNN',
'HRV_CVSD',
'HRV_MedianNN',
'HRV_MadNN',
'HRV_MCVNN',
'HRV_IQRNN',
'HRV_SDRMSSD',
'HRV_Prc20NN',
'HRV_Prc80NN',
'HRV_pNN50',
'HRV_pNN20',
'HRV_MinNN',
'HRV_MaxNN',
'HRV_HTI',
'HRV_TINN',
'HRV_ULF',
'HRV_VLF',
'HRV_LF',
'HRV_HF',
'HRV_VHF',
'HRV_TP',
'HRV_LFHF',
'HRV_LFn',
'HRV_HFn',
'HRV_LnHF',
'HRV_SD1',
'HRV_SD2',
'HRV_SD1SD2',
'HRV_S',
'HRV_CSI',
'HRV_CVI',
'HRV_CSI_Modified',
'HRV_PIP',
'HRV_IALS',
'HRV_PSS',
'HRV_PAS',
'HRV_GI',
'HRV_SI',
'HRV_AI',
'HRV_PI',
'HRV_C1d',
'HRV_C1a',
'HRV_SD1d',
'HRV_SD1a',
'HRV_C2d',
'HRV_C2a',
'HRV_SD2d',
'HRV_SD2a',
'HRV_Cd',
'HRV_Ca',
'HRV_SDNNd',
'HRV_SDNNa',
'HRV_DFA_alpha1',
'HRV_MFDFA_alpha1_Width',
'HRV_MFDFA_alpha1_Peak',
'HRV_MFDFA_alpha1_Mean',
'HRV_MFDFA_alpha1_Max',
'HRV_MFDFA_alpha1_Delta',
'HRV_MFDFA_alpha1_Asymmetry',
'HRV_MFDFA_alpha1_Fluctuation',
'HRV_MFDFA_alpha1_Increment',
'HRV_ApEn',