In [1]:
from glob import glob
from pathlib import Path

In [2]:
import pandas as pd

In [3]:
participants = [Path(path).stem for path in glob('./data/wesad/**') if Path(path).is_dir()]

In [4]:
for participant in participants:
    data = pd.read_pickle(f'./data/wesad/{participant}/{participant}.pkl')
    df = pd.DataFrame({
        'signal': data['signal']['chest']['ECG'].flatten(),
        'label': data['label']
    })
    df.to_csv(f'./data/wesad/{participant}/{participant}.csv')

In [5]:
import numpy as np
import pandas as pd
import scipy
from scipy import stats
import collections
from datasets import load_dataset
import neurokit2 as nk

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
def TINN(x:np.array):
  """ Compute all the triangular interpolation to calculate the TINN scores. It also computes HRV index from an array x which contains 
      all the interbeats times for a given ECG signal.

      The axis is divided in 2 parts respectively on the right and left of the abscissa of the maximum value of the gaussian distribution
      The TINN score calculation is defined in the WESAD Dataset paper, to calculate it we needthe closest triangular interpolation 
      of the gaussian distribution of the interbeats times. The triangular interpolation is defined by 2 lines that meet at the maximum value
      of the gaussian distribution and cross the x-axis in N on the first half of the x-axis and M on the second half of the x-axis. 
      Thus inside ]N;M[ the interpolation function != 0
      Outside of ]N;M[ the interpolation function equals 0.
  """

  kernel = stats.gaussian_kde(x) #Create an approximated kernel for gaussian distribution from the x array (interbeats times)
  absi=np.linspace(np.min(x),np.max(x),len(x)) # Compute the x-axis of the interbeats distribution (from minimum interbeat time to maximum interbeat time)
  val=kernel.evaluate(absi) # Fit the gaussian distribution to the created x-axis
  ecart=absi[1]-absi[0] # Space between 2 values on the axis
  maxind=np.argmax(val) # Select the index for which the gaussian distribution (val array) is maximum 
  max_pos=absi[maxind]  # Interbeat time (abscissa) for which the gaussian distribution is maximum
  maxvalue=np.amax(val) # Max of the gaussian distribution
  N_abs=absi[0:maxind+1] # First half of the x-axis
  M_abs=absi[maxind:] # Second half of the x-axis
  HRVindex=len(x)/maxvalue
  err_N=[]
  err_M=[]

  for i in range(0,len(N_abs)-1):
    N=N_abs[i]
    slope=(maxvalue)/(max_pos-N)
    D=val[0:maxind+1]
    q=np.clip(slope*ecart*np.arange(-i,-i+maxind+1),0,None) #Triangular interpolation on the First half of the x-axis
    diff=D-q 
    err=np.multiply(diff,diff)
    err1=np.delete(err,-1)
    err2=np.delete(err, 0)
    errint=(err1+err2)/2
    errtot=np.linalg.norm(errint) # Error area between the triangular interpolation and the gaussian distribution on the first half of the x-axis
    err_N.append((errtot,N,N_abs,q))
  
  for i in range(1,len(M_abs)):
    M=M_abs[i]
    slope=(maxvalue)/(max_pos-M)
    D=val[maxind:]
    q=np.clip(slope*ecart*np.arange(-i,len(D)-i),0,None) #Triangular interpolation on the second half of the x-axis
    diff=D-q
    err=np.multiply(diff,diff)
    err1=np.delete(err,-1)
    err2=np.delete(err, 0)
    errint=(err1+err2)/2
    errtot=np.linalg.norm(errint) # Error area between the triangular interpolation and the gaussian distribution on the second half of the x-axis
    err_M.append((errtot,M,M_abs,q))

  return (err_N,err_M,absi,val,HRVindex)

def best_TINN(x:np.array):
  """Select the best N and M that give the best triangular interpolation function approximation of the gaussian distrbution and return
    N; M; the TINN score = M-N ; and the HRV index
  
  """
  err_N,err_M,_,_,HRVindex=TINN(x)
  N=np.argmin(np.array(err_N,dtype=object)[:,0])
  M=np.argmin(np.array(err_M,dtype=object)[:,0])
  absN=err_N[N][1]
  absM=err_M[M][1]
  return float(absN),float(absM),float(absM-absN),HRVindex

In [7]:
def get_freq_features_ecg(x):
  """ Returns frequential features of the Heart Rate Variability signal (interbeats times) by computing FFT, to compute the Fouriers 
  Frequencies the mean of the Heart Rate variability is used as sampling period  
  """
  mean=np.mean(x)
  yf=np.array(scipy.fft.fft(x-mean))
  xf=scipy.fft.fftfreq(len(x),mean)[0:len(x)//2]
  psd=(2/len(yf))*np.abs(yf)[0:len(x)//2]
  fmean=np.mean(xf)
  fstd=np.std(xf)
  sumpsd=np.sum(psd)
  return fmean,fstd,sumpsd


In [8]:
def num_compare_NN50(x,i):
  """Count the number of HRV intervals differing more than 50 ms for a given HRV interval x[i]
  
  """
  ref=x[i]
  k=0
  diff=np.absolute(x-ref)
  k+=np.sum(np.where(diff>0.05,1,0))
  return k 

def compare_NN50(x):
  """ Returns the number and percentage of HRV intervals differing more than 50ms for all intervals
  
  """
  k=0
  for i in range(0,len(x)):
    k+=num_compare_NN50(x,i)
  if k==0:
    k=1
  return k,(k/(len(x)*len(x)))

In [22]:
sampling_rate = 700
n_window = 20 * sampling_rate

In [23]:
window_shift_size = 1
step_size = int(window_shift_size * sampling_rate)

In [24]:
def process_and_save_participant(participant):
    dataset = load_dataset('csv', data_files=f'./data/wesad/{participant}/{participant}.csv')['train']

    # 1 = baseline (neutral).
    neutral = dataset.filter(lambda x: x['label'] == 1).select(range(90 * sampling_rate))
    
    neutral_peaks, _ = nk.ecg_peaks(neutral['signal'], sampling_rate=sampling_rate)
    neutral_peaks_indices = neutral_peaks[neutral_peaks['ECG_R_Peaks'] == 1].index

    ## HR
    neutral_signal_rate = nk.signal_rate(neutral_peaks, sampling_rate=sampling_rate)
    neutral_mean_hr = np.mean(neutral_signal_rate)
    neutral_std_hr = np.std(neutral_signal_rate)

    ## Frequencies
    neutral_periods = np.array([(neutral_peaks_indices[i+1]-neutral_peaks_indices[i])/sampling_rate for i in range(0,len(neutral_peaks_indices)-1)])
    neutral_frequency = 1 / neutral_periods
    neutral_mean_freq = np.mean(neutral_frequency)
    neutral_std_freq = np.std(neutral_frequency)
    neutral_mean_f, neutral_std_f, neutral_sum_psd = get_freq_features_ecg(neutral_periods)
    
    ## HRV
    neutral_hrv = np.array([(neutral_peaks_indices[i]-neutral_peaks_indices[i-1])/sampling_rate for i in range(1,len(neutral_peaks_indices))])
    neutral_mean_hrv = np.mean(neutral_hrv)
    neutral_std_hrv = np.std(neutral_hrv)
    neutral_rms_hrv = np.sqrt(np.mean(neutral_hrv**2))
    _, _, _, neutral_hrv_index = best_TINN(neutral_hrv)

    ## %NN50
    neutral_NN50, neutral_pNN50 = compare_NN50(neutral_hrv)

    ## Power
    neutral_frequencies = nk.hrv_frequency(
        neutral_peaks, 
        sampling_rate=sampling_rate,
        ulf=[0.01,0.04],
        lf=[0.04,0.15],
        hf=[0.15,0.4],
        vhf=[0.4,1]
    )
    neutral_total_power = np.nansum([neutral_frequencies['HRV_ULF'], neutral_frequencies['HRV_LF'], neutral_frequencies['HRV_HF'], neutral_frequencies['HRV_VHF']])

    dataframes = []
    for start_idx in range(0, len(dataset), step_size): ## Window shift
        try:
            sample = dataset[start_idx:start_idx+n_window]
            if len(sample['signal']) < n_window:
                continue
            
            signal = sample['signal']
            label = collections.Counter(sample['label']).most_common(1)[0][0]

            peaks, _ = nk.ecg_peaks(signal, sampling_rate=sampling_rate)
            peaks_indices = peaks[peaks['ECG_R_Peaks'] == 1].index

            ## HR
            signal_rate = nk.signal_rate(peaks, sampling_rate=sampling_rate)
            mean_hr = np.mean(signal_rate)
            std_hr = np.std(signal_rate)

            ## Frequencies
            periods = np.array([(peaks_indices[i+1]-peaks_indices[i])/sampling_rate for i in range(0,len(peaks_indices)-1)])
            frequency = 1 / periods
            mean_freq = np.mean(frequency)
            std_freq = np.std(frequency)
            mean_f, std_f, sum_psd = get_freq_features_ecg(periods)
            
            ## HRV
            hrv = np.array([(peaks_indices[i]-peaks_indices[i-1])/sampling_rate for i in range(1,len(peaks_indices))])
            mean_hrv = np.mean(hrv)
            std_hrv = np.std(hrv)
            rms_hrv = np.sqrt(np.mean(hrv**2))
            _, _, _, hrv_index = best_TINN(hrv)

            ## %NN50
            NN50, pNN50 = compare_NN50(hrv)

            ## Power
            frequencies = nk.hrv_frequency(
                peaks, 
                sampling_rate=sampling_rate,
                ulf=[0.01,0.04],
                lf=[0.04,0.15],
                hf=[0.15,0.4],
                vhf=[0.4,1]
            )
            total_power = np.nansum([frequencies['HRV_ULF'], frequencies['HRV_LF'], frequencies['HRV_HF'], frequencies['HRV_VHF']])

            ## Dataframe
            df = nk.hrv(peaks, sampling_rate=sampling_rate)
            df['label'] = label
            df['mean_hr'] = mean_hr / neutral_mean_hr
            df['std_hr'] = std_hr / neutral_std_hr
            df['hrv_index'] = hrv_index / neutral_hrv_index
            df['nn50'] = NN50 / neutral_NN50
            df['mean_hrv'] = mean_hrv / neutral_mean_hrv
            df['std_hrv'] = std_hrv / neutral_std_hrv
            df['rms_hrv'] = rms_hrv / neutral_rms_hrv
            df['mean_fourier_frequencies'] = mean_f / neutral_mean_f
            df['std_fourier_frequencies'] = std_f / neutral_std_f
            df['sum_psd'] = sum_psd / neutral_sum_psd
            df['ulf'] = frequencies['HRV_ULF'] / neutral_frequencies['HRV_ULF']
            df['lf'] = frequencies['HRV_LF'] / neutral_frequencies['HRV_LF']
            df['hf'] = frequencies['HRV_HF'] / neutral_frequencies['HRV_HF']
            df['uhf'] = frequencies['HRV_VHF'] / neutral_frequencies['HRV_VHF']
            df['lf_hf_ratio'] = (frequencies['HRV_LF'] / frequencies['HRV_HF']) / (neutral_frequencies['HRV_LF'] / neutral_frequencies['HRV_HF'])
            df['total_power'] = total_power / neutral_total_power
            df['relative_power_ulf'] = ((frequencies['HRV_ULF'] / total_power) * 100) / ((neutral_frequencies['HRV_ULF'] / neutral_total_power) * 100)
            df['relative_power_lf'] = ((frequencies['HRV_LF'] / total_power) * 100) / ((neutral_frequencies['HRV_LF'] / neutral_total_power) * 100)
            df['relative_power_hf'] = ((frequencies['HRV_HF'] / total_power) * 100) / ((neutral_frequencies['HRV_HF'] / neutral_total_power) * 100)
            df['relative_power_uhf'] = ((frequencies['HRV_VHF'] / total_power) * 100) / ((neutral_frequencies['HRV_VHF'] / neutral_total_power) * 100)
            dataframes.append(df)
        except Exception as e:
            # print(e)
            continue
        
    result = pd.concat(dataframes, ignore_index=True)
    result.to_csv(f'./data/wesad_features_20s/{participant}.csv')

In [25]:
from joblib import Parallel, delayed

In [26]:
Parallel(n_jobs=6)(delayed(process_and_save_participant)(participant) for participant in participants) 

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]