In [1]:
from pathlib import Path
from glob import glob

In [2]:
import datasets

In [3]:
import numpy as np
import pandas as pd

In [4]:
root = './data/ecg_features_60s_clean_twa_rqa_60s'
participants = [Path(file).name for file in glob(f'{root}/*.csv')]

In [5]:
dataset = datasets.load_dataset(
    root, 
    train_participants=participants,
    trust_remote_code=True,
    num_proc=8
)['fit']

Loading dataset shards:   0%|          | 0/17 [00:00<?, ?it/s]

In [6]:
def encode(baseline = 0, mental_stress = 1, high_physical_activity = 2, moderate_physical_activity = 3, low_physical_activity = 4):
    def inner(labels):
        baseline_list = ['Sitting', 'Recov1', 'Recov2', 'Recov3', 'Recov4', 'Recov5', 'Recov6']
        mental_stress_list = ['TA', 'SSST_Sing_countdown', 'Pasat', 'Raven', 'TA_repeat', 'Pasat_repeat']
        high_physical_stress_list = ['Treadmill1', 'Treadmill2', 'Treadmill3', 'Treadmill4', 'Walking_fast_pace', 'Cycling', 'stairs_up_and_down']
        moderate_physical_stress_list = ['Walking_own_pace', 'Dishes', 'Vacuum']
        low_physical_stress_list = ['Standing', 'Lying_supine', 'Recov_standing']
        
        def encode_multiclass(label):
            if label in baseline_list:
                return baseline
            elif label in mental_stress_list:
                return mental_stress
            elif label in high_physical_stress_list:
                return high_physical_activity
            elif label in moderate_physical_stress_list:
                return moderate_physical_activity
            elif label in low_physical_stress_list:
                return low_physical_activity
            else:
                return -1
            
        return {
            'label': [encode_multiclass(label) for label in labels],
        }
    return inner

def clean(dataset, mapping={}):
    dataset = dataset.map(
        encode(**mapping), 
        batched=True, 
        batch_size=2048, 
        input_columns=['label'],
        num_proc=4
    )
    return dataset.filter(
        lambda label: label != -1,
        input_columns=['label'],
    )


In [7]:
dataset = clean(dataset)

In [8]:
dataset

Dataset({
    features: ['label', 'hrv_mean', 'hrv_min', 'hrv_max', 'hrv_std', 'hrv_rms', 'hr_mean', 'hr_min', 'hr_max', 'hr_std', 'rr_mean', 'rr_min', 'rr_max', 'rr_std', 'nn50', 'pnn50', 'rmssd', 'ulf_min', 'vlf_min', 'lf_min', 'hf_min', 'vhf_min', 'uhf_min', 'tp_min', 'lf_hf_ratio_min', 'lp_ulf_min', 'lp_vlf_min', 'lp_lf_min', 'lp_hf_min', 'lp_vhf_min', 'lp_uhf_min', 'lf_normalized_min', 'hf_normalized_min', 'lf/hf+lf_min', 'hf/hf+lf_min', 'ulf_max', 'vlf_max', 'lf_max', 'hf_max', 'vhf_max', 'uhf_max', 'tp_max', 'lf_hf_ratio_max', 'lp_ulf_max', 'lp_vlf_max', 'lp_lf_max', 'lp_hf_max', 'lp_vhf_max', 'lp_uhf_max', 'lf_normalized_max', 'hf_normalized_max', 'lf/hf+lf_max', 'hf/hf+lf_max', 'ulf_mean', 'vlf_mean', 'lf_mean', 'hf_mean', 'vhf_mean', 'uhf_mean', 'tp_mean', 'lf_hf_ratio_mean', 'lp_ulf_mean', 'lp_vlf_mean', 'lp_lf_mean', 'lp_hf_mean', 'lp_vhf_mean', 'lp_uhf_mean', 'lf_normalized_mean', 'hf_normalized_mean', 'lf/hf+lf_mean', 'hf/hf+lf_mean', 'ulf_median', 'vlf_median', 'lf_media

In [9]:
dataset = dataset.select_columns(['label'])

In [10]:
df = dataset.to_pandas()

In [12]:
df.value_counts()

label
1        1187478
0        1040651
2         892154
4         287103
3         229474
Name: count, dtype: int64

In [13]:
df.value_counts().sum()

3636860

In [14]:
df.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'

label
1        32.7%
0        28.6%
2        24.5%
4         7.9%
3         6.3%
Name: proportion, dtype: object