In [1]:
from pathlib import Path
from glob import glob

In [2]:
import datasets

In [3]:
import numpy as np
import pandas as pd

In [4]:
import sklearn
import sklearn.metrics

from sklearn.model_selection import train_test_split

In [5]:
base_path = './data/ecg_features_60s_clean_twa_rqa_60s'

In [6]:
participants = [Path(path).stem for path in glob(f'{base_path}/*.csv')]
train_participants, test_participants = train_test_split(participants, test_size=0.2, random_state=42)
train_participants, val_participants = train_test_split(participants, test_size=0.25, random_state=42)

In [7]:
dataset = datasets.load_dataset(
    f'{base_path}', 
    train_participants=train_participants,
    val_participants=val_participants,
    test_participants=test_participants,
    trust_remote_code=True
)

In [8]:
def encode(baseline = 0, mental_stress = -1, high_physical_activity = -1, moderate_physical_activity = -1, low_physical_activity = -1):
    def inner(labels):
        baseline_list = ['Sitting', 'Recov1', 'Recov2', 'Recov3', 'Recov4', 'Recov5', 'Recov6']
        mental_stress_list = ['TA', 'SSST_Sing_countdown', 'Pasat', 'Raven', 'TA_repeat', 'Pasat_repeat']
        high_physical_stress_list = ['Treadmill1', 'Treadmill2', 'Treadmill3', 'Treadmill4', 'Walking_fast_pace', 'Cycling', 'stairs_up_and_down']
        moderate_physical_stress_list = ['Walking_own_pace', 'Dishes', 'Vacuum']
        low_physical_stress_list = ['Standing', 'Lying_supine', 'Recov_standing']
        
        def encode_multiclass(label):
            if label in baseline_list:
                return baseline
            elif label in mental_stress_list:
                return mental_stress
            elif label in high_physical_stress_list:
                return high_physical_activity
            elif label in moderate_physical_stress_list:
                return moderate_physical_activity
            elif label in low_physical_stress_list:
                return low_physical_activity
            else:
                return -1
            
        return {
            'label': [encode_multiclass(label) for label in labels],
        }
    return inner

def clean(dataset, mapping={}):
    dataset = dataset.map(
        encode(**mapping), 
        batched=True, 
        batch_size=2048, 
        input_columns=['label'],
        num_proc=4
    )
    return dataset.filter(
        lambda label: label != -1,
        input_columns=['label'],
    )


In [9]:
dataset = clean(dataset, { 'mental_stress': 1 })

In [26]:
X_labels =  [
    "hrv_mean",
    # "hrv_min",
    # "hrv_max",
    # "hrv_std",
    "hrv_rms",
    # "hr_mean",
    # "hr_min",
    # "hr_max",
    "rr_mean",
    # "rr_min",
    # "rr_max",
    # "rr_std",
    # "nn50",
    # "pnn50",
    # "rmssd",
    # "lf",
    # "hf",
    "vhf",
    # "uhf",
    # "tp",
    # "lp_hf",
    "lp_vhf",
    # "lp_uhf",
    # "hf_normalized",
    "w",
    "wmax",
    "wen",
    # "MeanNN",
    # "SDNN",
    # "SDANN1",
    # "SDNNI1",
    # "SDANN2",
    # "SDNNI2",
    # "SDANN5",
    # "SDNNI5",
    # "RMSSD",
    # "SDSD",
    # "CVNN",
    # "CVSD",
    # "MedianNN",
    # "MadNN",
    # "MCVNN",
    # "IQRNN",
    # "SDRMSSD",
    # "Prc20NN",
    "Prc80NN",
    # "pNN50",
    # "pNN20",
    # "MinNN",
    # "MaxNN",
    # "HTI",
    # "TINN",
    # "twa",
]
y_label = 'label'

In [27]:
X_train = dataset['fit'].select_columns(X_labels)
y_train = dataset['fit'].select_columns(y_label)

In [28]:
from sklearn.ensemble import RandomForestClassifier

In [29]:
X_train

Dataset({
    features: ['hrv_mean', 'hrv_rms', 'rr_mean', 'vhf', 'lp_vhf', 'w', 'wmax', 'wen', 'Prc80NN'],
    num_rows: 1662743
})

In [30]:
rf = RandomForestClassifier(random_state=42, bootstrap=False)
rf.fit(X_train.to_pandas().replace([np.inf, -np.inf, np.nan], 0), y_train.to_pandas())

  return fit_method(estimator, *args, **kwargs)


In [31]:
val = dataset['validate'].to_pandas().replace([np.inf, -np.inf, np.nan], 0)
X_val, y_val = val[X_labels], val[y_label]

In [32]:
test = dataset['test'].to_pandas().replace([np.inf, -np.inf, np.nan], 0)

In [33]:
train = dataset['fit'].to_pandas().replace([np.inf, -np.inf, np.nan], 0)

In [34]:
train['label'].value_counts()

label
1    889218
0    773525
Name: count, dtype: int64

In [35]:
val['label'].value_counts()

label
1    298255
0    266278
Name: count, dtype: int64

In [36]:
test['label'].value_counts()

label
1    241122
0    218414
Name: count, dtype: int64

In [37]:
sklearn.metrics.accuracy_score(y_val, rf.predict(X_val))

0.5708311117330608