In [15]:
from enum import Enum

In [16]:
from pathlib import Path
from glob import glob
from joblib import Parallel, delayed

In [17]:
import numpy as np
import pandas as pd
import datasets

In [18]:
from prettytable import PrettyTable

In [19]:
from tqdm import tqdm

In [20]:
import sklearn

from sklearn import metrics
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, balanced_accuracy_score

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

In [21]:
base_path = './data/ecg_features_60s_clean_twa_rqa_60s'

In [22]:
X_labels =  [
    'hrv_mean',
    'hrv_min',
    'hrv_max',
    'hrv_std',
    'hrv_rms',
    'TINN',
    'hr_mean',
    'hr_min',
    'hr_max',
    'hr_std',
    'rr_mean',
    'rr_min',
    'rr_max',
    'rr_std',
    'nn50',
    'pnn50',
    'rmssd',
    'twa',

    # new
    'vhf_entropy',  # 0.28
    'lp_vhf_entropy', # 0.28
    'lp_vhf_max',   # 0.25
    'vhf_max',   # 0.25
    'lp_vhf_mean',   # 0.24
    'lp_vhf_std',   # 0.24
    'lp_vhf_energy', # 0.22
    'lp_vhf_power', # 0.22
    'lp_vhf_median',      # 0.21
    'vhf_std',      # 0.21
    'vhf_power',    # 0.21
    'vhf_mean',    # 0.21
    'tp_entropy',   # 0.21
    'vhf_median', # 0.19
    'lp_vhf_covariance', # 0.17
    'lp_lf_min', # 0.17
    'w',            # 0.17
    'PSS',          # 0.17
    'wmax',         # 0.16
    'hr_min',       # 0.16
    'lp_uhf_entropy', # 0.16
    'wen',          # 0.15
    'hr_mean',      # 0.15
    'PIP',          # 0.15
    'hf_entropy',   # 0.15
    'uhf_entropy',  # 0.14
    'IALS',         # 0.14
    'FuzzyEn',      # 0.14
    'SampEn',       # 0.13
]
y_label = 'label'

In [23]:
participants = [Path(path).stem for path in glob(f'{base_path}/*.csv')]
train_participants, test_participants = train_test_split(participants, test_size=0.2, random_state=42)

In [24]:
def encode(baseline = 0, mental_stress = -1, high_physical_activity = -1, moderate_physical_activity = -1, low_physical_activity = -1):
    def inner(labels):
        baseline_list = ['Sitting', 'Recov1', 'Recov2', 'Recov3', 'Recov4', 'Recov5', 'Recov6']
        mental_stress_list = ['TA', 'SSST_Sing_countdown', 'Pasat', 'Raven', 'TA_repeat', 'Pasat_repeat']
        high_physical_stress_list = ['Treadmill1', 'Treadmill2', 'Treadmill3', 'Treadmill4', 'Walking_fast_pace', 'Cycling', 'stairs_up_and_down']
        moderate_physical_stress_list = ['Walking_own_pace', 'Dishes', 'Vacuum']
        low_physical_stress_list = ['Standing', 'Lying_supine', 'Recov_standing']
        
        def encode_multiclass(label):
            if label in baseline_list:
                return baseline
            elif label in mental_stress_list:
                return mental_stress
            elif label in high_physical_stress_list:
                return high_physical_activity
            elif label in moderate_physical_stress_list:
                return moderate_physical_activity
            elif label in low_physical_stress_list:
                return low_physical_activity
            else:
                return -1
            
        return {
            'label': [encode_multiclass(label) for label in labels],
        }
    return inner

def clean(dataset, mapping={}):
    dataset = dataset.map(
        encode(**mapping), 
        batched=True, 
        batch_size=2048, 
        input_columns=['label'],
        num_proc=4
    )
    return dataset.filter(
        lambda label: label != -1,
        input_columns=['label'],
    )

In [25]:
class Model(Enum):
    RandomForest = 2
    DecisionTree = 1
    AdaBoost = 3
    LinearDiscriminantAnalysis = 4
    KNearestNeighbors = 5
    LogisticRegression = 6
    XGBoost = 7
    QuadraticDiscriminantAnalysis = 8

In [26]:
def train(model: int, train_indices, val_indices, params= {}, mapping = {}):
    dataset = datasets.load_dataset(
        f'{base_path}', 
        train_participants=[train_participants[i] for i in train_indices],
        val_participants=[train_participants[i] for i in val_indices],
        test_participants=test_participants,
        trust_remote_code=True
    )

    dataset = clean(dataset, mapping=mapping)
    train = dataset['fit'].to_pandas().replace([np.inf, -np.inf, np.nan], 0)

    X_train, y_train = train[X_labels], train[y_label]
    del train

    if model == 1:
        cls = DecisionTreeClassifier(criterion='entropy', min_samples_split=20, **params, random_state=42)
    elif model == 2:
        cls = RandomForestClassifier(**params, random_state=42, bootstrap=False)
    elif model == 3:
        cls = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(criterion='entropy', min_samples_split=20), n_estimators=100, **params)
    elif model == 4:
        cls = LinearDiscriminantAnalysis(**params)
    elif model == 5:
        cls = KNeighborsClassifier(n_neighbors=9, **params)
    elif model == 6:
        cls = LogisticRegression(**params)
    elif model == 7:
        cls = XGBClassifier(**params)
    elif model == 8:
        cls = QuadraticDiscriminantAnalysis(**params)
    else: 
        raise ValueError('Invalid model')

    cls.fit(X_train, y_train)

    del X_train

    val = dataset['validate'].to_pandas().replace([np.inf, -np.inf, np.nan], 0)
    X_val, y_val = val[X_labels], val[y_label]
    del val

    test = dataset['test'].to_pandas().replace([np.inf, -np.inf, np.nan], 0)
    X_test, y_test = test[X_labels], test[y_label]
    del test

    data = {
        'val_accuracy': sklearn.metrics.accuracy_score(y_val, cls.predict(X_val)),
        'val_balanced_accuracy': sklearn.metrics.balanced_accuracy_score(y_val, cls.predict(X_val)),
        'test_accuracy': sklearn.metrics.accuracy_score(y_test, cls.predict(X_test)),
        'test_balanced_accuracy': sklearn.metrics.balanced_accuracy_score(y_test, cls.predict(X_test)),
    }

    if len(y_train.unique()) == 2:
        ## binary
        data['val_f1'] = sklearn.metrics.f1_score(y_val, cls.predict(X_val))
        data['test_f1'] = sklearn.metrics.f1_score(y_test, cls.predict(X_test))
    else: 
        # multiclass
        data['val_f1'] = sklearn.metrics.f1_score(y_val, cls.predict(X_val), average='micro')
        data['test_f1'] = sklearn.metrics.f1_score(y_test, cls.predict(X_test), average='micro')

    del y_train, X_val, y_val
    del X_test, y_test

    return data 

### Baseline & Mental Stress

In [27]:
def attempt(model: Model):
    table = PrettyTable()
    table.title = f'{model.name}'
    table.field_names = [
        '',
        'Validation F1', 
        'Test F1', 
        'Validation Accuracy', 
        'Test Accuracy', 
        'Validation Balanced Accuracy', 
        'Test Balanced Accuracy'
    ]

    value = int(model.value)
    mappings = [ { "mental_stress": 1 } ] #, { "high_physical_activity": 1 }, { "baseline": -1, "mental_stress": 0, "high_physical_activity": 1 } ]
    with tqdm(total=len(mappings)) as pbar:
        for mapping in mappings:
            pbar.set_description(f'{model.name} - {mapping}')
            scores = Parallel(n_jobs=1)(delayed(train)(
                value,
                train_indices, 
                val_indices, 
                mapping=mapping
            ) for train_indices, val_indices in KFold(n_splits=2, shuffle=True, random_state=42).split(train_participants)) 
            df = pd.DataFrame(scores)

            titles = []
            if ("baseline" not in mapping) or (mapping["baseline"] != -1):
                titles.append("Baseline")
            if "mental_stress" in mapping and mapping["mental_stress"] >= 0:
                titles.append("Mental Stress")
            if "high_physical_activity" in mapping and mapping["high_physical_activity"] >= 0:
                titles.append("High Physical Activity")

            row = [
                f"{' & '.join(titles)}",
                f"{round(df['val_f1'].mean() * 100, 2)}% ± {round(df['val_f1'].std() * 100, 2)}%", 
                f"{round(df['test_f1'].mean() * 100, 2)}% ± {round(df['test_f1'].std() * 100, 2)}%", 
                f"{round(df['val_accuracy'].mean() * 100, 2)}% ± {round(df['val_accuracy'].std() * 100, 2)}%", 
                f"{round(df['test_accuracy'].mean() * 100, 2)}% ± {round(df['test_accuracy'].std() * 100, 2)}%", 
                f"{round(df['val_balanced_accuracy'].mean() * 100, 2)}% ± {round(df['val_balanced_accuracy'].std() * 100, 2)}%", 
                f"{round(df['test_balanced_accuracy'].mean() * 100, 2)}% ± {round(df['test_balanced_accuracy'].std() * 100, 2)}%"
            ]
            print("Intermediate results: ", row)
            table.add_row(row)
            pbar.update(1)
    
    print(table)

In [28]:
for model in Model:
    attempt(model)

RandomForest - {'mental_stress': 1}: 100%|██████████| 1/1 [3:45:00<00:00, 13500.22s/it]


Intermediate results:  ['Baseline & Mental Stress', '62.32% ± 1.56%', '62.74% ± 0.5%', '58.92% ± 0.53%', '58.73% ± 0.59%', '58.56% ± 0.3%', '58.34% ± 0.74%']
+----------------------------------------------------------------------------------------------------------------------------------------------------------+
|                                                                       RandomForest                                                                       |
+--------------------------+----------------+---------------+---------------------+----------------+------------------------------+------------------------+
|                          | Validation F1  |    Test F1    | Validation Accuracy | Test Accuracy  | Validation Balanced Accuracy | Test Balanced Accuracy |
+--------------------------+----------------+---------------+---------------------+----------------+------------------------------+------------------------+
| Baseline & Mental Stress | 62.32% ± 1.56% | 62.74% ± 0.

DecisionTree - {'mental_stress': 1}: 100%|██████████| 1/1 [6:55:51<00:00, 24951.42s/it]


Intermediate results:  ['Baseline & Mental Stress', '58.59% ± 0.82%', '58.13% ± 0.47%', '55.63% ± 0.06%', '55.32% ± 0.12%', '55.4% ± 0.07%', '55.12% ± 0.19%']
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
|                                                                        DecisionTree                                                                       |
+--------------------------+----------------+----------------+---------------------+----------------+------------------------------+------------------------+
|                          | Validation F1  |    Test F1     | Validation Accuracy | Test Accuracy  | Validation Balanced Accuracy | Test Balanced Accuracy |
+--------------------------+----------------+----------------+---------------------+----------------+------------------------------+------------------------+
| Baseline & Mental Stress | 58.59% ± 0.82% | 58.13

AdaBoost - {'mental_stress': 1}:   0%|          | 0/1 [00:00<?, ?it/s]