In [16]:
import optuna

In [17]:
import psutil

In [18]:
import numpy as np
import pandas as pd
import seaborn as sns
import pandas as pd

In [19]:
import sklearn
from sklearn import metrics
from sklearn.model_selection import RandomizedSearchCV, cross_val_score, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestClassifier

In [20]:
import datasets
from datasets import load_dataset
from sklearn.model_selection import train_test_split

In [21]:
from pathlib import Path
from glob import glob

In [22]:
base_path = './data/ecg_features_60s_clean'

In [23]:
participants = [Path(path).stem for path in glob(f'{base_path}/*.csv')]
train_participants, test_participants = train_test_split(participants, test_size=0.2, random_state=42)

In [24]:
def encode(samples):
    baseline = ['Sitting', 'Recov1', 'Recov2', 'Recov3', 'Recov4', 'Recov5', 'Recov6']
    mental_stress = ['TA', 'SSST_Sing_countdown', 'Pasat', 'Raven', 'TA_repeat', 'Pasat_repeat']
    high_physical_stress = ['Treadmill1', 'Treadmill2', 'Treadmill3', 'Treadmill4', 'Walking_fast_pace', 'Cycling', 'stairs_up_and_down']
    moderate_physical_stress = ['Walking_own_pace', 'Dishes', 'Vacuum']
    low_physical_stress = ['Standing', 'Lying_supine', 'Recov_standing']
    
    def encode_multiclass(label):
        if label in baseline:
            return 0
        elif label in mental_stress:
            return -1
        elif label in high_physical_stress:
            return 1
        elif label in moderate_physical_stress:
            return -1
        elif label in low_physical_stress:
            return -1
        else:
            return -1
        
    return {
        'label': [encode_multiclass(label) for label in samples['label']],
    }

def clean(dataset):
    dataset = dataset.map(encode, batched=True, batch_size=2048, num_proc=4)
    dataset = dataset.filter(lambda x: x['label'] != -1)
    return dataset['fit'].to_pandas(), dataset['validate'].to_pandas()


In [25]:
X_labels =  [
    'μhr',
    'σhr',
    'μhrv',
    'σhrv',
    'NN50',
    'pNN50',
    'TINN',
    'rmsHRV',
    'ULF',
    'LF',
    'HF',
    'UHF',
    'LF_HF_ratio',
    'Σ',
    'relative_power_ulf',
    'relative_power_lf',
    'relative_power_hf',
    'relative_power_uhf',
    'LF_norm',
    'HF_norm',
    'hr_max',
    'hr_min',
    'rmssd',
    'rr_mean',
    'rr_std',
    # 'twa'
]
y_label = 'label'

In [26]:
from joblib import Parallel, delayed

In [27]:
# dataset = load_dataset(
#     f'{base_path}', 
#     train_participants=train_participants[:10],
#     trust_remote_code=True
# )

In [28]:
# for train_indices, val_indices in KFold(n_splits=10, shuffle=True, random_state=42).split(train_participants):
#     dataset = load_dataset(
#         f'{base_path}', 
#         train_participants=[train_participants[i] for i in train_indices],
#         val_participants=[train_participants[i] for i in val_indices],
#         trust_remote_code=True
#     )

#     print(dataset['fit'].shape)
#     break

In [29]:
def train(train_indices, val_indices, params):
    dataset = load_dataset(
        f'{base_path}', 
        train_participants=[train_participants[i] for i in train_indices],
        val_participants=[train_participants[i] for i in val_indices],
        # test_participants=test_participants,
        trust_remote_code=True
    )

    train, val = clean(dataset)
    train.replace([np.inf, -np.inf], 0, inplace=True)
    val.replace([np.inf, -np.inf], 0, inplace=True)

    train_X, train_y = train[X_labels], train[y_label]
    val_X, val_y = val[X_labels], val[y_label]

    rf = RandomForestClassifier(**params)
    rf.fit(train_X, train_y)
    return rf.score(val_X, val_y)

In [30]:
import uuid

In [34]:
def objective(trial):
    params = {
        "n_estimators": 500,
        "max_depth": trial.suggest_int("max_depth", 2, 10),
        "min_samples_split": trial.suggest_int("min_samples_split", 10, 200, step=10),
        "max_features": trial.suggest_int("max_features", 3, 15),
    }
    
    scores = Parallel(n_jobs=5)(delayed(train)(train_indices, val_indices, params) for train_indices, val_indices in KFold(n_splits=10, shuffle=True, random_state=42).split(train_participants)) 
    return np.min([np.mean(scores), np.median([scores])])

study = optuna.create_study(
    study_name=f'random_forest_{base_path}_{str(uuid.uuid4())}',
    storage="sqlite:///db.sqlite3",
    direction='maximize',
    sampler=optuna.samplers.RandomSampler(seed=42)
)

study.optimize(
    objective, 
    n_trials=10,
    show_progress_bar=True
)

[I 2024-05-08 00:04:17,942] A new study created in RDB with name: random_forest_./data/ecg_features_60s_clean_47cd115d-5643-473d-afb2-7b24aa30fedf
Best trial: 0. Best value: 0.866049:  10%|█         | 1/10 [2:49:19<25:23:53, 10159.30s/it]

[I 2024-05-08 02:53:37,234] Trial 0 finished with value: 0.8660487097997943 and parameters: {'max_depth': 5, 'min_samples_split': 200, 'max_features': 12}. Best is trial 0 with value: 0.8660487097997943.


Best trial: 0. Best value: 0.866049:  10%|█         | 1/10 [2:54:53<26:14:01, 10493.54s/it]


[W 2024-05-08 02:59:11,440] Trial 1 failed with parameters: {'max_depth': 7, 'min_samples_split': 40, 'max_features': 5} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "c:\Users\Admin\AppData\Local\Programs\Python\Python311\Lib\site-packages\optuna\study\_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\Admin\AppData\Local\Temp\ipykernel_14512\536576319.py", line 9, in objective
    scores = Parallel(n_jobs=5)(delayed(train)(train_indices, val_indices, params) for train_indices, val_indices in KFold(n_splits=10, shuffle=True, random_state=42).split(train_participants))
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Admin\AppData\Local\Programs\Python\Python311\Lib\site-packages\joblib\parallel.py", line 19

KeyboardInterrupt: 

In [None]:
# Get the best hyperparameters
best_params = study.best_params
best_score = study.best_value

print("Best Score:", best_score)
print("Best Parameters:", best_params)

Best Score: 0.5990182092260431
Best Parameters: {'n_estimators': 331, 'max_depth': 6, 'min_samples_split': 3, 'min_samples_leaf': 10, 'criterion': 'entropy', 'max_features': 'sqrt'}
