In [1]:
import uuid

In [2]:
import importlib
from pathlib import Path
from glob import glob

In [3]:
import numpy as np 

import sklearn
from sklearn import metrics
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestClassifier

In [4]:
import wandb

In [5]:
from train import prepare_model, train

In [6]:
# model = 'sia.models.wickstrom_2020'
# dataset = 'sia.datasets.wickstrom_2020'
model = 'sia.models.time_series'
dataset = 'sia.datasets.stepping_dataset'

data_dir = './data/ecg_model'

In [7]:
participants = [Path(path).stem for path in glob(f'{data_dir}/*.csv')]
train_participants, test_participants = train_test_split(participants[:20], test_size=0.2, random_state=42)

In [8]:
model_name = model.split('.')[-1]
model_module = importlib.import_module(model)
dataset_module = importlib.import_module(dataset)

In [9]:
sampling_rate = 1000

In [10]:
import optuna
import pandas as pd
from tabulate import tabulate

In [11]:
def encode(baseline = 0, mental_stress = -1, high_physical_activity = -1, moderate_physical_activity = -1, low_physical_activity = -1):
    def inner(labels):
        baseline_list = ['Sitting', 'Recov1', 'Recov2', 'Recov3', 'Recov4', 'Recov5', 'Recov6']
        mental_stress_list = ['TA', 'SSST_Sing_countdown', 'Pasat', 'Raven', 'TA_repeat', 'Pasat_repeat']
        high_physical_stress_list = ['Treadmill1', 'Treadmill2', 'Treadmill3', 'Treadmill4', 'Walking_fast_pace', 'Cycling', 'stairs_up_and_down']
        moderate_physical_stress_list = ['Walking_own_pace', 'Dishes', 'Vacuum']
        low_physical_stress_list = ['Standing', 'Lying_supine', 'Recov_standing']
        
        def encode_multiclass(label):
            if label in baseline_list:
                return baseline
            elif label in mental_stress_list:
                return mental_stress
            elif label in high_physical_stress_list:
                return high_physical_activity
            elif label in moderate_physical_stress_list:
                return moderate_physical_activity
            elif label in low_physical_stress_list:
                return low_physical_activity
            else:
                return -1
            
        return {
            'label': [encode_multiclass(label) for label in labels],
        }
    return inner

def clean(dataset, mapping={}):
    print("--- Cleaning ---")
    dataset = dataset.map(
        encode(**mapping), 
        batched=True, 
        batch_size=2048, 
        input_columns=['label'],
        num_proc=4
    )
    print("--- Filtering ---")
    return dataset.filter(
        lambda label: label != -1,
        input_columns=['label'],
    )

In [12]:
def optimize(data_dir, k_fold=5, n_trials=10, mapping = {}):
    def objective(trial):
        params = {
            "epochs": 11,
            "num_workers": 8,

            "batch_size": 100, #trial.suggest_int("batch_size", 10, 1024),
            "learning_rate": 0.01,

            "window": 60000 # trial.suggest_int("window", 1000, 60000, step=1000),
        }
        
        wandb.init(
            project='stress-in-action',
            config={
                "epochs": params['epochs'],
                "batch_size": params['batch_size'],
                "learning_rate": params['learning_rate'],
                "window": params['window'],
                "architecture": model_name,
                "dataset": dataset
            }
        )

        scores = []
        for train_indices, val_indices in KFold(n_splits=k_fold, shuffle=True, random_state=42).split(train_participants):
            try: 
                print("--- Preparing Model ---")
                model = prepare_model(
                    model=model_module.Model, # assuming all models are named Model.
                    data=data_dir,
                    dataset=dataset_module.Dataset,
                    batch_size=params['batch_size'],
                    learning_rate=params['learning_rate'],
                    num_workers=params['num_workers'],
                    ignore_torch_format=True,
                    train_participants=[train_participants[i] for i in train_indices],
                    val_participants=[train_participants[i] for i in val_indices],
                    test_participants=test_participants,
                    dataset_kwargs={
                        'window': params['window']
                    },
                    dataset_preprocessor=lambda data: clean(data, mapping)
                )

                print("--- Training ---")
                trainer = train(
                    model_name,
                    model=model,
                    epochs=params['epochs']
                )

                print("--- Done ---")
                scores.append(trainer.callback_metrics["val_accuracy"].item())
                break ## only one fold for now
            except Exception as e:
                print(e)
                break

        wandb.finish()

        return np.min([np.mean(scores), np.median([scores])])

    def detailed(trial):
        params = {
            "epochs": 11,
            "num_workers": 8,

            "batch_size": 100, #trial.suggest_int("batch_size", 10, 1024),
            "learning_rate": 0.01,

            "window": 60000 # trial.suggest_int("window", 1000, 60000, step=1000),
        }
        
        wandb.init(
            project='stress-in-action',
            config={
                "epochs": params['epochs'],
                "batch_size": params['batch_size'],
                "learning_rate": params['learning_rate'],
                "window": params['window'],
                "architecture": model_name,
                "dataset": dataset
            }
        )

        scores = []
        for train_indices, val_indices in KFold(n_splits=k_fold, shuffle=True, random_state=42).split(train_participants):
            try: 
                model = prepare_model(
                    model=model_module.Model, # assuming all models are named Model.
                    data=data_dir,
                    dataset=dataset_module.Dataset,
                    batch_size=params['batch_size'],
                    learning_rate=params['learning_rate'],
                    num_workers=params['num_workers'],
                    ignore_torch_format=True,
                    train_participants=[train_participants[i] for i in train_indices],
                    val_participants=[train_participants[i] for i in val_indices],
                    test_participants=test_participants,
                    dataset_kwargs={
                        'window': params['window']
                    }
                )
                model.data = clean(model.data, mapping)
                trainer = train(
                    model_name,
                    model=model,
                    epochs=params['epochs']
                )

                trainer.test(model)

                scores.append({
                    "val_accuracy": trainer.callback_metrics["val_accuracy"].item(),
                    "val_precision": trainer.callback_metrics["val_precision"].item(),
                    "val_f1": trainer.callback_metrics["val_f1"].item(),
                    "test_accuracy": trainer.callback_metrics["test_accuracy"].item(),
                    "test_precision": trainer.callback_metrics["test_precision"].item(),
                    "test_f1": trainer.callback_metrics["test_f1"].item()
                })
            except Exception as e:
                print(e)
                break

        wandb.finish()

        return pd.DataFrame(scores)


    study = optuna.create_study(
        study_name=f'{model}_{dataset}_{data_dir}_{str(uuid.uuid4())}',
        storage="sqlite:///db.sqlite3",
        direction='maximize',
        sampler=optuna.samplers.RandomSampler(seed=42)
    )

    study.optimize(
        objective, 
        n_trials=n_trials,
        show_progress_bar=True
    )

    # Get the best hyperparameters
    best_params = study.best_params
    best_score = study.best_value

    print("Best Score:", best_score)
    print("Best Parameters:", best_params)

    # df = detailed(study.best_trial)

    # print(tabulate(
    #     [
    #         [
    #             'Validation F1', 
    #             'Test F1', 
    #             'Validation Accuracy', 
    #             'Test Accuracy', 
    #             'Validation Balanced Accuracy', 
    #             'Test Balanced Accuracy'
    #         ],
    #         [
    #             f"{round(df['val_f1'].mean() * 100, 2)}% ± {round(df['val_f1'].std() * 100, 2)}%", 
    #             f"{round(df['test_f1'].mean() * 100, 2)}% ± {round(df['test_f1'].std() * 100, 2)}%", 
    #             f"{round(df['val_accuracy'].mean() * 100, 2)}% ± {round(df['val_accuracy'].std() * 100, 2)}%", 
    #             f"{round(df['test_accuracy'].mean() * 100, 2)}% ± {round(df['test_accuracy'].std() * 100, 2)}%", 
    #             f"{round(df['val_balanced_accuracy'].mean() * 100, 2)}% ± {round(df['val_balanced_accuracy'].std() * 100, 2)}%", 
    #             f"{round(df['test_balanced_accuracy'].mean() * 100, 2)}% ± {round(df['test_balanced_accuracy'].std() * 100, 2)}%"
    #         ],
    #     ], tablefmt='fancy_grid')
    # )


In [13]:
optimize('./data/ecg_model', k_fold=2, n_trials=1, mapping={ 'mental_stress': 1 })

[I 2024-07-21 14:12:54,413] A new study created in RDB with name: sia.models.time_series_sia.datasets.stepping_dataset_./data/ecg_model_5bf51f8d-5f17-4fcd-8b1b-3d3e49b94098


  0%|          | 0/1 [00:00<?, ?it/s]

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33malex-antonides[0m. Use [1m`wandb login --relogin`[0m to force relogin


--- Preparing Model ---
--- Training ---


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


--- Cleaning ---
--- Filtering ---



LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

   | Name                 | Type            | Params
----------------------------------------------------------
0  | rnn                  | LSTM            | 14.4 M
1  | fc                   | Linear          | 61    
2  | dropout              | Dropout         | 0     
3  | batch_norm           | BatchNorm1d     | 120   
4  | train_accuracy       | BinaryAccuracy  | 0     
5  | train_f1score        | BinaryF1Score   | 0     
6  | train_precision      | BinaryPrecision | 0     
7  | validation_accuracy  | BinaryAccuracy  | 0     
8  | validation_f1score   | BinaryF1Score   | 0     
9  | validation_precision | BinaryPrecision | 0     
10 | test_accuracy        | BinaryAccuracy  | 0     
11 | test_f1score         | BinaryF1Score   | 0     
12 | test_precision       | BinaryPrecision | 0     
----------------------------------------------------------
14.4 M    Trainable params
0         Non-trainable params
14.4 M    Total params
57.660    Total 

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

--- Done ---
'val_accuracy'


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

[W 2024-07-21 14:14:47,294] Trial 0 failed with parameters: {} because of the following error: The value nan is not acceptable.
[W 2024-07-21 14:14:47,295] Trial 0 failed with value nan.


ValueError: Record does not exist.

In [None]:
import wandb

In [None]:
wandb.finish()