In [None]:
# %pip install -r requirements.txt

In [None]:
import warnings
warnings.filterwarnings("ignore")

## Training
The training process is defined in this notebook. Throughout the project, two processes are used, the part that trains and optimizes the machine learning models, and the part that trains and optimizes the neural network.

In [None]:
import uuid
from glob import glob
from pathlib import Path
from enum import Enum
from joblib import Parallel, delayed

In [None]:
import datasets
import numpy as np
import pandas as pd

In [None]:
from prettytable import PrettyTable

In [None]:
root_dir = './data/features/'
participants = [Path(p).stem for p in glob(f'{root_dir}/*.csv')]

### Participant Split
The step defined below splits the participants into their respective buckets.

In [None]:
import sklearn
from sklearn.model_selection import train_test_split

In [None]:
train_bucket, test_bucket = train_test_split(participants, test_size=0.2, random_state=42)

### Machine Learning Models
The step defined trains and optimizes the machine learning models.

In [None]:
import optuna
from sklearn import metrics

In [None]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from xgboost import XGBClassifier

In [None]:
class Model(Enum):
    DecisionTree = 1
    RandomForest = 2 
    AdaBoost = 3
    LinearDiscriminantAnalysis = 4
    KNearestNeighbors = 5
    LogisticRegression = 6
    XGBoost = 7
    QuadraticDiscriminantAnalysis = 8
    RandomBaseline = 9
    SophisticatedBaseline = 10

#### Encoding
The step defined encodes the data for the machine learning models. This allows the user to train a model targeting a specific category (i.e. Baseline versus Mental Stress)

In [None]:
def encode(baseline = 0, mental_stress = -1, high_physical_activity = -1, moderate_physical_activity = -1, low_physical_activity = -1):
    """Encode the categories into integers based on a given mapping. Any -1 is discarded."""
    def inner(categories):
        def encode_class(label):
            if label == 'baseline':
                return baseline
            elif label == 'mental_stress':
                return mental_stress
            elif label == 'high_physical_activity':
                return high_physical_activity
            elif label == 'moderate_physical_activity':
                return moderate_physical_activity
            elif label == 'low_physical_activity':
                return low_physical_activity
            else:
                return -1
            
        return {
            'label': [encode_class(category) for category in categories],
        }
    return inner

def clean(dataset, mapping={}):
    """Clean the dataset by encoding category and removing any -1 labels."""
    dataset = dataset.map(
        encode(**mapping), 
        batched=True, 
        batch_size=2048, 
        input_columns=['category'],
        num_proc=8
    )
    return dataset.filter(
        lambda label: label != -1,
        input_columns=['category'],
    )


#### Training
The method below defines the step used to train a given model and calculate the metrics.

In [None]:
def train(
    model:int, 
    data_dir: str,
    X_labels: list[str],
    y_label: str, 
    train_bucket: list[str], 
    val_bucket: list[str], 
    test_bucket: list[str], 
    params: dict,
    mapping: dict = {}
):
    """Train a model using the given dataset and parameters.
    
    Parameters
    ----------
    model: int
        The model to use. Unfortunately, the enum Model can't be passed over the network, thus a number is used.
        1: DecisionTree
        2: RandomForest
        3: AdaBoost
        4: LinearDiscriminantAnalysis
        5: KNearestNeighbors
        6: LogisticRegression
        7: XGBoost
        8: QuadraticDiscriminantAnalysis
        9: RandomBaseline
        10: SophisticatedBaseline
    data_dir: str
        The directory containing the dataset.
    X_labels: list[str]
        The features to use.
    y_label: str
        The target label.
    train_bucket: list[str]
        The participants to use for training.
    val_bucket: list[str]
        The participants to use for validation.
    test_bucket: list[str]
        The participants to use for testing.
    params: dict
        The hyperparameters for the model.
    mapping: dict
        The mapping for the categories.
    """
    dataset = datasets.load_dataset(
        data_dir, 
        train_participants=train_bucket,
        val_participants=val_bucket,
        test_participants=test_bucket,
        trust_remote_code=True
    )

    dataset = dataset.select_columns([y_label] + X_labels)
    dataset = clean(dataset, mapping=mapping)
    
    train = dataset['fit'].to_pandas().replace([np.inf, -np.inf, np.nan], 0)

    X_train, y_train = train[X_labels], train[y_label]
    del train

    if model == 1:
        cls = DecisionTreeClassifier(**params, random_state=42)
    elif model == 2:
        cls = RandomForestClassifier(**params, random_state=42, bootstrap=False)
    elif model == 3:
        cls = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(criterion='entropy', min_samples_split=20), **params)
    elif model == 4:
        cls = LinearDiscriminantAnalysis(**params)
    elif model == 5:
        cls = KNeighborsClassifier(**params)
    elif model == 6:
        cls = LogisticRegression(**params)
    elif model == 7:
        cls = XGBClassifier(**params)
    elif model == 8:
        cls = QuadraticDiscriminantAnalysis(**params)
    elif model == 9:
        cls = DummyClassifier(strategy= 'uniform', random_state= 42, **params)
    elif model == 10:
        cls = DummyClassifier(strategy= 'stratified', random_state= 42, **params)
    else: 
        raise ValueError('Invalid model')

    cls.fit(X_train, y_train)

    del X_train

    val = dataset['validate'].to_pandas().replace([np.inf, -np.inf, np.nan], 0)
    X_val, y_val = val[X_labels], val[y_label]
    del val

    test = dataset['test'].to_pandas().replace([np.inf, -np.inf, np.nan], 0)
    X_test, y_test = test[X_labels], test[y_label]
    del test

    data = {
        'val_accuracy': sklearn.metrics.accuracy_score(y_val, cls.predict(X_val)),
        'val_balanced_accuracy': sklearn.metrics.balanced_accuracy_score(y_val, cls.predict(X_val)),
        'test_accuracy': sklearn.metrics.accuracy_score(y_test, cls.predict(X_test)),
        'test_balanced_accuracy': sklearn.metrics.balanced_accuracy_score(y_test, cls.predict(X_test)),
    }

    if len(y_train.unique()) == 2:
        ## binary
        data['val_f1'] = sklearn.metrics.f1_score(y_val, cls.predict(X_val))
        data['test_f1'] = sklearn.metrics.f1_score(y_test, cls.predict(X_test))

        # AUC
        data['val_auc'] = sklearn.metrics.roc_auc_score(y_val, cls.predict_proba(X_val)[:, 1])
        data['test_auc'] = sklearn.metrics.roc_auc_score(y_test, cls.predict_proba(X_test)[:, 1])
        
        # Confusion Matrix
        val_cm = sklearn.metrics.confusion_matrix(y_val, cls.predict(X_val), labels=y_train.unique())
        test_cm = sklearn.metrics.confusion_matrix(y_test, cls.predict(X_test), labels=y_train.unique())

        data['val_cm'] = val_cm
        data['test_cm'] = test_cm
    else: 
        # multiclass
        data['val_f1'] = sklearn.metrics.f1_score(y_val, cls.predict(X_val), average='micro')
        data['test_f1'] = sklearn.metrics.f1_score(y_test, cls.predict(X_test), average='micro')

        # AUC (one-vs-rest)
        y_val_bin = sklearn.preprocessing.label_binarize(y_val, classes=np.unique(y_train))
        y_test_bin = sklearn.preprocessing.label_binarize(y_test, classes=np.unique(y_train))
        
        val_auc = sklearn.metrics.roc_auc_score(y_val_bin, cls.predict_proba(X_val), average='macro', multi_class='ovr')
        test_auc = sklearn.metrics.roc_auc_score(y_test_bin, cls.predict_proba(X_test), average='macro', multi_class='ovr')
        
        data['val_auc'] = val_auc
        data['test_auc'] = test_auc
        
        # Confusion Matrix
        val_cm = sklearn.metrics.confusion_matrix(y_val, cls.predict(X_val), labels=y_train.unique())
        test_cm = sklearn.metrics.confusion_matrix(y_test, cls.predict(X_test), labels=y_train.unique())

        data['val_cm'] = val_cm
        data['test_cm'] = test_cm

    del y_train, X_val, y_val
    
    if model == 2:
        importances = cls.feature_importances_
        feature_importance = pd.DataFrame(importances, index=X_labels, columns=["importance"])
        feature_importance["std"] = np.std([tree.feature_importances_ for tree in cls.estimators_], axis=0)
        feature_importance.sort_values(by='importance', ascending=False, inplace=True)

        result = sklearn.inspection.permutation_importance(
            cls, X_test, y_test, n_repeats=10, random_state=42
        )
        permutation_importances = pd.DataFrame(result.importances_mean, index=X_labels, columns=["importance"])
        permutation_importances["std"] = result.importances_std
        permutation_importances.sort_values(by='importance', ascending=False, inplace=True)

        data['feature_importance'] = feature_importance["importance"].head(5)
        data['fi_std'] = feature_importance["std"].head(5)
        data['permutation_importances'] = permutation_importances["importance"].head(5)
        data['pi_std'] = permutation_importances["std"].head(5)

    del X_test, y_test

    return data 

#### Optimization
The step below optimizes the model using the Optuna library.

In [None]:
def optimize(
    model: Model, 
    data_directory: str, 
    X_labels: list[str], 
    y_label: str, 
    train_bucket: list[str], 
    test_bucket: list[str],
    k_fold: int = 10, 
    mapping: dict = {}, 
    params: dict = None
):
    """Optimize the model using Optuna.

    Parameters
    ----------
    model: Model
        The model to use.
    data_directory: str
        The directory containing the dataset.
    X_labels: list[str]
        The features to use.
    y_label: str
        The target label.
    train_bucket: list[str]
        The participants to use for training.
    test_bucket: list[str]
        The participants to use for testing.
    k_fold: int
        The number of folds to use.
    mapping: dict
        The mapping for the categories.
    params: dict
        The hyperparameters for the
    """
    skip_optimization = params is not None

    table = PrettyTable()
    table.title = f'{model.name}'
    table.field_names = [
        '',
        'Test F1', 
        'Test Accuracy', 
        'Test AUC'
    ]
    model_value = int(model.value)

    def objective(trial):
        if model == Model.DecisionTree:
            params = {
                'max_depth': trial.suggest_int('max_depth', 1, 32),
                'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
                'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
            }
        elif model == Model.RandomForest:
            params = {
                "n_estimators": 500,
                "max_depth": trial.suggest_int("max_depth", 2, 10),
                "min_samples_split": trial.suggest_int("min_samples_split", 10, 200, step=10),
                "min_samples_leaf": trial.suggest_int("min_samples_leaf", 3, 15),
            }
        elif model == Model.AdaBoost:
            params = {
                'n_estimators': trial.suggest_int('n_estimators', 50, 300),
                'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1.0),
            }
        elif model == Model.XGBoost:
            params = {
                'max_depth': trial.suggest_int('max_depth', 1, 16),
                'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1.0),
                'n_estimators': trial.suggest_int('n_estimators', 10, 300),
                'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
                'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
                'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
            }
        elif model == Model.LogisticRegression:
            params = {
                'C': trial.suggest_loguniform('C', 1e-5, 1e5),
                'solver': trial.suggest_categorical('solver', ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']),
                'max_iter': trial.suggest_int('max_iter', 100, 1000),
            }
        elif model == Model.KNearestNeighbors:
            params = {
                'n_neighbors': trial.suggest_int('n_neighbors', 1, 50),
                'weights': trial.suggest_categorical('weights', ['uniform', 'distance']),
                'p': trial.suggest_int('p', 1, 2),
            }
        elif model == Model.LinearDiscriminantAnalysis:
            params = {
                'solver': trial.suggest_categorical('solver', ['lsqr', 'eigen']),
                'shrinkage': trial.suggest_uniform('shrinkage', 0.0, 1.0),
            }
        elif model == Model.QuadraticDiscriminantAnalysis:
            params = {
                'reg_param': trial.suggest_uniform('reg_param', 0.0, 1.0),
            }
        else: 
            raise ValueError('Invalid model')

        datasets.disable_progress_bars()
        scores = Parallel(n_jobs=8)(delayed(train)(
            model_value, 
            data_directory,
            X_labels, 
            y_label, 
            [train_bucket[i] for i in train_indices], 
            [train_bucket[i] for i in val_indices], 
            test_bucket, 
            params,
            mapping=mapping
        ) for train_indices, val_indices in sklearn.model_selection.KFold(n_splits=k_fold, shuffle=True, random_state=42).split(train_bucket)) 
        datasets.enable_progress_bars()

        df = pd.DataFrame(scores)
        return np.mean([df['test_auc'].mean(), df['test_auc'].median()])
    
    def detailed(params): 
        scores=[]
        datasets.disable_progress_bars()
        scores = Parallel(n_jobs=8)(delayed(train)(
            model_value, 
            data_directory,
            X_labels, 
            y_label, 
            [train_bucket[i] for i in train_indices], 
            [train_bucket[i] for i in val_indices], 
            test_bucket, 
            params,
            mapping=mapping
        ) for train_indices, val_indices in sklearn.model_selection.KFold(n_splits=k_fold, shuffle=True, random_state=42).split(train_bucket)) 
        datasets.enable_progress_bars()
        return pd.DataFrame(scores)

    if skip_optimization == False:
        study = optuna.create_study(
            study_name=f'{model.name}_{data_directory}_{str(uuid.uuid4())}',
            storage="sqlite:///db.sqlite3",
            direction='maximize',
            sampler=optuna.samplers.RandomSampler(seed=42)
        )

        study.optimize(
            objective, 
            n_trials=50,
            show_progress_bar=True
        )
    
        # Get the best hyperparameters
        best_params = study.best_params
        best_score = study.best_value

        print("Best Score:", best_score)
        print("Best Parameters:", best_params)

    if skip_optimization:
        print("Skipping optimization, using given parameters...", params)
        best_params = params

    df = detailed(best_params)

    titles = []
    if ("baseline" not in mapping) or (mapping["baseline"] != -1):
        titles.append("Baseline")
    if "mental_stress" in mapping and mapping["mental_stress"] >= 0:
        titles.append("Mental Stress")
    if "low_physical_activity" in mapping and mapping["low_physical_activity"] >= 0:
        titles.append("Low Physical Activity")
    if "moderate_physical_activity" in mapping and mapping["moderate_physical_activity"] >= 0:
        titles.append("Moderate Physical Activity")
    if "high_physical_activity" in mapping and mapping["high_physical_activity"] >= 0:
        titles.append("High Physical Activity")
        
    row = [
        f"{' & '.join(titles)}",
        f"{round(df['test_f1'].mean() * 100, 2)}% ± {round(df['test_f1'].std() * 100, 2)}%", 
        f"{round(df['test_accuracy'].mean() * 100, 2)}% ± {round(df['test_accuracy'].std() * 100, 2)}%", 
        f"{round(df['test_auc'].mean() * 100, 2)}% ± {round(df['test_auc'].std() * 100, 2)}%"
    ]
    table.add_row(row)

    return best_params

#### Features
The resulting features described in the previous notebook.

In [None]:
X_labels =  [
    # Time-Domain
    'hr_min',
    'hr_max',
    'hr_mean',
    'hr_std',
    'hrv_mean',
    'hrv_std',
    'hrv_rms',
    'cvnn',
    'cvsd',
    'nn20',
    'pnn20',
    'nn50',
    'pnn50',

    # Frequency-Domain
    'ulf_Feature.MIN',
    'vlf_Feature.MIN',
    'lf_Feature.MIN',
    'hf_Feature.MIN',
    'vhf_Feature.MIN',
    'uhf_Feature.MIN',
    'tp_Feature.MIN',
    'lp_ulf_Feature.MIN',
    'lp_vlf_Feature.MIN',
    'lp_lf_Feature.MIN',
    'lp_hf_Feature.MIN',
    'lp_vhf_Feature.MIN',
    'lp_uhf_Feature.MIN',
    'lf_hf_Feature.MIN',
    'ulf_Feature.MAX',
    'vlf_Feature.MAX',
    'lf_Feature.MAX',
    'hf_Feature.MAX',
    'vhf_Feature.MAX',
    'uhf_Feature.MAX',
    'tp_Feature.MAX',
    'lp_ulf_Feature.MAX',
    'lp_vlf_Feature.MAX',
    'lp_lf_Feature.MAX',
    'lp_hf_Feature.MAX',
    'lp_vhf_Feature.MAX',
    'lp_uhf_Feature.MAX',
    'lf_hf_Feature.MAX',
    'ulf_Feature.MEAN',
    'vlf_Feature.MEAN',
    'lf_Feature.MEAN',
    'hf_Feature.MEAN',
    'vhf_Feature.MEAN',
    'uhf_Feature.MEAN',
    'tp_Feature.MEAN',
    'lp_ulf_Feature.MEAN',
    'lp_vlf_Feature.MEAN',
    'lp_lf_Feature.MEAN',
    'lp_hf_Feature.MEAN',
    'lp_vhf_Feature.MEAN',
    'lp_uhf_Feature.MEAN',
    'lf_hf_Feature.MEAN',
    'ulf_Feature.STD',
    'vlf_Feature.STD',
    'lf_Feature.STD',
    'hf_Feature.STD',
    'vhf_Feature.STD',
    'uhf_Feature.STD',
    'tp_Feature.STD',
    'lp_ulf_Feature.STD',
    'lp_vlf_Feature.STD',
    'lp_lf_Feature.STD',
    'lp_hf_Feature.STD',
    'lp_vhf_Feature.STD',
    'lp_uhf_Feature.STD',
    'lf_hf_Feature.STD',
    'ulf_Feature.POWER',
    'vlf_Feature.POWER',
    'lf_Feature.POWER',
    'hf_Feature.POWER',
    'vhf_Feature.POWER',
    'uhf_Feature.POWER',
    'tp_Feature.POWER',
    'lp_ulf_Feature.POWER',
    'lp_vlf_Feature.POWER',
    'lp_lf_Feature.POWER',
    'lp_hf_Feature.POWER',
    'lp_vhf_Feature.POWER',
    'lp_uhf_Feature.POWER',
    'lf_hf_Feature.POWER',
    'ulf_Feature.COVARIANCE',
    'vlf_Feature.COVARIANCE',
    'lf_Feature.COVARIANCE',
    'hf_Feature.COVARIANCE',
    'vhf_Feature.COVARIANCE',
    'uhf_Feature.COVARIANCE',
    'tp_Feature.COVARIANCE',
    'lp_ulf_Feature.COVARIANCE',
    'lp_vlf_Feature.COVARIANCE',
    'lp_lf_Feature.COVARIANCE',
    'lp_hf_Feature.COVARIANCE',
    'lp_vhf_Feature.COVARIANCE',
    'lp_uhf_Feature.COVARIANCE',
    'lf_hf_Feature.COVARIANCE',
    'ulf_Feature.ENERGY',
    'vlf_Feature.ENERGY',
    'lf_Feature.ENERGY',
    'hf_Feature.ENERGY',
    'vhf_Feature.ENERGY',
    'uhf_Feature.ENERGY',
    'tp_Feature.ENERGY',
    'lp_ulf_Feature.ENERGY',
    'lp_vlf_Feature.ENERGY',
    'lp_lf_Feature.ENERGY',
    'lp_hf_Feature.ENERGY',
    'lp_vhf_Feature.ENERGY',
    'lp_uhf_Feature.ENERGY',
    'lf_hf_Feature.ENERGY',
    'ulf_Feature.ENTROPY',
    'vlf_Feature.ENTROPY',
    'lf_Feature.ENTROPY',
    'hf_Feature.ENTROPY',
    'vhf_Feature.ENTROPY',
    'uhf_Feature.ENTROPY',
    'tp_Feature.ENTROPY',
    'lp_ulf_Feature.ENTROPY',
    'lp_vlf_Feature.ENTROPY',
    'lp_lf_Feature.ENTROPY',
    'lp_hf_Feature.ENTROPY',
    'lp_vhf_Feature.ENTROPY',
    'lp_uhf_Feature.ENTROPY',
    'lf_hf_Feature.ENTROPY',

    # Nonlinear
    'apen',
    'sampen',
    'fuzzyen',
    'sd1',
    'sd2',
    'sd1_sd2',
    'w',
    'wmax',
    'wen',
    'pss',

    # Morphology
    "twa"
]
y_label = "category"

#### Hyper-parameter Optimization
The step below optimizes the hyper-parameters of the model using the Optuna library.

**Remove the comment below to start the optimization process and to find the best hyper-parameters for the model.**

In [None]:
# for model in [
#     Model.DecisionTree,
#     Model.RandomForest,
#     Model.AdaBoost,
#     Model.XGBoost,
#     Model.LogisticRegression,
#     Model.KNearestNeighbors, 
#     Model.LinearDiscriminantAnalysis,
#     Model.QuadraticDiscriminantAnalysis,
#     Model.RandomBaseline,
#     Model.SophisticatedBaseline,
# ]:
#     print(model)
#     for mapping in [
#         { "mental_stress": 1 },                                              # Baseline versus Mental Stress
#         { "high_physical_activity": 1 },                                     # Baseline versus High Physical Activity
#         { "baseline": -1, "mental_stress": 0, "high_physical_activity": 1 }, # Mental Stress versus High Physical Activity
#         { "mental_stress": 1, "high_physical_activity": 2 },                 # Baseline versus Mental Stress versus High Physical Activity
#     ]:
#         print(mapping)
#         optimize(
#             model, 
#             root_dir, 
#             X_labels,
#             y_label,
#             train_bucket, 
#             test_bucket,
#             mapping=mapping
#         )
    

**Given that we already found the perfect hyper-parameters, we will not run the optimization process.**

In [None]:
for model, params in [
    (Model.DecisionTree, {'max_depth': 12, 'min_samples_split': 20, 'min_samples_leaf': 15}),
    (Model.RandomForest, {'max_depth': 8, 'min_samples_split': 50, 'max_features': 5}),
    (Model.AdaBoost, {'n_estimators': 100}),
    (Model.XGBoost, {'max_depth': 7, 'learning_rate': 0.007476312062252299, 'n_estimators': 188, 'min_child_weight': 2, 'subsample': 0.6460723242676091, 'colsample_bytree': 0.6831809216468459}),
    (Model.LogisticRegression, {'C': 0.05564180225431373, 'solver': 'newton-cg', 'max_iter': 152}),
    (Model.KNearestNeighbors, {'n_neighbors': 9}),
    (Model.LinearDiscriminantAnalysis, {'solver': 'lsqr', 'shrinkage': 0.15599452033620265}),
    (Model.QuadraticDiscriminantAnalysis, {'reg_param': 0.16}),
    (Model.RandomBaseline, {}),
    (Model.SophisticatedBaseline, {}),
]:
    print(model)
    for mapping in [
        { "mental_stress": 1 },                                              # Baseline versus Mental Stress
        { "high_physical_activity": 1 },                                     # Baseline versus High Physical Activity
        { "baseline": -1, "mental_stress": 0, "high_physical_activity": 1 }, # Mental Stress versus High Physical Activity
        { "mental_stress": 1, "high_physical_activity": 2 },                 # Baseline versus Mental Stress versus High Physical Activity
    ]:
        print(mapping)
        optimize(
            model, 
            root_dir, 
            X_labels,
            y_label,
            train_bucket, 
            test_bucket,
            mapping=mapping,
            params=params
        )
    

### Neural Network
The step defined trains and optimizes the deep learning models.

In [None]:
from src import prepare
from src.models import RnnModule
from src.datamodules import MultiParticipantDataModule
from src.datasets import WindowedDataset

**For this experiment, the K-Fold and Hyper-parameter optimization is removed, thus we split the data into training and validation sets.**

In [None]:
import sklearn
from sklearn.model_selection import train_test_split

In [None]:
root_dir = './data/signal/'
participants = [Path(p).stem for p in glob(f'{root_dir}/*.csv')]

In [None]:
train_bucket, test_bucket = train_test_split(participants, test_size=0.2, random_state=42)
train_bucket, validation_bucket = train_test_split(train_bucket, test_size=0.25, random_state=42)

In [None]:
# Model is a combination between a module that tracks metrics, and a model defined in src/models/*
model = prepare(RnnModule)

# Datamodule is a combination between a dataset that is capable of loading multiple participants, 
# and a dataset defined in src/datasets/*.
datamodule = MultiParticipantDataModule(
    f'{root_dir}', 
    train_bucket, 
    validation_bucket, 
    test_bucket, 
    batch_size=64,
    dataset=WindowedDataset,
    standardize=True
)

In [None]:
from lightning.pytorch.callbacks import ModelCheckpoint, EarlyStopping

callbacks = [
    EarlyStopping(monitor="val_loss", patience=15),
    ModelCheckpoint(save_top_k=1, monitor="val_BinaryAccuracy", mode="max", save_last=True)
]

In [None]:
import lightning as L

trainer = L.Trainer(
    max_epochs=100, 
    callbacks=callbacks,
    accelerator="auto", 
    devices="auto", 
    strategy="auto", 
    profiler="simple",
    default_root_dir=f"./checkpoints/{type(model).__name__}",
    logger=L.pytorch.loggers.WandbLogger(
        project="stress-in-action"
    ),
)

In [None]:
tuner = L.pytorch.tuner.Tuner(
    trainer
)

trainer.fit(
    model=model,
    datamodule=datamodule
)

trainer.test(
    ckpt_path="best"
)