# Pipelines

> The major task you will launch with `PiSCES` is a _Pipeline_ which combines the task of finding data sets and loading them into memory, configuring train-test splits, training models, and organizing and reporting results. To prevent test distribution statistics from leaking into the training data for each model, we strictly enforce that train-test splitting is done at the **study participant** level.  
>
> Said differently, under no circumstances will a PiSCES pipeline produce a set of training samples that contain data covering 6 hours of an 8 hour recording for Participant ABC123, and the remaining 2 hours is used for test evaluation; giving a model access to recordings from a participant during training skews results in the model's favor on testing, since it is more familiar with ABC123's specific signatures when asleep, awake, and transitioning.

In [1]:
#| default_exp loader

In [2]:
#| hide 
%load_ext autoreload
%autoreload 2

In [3]:
#| hide
from nbdev.showdoc import *


In [None]:
#| export

from dataclasses import dataclass
from typing import Optional


@dataclass
class ValidationConfiguration:
    parameter: Optional[float | int | str]
    method: ValidationMethod

    def __init__(self, parameter: Optional[float | int | str], method: str):
        self.parameter = parameter
        self.method = ValidationMethod[method]

    def make_splits(self, records: List[DataRecord]):
        return self.method.make_splits(self.parameter, records)


@dataclass
class DatabaseConfiguration:
    data_set_csv: str
    database: str = ":memory:"
    # SQL formatted, for WHERE clause
    # eg: if selector = "data_set_name = 'sleep_accel'" then we select only sleep_accel records from teh database
    #   as the output of """SELECT * FROM data_sets WHERE data_set_name = 'sleep_accel'"""
    selector: str = ""


@dataclass
class PipelineConfiguration:
    data_config: DatabaseConfiguration
    validation: ValidationConfiguration
    models: List[KnownModel]
    features: List[KnownFeatures]
    split_models_saveto: str

    def __init__(
        self,
        data_config: Dict[str, str],
        validation: Dict[str, str],
        models: List[str],
        features: List[str],
        split_models_saveto: str = "",
    ):
        self.data_config = DatabaseConfiguration(**data_config)
        self.validation = ValidationConfiguration(**validation)
        self.models = [KnownModel[mod] for mod in models]
        self.features = [KnownFeatures[ft] for ft in features]
        # self.feature_config = feature_config
        self.split_models_saveto = split_models_saveto


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()