---
title: Data Pipeline
subtitle: Base layer pipeline
description: Pipeline for a specific data type from a specific source
---

Roughly speaking every data source corresponds to an instrument in the mission.

Generally, it includes the following steps:

- Downloading data
- Loading data
- Preprocessing data
- Processing data
- Extracting features


In [None]:
#| hide
#| export
import polars as pl

from kedro.pipeline import Pipeline, node
from kedro.pipeline.modular_pipeline import pipeline

from typing import Any, Dict

In [None]:
#| default_exp pipelines/default/data

In [None]:
#| hide
%load_ext autoreload
%autoreload 2

## Loading data

In [None]:
def download_data(
    start: str = None,
    end: str = None,
    ts = None,  # time resolution
    probe: str = None,
    coord: str = None,
):
    """Downloading data
    """
    ...


def load_data(
    start: str = None,
    end: str = None,
    ts = None,  # time resolution
    probe: str = None,
    coord: str = None,
):
    """Load data into a proper data structure, like dataframe.

    - Downloading data
    - Converting data structure
    """
    ...


### Preprocessing data

In [None]:
def preprocess_data(
    raw_data: Any | pl.DataFrame = None,
    start: str = None,
    end: str = None,
    ts = None,  # time resolution
    coord: str = None,
) -> pl.DataFrame:
    """
    Preprocess the raw dataset (only minor transformations)

    - Applying naming conventions for columns
    - Parsing and typing data (like from string to datetime for time columns)
    - Structuring the data (like pivoting, unpivoting, etc.)
    - Changing storing format (like from `csv` to `parquet`)
    - Dropping null columns 
    - Dropping duplicate time
    - Resampling data to a given time resolution (better to do in the next stage)
    - ... other 'transformations' commonly performed at this stage.
    """
    pass


### Processing data

Some common preprocessing steps are:

- Partition data by year, see `ids_finder.utils.basic.partition_data_by_year`

Note: we process the data every year to minimize the memory usage and to avoid the failure of the processing (so need to process all the data again if only fails sometimes).

In [None]:
#| exports
def process_data(
    raw_data: Any | pl.DataFrame,
    ts: str = None,  # time resolution
    coord: str = None,
) -> pl.DataFrame | Dict[str, pl.DataFrame]:
    """
    Corresponding to primary data layer, where source data models are transformed into domain data models

    - Transforming coordinate system if needed
    - Smoothing data
    - Resampling data to a given time resolution
    - Partitioning data, for the sake of memory
    """
    pass

def extract_features():
    pass

### Pipeline

In [None]:
from ids_finder.utils.basic import load_params

In [None]:
# | exports
def create_pipeline(
    sat_id: str,  # satellite id, used for namespace
    source: str,  # source data, like "mag" or "plasma"
    **kwargs,
) -> Pipeline:
    
    params = load_params(project_path = '../../')
    sat_id = sat_id.upper()
    namespace = sat_id + "." + source

    tau = params['tau']    
    ts = params[sat_id][source]['time_resolution']
    tau_str = f"tau_{tau}s"
    ts_str = f"ts_{ts}s"
    
    node_load_data = node(
        load_data,
        inputs=dict(
            start="params:start_date",
            end="params:end_date",
            ts=ts,
        ),
        outputs="raw_data",
        name="load_data",
    )

    node_preprocess_data = node(
        preprocess_data,
        inputs=dict(
            raw_data="raw_data",
            start="params:start_date",
            end="params:end_date",
        ),
        outputs=f"inter_mag_{ts_str}",
        name="preprocess_data",
    )

    node_process_data = node(
        process_data,
        inputs=f"inter_mag_{ts_str}",
        outputs=f"primary_mag_{ts_str}",
        name="process_data",
    )

    node_extract_features = node(
        extract_features,
        inputs=[f"primary_mag_{ts_str}", "params:tau", "params:extract_params"],
        outputs=f"feature_tau_{tau_str}",
        name=f"extract_{sat_id}_features",
    )

    nodes = [
        node_load_data,
        node_preprocess_data,
        node_process_data,
        node_extract_features,
    ]

    pipelines = pipeline(
        nodes,
        namespace=namespace,
        parameters={
            "params:start_date": "params:jno_start_date",
            "params:end_date": "params:jno_end_date",
            "params:tau": tau_str,
        },
    )

    return pipelines

In [None]:
class DatasetConfig:
    def __init__(self, sat_id, download_func, preprocess_func, process_func):
        self.sat_id = sat_id
        self.download_func = download_func
        self.preprocess_func = preprocess_func
        self.process_func = process_func

class PipelineGenerator:
    def __init__(self, config: DatasetConfig, ts='1s', tau='60s'):
        self.config = config
        self.ts = ts
        self.tau = tau

    def _node(self, func, inputs, outputs, name):
        return node(func, inputs=inputs, outputs=outputs, name=name)

    def generate_pipeline(self):
        node_download = self._node(
            self.config.download_func,
            inputs=dict(start="params:start_date", end="params:end_date"),
            outputs=f"raw_data_{self.ts}",
            name=f"download_{self.config.sat_id.upper()}_data"
        )

        node_preprocess = self._node(
            self.config.preprocess_func,
            inputs=dict(raw_data=f"raw_data_{self.ts}", start="params:start_date", end="params:end_date"),
            outputs=f"inter_data_{self.ts}",
            name=f"preprocess_{self.config.sat_id.upper()}_data"
        )

        node_process = self._node(
            self.config.process_func,
            inputs=f"inter_data_{self.ts}",
            outputs=f"primary_data_rtn_{self.ts}",
            name=f"process_{self.config.sat_id.upper()}_data"
        )

        node_extract = self._node(
            extract_features,
            inputs=[f"primary_data_rtn_{self.ts}", "params:tau", "params:extract_params"],
            outputs=f"feature_tau_{self.tau}",
            name=f"extract_{self.config.sat_id}_features"
        )

        return pipeline(
            [node_download, node_preprocess, node_process, node_extract],
            namespace=self.config.sat_id,
            parameters={"params:start_date": "params:jno_start_date", "params:end_date": "params:jno_end_date", "params:tau": self.tau}
        )
