---
title: Data Pipeline
subtitle: Base layer pipeline
description: Pipeline for a specific data type from a specific source
---

Roughly speaking every data source corresponds to an instrument in the mission.

Generally, it includes the following steps:

- Downloading data
- Loading data
- Preprocessing data
- Processing data
- Extracting features


In [None]:
# | hide
# | export
import polars as pl

from kedro.pipeline import Pipeline, node
from kedro.pipeline.modular_pipeline import pipeline
from ids_finder.utils.basic import load_params
from typing import Callable, Optional, Any, Dict

In [None]:
# | default_exp pipelines/default/data

In [None]:
# | hide
%load_ext autoreload
%autoreload 2

## Loading data

In [None]:
def download_data(
    start: str = None,
    end: str = None,
    datatype=None,
    ts=None,  # time resolution
    probe: str = None,
    coord: str = None,
):
    """Downloading data"""
    ...


def load_data(
    start: str = None,
    end: str = None,
    datatype=None,
    ts=None,  # time resolution
    probe: str = None,
    coord: str = None,
    vars: dict = None,
):
    """Load data into a proper data structure, like dataframe.

    - Downloading data
    - Converting data structure
    - Parsing original data (dealing with delimiters, missing values, etc.)
    """
    ...

## Preprocessing data

In [None]:
def preprocess_data(
    raw_data: Any | pl.DataFrame = None,
    start: str = None,
    end: str = None,
    ts=None,  # time resolution
    coord: str = None,
) -> pl.DataFrame:
    """
    Preprocess the raw dataset (only minor transformations)

    - Applying naming conventions for columns
    - Parsing and typing data (like from string to datetime for time columns)
    - Structuring the data (like pivoting, unpivoting, etc.)
    - Changing storing format (like from `csv` to `parquet`)
    - Dropping null columns
    - Dropping duplicate time
    - Resampling data to a given time resolution (better to do in the next stage)
    - ... other 'transformations' commonly performed at this stage.
    """
    pass

## Processing data

Some common preprocessing steps are:

- Partition data by year, see `ids_finder.utils.basic.partition_data_by_year`

Note: we process the data every year to minimize the memory usage and to avoid the failure of the processing (so need to process all the data again if only fails sometimes).

In [None]:
def process_data(
    raw_data: Any | pl.DataFrame,
    ts: str = None,  # time resolution
    coord: str = None,
) -> pl.DataFrame | Dict[str, pl.DataFrame]:
    """
    Corresponding to primary data layer, where source data models are transformed into domain data models

    - Transforming coordinate system if needed
    - Discarding unnecessary columns
    - Smoothing data
    - Resampling data to a given time resolution
    - Partitioning data, for the sake of memory
    """
    pass


def extract_features():
    pass

## Pipeline

In [None]:
# | export
DEFAULT_LOAD_INPUTS = dict(
    start="params:start_date",
    end="params:end_date",
    datatype="params:datatype",
)

In [None]:
# | exports
def create_pipeline_template(
    sat_id: str,  # satellite id, used for namespace
    source: str,  # source data, like "mag" or "plasma", used for namespace
    load_data_fn: Callable,
    preprocess_data_fn: Callable,
    process_data_fn: Callable,
    load_inputs: dict = DEFAULT_LOAD_INPUTS,
    params: Optional[dict] = None,
    **kwargs,
) -> Pipeline:
    if params is None:
        params = load_params()

    namespace = f"{sat_id}.{source}"

    ts = params[sat_id][source]["time_resolution"]
    datatype = params[sat_id][source]["datatype"]

    ts_str = f"ts_{ts}s"

    node_load_data = node(
        load_data_fn,
        inputs=load_inputs,
        outputs="raw_data",
        name="load_data",
    )

    node_preprocess_data = node(
        preprocess_data_fn,
        inputs="raw_data",
        outputs=f"inter_data_{datatype}",
        name="preprocess_data",
    )

    node_process_data = node(
        process_data_fn,
        inputs=dict(
            raw_data=f"inter_data_{datatype}",
            ts="params:time_resolution",
        ),
        outputs=f"primary_data_{ts_str}",
        name="process_data",
    )

    nodes = [
        node_load_data,
        node_preprocess_data,
        node_process_data,
    ]

    pipelines = pipeline(
        nodes,
        namespace=namespace,
        parameters={
            "params:start_date": "params:jno_start_date",
            "params:end_date": "params:jno_end_date",
        },
    )

    return pipelines