---
title: Data Pipeline
subtitle: Base layer pipeline
description: Data pipeline processes data for a specific type from a specific source, roughly speaking every data source corresponds to an instrument in the mission.
---

In [None]:
#| hide
%load_ext autoreload
%autoreload 2
from nbdev.showdoc import show_doc

In [None]:
#| hide

import polars as pl
from typing import Any, Dict, List, Tuple, Union

In [None]:
# Kerdo
from kedro.pipeline import Pipeline, node
from kedro.pipeline.modular_pipeline import pipeline

## Magnetic field data pipeline

The product of this pipeline is a data set of interesting magnetic field events.

### Loading data

In [None]:
def download_mag_data(
    start: str = None,
    end: str = None,
    ts: str = None,  # time resolution
    probe: str = None,
    coord: str = None,
):
    """Downloading data
    """
    ...


def load_mag_data(
    start: str = None,
    end: str = None,
    ts: str = None,  # time resolution
    probe: str = None,
    coord: str = None,
):
    """Load data into a proper data structure, like dataframe.

    - Downloading data
    - Converting data structure
    """
    ...


### Preprocessing data

In [None]:
def preprocess_mag_data(
    raw_data: Any | pl.DataFrame = None,
    start: str = None,
    end: str = None,
    ts: str = None,  # time resolution
    coord: str = None,
) -> pl.DataFrame:
    """
    Preprocess the raw dataset (only minor transformations)

    - Applying naming conventions for columns
    - Parsing and typing data (like from string to datetime for time columns)
    - Structuring the data (like pivoting, unpivoting, etc.)
    - Changing storing format (like from `csv` to `parquet`)
    - Dropping null columns 
    - Dropping duplicate time
    - Resampling data to a given time resolution (better to do in the next stage)
    - ... other 'transformations' commonly performed at this stage.
    """
    pass


### Processing data

Some common preprocessing steps are:

- Partition data by year, see `ids_finder.utils.basic.partition_data_by_year`

Note: we process the data every year to minimize the memory usage and to avoid the failure of the processing (so need to process all the data again if only fails sometimes).

In [None]:
def process_mag_data(
    raw_data: Any | pl.DataFrame,
    ts: str = None,  # time resolution
    coord: str = None,
) -> pl.DataFrame | Dict[str, pl.DataFrame]:
    """
    Corresponding to primary data layer, where source data models are transformed into domain data models

    - Transforming coordinate system if needed
    - Smoothing data
    - Resampling data to a given time resolution
    - Partitioning data, for the sake of memory
    """
    pass

def extract_features():
    pass

### Pipeline

In [None]:
def create_pipeline(
    sat_id: str,  # satellite id, used for namespace
    ts: int = 1,  # time resolution,
    tau: str = '60s',  # time window
    **kwargs,
) -> Pipeline:
    
    ts_str = f"ts_{ts}s"
    
    node_load_data = node(
        load_mag_data,
        inputs=dict(
            start="params:start_date",
            end="params:end_date",
        ),
        outputs=f"raw_mag",
        name=f"load_{sat_id.upper()}_magnetic_field_data",
    )

    node_preprocess_data = node(
        preprocess_mag_data,
        inputs=dict(
            raw_data=f"raw_mag",
            start="params:start_date",
            end="params:end_date",
        ),
        outputs=f"inter_mag_{ts_str}",
        name=f"preprocess_{sat_id.upper()}_magnetic_field_data",
    )

    node_process_data = node(
        process_mag_data,
        inputs=f"inter_mag_{ts_str}",
        outputs=f"primary_mag_{ts_str}",
        name=f"process_{sat_id.upper()}_magnetic_field_data",
    )

    node_extract_features = node(
        extract_features,
        inputs=[f"primary_mag_{ts_str}", "params:tau", "params:extract_params"],
        outputs=f"feature_tau_{tau}",
        name=f"extract_{sat_id}_features",
    )

    nodes = [
        node_load_data,
        node_preprocess_data,
        node_process_data,
        node_extract_features,
    ]

    pipelines = pipeline(
        nodes,
        namespace=sat_id,
        parameters={
            "params:start_date": "params:jno_start_date",
            "params:end_date": "params:jno_end_date",
            "params:tau": tau,
        },
    )

    return pipelines

In [None]:
class DatasetConfig:
    def __init__(self, sat_id, download_func, preprocess_func, process_func):
        self.sat_id = sat_id
        self.download_func = download_func
        self.preprocess_func = preprocess_func
        self.process_func = process_func

class PipelineGenerator:
    def __init__(self, config: DatasetConfig, ts='1s', tau='60s'):
        self.config = config
        self.ts = ts
        self.tau = tau

    def _node(self, func, inputs, outputs, name):
        return node(func, inputs=inputs, outputs=outputs, name=name)

    def generate_pipeline(self):
        node_download = self._node(
            self.config.download_func,
            inputs=dict(start="params:start_date", end="params:end_date"),
            outputs=f"raw_data_{self.ts}",
            name=f"download_{self.config.sat_id.upper()}_data"
        )

        node_preprocess = self._node(
            self.config.preprocess_func,
            inputs=dict(raw_data=f"raw_data_{self.ts}", start="params:start_date", end="params:end_date"),
            outputs=f"inter_data_{self.ts}",
            name=f"preprocess_{self.config.sat_id.upper()}_data"
        )

        node_process = self._node(
            self.config.process_func,
            inputs=f"inter_data_{self.ts}",
            outputs=f"primary_data_rtn_{self.ts}",
            name=f"process_{self.config.sat_id.upper()}_data"
        )

        node_extract = self._node(
            extract_features,
            inputs=[f"primary_data_rtn_{self.ts}", "params:tau", "params:extract_params"],
            outputs=f"feature_tau_{self.tau}",
            name=f"extract_{self.config.sat_id}_features"
        )

        return pipeline(
            [node_download, node_preprocess, node_process, node_extract],
            namespace=self.config.sat_id,
            parameters={"params:start_date": "params:jno_start_date", "params:end_date": "params:jno_end_date", "params:tau": self.tau}
        )
