---
title: Project Pipeline
description: Project pipeline produces dataset for analysis for a specific purpose (project).
---

Generally, it includes the following steps:

- Combining data/events from different missions into a single dataset for analysis.

In [None]:
#| default_exp pipelines/project/pipeline


In [1]:
#| output: False
from ids_finder.utils.basic import load_catalog

catalog = load_catalog('../../')

In [3]:
#| export
import polars as pl
from beforerr.basics import pmap


In [4]:
catalog.load('events.JNO_ts_1s_tau_60s')

In [None]:
#| export
from kedro.pipeline import Pipeline, node
from kedro.pipeline.modular_pipeline import pipeline

In [11]:
# | export

from datetime import timedelta
from loguru import logger
import polars.selectors as cs


def process_events_l1(events: pl.LazyFrame):
    "clean data to remove extreme values"
    events = events.collect()

    df = events.filter(
        pl.col("d_star") < 100,  # exclude extreme values
        pl.col("v_mn") > 10,
        pl.col("duration") < timedelta(seconds=60),
    ).with_columns(
        cs.float().cast(pl.Float64),
        j0_norm_log=pl.col("j0_norm").log10(),
        L_mn_norm_log=pl.col("L_mn_norm").log10(),
    )

    logger.info(
        f"candidates_l1: {len(df)}, with effective ratio: {len(df) / len(events):.2%}"
    )

    return df.lazy()

In [None]:
#| export
def create_l1_node(sat="JNO", ts=1, tau=60):
    ts_str = f"ts_{ts}s"
    tau_str = f"tau_{tau}s"
    return node(
        process_events_l1,
        inputs=f"events.{sat}_{ts_str}_{tau_str}",
        outputs=f"events.l1.{sat}_{ts_str}_{tau_str}",
    )

In [None]:
# | export
def combine_events(**datasets):
    datasets = [v.with_columns(sat=pl.lit(key)) for key, v in datasets.items()]
    combined_dataset = pl.concat(datasets, how="diagonal")
    return combined_dataset.with_columns(
        pl.col("radial_distance").fill_null(1),  # by default, fill with 1 AU
    ).with_columns(
        r_bin=pl.col("radial_distance").round(),
    )


def create_pipeline():
    combine_layer = "events.l1"
    node_combine_events = node(
        combine_events,
        inputs=dict(
            JNO=f"{combine_layer}.JNO_ts_1s_tau_60s",
            STA=f"{combine_layer}.STA_ts_1s_tau_60s",
            THB=f"{combine_layer}.THB_sw_ts_1s_tau_60s",
        ),
        outputs=f"{combine_layer}.ALL_sw_ts_1s_tau_60s",
        # namespace="events.l1",
    )

    nodes = [
        create_l1_node("JNO"),
        create_l1_node("STA"),
        create_l1_node("THB_sw"),
        node_combine_events,
        
    ]
    return pipeline(nodes)

In [None]:
def combine_candidates(dict):
    pass

# node_thm_extract_features = node(
#     extract_features,
#     inputs=["primary_thm_rtn_1s", "params:tau", "params:thm_1s_params"],
#     outputs="candidates_thm_rtn_1s",
#     name="extract_ARTEMIS_features",
# )

# node_combine_candidates = node(
#     combine_candidates,
#     inputs=dict(
#         sta_candidates="candidates_sta_rtn_1s",
#         jno_candidates="candidates_jno_ss_se_1s",
#         thm_candidates="candidates_thm_rtn_1s",
#     ),
#     outputs="candidates_all_1s",
#     name="combine_candidates",
# )