---
title: THEMIS Magnetic field data pipeline
---

- For convenience, we choose magnetic field data in **GSE** coordinate system
- The `fgs` data are in 3-4s resolution

In [None]:
# | export
from datetime import timedelta

import polars as pl

from kedro.pipeline import Pipeline, node
from kedro.pipeline.modular_pipeline import pipeline

In [None]:
# | hide
# | default_exp pipelines/themis/mag
%load_ext autoreload
%autoreload 2

## Loading data

In [None]:
# | export
import speasy as spz
from speasy import SpeasyVariable

In [None]:
# | export
def check_dataype(ts):
    ts = timedelta(seconds=ts)
    fgs_ts = timedelta(seconds=3)
    fgl_ts = timedelta(seconds=0.1)

    if ts > fgs_ts:
        datatype = "fgs"
    elif ts > fgl_ts:
        datatype = "fgl"
    else:
        datatype = "fgh"
    return datatype

In [None]:
# | export
def download_data(
    trange, probe: str = "b", datatype="fgs", coord="gse"
) -> SpeasyVariable:
    match probe:
        case "b":
            sat = "thb"

    product = f"cda/{sat.upper()}_L2_FGM/{sat}_{datatype}_{coord}"
    data = spz.get_data(product, trange, disable_proxy=True)

    return data


def spz2df(raw_data: SpeasyVariable):
    return pl.from_dataframe(raw_data.to_dataframe().reset_index()).rename(
        {"index": "time"}
    )


def load_data(
    start,
    end,
    datatype=None,
    ts=None,  # time resolution
    probe: str = "b",
    coord="gse",
):
    trange = [start, end]

    data = download_data(trange, probe, datatype, coord)
    return spz2df(data).lazy()

## Preprocessing data

In [None]:
# | export
from ids_finder.utils.basic import resample

In [None]:
# | export
def preprocess_data(
    raw_data: pl.LazyFrame,
    datatype: str = None,
    coord: str = "gse",
) -> pl.LazyFrame:
    """
    Preprocess the raw dataset (only minor transformations)

    - Applying naming conventions for columns
    - Dropping duplicate time
    - Changing storing format to `parquet`

    """

    datatype = datatype.upper()
    name_mapping = {
        f"Bx {datatype}-D": "B_x",
        f"By {datatype}-D": "B_y",
        f"Bz {datatype}-D": "B_z",
    }

    return raw_data.sort("time").unique("time").rename(name_mapping)

## Processing data

In [None]:
# | export
from ids_finder.utils.basic import partition_data_by_year

In [None]:
# | export
def process_data(
    raw_data: pl.LazyFrame,
    ts: int = None,  # time resolution
) -> pl.DataFrame | dict[str, pl.DataFrame]:
    """
    Partitioning data, for the sake of memory
    """

    every = timedelta(seconds=ts)
    period = 2 * every

    return raw_data.pipe(resample, every=every, period=period).pipe(
        partition_data_by_year
    )

## Pipeline

In [1]:
# | export
from ids_finder.core.pipeline import extract_features
from ids_finder.pipelines.default.data_mag import create_pipeline_template


def create_pipeline(sat_id="THB", source="MAG"):
    return create_pipeline_template(
        sat_id=sat_id,
        source=source,
        load_data_fn=load_data,
        preprocess_data_fn=preprocess_data,
        process_data_fn=process_data,
        extract_features_fn=extract_features,
    )