---
title: Finding magnetic discontinuities
order: 0
---

It can be divided into two parts:

1. Finding the discontinuities, see [this notebook](./01_ids_detection.ipynb)
    - Corresponding to limited feature extraction / anomaly detection
2. Calculating the properties of the discontinuities, see [this notebook](./02_ids_properties.ipynb)
    - One can use higher time resolution data

In [None]:
#| default_exp core/pipeline

In [1]:
#| export
#| code-summary: "Import all the packages needed for the project"
from fastcore.utils import *
from fastcore.test import *
from ids_finder.utils.basic import *
import polars as pl
import xarray as xr
from ids_finder.core.detection import *
from ids_finder.core.propeties import *

try:
    import modin.pandas as pd
    import modin.pandas as mpd
    from modin.config import ProgressBar
    ProgressBar.enable()
except ImportError:
    import pandas as pd
import pandas
    
import numpy as np
from xarray_einstats import linalg

from datetime import timedelta

from loguru import logger

import pdpipe as pdp

from typing import Any, Callable

## Processing Stages

- [ ] Smoothing
- [ ] Interpolating

## Processing the whole dataset

In [None]:
# | export
def filter_indices(
    df: pl.DataFrame | pl.LazyFrame,
    index_std_threshold=2,
    index_fluc_threshold=1,
    index_diff_threshold=0.1,
    sparse_num=15,
) -> pl.DataFrame | pl.LazyFrame:
    # filter indices to get possible IDs

    return df.filter(
        pl.col("index_std") > index_std_threshold,
        pl.col("index_fluctuation") > index_fluc_threshold,
        pl.col("index_diff") > index_diff_threshold,
        pl.col("index_std").is_finite(), # for cases where neighboring groups have std=0
        pl.col("count") > sparse_num, 
        pl.col("count_prev") > sparse_num, # filter out sparse intervals, which may give unreasonable results.
        pl.col("count_next") > sparse_num, # filter out sparse intervals, which may give unreasonable results.
    )

In [None]:
#| export
from pdpipe.util import out_of_place_col_insert

patch `pdp.ApplyToRows` to work with `modin` and `xorbits` DataFrames

In [None]:
#| export
@patch
def _transform(self: pdp.ApplyToRows, X, verbose):
    new_cols = X.apply(self._func, axis=1)
    if isinstance(new_cols, (pd.Series, pandas.Series)):
        loc = len(X.columns)
        if self._follow_column:
            loc = X.columns.get_loc(self._follow_column) + 1
        return out_of_place_col_insert(
            X=X, series=new_cols, loc=loc, column_name=self._colname
        )
    if isinstance(new_cols, (mpd.DataFrame, pandas.DataFrame)):
        sorted_cols = sorted(list(new_cols.columns))
        new_cols = new_cols[sorted_cols]
        if self._follow_column:
            inter_X = X
            loc = X.columns.get_loc(self._follow_column) + 1
            for colname in new_cols.columns:
                inter_X = out_of_place_col_insert(
                    X=inter_X,
                    series=new_cols[colname],
                    loc=loc,
                    column_name=colname,
                )
                loc += 1
            return inter_X
        assign_map = {
            colname: new_cols[colname] for colname in new_cols.columns
        }
        return X.assign(**assign_map)
    raise TypeError(  # pragma: no cover
        "Unexpected type generated by applying a function to a DataFrame."
        " Only Series and DataFrame are allowed."
    )

In [None]:
#| export
def calc_candidate_mva_features(candidate, data: xr.DataArray):

    output_names = ["Vl_x", "Vl_y", "Vl_z", "eig0", "eig1", "eig2", 'Q_mva', 'b_mag', 'b_n', 'db_mag', 'bn_over_b', 'db_over_b', 'db_over_b_max', 'db_l', 'db_m', 'db_n']
    results = mva_features(
        data.sel(time=slice(candidate["d_tstart"], candidate["d_tstop"])).to_numpy()
    )
    
    return pandas.Series(results, output_names)

In [None]:
#| export
def convert_to_dataframe(
    data: pl.DataFrame | pl.LazyFrame # orignal Dataframe
)->pd.DataFrame:
    "convert data into a pandas/modin DataFrame"
    if isinstance(data, pl.LazyFrame):
        data = data.collect().to_pandas(use_pyarrow_extension_array=True)
    if isinstance(data, pl.DataFrame):
        data = data.to_pandas(use_pyarrow_extension_array=True)
    if not isinstance(data, pd.DataFrame):  # `modin` supports
        data = pd.DataFrame(data)
    return data

`Pipelines` Class for processing IDs

In [None]:
# | export
class IDsPipeline:
    def __init__(self):
        pass

    def calc_duration(self, sat_fgm: xr.DataArray):
        return pdp.ApplyToRows(
            lambda candidate: calc_candidate_duration(candidate, sat_fgm),
            func_desc="calculating duration parameters",
        )

    def calibrate_duration(self, sat_fgm, data_resolution):
        return pdp.ApplyToRows(
            lambda candidate: calibrate_candidate_duration(
                candidate, sat_fgm, data_resolution
            ),
            func_desc="calibrating duration parameters if needed",
        )

    def calc_mva_features(self, sat_fgm):
        return pdp.ApplyToRows(
            lambda candidate: calc_candidate_mva_features(candidate, sat_fgm),
            func_desc='calculating index "q_mva", "BnOverB" and "dBOverB"',
        )

    def calc_rotation_angle(self, sat_fgm):
        return pdp.ColByFrameFunc(
            "rotation_angle",
            lambda df: calc_candidate_rotation_angle(df, sat_fgm),
            func_desc="calculating rotation angle",
        )

Notes that the candidates only require a small portion of the data so we can compress the data to speed up the processing.

In [None]:
# | export
def compress_data_by_cands(
    data: pl.DataFrame, candidates: pl.DataFrame, tau: timedelta
):
    """Compress the data for parallel processing"""
    ttstarts = candidates["tstart"] - tau
    ttstops = candidates["tstop"] + tau

    ttstarts_index = data["time"].search_sorted(ttstarts)
    ttstops_index = data["time"].search_sorted(ttstops)

    indices = np.concatenate(
        [
            np.arange(ttstart_index, ttstop_index + 1)
            for ttstart_index, ttstop_index in zip(ttstarts_index, ttstops_index)
        ]
    )  # faster than `pl.arange`
    indices_unique = (
        pl.Series(indices).unique().sort()
    )  # faster than `np.unique(index)`
    return data[indices_unique]


In [None]:
# | export
def sort_df(df: pl.DataFrame, col="time"):
    if df.get_column(col).is_sorted():
        return df.set_sorted(col)
    else:
        return df.sort(col)


def process_candidates(
    candidates_pl: pl.DataFrame,  # potential candidates DataFrame
    sat_fgm: xr.DataArray,  # satellite FGM data
    data_resolution: timedelta,  # time resolution of the data
) -> pl.DataFrame:
    "Process candidates DataFrame"
    
    candidates = convert_to_dataframe(candidates_pl)

    id_pipelines = IDsPipeline()
    candidates = id_pipelines.calc_duration(sat_fgm).apply(candidates)

    # calibrate duration
    temp_candidates = candidates.loc[
        lambda df: df["d_tstart"].isnull() | df["d_tstop"].isnull()
    ]  # temp_candidates = candidates.query('d_tstart.isnull() | d_tstop.isnull()') # not implemented in `modin`

    if not temp_candidates.empty:
        temp_candidates_updated = id_pipelines.calibrate_duration(
            sat_fgm, data_resolution
        ).apply(temp_candidates)
        candidates.update(temp_candidates_updated)

    ids = (
        id_pipelines.calc_mva_features(sat_fgm)
        + id_pipelines.calc_rotation_angle(sat_fgm)
    ).apply(
        candidates.dropna()  # Remove candidates with NaN values)
    )

    if isinstance(ids, mpd.DataFrame):
        ids = ids._to_pandas()
    if isinstance(ids, pandas.DataFrame):
        ids_pl = pl.DataFrame(ids)

    return ids_pl.pipe(sort_df, col="d_time")

## Pipeline

In [None]:
# | export
def ids_finder(data: pl.LazyFrame, tau: float, params: dict):
    tau = timedelta(seconds=tau)
    ts = timedelta(seconds=params["time_resolution"])
    bcols = params.get("bcols", ["B_x", "B_y", "B_z"])
    data = data.sort("time").collect()

    # get candidates
    indices = compute_indices(data, tau, bcols)
    sparse_num = tau / ts // 3
    candidates = indices.pipe(filter_indices, sparse_num=sparse_num).pipe(
        pl_format_time, tau
    )

    data_c = compress_data_by_cands(data, candidates, tau)
    sat_fgm = df2ts(data_c, bcols)
    ids = process_candidates(candidates, sat_fgm, ts)
    return ids


def extract_features(
    partitioned_input: Dict[str, Callable], tau: float, params
) -> pl.DataFrame:
    ids = pl.concat(
        [
            ids_finder(partition_load(), tau, params)
            for partition_load in partitioned_input.values()
        ]
    )
    return ids.unique(["d_time", "d_tstart", "d_tstop"])

## Test

Generally `mapply` and `modin` are the fastest. `xorbits` is expected to be the fastest but it is not and it is the slowest one.

In [None]:
#| notest
sat = 'jno'
coord = 'se'
cols = ["BX", "BY", "BZ"]
tau = timedelta(seconds=60)
data_resolution = timedelta(seconds=1)

if True:
    year = 2012
    files = f'../data/{sat}_data_{year}.parquet'
    output = f'../data/{sat}_candidates_{year}_tau_{tau.seconds}.parquet'

    data = pl.scan_parquet(files).set_sorted('time').collect()

    indices = compute_indices(data, tau)
    # filter condition
    sparse_num = tau / data_resolution // 3
    filter_condition = filter_indices(sparse_num = sparse_num)

    candidates = indices.filter(filter_condition).with_columns(pl_format_time(tau)).sort('time')
    
    data_c = compress_data_by_cands(data, candidates, tau)
    sat_fgm = df2ts(data_c, cols, attrs={"units": "nT"})

### Test parallelization

In [None]:
#| notest
candidates_pd = candidates.to_pandas()
candidates_modin = mpd.DataFrame(candidates_pd)
# candidates_x = xpd.DataFrame(candidates_pd)

In [None]:
#| code-summary: Test different libraries to parallelize the computation
#| notest
if True:
    pdp_test = pdp.ApplyToRows(
        lambda candidate: calc_candidate_duration(candidate, sat_fgm),  # fast a little bit
        # lambda candidate: calc_duration(get_candidate_data_xr(candidate, sat_fgm)),
        # lambda candidate: calc_duration(sat_fgm.sel(time=slice(candidate['tstart'], candidate['tstop']))),
        func_desc="calculating duration parameters",
    )
    
    # process_candidates(candidates_modin, sat_fgm, sat_state, data_resolution)
    
    # ---
    # successful cases
    # ---
    # candidates_pd.mapply(lambda candidate: calc_candidate_duration(candidate, sat_fgm), axis=1) # this works, 4.2 secs
    # candidates_pd.mapply(calc_candidate_duration, axis=1, data=sat_fgm) # this works, but a little bit slower, 6.7 secs
    
    # candidates_pd.apply(calc_candidate_duration, axis=1, data=sat_fgm) # Standard case: 24+s secs
    # candidates_pd.swifter.apply(calc_candidate_duration, axis=1, data=sat_fgm) # this works with dask, 80 secs
    # candidates_pd.swifter.set_dask_scheduler(scheduler="threads").apply(calc_candidate_duration, axis=1, data=sat_fgm) # this works with dask, 60 secs
    # candidates_modin.apply(lambda candidate: calc_candidate_duration(candidate, sat_fgm), axis=1) # this works with ray, 6 secs # NOTE: can not work with dask
    # candidates_x.apply(calc_candidate_duration, axis=1, data=sat_fgm) # 30 seconds
    # pdp_test(candidates_modin) # this works, 8 secs
    
    # ---
    # failed cases
    # ---
    # candidates_modin.apply(calc_candidate_duration, axis=1, data=sat_fgm) # AttributeError: 'DataFrame' object has no attribute 'sel'

### Test feature engineering

In [None]:
# from tsflex.features import MultipleFeatureDescriptors, FeatureCollection

# from tsflex.features.integrations import catch22_wrapper
# from pycatch22 import catch22_all

In [None]:
# tau_pd = pd.Timedelta(tau)

# catch22_feats = MultipleFeatureDescriptors(
#     functions=catch22_wrapper(catch22_all),
#     series_names=bcols,  # list of signal names
#     windows = tau_pd, strides=tau_pd/2,
# )

# fc = FeatureCollection(catch22_feats)
# features = fc.calculate(data, return_df=True)  # calculate the features on your data

In [None]:
# features_pl = pl.DataFrame(features.reset_index()).sort('time')
# df = candidates_pl.join_asof(features_pl, on='time').to_pandas()

In [None]:
# profile = ProfileReport(df, title="JUNO Candidates Report")
# profile.to_file("jno.html")

### Benchmark

In [None]:
import timeit

In [None]:
def benchmark(task_dict, number=1):
    results = {}
    for name, (data, task) in task_dict.items():
        try:
            time_taken = timeit.timeit(
                lambda: task(data),
                number=number
            )
            results[name] = time_taken / number
        except Exception as e:
            results[name] = str(e)
    return results

In [None]:
#| notest
func = lambda candidate: calc_candidate_duration(candidate, sat_fgm)
task_dict = {
    'pandas': (candidates_pd, lambda _: _.apply(func, axis=1)),
    'pandas-mapply': (candidates_pd, lambda _: _.mapply(func, axis=1)),
    'modin': (candidates_modin, lambda _: _.apply(func, axis=1)),
    # 'xorbits': (candidates_x, lambda _: _.apply(func, axis=1)),
}

results = benchmark(task_dict)

## Notes

### TODOs

1. Feature engineering
2. Feature selection

## Obsolete codes

This is obsolete codes because the timewindow now is overlapping. No need to consider where magnetic discontinuities happens in the boundary of one timewindow.

In [None]:
def calc_candidate_d_duration(candidate, data) -> pd.Series:
    try:
        if pd.isnull(candidate['d_tstart']) or pd.isnull(candidate['d_tstop']):
            candidate_data = get_candidate_data(candidate, data, neighbor=1)
            d_time = candidate['d_time']
            threshold = candidate['threshold']
            return calc_d_duration(candidate_data, d_time, threshold)
        else:
            return pandas.Series({
                'd_tstart': candidate['d_tstart'],
                'd_tstop': candidate['d_tstop'],
            })
    except Exception as e:
        # logger.debug(f"Error for candidate {candidate} at {candidate['time']}: {str(e)}")
        print(f"Error for candidate {candidate} at {candidate['time']}: {str(e)}")
        raise e

pdp.ApplyToRows(
    lambda candidate: calc_candidate_d_duration(candidate, sat_fgm),
    func_desc="calculating duration parameters if needed"
)

Obsolete codes for xarray related calculations.

In [None]:
def calc_vec_mean_mag(vec: xr.DataArray):
    return linalg.norm(vec, dims="v_dim").mean(dim="time")


def calc_vec_std(vec: xr.DataArray):
    """
    Computes the standard deviation of a vector.
    """
    return linalg.norm(vec.std(dim="time"), dims="v_dim")


def calc_vec_relative_diff(vec: xr.DataArray):
    """
    Computes the relative difference between the last and first elements of a vector.
    """
    dvec = vec.isel(time=-1) - vec.isel(time=0)
    return linalg.norm(dvec, dims="v_dim") / linalg.norm(vec, dims="v_dim").mean(
        dim="time"
    )

#### `process_candidates`
Assign coordinates using `Dataframe.apply` is not optimized, quite slow...

In [None]:
def process_candidates(
    candidates: pd.DataFrame, # potential candidates DataFrame
    sat_fgm: xr.DataArray, # satellite FGM data
    sat_state: xr.DataArray, # satellite state data
    data_resolution: timedelta, # time resolution of the data
) -> pd.DataFrame: # processed candidates DataFrame
    id_pipelines = IDsPipeline()

    candidates = id_pipelines.calc_duration(sat_fgm).apply(candidates)

    # calibrate duration
    temp_candidates = candidates.loc[
        lambda df: df["d_tstart"].isnull() | df["d_tstop"].isnull()
    ]  # temp_candidates = candidates.query('d_tstart.isnull() | d_tstop.isnull()') # not implemented in `modin`

    if not temp_candidates.empty:
        candidates.update(
            id_pipelines.calibrate_duration(sat_fgm, data_resolution).apply(
                temp_candidates
            )
        )

    ids = (
        id_pipelines.calc_mva_features(sat_fgm)
        + id_pipelines.calc_rotation_angle(sat_fgm)
        + id_pipelines.assign_coordinates(sat_state)
    ).apply(
        candidates.dropna()  # Remove candidates with NaN values)
    )

    return ids