---
title: ID properties
subtitle: Full feature extraction
---

In [None]:
# | default_exp core/propeties
# | export
# | code-summary: "Import all the packages needed for the project"
import polars as pl
import xarray as xr
import numpy as np

try:
    import modin.pandas as pd
    from modin.config import ProgressBar

    ProgressBar.enable()
except ImportError:
    import pandas as pd

from loguru import logger

from discontinuitypy.propeties.duration import calc_duration
from discontinuitypy.propeties.mva import calc_mva_features_all
from typing import Literal

In [None]:
# | export
def get_data_at_times(data: xr.DataArray, times) -> np.ndarray:
    """
    Select data at specified times.
    """
    # Use xarray's selection capability if data supports it
    return data.sel(time=times, method="nearest").to_numpy()


def select_data_by_timerange(data: xr.DataArray, tstart, tstop, neighbor: int = 0):
    duration = tstop - tstart
    offset = neighbor * duration
    timerange = slice(tstart - offset, tstop + offset)
    return data.sel(time=timerange)


def get_candidate_data(candidate: dict, data: xr.DataArray, **kwargs):
    return select_data_by_timerange(
        data, candidate["tstart"], candidate["tstop"], **kwargs
    )


def get_candidates(candidates: pd.DataFrame, candidate_type=None, num: int = 4):
    if candidate_type is not None:
        _candidates = candidates[candidates["type"] == candidate_type]
    else:
        _candidates = candidates

    # Sample a specific number of candidates if num is provided and it's less than the total number
    if num < len(_candidates):
        logger.info(
            f"Sampling {num} {candidate_type} candidates out of {len(_candidates)}"
        )
        return _candidates.sample(num)
    else:
        return _candidates

## Duration

In [None]:
# | exporti
def ld2dl(listdict: list[dict], func=np.array):
    """Convert a list of dictionaries to a dictionary of lists."""
    return {key: func([result[key] for result in listdict]) for key in listdict[0]}

In [None]:
# | export
def calc_candidate_duration(candidate, data, **kwargs):
    candidate_data = get_candidate_data(candidate, data)
    return calc_duration(candidate_data, **kwargs)

In [None]:
# | export
def calc_events_duration(df: pl.DataFrame, data, tr_cols=["tstart", "tstop"], **kwargs):
    # TODO: Add support for parallel processing
    results = [
        calc_duration(select_data_by_timerange(data, row[0], row[1]), **kwargs)
        for row in df.select(tr_cols).iter_rows()
    ]
    return df.with_columns(**ld2dl(results)).drop_nulls()


def calc_events_mva_features(
    df: pl.DataFrame,
    data: xr.DataArray,
    method: Literal["fit", "derivative"],
    tr_cols=["t.d_start", "t.d_end"],
    **kwargs,
):
    results = [
        calc_mva_features_all(
            select_data_by_timerange(data, row[0], row[1]), method=method, **kwargs
        )
        for row in df.select(tr_cols).iter_rows()
    ]
    return df.with_columns(**ld2dl(results))

## Normal direction

In [None]:
# | export
def calc_normal_direction(v1, v2, normalize=True) -> np.ndarray:
    """
    Computes the normal direction of two vectors.

    Parameters
    ----------
    v1 : array_like
        The first vector(s).
    v2 : array_like
        The second vector(s).
    """
    c = np.cross(v1, v2)
    return c / np.linalg.norm(c, axis=-1, keepdims=True)

In [None]:
# | export
def calc_events_normal_direction(events: pl.DataFrame, data: xr.DataArray, name="k"):
    """
    Computes the normal directions(s) at two different time steps.
    """
    tstart = events["t.d_start"].to_numpy()
    tstop = events["t.d_end"].to_numpy()

    vecs_before = get_data_at_times(data, tstart)
    vecs_after = get_data_at_times(data, tstop)

    normal_directions = calc_normal_direction(vecs_before, vecs_after)
    # need to convert to list first, as only 1D array is supported
    return events.with_columns(pl.Series(name, normal_directions))

In [None]:
# | export
def calc_events_vec_change(events: pl.DataFrame, data: xr.DataArray, name="dB"):
    """
    Utils function to calculate features related to the change of the magnetic field
    """
    tstart = events["t.d_start"].to_numpy()
    tstop = events["t.d_end"].to_numpy()

    vecs_before = get_data_at_times(data, tstart)
    vecs_after = get_data_at_times(data, tstop)
    dvecs = vecs_after - vecs_before

    return events.with_columns(pl.Series(name, dvecs))

## Pipelines

In [None]:
# | export
def process_events(
    events: pl.DataFrame,  # potential candidates DataFrame
    data: xr.DataArray,
    method: Literal["fit", "derivative"] = "fit",
    **kwargs,
):
    "Process candidates DataFrame"

    if method == "fit":
        duration_method = "distance"
        duration_expr = pl.col("fit.vars.sigma") * 2
    else:
        duration_method = "derivative"
        duration_expr = (
            pl.col("t.d_end") - pl.col("t.d_start")
        ).dt.total_nanoseconds() / 1e9

    return (
        events.pipe(calc_events_duration, data=data, method=duration_method)
        .pipe(calc_events_mva_features, data=data, method=method)
        .pipe(calc_events_vec_change, data=data, name="dB")
        .pipe(calc_events_normal_direction, data=data, name="k")
    ).with_columns(duration=duration_expr)

In [None]:
# | hide
from nbdev import nbdev_export

nbdev_export()

## Test

### Test parallelization


Generally `mapply` and `modin` are the fastest. `xorbits` is expected to be the fastest but it is not and it is the slowest one.

```python
#| notest
sat = 'jno'
coord = 'se'
cols = ["BX", "BY", "BZ"]
tau = timedelta(seconds=60)
data_resolution = timedelta(seconds=1)

if True:
    year = 2012
    files = f'../data/{sat}_data_{year}.parquet'
    output = f'../data/{sat}_candidates_{year}_tau_{tau.seconds}.parquet'

    data = pl.scan_parquet(files).set_sorted('time').collect()

    indices = compute_indices(data, tau)
    # filter condition
    sparse_num = tau / data_resolution // 3
    filter_condition = filter_indices(sparse_num = sparse_num)

    candidates = indices.filter(filter_condition).with_columns(pl_format_time(tau)).sort('time')
    
    data_c = compress_data_by_events(data, candidates, tau)
    sat_fgm = df2ts(data_c, cols, attrs={"units": "nT"})
```

In [None]:
# | code-summary: Test different libraries to parallelize the computation
# | notest
def test_parallelization(candidates, sat_fgm):
    # process_events(candidates_modin, sat_fgm, sat_state, data_resolution)

    # ---
    # successful cases
    # ---
    # candidates_pd.mapply(lambda candidate: calc_candidate_duration(candidate, sat_fgm), axis=1) # this works, 4.2 secs
    # candidates_pd.mapply(calc_candidate_duration, axis=1, data=sat_fgm) # this works, but a little bit slower, 6.7 secs

    # candidates_pd.apply(calc_candidate_duration, axis=1, data=sat_fgm) # Standard case: 24+s secs
    # candidates_pd.swifter.apply(calc_candidate_duration, axis=1, data=sat_fgm) # this works with dask, 80 secs
    # candidates_pd.swifter.set_dask_scheduler(scheduler="threads").apply(calc_candidate_duration, axis=1, data=sat_fgm) # this works with dask, 60 secs
    # candidates_modin.apply(lambda candidate: calc_candidate_duration(candidate, sat_fgm), axis=1) # this works with ray, 6 secs # NOTE: can not work with dask
    # candidates_x.apply(calc_candidate_duration, axis=1, data=sat_fgm) # 30 seconds
    # ---
    # failed cases
    # ---
    # candidates_modin.apply(calc_candidate_duration, axis=1, data=sat_fgm) # AttributeError: 'DataFrame' object has no attribute 'sel'
    pass

In [None]:
import timeit
from functools import partial

In [None]:
def benchmark(task_dict, number=1):
    results = {}
    for name, (data, task) in task_dict.items():
        try:
            time_taken = timeit.timeit(lambda: task(data), number=number)
            results[name] = time_taken / number
        except Exception as e:
            results[name] = str(e)
    return results

In [None]:
# | notest
import modin.pandas as mpd


def benchmark_results(candidates, sat_fgm):
    candidates_pd = candidates.to_pandas()
    candidates_modin = mpd.DataFrame(candidates_pd)
    # candidates_x = xpd.DataFrame(candidates_pd)

    func = partial(calc_candidate_duration, data=sat_fgm)
    task_dict = {
        "pandas": (candidates_pd, lambda _: _.apply(func, axis=1)),
        "pandas-mapply": (candidates_pd, lambda _: _.mapply(func, axis=1)),
        "modin": (candidates_modin, lambda _: _.apply(func, axis=1)),
        # 'xorbits': (candidates_x, lambda _: _.apply(func, axis=1)),
    }

    results = benchmark(task_dict)
    return results