---
title: ID properties
subtitle: Full feature extraction
---

In [None]:
#| default_exp core/propeties

In [4]:
#| export
#| code-summary: "Import all the packages needed for the project"
from fastcore.utils import *
from fastcore.test import *
from ids_finder.utils.basic import *
import polars as pl
import xarray as xr
import pyarrow as pa

try:
    import modin.pandas as pd
    import modin.pandas as mpd
    from modin.config import ProgressBar
    ProgressBar.enable()
except ImportError:
    import pandas as pd
import pandas
    
import numpy as np
from xarray_einstats import linalg

from datetime import timedelta

from loguru import logger


import pdpipe as pdp
from pdpipe.util import out_of_place_col_insert
from multipledispatch import dispatch

In [None]:
# | export
@dispatch(object, xr.DataArray)
def get_candidate_data(
    candidate, data, neighbor: int = 0
) -> xr.DataArray:
    duration = candidate["tstop"] - candidate["tstart"]
    offset = neighbor * duration
    temp_tstart = candidate["tstart"] - offset
    temp_tstop = candidate["tstop"] + offset

    return data.sel(time=slice(temp_tstart, temp_tstop))


@dispatch(object, pl.DataFrame)
def get_candidate_data(
    candidate, data, neighbor: int = 0, bcols=["BX", "BY", "BZ"]
) -> xr.DataArray:
    """
    Notes
    -----
    much slower than `get_candidate_data_xr`
    """
    duration = candidate["tstart"] - candidate["tstop"]
    offset = neighbor * duration
    temp_tstart = candidate["tstart"] - offset
    temp_tstop = candidate["tstop"] + offset

    temp_data = data.filter(pl.col("time").is_between(temp_tstart, temp_tstop))

    return df2ts(temp_data, bcols, attrs={"units": "nT"})


def get_candidates(candidates: pd.DataFrame, candidate_type=None, num: int = 4):
    if candidate_type is not None:
        _candidates = candidates[candidates["type"] == candidate_type]
    else:
        _candidates = candidates

    # Sample a specific number of candidates if num is provided and it's less than the total number
    if num < len(_candidates):
        logger.info(
            f"Sampling {num} {candidate_type} candidates out of {len(_candidates)}"
        )
        return _candidates.sample(num)
    else:
        return _candidates

## Duration

Definitions of duration
- Define $d^* = \max( | dB / dt | ) $, and then define time interval where $| dB/dt |$ decreases to $d^*/4$

In [None]:
#| export
THRESHOLD_RATIO  = 1/4

def calc_duration(vec: xr.DataArray, threshold_ratio=THRESHOLD_RATIO) -> pandas.Series:
    # NOTE: gradient calculated at the edge is not reliable.
    vec_diff = vec.differentiate("time", datetime_unit="s").isel(time=slice(1,-1))
    vec_diff_mag = linalg.norm(vec_diff, dims='v_dim')

    # Determine d_star based on trend
    if vec_diff_mag.isnull().all():
        raise ValueError("The differentiated vector magnitude contains only NaN values. Cannot compute duration.")
    
    d_star_index = vec_diff_mag.argmax(dim="time")
    d_star = vec_diff_mag[d_star_index]
    d_time = vec_diff_mag.time[d_star_index]
    
    threshold = d_star * threshold_ratio

    start_time, end_time = find_start_end_times(vec_diff_mag, d_time, threshold)

    dict = {
        'd_star': d_star.item(),
        'd_time': d_time.values,
        'threshold': threshold.item(),
        'd_tstart': start_time,
        'd_tstop': end_time,
    }

    return pandas.Series(dict)

def calc_d_duration(vec: xr.DataArray, d_time, threshold) -> pd.Series:
    vec_diff = vec.differentiate("time", datetime_unit="s")
    vec_diff_mag = linalg.norm(vec_diff, dims='v_dim')

    start_time, end_time = find_start_end_times(vec_diff_mag, d_time, threshold)

    return pandas.Series({
        'd_tstart': start_time,
        'd_tstop': end_time,
    })
 
def find_start_end_times(vec_diff_mag: xr.DataArray, d_time, threshold) -> tuple[pd.Timestamp, pd.Timestamp]:
    # Determine start time
    pre_vec_mag = vec_diff_mag.sel(time=slice(None, d_time))
    start_time = get_time_from_condition(pre_vec_mag, threshold, "last_below")

    # Determine stop time
    post_vec_mag = vec_diff_mag.sel(time=slice(d_time, None))
    end_time = get_time_from_condition(post_vec_mag, threshold, "first_below")

    return start_time, end_time


def get_time_from_condition(vec: xr.DataArray, threshold, condition_type) -> pd.Timestamp:
    if condition_type == "first_below":
        condition = vec < threshold
        index_choice = 0
    elif condition_type == "last_below":
        condition = vec < threshold
        index_choice = -1
    else:
        raise ValueError(f"Unknown condition_type: {condition_type}")

    where_result = np.where(condition)[0]

    if len(where_result) > 0:
        return vec.time[where_result[index_choice]].values
    return None

In [None]:
#| export
def calc_candidate_duration(candidate: pd.Series, data) -> pd.Series:
    try:
        candidate_data = get_candidate_data(candidate, data)
        return calc_duration(candidate_data)
    except Exception as e:
        # logger.debug(f"Error for candidate {candidate} at {candidate['time']}: {str(e)}") # can not be serialized
        print(f"Error for candidate {candidate} at {candidate['time']}: {str(e)}")
        raise e



## Minimum variance analysis (MVA) features

To ensure the accuracy of MVA, only when the ratio of the middle to the minimum eigenvalue (labeled QMVA for simplicity) is larger than 3 are the results used for further analysis.

In [None]:
#| exports
def minvar(data):
    """
    see `pyspedas.cotrans.minvar`
    This program computes the principal variance directions and variances of a
    vector quantity as well as the associated eigenvalues.

    Parameters
    -----------
    data:
        Vxyz, an (npoints, ndim) array of data(ie Nx3)

    Returns
    -------
    vrot:
        an array of (npoints, ndim) containing the rotated data in the new coordinate system, ijk.
        Vi(maximum direction)=vrot[0,:]
        Vj(intermediate direction)=vrot[1,:]
        Vk(minimum variance direction)=Vrot[2,:]
    v:
        an (ndim,ndim) array containing the principal axes vectors
        Maximum variance direction eigenvector, Vi=v[*,0]
        Intermediate variance direction, Vj=v[*,1] (descending order)
    w:
        the eigenvalues of the computation
    """

    #  Min var starts here
    # data must be Nx3
    vecavg = np.nanmean(np.nan_to_num(data, nan=0.0), axis=0)

    mvamat = np.zeros((3, 3))
    for i in range(3):
        for j in range(3):
            mvamat[i, j] = np.nanmean(np.nan_to_num(data[:, i] * data[:, j], nan=0.0)) - vecavg[i] * vecavg[j]

    # Calculate eigenvalues and eigenvectors
    w, v = np.linalg.eigh(mvamat, UPLO='U')

    # Sorting to ensure descending order
    w = np.abs(w)
    idx = np.flip(np.argsort(w))

    # IDL compatability
    if True:
        if np.sum(w) == 0.0:
            idx = [0, 2, 1]

    w = w[idx]
    v = v[:, idx]

    # Rotate intermediate var direction if system is not Right Handed
    YcrossZdotX = v[0, 0] * (v[1, 1] * v[2, 2] - v[2, 1] * v[1, 2])
    if YcrossZdotX < 0:
        v[:, 1] = -v[:, 1]
        # v[:, 2] = -v[:, 2] # Should not it is being flipped at Z-axis?

    # Ensure minvar direction is along +Z (for FAC system)
    if v[2, 2] < 0:
        v[:, 2] = -v[:, 2]
        v[:, 1] = -v[:, 1]

    vrot = np.array([np.dot(row, v) for row in data])

    return vrot, v, w


In [None]:
#| export

def mva_features(data: np.ndarray):
    """
    Compute MVA features based on the given data array.

    Parameters:
    - data (np.ndarray): Input data

    Returns:
    - List: Computed features
    """
    
    # Compute variance properties
    vrot, v, w = minvar(data)

    # Maximum variance direction eigenvector
    Vl = v[:, 0]
    Vm = v[:, 1]
    Vn = v[:, 2]

    vec_mag = np.linalg.norm(vrot, axis=1)
    
    # Compute changes in each component of B_rot
    dvec = [vrot[0, i] - vrot[-1, i] for i in range(3)]
    
    # Compute mean values
    vec_mag_mean = np.mean(vec_mag)
    vec_n_mean = np.mean(vrot[:, 2])
    VnOverVmag = vec_n_mean / vec_mag_mean

    # Compute relative changes in magnitude
    dvec_mag = vec_mag[-1] - vec_mag[0]
    dBOverB = np.abs(dvec_mag / vec_mag_mean)
    dBOverB_max = (np.max(vec_mag) - np.min(vec_mag)) / vec_mag_mean

    output_names = [
        "Vl_x", "Vl_y", "Vl_z",
        "Vn_x", "Vn_y", "Vn_z",
        "eig0", "eig1", "eig2", 
        'Q_mva',
        'b_mag', 'b_n', 'db_mag', 'bn_over_b', 'db_over_b', 'db_over_b_max', 'db_l', 'db_m', 'db_n']
    
    results = [
        Vl[0], Vl[1], Vl[2],
        Vn[0], Vn[1], Vn[2],
        w[0], w[1], w[2],
        w[1] / w[2],
        vec_mag_mean,
        vec_n_mean,
        dvec_mag,
        VnOverVmag, 
        dBOverB,
        dBOverB_max,
        dvec[0], dvec[1], dvec[2]
    ]

    return results, output_names

In [None]:
# | export
def calc_candidate_mva_features(candidate, data: xr.DataArray):
    results, output_names = mva_features(
        data.sel(time=slice(candidate["d_tstart"], candidate["d_tstop"])).to_numpy()
    )

    return pandas.Series(results, index=output_names)

##### Test

In [None]:
#| eval: false
from fastcore.test import *

# Generate synthetic data
np.random.seed(42)  # for reproducibility
data = np.random.rand(100, 3)  # 100 time points, 3-dimensional data
# Call the mva_features function
features = mva_features(data)
_features = [0.3631060892452051, 0.8978455426527485, -0.24905290500542857, 0.09753158579102299, 0.086943767300213, 0.07393142040422575, 1.1760056390752571, 0.9609421690770317, 0.6152039820297959, -0.5922397773398479, 0.6402091632847049, 0.61631157045453, 1.2956351134759623, 0.19091785005728523, 0.5182488424049534, 0.4957624347593598]
test_eq(features, _features)

## Field rotation angles
The PDF of the field rotation angles across the solar-wind IDs is well fitted by the exponential function exp(−θ/)...

In [None]:
#| export
def get_data_at_times(data: xr.DataArray, times) -> np.ndarray:
    """
    Select data at specified times.
    """
    # Use xarray's selection capability if data supports it
    return data.sel(time=times, method="nearest").to_numpy()

In [None]:
#| export
def calc_rotation_angle(v1, v2):
    """
    Computes the rotation angle between two vectors.
    
    Parameters:
    - v1: The first vector(s).
    - v2: The second vector(s).
    """
    
    if v1.shape != v2.shape:
        raise ValueError("Vectors must have the same shape.")
    
    # Normalize the vectors
    v1_u = v1 / np.linalg.norm(v1, axis=-1, keepdims=True)
    v2_u = v2 / np.linalg.norm(v2, axis=-1, keepdims=True)
    
    # Calculate the cosine of the angle for each time step
    cosine_angle = np.sum(v1_u * v2_u, axis=-1)
    
    # Clip the values to handle potential floating point errors
    cosine_angle = np.clip(cosine_angle, -1, 1)
    
    angle = np.arccos(cosine_angle)
    
    # Convert the angles from radians to degrees
    return np.degrees(angle)


In [None]:
#| export
def calc_events_rotation_angle(events, data: xr.DataArray):
    """
    Computes the rotation angle(s) at two different time steps.
    """
    tstart = events['d_tstart'].to_numpy()
    tstop = events['d_tstop'].to_numpy()

    vecs_before = get_data_at_times(data, tstart)
    vecs_after = get_data_at_times(data, tstop)

    rotation_angles = calc_rotation_angle(vecs_before, vecs_after)
    return rotation_angles

## Normal direction

In [None]:
#| export
def calc_normal_direction(v1, v2, normalize=True) -> np.ndarray:
    """
    Computes the normal direction of two vectors.

    Parameters
    ----------
    v1 : array_like 
        The first vector(s).
    v2 : array_like 
        The second vector(s).
    """
    c = np.cross(v1, v2)
    return c / np.linalg.norm(c, axis=-1, keepdims=True)


In [2]:
# | export
def calc_events_normal_direction(events, data: xr.DataArray):
    """
    Computes the normal directions(s) at two different time steps.
    """
    tstart = events['d_tstart'].to_numpy()
    tstop = events['d_tstop'].to_numpy()

    vecs_before = get_data_at_times(data, tstart)
    vecs_after = get_data_at_times(data, tstop)

    normal_directions = calc_normal_direction(vecs_before, vecs_after)
    # need to convert to list first, as only 1D array is supported
    return normal_directions.tolist()


## Pipelines

patch `pdp.ApplyToRows` to work with `modin` and `xorbits` DataFrames

In [None]:
#| export
@patch
def _transform(self: pdp.ApplyToRows, X, verbose):
    new_cols = X.apply(self._func, axis=1)
    if isinstance(new_cols, (pd.Series, pandas.Series)):
        loc = len(X.columns)
        if self._follow_column:
            loc = X.columns.get_loc(self._follow_column) + 1
        return out_of_place_col_insert(
            X=X, series=new_cols, loc=loc, column_name=self._colname
        )
    if isinstance(new_cols, (mpd.DataFrame, pandas.DataFrame)):
        sorted_cols = sorted(list(new_cols.columns))
        new_cols = new_cols[sorted_cols]
        if self._follow_column:
            inter_X = X
            loc = X.columns.get_loc(self._follow_column) + 1
            for colname in new_cols.columns:
                inter_X = out_of_place_col_insert(
                    X=inter_X,
                    series=new_cols[colname],
                    loc=loc,
                    column_name=colname,
                )
                loc += 1
            return inter_X
        assign_map = {
            colname: new_cols[colname] for colname in new_cols.columns
        }
        return X.assign(**assign_map)
    raise TypeError(  # pragma: no cover
        "Unexpected type generated by applying a function to a DataFrame."
        " Only Series and DataFrame are allowed."
    )

`Pipelines` Class for processing IDs

In [None]:
# | export
class IDsPipeline:
    def __init__(self):
        pass

    def calc_duration(self, sat_fgm: xr.DataArray):
        return pdp.ApplyToRows(
            lambda candidate: calc_candidate_duration(candidate, sat_fgm),
            func_desc="calculating duration parameters",
        )

    def calc_mva_features(self, sat_fgm):
        return pdp.ApplyToRows(
            lambda candidate: calc_candidate_mva_features(candidate, sat_fgm),
            func_desc='calculating index "q_mva", "BnOverB" and "dBOverB"',
        )

    def calc_rotation_angle(self, sat_fgm):
        return pdp.ColByFrameFunc(
            "rotation_angle",
            lambda df: calc_events_rotation_angle(df, sat_fgm),
            func_desc="calculating rotation angle",
        )

    def calc_normal_direction(self, sat_fgm):
        return pdp.ColByFrameFunc(
            "normal_direction",
            lambda df: calc_events_normal_direction(df, sat_fgm),
            func_desc="calculating normal direction",
        )

In [None]:
#| export
from ids_finder.utils.polars import convert_to_pd_dataframe, sort

In [None]:
#| export
def process_events(
    candidates_pl: pl.DataFrame,  # potential candidates DataFrame
    sat_fgm: xr.DataArray,  # satellite FGM data
    data_resolution: timedelta,  # time resolution of the data
    modin = True,
) -> pl.DataFrame:
    "Process candidates DataFrame"
    
    candidates = convert_to_pd_dataframe(candidates_pl, modin=modin)

    id_pipelines = IDsPipeline()
    candidates = id_pipelines.calc_duration(sat_fgm).apply(candidates)

    ids = (
        id_pipelines.calc_mva_features(sat_fgm)
        + id_pipelines.calc_rotation_angle(sat_fgm)
        + id_pipelines.calc_normal_direction(sat_fgm)
    ).apply(
        candidates.dropna()  # Remove candidates with NaN values)
    )

    if isinstance(ids, mpd.DataFrame):
        ids = ids._to_pandas()
    
    return pl.DataFrame(ids)
    # ValueError: Data type fixed_size_list[pyarrow] not supported by interchange protocol

## Obsolete codes

This is obsolete codes because the timewindow now is overlapping. No need to consider where magnetic discontinuities happens in the boundary of one timewindow.

In [None]:
def calc_candidate_d_duration(candidate, data) -> pd.Series:
    try:
        if pd.isnull(candidate['d_tstart']) or pd.isnull(candidate['d_tstop']):
            candidate_data = get_candidate_data(candidate, data, neighbor=1)
            d_time = candidate['d_time']
            threshold = candidate['threshold']
            return calc_d_duration(candidate_data, d_time, threshold)
        else:
            return pandas.Series({
                'd_tstart': candidate['d_tstart'],
                'd_tstop': candidate['d_tstop'],
            })
    except Exception as e:
        # logger.debug(f"Error for candidate {candidate} at {candidate['time']}: {str(e)}")
        print(f"Error for candidate {candidate} at {candidate['time']}: {str(e)}")
        raise e


In [None]:
# pdp.ApplyToRows(
#     lambda candidate: calc_candidate_d_duration(candidate, sat_fgm),
#     func_desc="calculating duration parameters if needed"
# )

### Calibrates candidate duration

This calibration is based on the assumption that the magnetic discontinuity is symmetric around the center of time, which is not always true.

So instead of calibrating the duration, we drop the events. 
- Cons: Might influence the statistics of occurrence rate, but 
- Pros: More robust results about the properties of the magnetic discontinuity.

In [None]:
def calibrate_candidate_duration(
    candidate: pd.Series, data:xr.DataArray, data_resolution, ratio = 3/4
):
    """
    Calibrates the candidate duration. 
    - If only one of 'd_tstart' or 'd_tstop' is provided, calculates the missing one based on the provided one and 'd_time'.
    - Then if this is not enough points between 'd_tstart' and 'd_tstop', returns None for both.
    
    
    Parameters
    ----------
    - candidate (pd.Series): The input candidate with potential missing 'd_tstart' or 'd_tstop'.
    
    Returns
    -------
    - pd.Series: The calibrated candidate.
    """
    
    start_notnull = pd.notnull(candidate['d_tstart'])
    stop_notnull = pd.notnull(candidate['d_tstop']) 
    
    match start_notnull, stop_notnull:
        case (True, True):
            d_tstart = candidate['d_tstart']
            d_tstop = candidate['d_tstop']
        case (True, False):
            d_tstart = candidate['d_tstart']
            d_tstop = candidate['d_time'] -  candidate['d_tstart'] + candidate['d_time']
        case (False, True):
            d_tstart = candidate['d_time'] -  candidate['d_tstop'] + candidate['d_time']
            d_tstop = candidate['d_tstop']
        case (False, False):
            return pandas.Series({
                'd_tstart': None,
                'd_tstop': None,
            })
    
    duration = d_tstop - d_tstart
    num_of_points_between = data.time.sel(time=slice(d_tstart, d_tstop)).count().item()
    
    if num_of_points_between <= (duration/data_resolution) * ratio:
        d_tstart = None
        d_tstop = None
    
    return pandas.Series({
        'd_tstart': d_tstart,
        'd_tstop': d_tstop,
    })

In [None]:
def calibrate_candidates_duration(candidates, sat_fgm, data_resolution):
    # calibrate duration

    calibrate_duration = pdp.ApplyToRows(
        lambda candidate: calibrate_candidate_duration(
            candidate, sat_fgm, data_resolution
        ),
        func_desc="calibrating duration parameters if needed",
    )

    temp_candidates = candidates.loc[
        lambda df: df["d_tstart"].isnull() | df["d_tstop"].isnull()
    ]  # temp_candidates = candidates.query('d_tstart.isnull() | d_tstop.isnull()') # not implemented in `modin`

    if not temp_candidates.empty:
        temp_candidates_updated = calibrate_duration(sat_fgm, data_resolution).apply(
            temp_candidates
        )
        candidates.update(temp_candidates_updated)
    return candidates

## Test

### Test parallelization


Generally `mapply` and `modin` are the fastest. `xorbits` is expected to be the fastest but it is not and it is the slowest one.

```python
#| notest
sat = 'jno'
coord = 'se'
cols = ["BX", "BY", "BZ"]
tau = timedelta(seconds=60)
data_resolution = timedelta(seconds=1)

if True:
    year = 2012
    files = f'../data/{sat}_data_{year}.parquet'
    output = f'../data/{sat}_candidates_{year}_tau_{tau.seconds}.parquet'

    data = pl.scan_parquet(files).set_sorted('time').collect()

    indices = compute_indices(data, tau)
    # filter condition
    sparse_num = tau / data_resolution // 3
    filter_condition = filter_indices(sparse_num = sparse_num)

    candidates = indices.filter(filter_condition).with_columns(pl_format_time(tau)).sort('time')
    
    data_c = compress_data_by_cands(data, candidates, tau)
    sat_fgm = df2ts(data_c, cols, attrs={"units": "nT"})
```

In [None]:
#| notest
candidates_pd = candidates.to_pandas()
candidates_modin = mpd.DataFrame(candidates_pd)
# candidates_x = xpd.DataFrame(candidates_pd)

In [None]:
#| code-summary: Test different libraries to parallelize the computation
#| notest
if True:
    pdp_test = pdp.ApplyToRows(
        lambda candidate: calc_candidate_duration(candidate, sat_fgm),  # fast a little bit
        # lambda candidate: calc_duration(get_candidate_data_xr(candidate, sat_fgm)),
        # lambda candidate: calc_duration(sat_fgm.sel(time=slice(candidate['tstart'], candidate['tstop']))),
        func_desc="calculating duration parameters",
    )
    
    # process_events(candidates_modin, sat_fgm, sat_state, data_resolution)
    
    # ---
    # successful cases
    # ---
    # candidates_pd.mapply(lambda candidate: calc_candidate_duration(candidate, sat_fgm), axis=1) # this works, 4.2 secs
    # candidates_pd.mapply(calc_candidate_duration, axis=1, data=sat_fgm) # this works, but a little bit slower, 6.7 secs
    
    # candidates_pd.apply(calc_candidate_duration, axis=1, data=sat_fgm) # Standard case: 24+s secs
    # candidates_pd.swifter.apply(calc_candidate_duration, axis=1, data=sat_fgm) # this works with dask, 80 secs
    # candidates_pd.swifter.set_dask_scheduler(scheduler="threads").apply(calc_candidate_duration, axis=1, data=sat_fgm) # this works with dask, 60 secs
    # candidates_modin.apply(lambda candidate: calc_candidate_duration(candidate, sat_fgm), axis=1) # this works with ray, 6 secs # NOTE: can not work with dask
    # candidates_x.apply(calc_candidate_duration, axis=1, data=sat_fgm) # 30 seconds
    # pdp_test(candidates_modin) # this works, 8 secs
    
    # ---
    # failed cases
    # ---
    # candidates_modin.apply(calc_candidate_duration, axis=1, data=sat_fgm) # AttributeError: 'DataFrame' object has no attribute 'sel'

In [None]:
import timeit
from functools import partial


In [None]:
def benchmark(task_dict, number=1):
    results = {}
    for name, (data, task) in task_dict.items():
        try:
            time_taken = timeit.timeit(
                lambda: task(data),
                number=number
            )
            results[name] = time_taken / number
        except Exception as e:
            results[name] = str(e)
    return results

In [None]:
#| notest

def benchmark_results(results, sat_fgm):
    func = partial(calc_candidate_duration, data=sat_fgm)
    task_dict = {
        'pandas': (candidates_pd, lambda _: _.apply(func, axis=1)),
        'pandas-mapply': (candidates_pd, lambda _: _.mapply(func, axis=1)),
        'modin': (candidates_modin, lambda _: _.apply(func, axis=1)),
        # 'xorbits': (candidates_x, lambda _: _.apply(func, axis=1)),
    }

    results = benchmark(task_dict)
    return results