---
title: "Datasets"
description: This notebook contains the code to deal with all candidates and individual candidate.
---

In [2]:
#| default_exp candidates

In [3]:
#| export
import polars as pl
import polars.selectors as cs
import pandas as pd
import pandas
import xarray as xr

from datetime import timedelta

#### `Kerdo`

In [4]:
#| export
from kedro.pipeline import Pipeline, node
from kedro.pipeline.modular_pipeline import pipeline
from ids_finder.utils.basic import load_catalog

In [5]:
#| output: false
catalog = load_catalog('../')
catalog.list()



[1m[[0m
    [32m'jno.raw_mag_1s'[0m,
    [32m'jno.raw_state'[0m,
    [32m'model.raw_jno_ss_se_1min'[0m,
    [32m'model.preprocessed_jno_ss_se_1min'[0m,
    [32m'JNO_index'[0m,
    [32m'sta.raw_state_merged'[0m,
    [32m'thb.raw_state_sw'[0m,
    [32m'parameters'[0m,
    [32m'params:tau'[0m,
    [32m'params:jno_start_date'[0m,
    [32m'params:jno_end_date'[0m,
    [32m'params:jno_1s_params'[0m,
    [32m'params:jno_1s_params.bcols'[0m,
    [32m'params:jno_1s_params.data_resolution'[0m,
    [32m'params:jno.extract_params'[0m,
    [32m'params:jno.extract_params.bcols'[0m,
    [32m'params:jno.extract_params.data_resolution'[0m,
    [32m'params:sta.extract_params'[0m,
    [32m'params:sta.extract_params.bcols'[0m,
    [32m'params:sta.extract_params.data_resolution'[0m,
    [32m'params:thb'[0m,
    [32m'params:thb.mag'[0m,
    [32m'params:thb.mag.bcols'[0m,
    [32m'params:thb.mag.time_resolution'[0m,
    [32m'params:thb.mag.coords'[0m,
    

## Combining magnetic field data and state data

In [6]:
#| export
import polars as pl

from ids_finder.utils.basic import df2ts, pl_norm
import xarray as xr
from xarray_einstats import linalg

In [7]:
# | export
def combine(candidates: pl.LazyFrame, states_data: pl.LazyFrame):
    vec_cols = ["v_x", "v_y", "v_z"]  # plasma velocity vector in any coordinate system
    b_vecL_cols = ["Vl_x", "Vl_y", "Vl_z"]  # major eigenvector in any coordinate system
    if not set(vec_cols).issubset(states_data.columns):
        raise ValueError(f"Missing columns {vec_cols}")
    if not set(b_vecL_cols).issubset(candidates.columns):
        raise ValueError(f"Missing columns {b_vecL_cols}")

    return candidates.sort("time").join_asof(states_data.sort("time"), on="time")

### Calculating additional features for the combined dataset

In [8]:
#| export
import astropy.units as u
from astropy.constants import mu0, e
from plasmapy.formulary.lengths import inertial_length
from plasmapy.formulary.speeds import Alfven_speed

In [9]:
#| export
def vector_project(v1,v2, dim="v_dim"):
    return xr.dot(v1 , v2, dims=dim) / linalg.norm(v2, dims=dim)

def vector_project_pl(df: pl.DataFrame, v1_cols, v2_cols, name=None):
    
    v1 = df2ts(df, v1_cols).assign_coords(v_dim=["r","t","n"])
    v2 = df2ts(df, v2_cols).assign_coords(v_dim=["r","t","n"]) 
    result = vector_project(v1, v2, dim="v_dim")
    
    return df.with_columns(
        pl.Series(result.data).alias(name or "v_proj")
    )

In [10]:
# | export
def compute_inertial_length(ldf: pl.LazyFrame):
    df = ldf.collect()

    density = df["plasma_density"].to_numpy() * u.cm ** (-3)
    result = inertial_length(density, "H+").to(u.km)

    return df.with_columns(ion_inertial_length=pl.Series(result.value)).lazy()


def compute_Alfven_speed(ldf: pl.LazyFrame):
    df = ldf.collect()

    B = df["B"] if "B" in df.columns else df["b_mag"]  # backwards compatiblity
    density = df["plasma_density"].to_numpy() * u.cm ** (-3)
    result = Alfven_speed(B.to_numpy() * u.nT, density=density, ion="p+").to(u.km / u.s)

    return df.with_columns(Alfven_speed=pl.Series(result.value)).lazy()


def compute_Alfven_current(ldf: pl.LazyFrame):
    df = ldf.collect()

    Alfven_speed = df["Alfven_speed"].to_numpy() * u.km / u.s
    density = df["plasma_density"].to_numpy() * u.cm ** (-3)

    result = (e.si * Alfven_speed * density)
    result = result.to(u.nA / u.m**2)

    return df.with_columns(j_Alfven=pl.Series(result.value)).lazy()

In [11]:
# | export
def calc_combined_features(df: pl.LazyFrame):
    vec_cols = ["v_x", "v_y", "v_z"]  # plasma velocity vector in any coordinate system
    b_vecL_cols = ["Vl_x", "Vl_y", "Vl_z"]  # major eigenvector in any coordinate system

    j_factor = ((u.nT / u.s) * (1 / mu0 / (u.km / u.s))).to(u.nA / u.m**2)

    result = (
        df.with_columns(
            duration=pl.col("d_tstop") - pl.col("d_tstart"),
        )
        .pipe(vector_project_pl, vec_cols, b_vecL_cols, name="v_l")
        .with_columns(v_mn=(pl.col("plasma_speed") ** 2 - pl.col("v_l") ** 2).sqrt())
        .with_columns(
            L_mn=pl.col("v_mn") * pl.col("duration").dt.nanoseconds() / 1e9,
            j0=pl.col("d_star") / pl.col("v_mn"),
        )
        .pipe(compute_inertial_length)
        .pipe(compute_Alfven_speed)
        .pipe(compute_Alfven_current)
        .pipe(j0=pl.col("j0") * j_factor.value)
        .with_columns(
            L_mn_norm=pl.col("L_mn") / pl.col("ion_inertial_length"),
            j0_norm=pl.col("j0") / pl.col("j_Alfven"),
        )
    )
    return result

### Pipelines

In [12]:
#| export
def combine_features(candidates: pl.LazyFrame, states_data: pl.LazyFrame):
    df = combine(candidates, states_data)
    updated_df = calc_combined_features(df)

    return updated_df.collect()

In [13]:
#| export
from kedro.pipeline import Pipeline, node
from kedro.pipeline.modular_pipeline import pipeline

In [14]:
# | export
def create_candidate_pipeline(
    sat_id, 
    tau: int = 60,
    ts_mag: int = 1,
    ts_state: str = "1h",
    **kwargs) -> Pipeline:

    ts_mag_str = f"ts_{ts_mag}s"
    tau_str = f"tau_{tau}s"

    node_combine_features = node(
        combine_features,
        inputs=[
            f"{sat_id}.feature_{ts_mag_str}_{tau_str}",
            f"{sat_id}.primary_state_{ts_state}",
        ],
        outputs=f"candidates.{sat_id}_{ts_mag_str}_{tau_str}",
    )

    nodes = [node_combine_features]
    return pipeline(nodes)

## Datasets

In [15]:
#| export
from pydantic import BaseModel
from kedro.io import DataCatalog
from ids_finder.utils.basic import concat_partitions

Foundational Dataset Class

In [None]:
#| export
from ids_finder.utils.basic import df2ts
from ids_finder.utils.plot import plot_candidate

In [None]:
# | export
class IDsDataset(BaseModel):
    class Config:
        arbitrary_types_allowed = True
        extra = "allow"

    sat_id: str
    tau: timedelta
    ts: timedelta = timedelta(seconds=1)

    candidates: pl.DataFrame | None = None
    data: pl.LazyFrame | None = None # data is large, so we use `pl.LazyFrame` to save memory

    def plot_candidate(self, index = None, predicates = None):
        if index is not None:
            candidate = self.candidates.row(index, named=True)
        elif predicates is not None:
            candidate = self.candidates.filter(predicates).row(0, named=True)

        _data = self.data.filter(
            pl.col("time").is_between(candidate["tstart"], candidate["tstop"])
        )
        sat_fgm = df2ts(_data, [ 'B_x', 'B_y', 'B_z'] )
        plot_candidate(candidate, sat_fgm)
        pass

    def plot_candidates(self, **kwargs):
        pass

Extended Dataset Class with support for `kedro`

In [None]:
#| export
class cIDsDataset(IDsDataset):
    catalog: DataCatalog
    
    or_df: pl.DataFrame | None = None  # occurence rate
    or_df_normalized: pl.DataFrame | None = None # normalized occurence rate

    def __init__(self, **data):
        super().__init__(**data)
        
        self._tau_str = f"tau_{self.tau.seconds}s"
        self._ts_mag_str = f"ts_{self.ts.seconds}s"
        
        if self.candidates is None:
            self.load_candidates()
        if self.data is None:
            self.load_data()

    def load_candidates(self):

        candidates_format = f"candidates.{self.sat_id}_{self._ts_mag_str}_{self._tau_str}"

        self.candidates = self.catalog.load(candidates_format).fill_nan(None).with_columns(
            cs.float().cast(pl.Float64),
            sat=pl.lit(self.sat_id),
        ).collect()

    def load_data(self):
        data_format = f"{self.sat_id}.primary_mag_{self._ts_mag_str}"
        self.data = concat_partitions(self.catalog.load(data_format))

## Candidate class

In [None]:
#| export
from pprint import pprint

In [None]:
#| export
class CandidateID:
    def __init__(self, time, df: pl.DataFrame) -> None:
        self.time = pd.Timestamp(time)
        self.data = df.row(
            by_predicate=(pl.col("time") == self.time), 
            named=True
        )

    def __repr__(self) -> str:
        # return self.data.__repr__()
        pprint(self.data)
        return ''
    
    def plot(self, sat_fgm, tau):
        plot_candidate_xr(self.data, sat_fgm, tau)
        pass
        

In [None]:
sta_candidates_1s: pl.DataFrame = catalog.load('candidates.sta_1s')
jno_candidates_1s = catalog.load('candidates.jno_1s')

sta_mag : pl.LazyFrame = catalog.load('sta.inter_mag_rtn_1s')
jno_mag : pl.LazyFrame = catalog.load('sta.inter_mag_rtn_1s')

In [None]:
from ids_finder.utils.basic import df2ts, pmap
from fastcore.utils import *

In [None]:

def plot_candidate(candidate, mag_data: pl.LazyFrame, b_cols = ['BX', 'BY', 'BZ']):
    temp_tstart = candidate["tstart"]
    tmep_tstop = candidate["tstop"]
    tau = tmep_tstop - temp_tstart

    temp_mag_data = (
        mag_data.filter(pl.col("time").is_between(temp_tstart - tau, tmep_tstop + tau))
        .with_columns(pl.col("time").dt.cast_time_unit("ns"))
        .collect()
    )
    
    sat_fgm = df2ts(temp_mag_data, b_cols)
    plot_candidate_xr(candidate, sat_fgm, tau)

In [None]:
n = 3
# list(sta_candidates_1s.sample(n).iter_rows(named=True) | pmap(plot_candidate, mag_data=sta_mag))

candidates = jno_candidates_1s.sample(n)
list(candidates.iter_rows(named=True) | pmap(plot_candidate, mag_data=jno_mag))
