# Loading all datasets from different sources

In [1]:
import polars as pl
import polars.selectors as cs
import pandas as pd
import numpy as np

from loguru import logger

::: {#cell-1 .cell 0=‘h’ 1=‘i’ 2=‘d’ 3=‘e’ execution_count=2}

``` python
%load_ext autoreload
%autoreload 2
```

:::

### Loading all datasets from different sources

In [3]:
from ids_finder.utils.basic import load_catalog

catalog = load_catalog()

In [5]:
from ids_finder.datasets import cIDsDataset

sta_dataset = cIDsDataset(sat_id="STA", tau=60, ts=1, catalog=catalog)
jno_dataset = cIDsDataset(sat_id="JNO", tau=60, ts=1, catalog=catalog)
thb_dataset = cIDsDataset(sat_id="THB", tau=60, ts=1, catalog=catalog)

In [6]:
from beforerr.basics import pmap
from ids_finder.utils.analysis import filter_tranges_ds


In [7]:
thb_inter_state_sw: pl.LazyFrame = catalog.load('thb.inter_state_sw')
start, end = thb_inter_state_sw.select(['start', 'end']).collect()

thb_sw_dataset = filter_tranges_ds(thb_dataset, (start, end))

In [8]:
all_datasets = [sta_dataset, jno_dataset, thb_sw_dataset]

In [None]:
all_candidates_l0 : pl.DataFrame = pl.concat(
    all_datasets | pmap(lambda x: x.candidates),
    how="diagonal",
)

In [None]:
def combine_candidates(datasets):
    return pl.concat(
        datasets | pmap(lambda x: x.candidates),
        how="diagonal",
    )


## Processing datasets

Some extreme values are present in the data. We will remove them.

In [None]:

NVARS = ['d_star', 'L_mn', 'L_mn_norm', 'j0', 'j0_norm', 'duration', 'v_mn']
DISPLAY_VARS = ['time', 'sat'] + NVARS


def check_candidates(df):
    return df[NVARS].describe()

check_candidates(all_candidates_l0)

In [None]:
from datetime import timedelta
def process_candidates_l1(raw_df: pl.DataFrame):
    "clean data to remove extreme values"

    df = raw_df.filter(
        pl.col("d_star") < 100, # exclude JUNO extreme values
        pl.col('v_mn') > 10,
        pl.col('duration') < timedelta(seconds=60),
        # pl.col("j0") < 100
    ).with_columns(
        pl.col('radial_distance').fill_null(1) # by default, fill with 1 AU
    ).with_columns(
        r_bin = pl.col('radial_distance').round(),
        j0_norm_log = pl.col('j0_norm').log10(),
        L_mn_norm_log = pl.col('L_mn_norm').log10(),
    )

    logger.info(
        f"candidates_l1: {len(df)}, with effective ratio: {len(df) / len(raw_df):.2%}"
    )

    return df

all_candidates_l1 = process_candidates_l1(all_candidates_l0)
%R -i all_candidates_l1 -c conv_pl
check_candidates(all_candidates_l1)


2023-11-08 14:11:23.225 | INFO     | __main__:process_candidates_l1:18 - candidates_l1: 180718, with effective ratio: 97.65%

In [None]:
jno_candidates_l1 = all_candidates_l1.filter(pl.col('sat') == 'JNO')
%R -i jno_candidates_l1 -c conv_pl

In [None]:
from ids_finder.utils.analysis import filter_before_jupiter
from ids_finder.utils.analysis import link_coord2dim

In [None]:
def process_candidates_l2(raw_df: pl.DataFrame, avg_window="30d"):
    time_col = "time"

    candidate = (
        raw_df.sort(time_col)
        .group_by_dynamic(time_col, every=avg_window, by="sat")
        .agg(cs.numeric().mean(), cs.duration().mean(), id_count=pl.count())
        .filter(pl.col("id_count") > 50)  # filter out JUNO extreme large thickness
        .sort(time_col)
        .upsample(time_col, every=avg_window, by="sat", maintain_order=True)
        .with_columns(pl.col("sat").forward_fill())
    )
    return candidate

In [None]:
all_candidates_l2: pl.DataFrame = (
    all_candidates_l1.pipe(filter_before_jupiter)
    .pipe(process_candidates_l2)
    .pipe(link_coord2dim)
)

In [None]:
inspect_df = all_candidates_l2[NVARS]
inspect_df.describe()

In [None]:
from ids_finder.utils.analysis import n2_normalize

all_candidates_l2_n2 = n2_normalize(all_candidates_l2, NVARS)