---
title: IDs from Juno
---

See following notebooks for details:

- [State data](./state.ipynb)

## Setup

In [21]:
#| hide
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [22]:
from discontinuitypy.datasets import IDsDataset
import polars as pl
from fastcore.utils import walk

from loguru import logger
from tqdm.auto import tqdm

from datetime import timedelta

In [3]:
mission = "JNO"
ts = timedelta(seconds=1)
tau = timedelta(seconds=60)


data_dir = '../../../data'
dir_path = f'{data_dir}/03_primary/JNO_MAG_ts_{ts.seconds}s'
juno_state_path = f'{data_dir}/03_primary/JNO_STATE_ts_3600s.parquet'
vec_cols = ['v_x', 'v_y', 'v_z']

format = 'arrow'
fname = f'events.{mission}.ts_{ts.total_seconds():.2f}s_tau_{tau.seconds}s.{format}'
output_path = f'{data_dir}/05_reporting/{fname}'

In [4]:
plasma_data = pl.scan_parquet(juno_state_path).sort('time')
logger.info(plasma_data.columns)

[32m2024-02-18 19:32:07.752[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1m['radial_distance', 'plasma_density', 'plasma_temperature', 'time', 'model_b_r', 'model_b_t', 'model_b_n', 'v_x', 'v_y', 'v_z', 'plasma_speed', 'B_background_x', 'B_background_y', 'B_background_z'][0m


## Standard Process

In [None]:
juno_events = []
for mag_path in walk(dir_path):
    mag_data = pl.scan_parquet(mag_path).drop('X', 'Y', 'Z').sort('time')

    _juno_events = (
        IDsDataset(
            mag_data=mag_data,
            plasma_data=plasma_data,
            tau=tau,
            ts=ts,
            vec_cols=vec_cols,
            density_col="plasma_density",
            speed_col="plasma_speed",
            temperature_col="plasma_temperature",
        )
        .find_events(return_best_fit=False)
        .update_candidates_with_plasma_data()
        .events
    )
    
    juno_events.append(_juno_events)
    
juno_ids_dataset = IDsDataset(
    events=pl.concat(juno_events),
    mag_data= pl.scan_parquet(list(walk(dir_path))).drop('X', 'Y', 'Z').sort('time')
).export(output_path)

## Check the discontinuity in Juno cruise phase

### Full time resolution data

0.03 s - 0.125 s time resolution

In [5]:
from space_analysis.missions.juno.fgm import download_data
from discontinuitypy.utils.basic import resample
from toolz import curry, compose
from pipe import select
from fastcore.utils import mkdir
import os

In [6]:
def preprocess(
    fp,
    every = timedelta(seconds = 0.125),
    dir_path = "../../../data/02_intermediate/JNO_MAG_8hz",
    update = False
):
    fname = fp.split('/')[-1]
    
    output_path = f"{dir_path}/{fname}"
    
    if not os.path.exists(output_path) or update:
        mkdir(dir_path, parents=True, exist_ok = True)
        df = pl.scan_ipc(fp).sort('time').pipe(resample, every = every)
        df.collect().write_ipc(output_path)
    return output_path

@curry
def process(fp, ids_dataset: IDsDataset, sparse_num = 10, **kwargs):
    df = pl.scan_ipc(fp).sort('time').unique('time')

    ids_dataset.data = df
    
    return ids_dataset.find_events(return_best_fit=False, sparse_num = sparse_num, **kwargs).update_candidates_with_plasma_data().events

In [7]:
def split_list(l, n):
    if isinstance(l, map):
        l = list(l)
    
    for i in range(0, len(l), n):
        yield l[i:i+n]

In [8]:
mag_paths = list(download_data(datatype="FULL") | select(preprocess))

In [9]:
ts = timedelta(seconds=0.125)
tau = timedelta(seconds=20)
method = "derivative"
# method = "fit"

fname = f'events.{mission}.{method}.ts_{ts.total_seconds():.2f}s_tau_{tau.seconds}s.{format}'
output_path = f'{data_dir}/05_reporting/{fname}'
logger.info(output_path)

[32m2024-02-18 19:32:19.893[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1m../../../data/05_reporting/events.JNO.derivative.ts_0.12s_tau_20s.arrow[0m


In [10]:
ids_ds = IDsDataset(
    plasma_data=plasma_data,
    tau=tau,
    ts=ts,
    vec_cols=vec_cols,
    density_col="plasma_density",
    speed_col="plasma_speed",
    temperature_col="plasma_temperature",
)

Reasonably splitting the data files may accelerate the processing.

In [12]:
fps = split_list(mag_paths, n=100)

func = process(ids_dataset = ids_ds, sparse_num = 10, method = method)

ids_ds.data = pl.scan_ipc(mag_paths)
ids_ds.events = pl.concat(fps | select(func)) 
ids_ds.export(output_path)




Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?




Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?





Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?




Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?




Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?




Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?





Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?




Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?




Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?




Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?




Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?




Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

IDsDataset(name=None, events=shape: (149_383, 96)
┌─────────────┬────────────┬─────┬──────────┬───┬────────────┬───────────┬────────────┬────────────┐
│ time        ┆ index_diff ┆ len ┆ std      ┆ … ┆ v.ion.chan ┆ B.change  ┆ v.Alfven.c ┆ v.Alfven.c │
│ ---         ┆ ---        ┆ --- ┆ ---      ┆   ┆ ge.l       ┆ ---       ┆ hange      ┆ hange.l    │
│ datetime[ns ┆ f64        ┆ u32 ┆ f64      ┆   ┆ ---        ┆ f64       ┆ ---        ┆ ---        │
│ ]           ┆            ┆     ┆          ┆   ┆ f64        ┆           ┆ f64        ┆ f64        │
╞═════════════╪════════════╪═════╪══════════╪═══╪════════════╪═══════════╪════════════╪════════════╡
│ 2011-08-25  ┆ 1.705243   ┆ 160 ┆ 2.443084 ┆ … ┆ NaN        ┆ 0.001359  ┆ NaN        ┆ NaN        │
│ 15:25:30    ┆            ┆     ┆          ┆   ┆            ┆           ┆            ┆            │
│ 2011-08-25  ┆ 0.725398   ┆ 160 ┆ 1.261811 ┆ … ┆ NaN        ┆ -0.013523 ┆ NaN        ┆ NaN        │
│ 15:27:20    ┆            ┆     ┆       

### First year

In [None]:
mag_path = sorted(list(walk(dir_path)))[0]
tau = timedelta(seconds=60)
mag_data = pl.scan_parquet(mag_path).drop('X', 'Y', 'Z').sort('time')

ids_ds = (
    IDsDataset(
        mag_data=mag_data,
        plasma_data=plasma_data,
        tau=tau,
        ts=ts,
        vec_cols=vec_cols,
        density_col="plasma_density",
        speed_col="plasma_speed",
        temperature_col="plasma_temperature",
    )
    .find_events(return_best_fit=True)
    .update_candidates_with_plasma_data()
)

In [None]:
#| layout-ncol: 3
#| column: page
ids_ds.plot_candidates(num=20, plot_fit_data=True, predicates=(pl.col('fit.stat.rsquared') > 0.95))

### Last year

In [None]:
# mag_path = sorted(list(walk(dir_path)))[-1]
tau = timedelta(seconds=300)
mag_path = sorted(list(walk(dir_path)))[-1]
mag_data = pl.scan_parquet(mag_path).drop('X', 'Y', 'Z').sort('time')

ids_ds = (
    IDsDataset(
        mag_data=mag_data,
        plasma_data=plasma_data,
        tau=tau,
        ts=ts,
        vec_cols=vec_cols,
        density_col="plasma_density",
        speed_col="plasma_speed",
        temperature_col="plasma_temperature",
    )
    .find_events(return_best_fit=True)
    .update_candidates_with_plasma_data()
)

In [None]:
#| layout-ncol: 3
#| column: page
ids_ds.plot_candidates(num=20, plot_fit_data=True, predicates=(pl.col('fit.stat.rsquared') > 0.95))


## Processing the whole data

## Obsolete

### Estimate

1 day of data resampled by 1 sec is about 12 MB.

So 1 year of data is about 4 GB, and 6 years of JUNO Cruise data is about 24 GB.

Downloading rate is about 250 KB/s, so it will take about 3 days to download all the data.

In [None]:
num_of_files = 6*365
jno_file_size = 12e3
thm_file_size = 40e3
files_size = jno_file_size + thm_file_size
downloading_rate = 250
processing_rate = 1/60

time_to_download = num_of_files * files_size / downloading_rate / 60 / 60
space_required = num_of_files * files_size / 1e6
time_to_process = num_of_files / processing_rate / 60 / 60

print(f"Time to download: {time_to_download:.2f} hours")
print(f"Disk space required: {space_required:.2f} GB")
print(f"Time to process: {time_to_process:.2f} hours")