---
title: IDs from ARTHEMIS
format:
  html:
    code-fold: true
output-file: artemis.html
---

## Background

ARTEMIS spacecrafts will be exposed in the solar wind at 1 AU during its orbits around the Moon. So it's very interesting to look into its data.

- For time inteval for THEMIS-B in solar wind, see [Link](https://omniweb.gsfc.nasa.gov/ftpbrowser/themis_b_sw.txt)
- For time inteval for THEMIS-C in solar wind, see [Link](https://omniweb.gsfc.nasa.gov/ftpbrowser/themis_c_sw.txt)


## Setup

Need to run command in shell first as `pipeline` is project-specific command

```{sh}
kedro pipeline create themis
```

To get candidates data, run `kedro run --from-inputs=jno.feature_1s --to-outputs=candidates.jno_1s`

In [1]:
#| hide
#| default_exp pipelines/themis/pipeline
%load_ext autoreload
%autoreload 2
from nbdev.showdoc import *


In [2]:
#| code-summary: import all the packages needed for the project
#| output: hide
#| export

from ids_finder.core import *
from fastcore.utils import *
from fastcore.test import *

import polars as pl
import pandas
import numpy as np
import xarray as xr


from datetime import timedelta
from loguru import logger


#### `Kerdo`

In [3]:
#| export
from kedro.pipeline import Pipeline, node
from kedro.pipeline.modular_pipeline import pipeline

In [46]:
#| eval: false
from ids_finder.utils.basic import load_catalog

In [47]:
catalog = load_catalog()

jno_start_date = catalog.load('params:jno_start_date')
jno_end_date = catalog.load('params:jno_end_date')
trange = [jno_start_date, jno_end_date]

## Dataset Overview

In [None]:
artemis_probes = ["b", "c"]
probe = artemis_probes[0]

jno_start_date = "2011-08-25"
jno_end_date = "2016-06-30" 

trange = [jno_start_date, jno_end_date]
test_trange = ["2011-08-25", "2011-09-25"]

## Magnetic field data pipeline

- For convenience, we choose magnetic field data in **GSE** coordinate system
- The `fgs` data are in 3-4s resolution

### Downloading data 

In [None]:
# | export
def download_mag_data(
    start: str, end: str, probe: str = "b", datatype="fgs", coord="gse"
):
    import speasy as spz

    trange = [start, end]

    match probe:
        case "b":
            sat = "thb"

    product = f"cda/{sat.upper()}_L2_FGM/{sat}_{datatype}_{coord}"
    data = spz.get_data(product, trange, disable_proxy=True)

    return data

### Preprocessing data

In [None]:
#| export
def spz2parquet(raw_data):
    return pl.from_dataframe(raw_data.to_dataframe().reset_index()).rename({"index": "time"})


def preprocess_mag_data(
    raw_data,
    ts: str = None,  # time resolution
    coord: str = 'gse',
) -> pl.DataFrame:
    """
    Preprocess the raw dataset (only minor transformations)

    - Applying naming conventions for columns
    - Changing storing format to `parquet`
    """
    return spz2parquet(raw_data).rename({
        'Bx FGS-D': 'b_{coord}_x',
        'By FGS-D': 'b_{coord}_y',
        'Bz FGS-D': 'b_{coord}_z',
    })

### Processing data

In [None]:
# | export
from ids_finder.utils.basic import partition_data_by_year

In [None]:
#| export
def process_mag_data(
    raw_data: pl.DataFrame,
    ts: str = None,  # time resolution
) -> pl.DataFrame | Dict[str, pl.DataFrame]:
    """
    Partitioning data, for the sake of memory
    """
    return partition_data_by_year(raw_data)

### Pipeline

In [None]:
# | exports
def create_mag_data_pipeline(
    sat_id: str,  # satellite id, used for namespace
    ts: str = '4s',  # time resolution,
    tau: str = '60s',  # time window
    **kwargs,
) -> Pipeline:
    
    node_download_data = node(
        download_mag_data,
        inputs=dict(
            start="params:start_date",
            end="params:end_date",
        ),
        outputs=f"raw_mag",
        name=f"download_{sat_id.upper()}_magnetic_field_data",
    )

    node_preprocess_data = node(
        preprocess_mag_data,
        inputs=dict(
            raw_data=f"raw_mag",
        ),
        outputs=f"inter_mag_{ts}",
        name=f"preprocess_{sat_id.upper()}_magnetic_field_data",
    )

    node_process_data = node(
        process_mag_data,
        inputs=f"inter_mag_{ts}",
        outputs=f"primary_mag_{ts}",
        name=f"process_{sat_id.upper()}_magnetic_field_data",
    )

    node_extract_features = node(
        extract_features,
        inputs=[f"primary_mag_{ts}", "params:tau", "params:extract_params"],
        outputs=f"feature_tau_{tau}",
        name=f"extract_{sat_id}_features",
    )

    nodes = [
        node_download_data,
        node_preprocess_data,
        node_process_data,
        node_extract_features,
    ]

    pipelines = pipeline(
        nodes,
        namespace=sat_id,
        parameters={
            "params:start_date": "params:jno_start_date",
            "params:end_date": "params:jno_end_date",
            "params:tau": tau,
        },
    )

    return pipelines

## State data pipeline

We use low resolution [OMNI data](https://omniweb.gsfc.nasa.gov/ow.html) for plasma state data, see [details](https://spdf.gsfc.nasa.gov/pub/data/omni/low_res_omni/omni2.text).

- Data gaps were filled with dummy numbers for the missing hours or entire
  days to make all files of equal length.  The character '9' is used to
  fill all fields for missing data according to their format, e.g.
  ' 9999.9' for a field with the FORTRAN format F7.1. Note that format F7.1
  below really means (1X,F6.1),etc.


```
The flow OMNI "phi" angle is opposite GSE "phi" angle, threrfore, Flow-vector cartesian components in GSE coordinates may be derived from the given speed and angles as

Vx = - V * cos(theta) * cos(phi)
Vy = + V * cos(theta) * sin(phi)
Vz = + V * sin(theta)
and vise versa: two angles may be derived from the given speed and Vx,Vy,Vz comp. as  
          a_theta=vz/V
          theta=(180.*asin(a_theta))/!PI
         a_phi=Vy/(-Vx)
        phi=(180.*atan(a_phi))/!PI
```

```
   (*)   Quasi-GSE for the flow longitude angle means the angle increases from zero
         to positive values as the flow changes from being aligned along the -X(GSE)
         axis towards the +Y(GSE) axis.  The flow longitude angle is positive for 
         flow from west of the sun, towards +Y(GSE).
         The flow latitude angle is positive for flow from south of the sun, 
         towards +Z(GSE)
``````                  

### Downloading data

In [None]:
#| export
def download_state_data(
    start: str = None,
    end: str = None,
    ts: str = None,  # time resolution
):
    import pyspedas
    
    trange = [start, end]
    files = pyspedas.omni.data(trange=trange, datatype='hour', downloadonly=True)
    return files


### Preprocessing data

In [None]:
#| export
from ids_finder.utils.basic import cdf2pl, pmap

In [None]:
#| export
def preprocess_state_data(
    raw_data: List[str], # files
    vars: dict,
) -> pl.LazyFrame:
    """
    Preprocess the raw dataset (only minor transformations)

    - Applying naming conventions for columns
    - Extracting variables from `CDF` files, and convert them to DataFrame
    """
    
    columns_name_mapping = {key: value["COLNAME"] for key, value in vars.items()}
    df: pl.LazyFrame = pl.concat(raw_data | pmap(cdf2pl, var_names=list(vars)))

    return df.collect().rename(columns_name_mapping)

### Processing data

In [None]:
#| export
def flow2gse(df: pl.LazyFrame) -> pl.LazyFrame:
    """
    - Transforming data from `Quasi-GSE` coordinate to GSE coordinate system
    """
    sw_speed = pl.col("sw_speed")
    sw_theta = pl.col("sw_vel_theta")
    sw_phi = pl.col("sw_vel_phi")

    return df.with_columns(
        sw_vel_gse_x=-sw_speed * sw_theta.cos() * sw_phi.cos(),
        sw_vel_gse_y=+sw_speed * sw_theta.cos() * sw_phi.sin(),
        sw_vel_gse_z=+sw_speed * sw_theta.sin(),
    ).drop(["sw_theta", "sw_phi"])

def process_state_data(df: pl.LazyFrame) -> pl.LazyFrame:
    """
    - Transforming data to GSE coordinate system
    """

    return df.pipe(flow2gse)

### Pipelines

In [None]:
# | export
def create_state_data_pipeline(
    sat_id,
    ts: str = '1h',  # time resolution
    **kwargs
) -> Pipeline:
    
    node_download_data = node(
        download_state_data,
        inputs=dict(
            start="params:start_date",
            end="params:end_date",
        ),
        outputs=f"raw_state_files",
        name=f"download_{sat_id.upper()}_state_data",
    )

    node_preprocess_data = node(
        preprocess_state_data,
        inputs=dict(
            raw_data=f"raw_state_files",
            vars="params:omni_vars",
        ),
        outputs=f"inter_state_{ts}",
        name=f"preprocess_{sat_id.upper()}_state_data",
    )
    
    node_process_data = node(
        process_state_data,
        inputs=f"inter_state_{ts}",
        outputs=f"primary_state_{ts}",
        name=f"process_{sat_id.upper()}_state_data",
    )
    
    nodes = [
        node_download_data,
        node_preprocess_data,
        node_process_data,
    ]
    pipelines = pipeline(
        nodes,
        namespace=sat_id,
        parameters={
            "params:omni_vars": "params:omni_vars",
            "params:start_date": "params:jno_start_date",
            "params:end_date": "params:jno_end_date",
        },
    )

    return pipelines

## Processing the whole data

In [None]:
#| export
from ids_finder.candidates import create_candidate_pipeline

In [None]:
# | export
def create_pipeline(
    sat_id="thb",
    tau="60s",
    ts_state="1h",  # time resolution of state data
) -> Pipeline:
    return (
        create_mag_data_pipeline(sat_id, tau=tau)
        + create_state_data_pipeline(sat_id, ts=ts_state)
        + create_candidate_pipeline(sat_id, tau=tau, ts_state=ts_state)
    )

In [None]:
#| eval: false
catalog.load('thb.primary_state_1h').collect().describe()

describe,time,sw_density,sw_temperature,sw_speed,sw_vel_theta,sw_vel_phi,sw_vel_gse_x,sw_vel_gse_y,sw_vel_gse_z
str,str,f64,f64,f64,f64,f64,f64,f64,f64
"""count""","""43848""",43848.0,43848.0,43848.0,43848.0,43848.0,43848.0,43848.0,43848.0
"""null_count""","""0""",445.0,76.0,5.0,5.0,5.0,5.0,5.0,5.0
"""mean""",,6.110902,85933.265625,412.241852,-0.790534,-0.118984,-9.411859,-3.740302,-38.05085
"""std""",,4.929866,83964.515625,87.505905,2.413519,2.73037,210.468213,210.310608,295.848938
"""min""","""2011-07-01 00:…",0.1,3299.0,240.0,-14.9,-26.9,-784.168997,-724.554749,-773.737244
"""25%""",,3.1,35276.0,348.0,-2.2,-1.8,-154.582172,-148.638458,-299.854858
"""50%""",,4.8,63350.0,396.0,-0.8,-0.2,-6.642112,0.0,-72.535683
"""75%""",,7.5,107588.0,459.0,0.7,1.5,131.584089,137.39772,235.292801
"""max""","""2016-06-30 23:…",137.199997,1901074.0,878.0,21.0,24.6,753.76094,868.656555,770.671265


## Obsolete codes

### Check and preprocess the data

As we are only interested in the data when THEMIS is in the solar wind, for simplicity we will only keep the data when `X, SSE` and `X, GSE` is positive.

- State data time resolution is 1 minute...

- FGS data time resolution is 4 second...

In [None]:
def get_thm_state(sat):
    sat_pos_sse_files = f"../data/{sat}_pos_sse.parquet"
    sat_pos_sse = pl.scan_parquet(sat_pos_sse_files).set_sorted("time")
    sat_pos_gse_files = f"../data/{sat}_pos_gse.parquet"
    sat_pos_gse = pl.scan_parquet(sat_pos_gse_files).set_sorted("time")
    sat_state = sat_pos_sse.join(sat_pos_gse, on="time", how="inner")
    return sat_state

In [None]:
#| eval: false

def convert_thm_state_to_parquet(
    probe: str, trange
):
    file_name = f"./data/th{probe}_state.parquet"
    if os.path.exists(file_name):
        return file_name

    start = trange.start.to_string()
    end = trange.end.to_string()

    files = pyspedas.themis.state(
        probe=probe,
        trange=[start, end],
        downloadonly=True,
        no_update=True,
    )

    thm_pos_sse_Xs = []
    thm_pos_gse_Xs = []
    thm_state_times = []
    for file in files:
        thm_state = pycdfpp.load(file)
        epoch_dt64 = thm_state[
            f"time"
        ].values  #  CATDESC: "thm_state_time, UTC, in seconds since 01-Jan-1970 00:00:00"
        thm_pos_sse_Xs.append(thm_state[f"th{probe}_pos_sse"].values[:, 0])
        thm_pos_gse_Xs.append(thm_state[f"th{probe}_pos_gse"].values[:, 0])
        thm_state_times.append(epoch_dt64)

    thm_pos_sse_X = np.concatenate(thm_pos_sse_Xs)
    thm_pos_gse_X = np.concatenate(thm_pos_gse_Xs)
    thm_state_time = np.concatenate(thm_state_times)

    pl.DataFrame(
        {
            "thm_state_time": thm_state_time,
            "thm_pos_gse_X": thm_pos_gse_X,
            "thm_pos_sse_X": thm_pos_sse_X,
        }
    ).with_columns(
        pl.from_epoch(pl.col("thm_state_time"), time_unit="s")
    ).write_parquet(
        file_name
    )

    return file_name


def convert_thm_fgm_to_parquet(probe, trange):
    file_name = f"./data/th{probe}_fgm.parquet"
    if os.path.exists(file_name):
        return file_name

    start = trange.start.to_string()
    end = trange.end.to_string()
    
    files = pyspedas.themis.fgm(
        probe=probe,
        trange=[start, end],
        downloadonly=True,
        no_update=True,
    )

    thm_fgl_gses = []
    thm_fgl_btotals = []
    thm_fgl_times = []

    for file in files:
        cdf = pycdfpp.load(file)
        thm_fgl_gses.append(cdf[f"th{probe}_fgl_gse"].values)
        thm_fgl_btotals.append(cdf[f"th{probe}_fgl_btotal"].values)
        thm_fgl_times.append(cdf[f"th{probe}_fgl_time"].values)

    thm_fgl_gse = np.concatenate(thm_fgl_gses)
    thm_fgl_btotal = np.concatenate(thm_fgl_btotals)
    thm_fgl_time = np.concatenate(thm_fgl_times)

    pl.DataFrame(
        {
            "time": thm_fgl_time,
            "BX": thm_fgl_gse[:,0],
            "BY": thm_fgl_gse[:,1],
            "BZ": thm_fgl_gse[:,2],
            "B": thm_fgl_btotal,
        }
    ).with_columns(
        pl.from_epoch(pl.col("thm_fgl_time"), time_unit="s"),
    ).write_parquet(   
        file_name
    )
    
    return file_name

In [None]:
%%markdown
df = (
    sat_state_sw.upsample("time", every="1m")
    .group_by_dynamic("time", every="1d")
    .agg(pl.col("X, SSE").null_count().alias("null_count"))
    .with_columns(
        pl.when(pl.col("null_count") > 720).then(0).otherwise(1).alias("availablity")
    )
)

properties = {
    'width': 800,
}

chart1 = alt.Chart(df).mark_point().encode(
    x='time',
    y='null_count'
).properties(**properties)

chart2  = alt.Chart(df).mark_point().encode(
    x='time',
    y='availablity'
).properties(**properties)

(chart1 & chart2)

df = (
    sat_state_sw.upsample("time", every="1m")
    .group_by_dynamic("time", every="1d")
    .agg(pl.col("X, SSE").null_count().alias("null_count"))
    .with_columns(
        pl.when(pl.col("null_count") > 720).then(0).otherwise(1).alias("availablity")
    )
)

properties = {
    'width': 800,
}

chart1 = alt.Chart(df).mark_point().encode(
    x='time',
    y='null_count'
).properties(**properties)

chart2  = alt.Chart(df).mark_point().encode(
    x='time',
    y='availablity'
).properties(**properties)

(chart1 & chart2)
