---
title: THEMIS State data pipeline
---

We use low resolution [OMNI data](https://omniweb.gsfc.nasa.gov/ow.html) for plasma state data, see [details](https://spdf.gsfc.nasa.gov/pub/data/omni/low_res_omni/omni2.text).



- Data gaps were filled with dummy numbers for the missing hours or entire
  days to make all files of equal length.  The character '9' is used to
  fill all fields for missing data according to their format, e.g.
  ' 9999.9' for a field with the FORTRAN format F7.1. Note that format F7.1
  below really means (1X,F6.1),etc.

```
The flow OMNI "phi" angle is opposite GSE "phi" angle, threrfore, Flow-vector cartesian components in GSE coordinates may be derived from the given speed and angles as

Vx = - V * cos(theta) * cos(phi)
Vy = + V * cos(theta) * sin(phi)
Vz = + V * sin(theta)
and vise versa: two angles may be derived from the given speed and Vx,Vy,Vz comp. as  
          a_theta=vz/V
          theta=(180.*asin(a_theta))/!PI
         a_phi=Vy/(-Vx)
        phi=(180.*atan(a_phi))/!PI
```

```
   (*)   Quasi-GSE for the flow longitude angle means the angle increases from zero
         to positive values as the flow changes from being aligned along the -X(GSE)
         axis towards the +Y(GSE) axis.  The flow longitude angle is positive for 
         flow from west of the sun, towards +Y(GSE).
         The flow latitude angle is positive for flow from south of the sun, 
         towards +Z(GSE)
``````                  

In [None]:
#| export
import polars as pl
import pandas

from kedro.pipeline import Pipeline, node
from kedro.pipeline.modular_pipeline import pipeline

In [None]:
#| hide
#| default_exp pipelines/themis/state
%load_ext autoreload
%autoreload 2

## Downloading data

In [None]:
# | export
from ids_finder.utils.basic import cdf2pl, pmap

In [None]:
# | export


def download_state_data(
    start: str = None,
    end: str = None,
    ts: str = None,  # time resolution
    probe: str = None,
    coord: str = None,
):
    import pyspedas

    trange = [start, end]
    files = pyspedas.omni.data(trange=trange, datatype="hour", downloadonly=True)
    return files


def load_data(
    start: str = None,
    end: str = None,
    ts: str = None,  # time resolution
    vars: dict = None,
):
    files = download_state_data(start, end, ts, vars)
    df: pl.LazyFrame = pl.concat(files | pmap(cdf2pl, var_names=list(vars)))
    return df

## Preprocessing data

In [None]:
# | export
def preprocess_data(
    raw_data: pl.LazyFrame,
    vars: dict,
) -> pl.LazyFrame:
    """
    Preprocess the raw dataset (only minor transformations)

    - Applying naming conventions for columns
    - Extracting variables from `CDF` files, and convert them to DataFrame
    """

    columns_name_mapping = {key: value["COLNAME"] for key, value in vars.items()}

    return raw_data.rename(columns_name_mapping)

Also we have additional data file that indicate if `THEMIS` is in solar wind or not.

In [None]:
# | export
def preprocess_sw_state_data(
    raw_data: pandas.DataFrame,
) -> pl.LazyFrame:
    """
    - Applying naming conventions for columns
    - Parsing and typing data (like from string to datetime for time columns)
    """

    return pl.from_dataframe(raw_data).with_columns(
        # Note: For `polars`, please either specify both hour and minute, or neither.
        pl.concat_str(pl.col("start"), pl.lit(" 00")).str.to_datetime(
            format="%Y %j %H %M"
        ),
        pl.concat_str(pl.col("end"), pl.lit(" 00")).str.to_datetime(
            format="%Y %j %H %M"
        ),
    )

## Processing data

In [None]:
# | export
def flow2gse(df: pl.LazyFrame) -> pl.LazyFrame:
    """
    - Transforming solar wind data from `Quasi-GSE` coordinate to GSE coordinate system
    """
    plasma_speed = pl.col("plasma_speed")
    sw_theta = pl.col("sw_vel_theta")
    sw_phi = pl.col("sw_vel_phi")

    return df.with_columns(
        sw_vel_gse_x=-plasma_speed * sw_theta.cos() * sw_phi.cos(),
        sw_vel_gse_y=+plasma_speed * sw_theta.cos() * sw_phi.sin(),
        sw_vel_gse_z=+plasma_speed * sw_theta.sin(),
    ).drop(["sw_theta", "sw_phi"])


def filter_tranges(time: pl.Series, tranges: tuple[list, list]):
    """
    - Filter data by time ranges, return the indices of the time that are in the time ranges
    """

    starts = tranges[0]
    ends = tranges[1]

    start_indices = time.search_sorted(starts)
    end_indices = time.search_sorted(ends)

    return np.concatenate(
        [
            np.arange(start_index, end_index + 1)
            for start_index, end_index in zip(start_indices, end_indices)
        ]
    )


def add_state(l_df: pl.LazyFrame, l_state: pl.LazyFrame):
    state = l_state.collect()
    df = l_df.collect()

    start = state.get_column("start")
    end = state.get_column("end")

    time = df.get_column("time")

    indices = filter_tranges(time, (start, end))

    return (
        df.with_row_count()
        .with_columns(
            state=pl.when(pl.col("row_nr").is_in(indices)).then(1).otherwise(0)
        )
        .drop("row_nr")
    )


def process_data(df: pl.LazyFrame, state: pl.LazyFrame = None) -> pl.LazyFrame:
    """
    - Transforming data to GSE coordinate system
    - Combine state data with additional plasma state data
    """

    return (
        df.pipe(flow2gse)
        .pipe(add_state, state)
        .rename(
            {
                "sw_vel_gse_x": "v_x",
                "sw_vel_gse_y": "v_y",
                "sw_vel_gse_z": "v_z",
            }
        )
    )

## Pipelines

In [None]:
# | export
def create_pipeline(
    sat_id,
    ts: str = "1h",  # time resolution
) -> Pipeline:
    node_load_data = node(
        load_data,
        inputs=dict(
            start="params:start_date",
            end="params:end_date",
            vars="params:omni_vars",
        ),
        outputs="raw_state",
        name=f"load_{sat_id.upper()}_state_data",
    )

    node_preprocess_data = node(
        preprocess_data,
        inputs=dict(
            raw_data="raw_state",
            vars="params:omni_vars",
        ),
        outputs=f"inter_state_{ts}",
        name=f"preprocess_{sat_id.upper()}_state_data",
    )

    node_preprocess_sw_state = node(
        preprocess_sw_state_data,
        inputs="raw_state_sw",
        outputs="inter_state_sw",
        name=f"preprocess_{sat_id.upper()}_solar_wind_state_data",
    )

    node_process_data = node(
        process_data,
        inputs=[f"inter_state_{ts}", "inter_state_sw"],
        outputs=f"primary_state_{ts}",
        name=f"process_{sat_id.upper()}_state_data",
    )

    nodes = [
        node_load_data,
        node_preprocess_data,
        node_preprocess_sw_state,
        node_process_data,
    ]
    pipelines = pipeline(
        nodes,
        namespace=sat_id,
        parameters={
            "params:omni_vars": "params:omni_vars",
            "params:start_date": "params:jno_start_date",
            "params:end_date": "params:jno_end_date",
        },
    )

    return pipelines

In [None]:
# | eval: false
# catalog.load("thb.primary_state_1h").collect().describe()
# catalog.load('thb.feature_tau_60s').collect()