---
title: JUNO Magnetic field data pipeline
---

In [None]:
#| export
from datetime import timedelta
import polars as pl

from kedro.pipeline import Pipeline, node
from kedro.pipeline.modular_pipeline import pipeline

In [None]:
#| hide
#| default_exp pipelines/juno/mag
%load_ext autoreload
%autoreload 2

```txt
File Naming Convention                                                        
==============================================================================
Convention:                                                                   
   fgm_jno_LL_CCYYDDDxx_vVV.ext                                               
Where:                                                                        
   fgm - Fluxgate Magnetometer three character instrument abbreviation        
   jno - Juno                                                                 
    LL - CODMAC Data level, for example, l3 for level 3                       
    CC - The century portion of a date, 20                                    
    YY - The year of century portion of a date, 00-99                         
   DDD - The day of year, 001-366                                             
    xx - Coordinate system of data (se = Solar equatorial, ser = Solar        
         equatorial resampled, pc = Planetocentric, ss = Sun-State,           
         pl = Payload)                                                        
     v - separator to denote Version number                                   
    VV - version                                                              
   ext - file extension (sts = Standard Time Series (ASCII) file, lbl = Label 
         file)                                                                
Example:                                                                      
   fgm_jno_l3_2014055se_v00.sts    
```

## Downloading data 

In [None]:
#| export
import pooch
from pooch import Unzip
from ids_finder.utils.basic import load_lbl, concat_partitions
from pipe import select, filter

In [None]:
# | export
time_resolutions = ["1sec", "1min"]

def download_data(
    start=None,
    end=None,
    datatype: str = "1sec",  # time resolution
) -> list[str]:
    base_url = "https://pds-ppi.igpp.ucla.edu/ditdos/download?id=pds://PPI/JNO-SS-3-FGM-CAL-V1.0/DATA/CRUISE/SE"
    files = pooch.retrieve(
        url=f"{base_url}/{datatype.upper()}",
        known_hash=None,
        path="../data/01_raw/",
        processor=Unzip(extract_dir=f"jno_ss_se_{datatype}"),
    )
    return files


def load_data(
    start,
    end,
    datatype: str = "1sec",  # time resolution
) -> pl.DataFrame:
    files = download_data(start, end, datatype)

    data = pl.concat(
        files
        | filter(lambda x: x.endswith(".lbl"))
        | select(load_lbl)
        | select(pl.from_dataframe)
    )

    return data

## Preprocessing data

Convert all files from `lbl` format to `parquet` format for faster processing

In [None]:
#| export
from ids_finder.utils.basic import concat_partitions

In [None]:
# | export
def preprocess_data(raw_data: pl.DataFrame) -> pl.DataFrame:
    """
    Preprocess the raw dataset (only minor transformations)

    - Applying naming conventions for columns
    - Parsing and typing data
    - Changing storing format (from `lbl` to `parquet`)
    - Dropping useless columns
    """

    df_pl = (
        raw_data
        .lazy()
        .with_columns(time=pl.col("SAMPLE UTC").str.to_datetime("%Y %j %H %M %S %f"))
        .drop(["SAMPLE UTC", "DECIMAL DAY", "INSTRUMENT RANGE"])
        .sort("time")
        .collect()
    )
    return df_pl

## Processing data

In [None]:
# | export
from ids_finder.utils.basic import partition_data_by_year

In [None]:
#| export
def process_data(
    raw_data: pl.DataFrame,
    ts: str = None,  # time resolution
    coord: str = None,
) -> pl.DataFrame | dict[str, pl.DataFrame]:
    """
    Partitioning data, for the sake of memory
    """
    return partition_data_by_year(raw_data)

## Pipeline

In [None]:
# | export
from ids_finder.core.pipeline import extract_features
from ids_finder.pipelines.default.data_mag import create_pipeline_template


def create_pipeline(sat_id="JNO", source="MAG"):
    return create_pipeline_template(
        sat_id=sat_id,
        source=source,
        load_data_fn=load_data,
        preprocess_data_fn=preprocess_data,
        process_data_fn=process_data,
        extract_features_fn=extract_features,
    )

In [None]:
| export
def create_pipeline(
    sat_id,
    ts: str = "1s",  # time resolution,
    tau: str = "60s",  # time window
    **kwargs,
) -> Pipeline:

    node_download_data = node(
        load_data,
        inputs=dict(
            start="params:start_date",
            end="params:end_date",
        ),
        outputs=f"raw_mag_files_{ts}",
        name=f"download_{sat_id.upper()}_magnetic_field_data",
    )

    node_preprocess_data = node(
        preprocess_data,
        inputs=dict(
            raw_data=f"raw_mag_{ts}",
        ),
        outputs=f"inter_mag_{ts}",
        name=f"preprocess_{sat_id.upper()}_magnetic_field_data",
    )
    
    node_process_data = node(
        process_data,
        inputs=f"inter_mag_{ts}",
        outputs=f"primary_mag_rtn_{ts}",
        name=f"process_{sat_id.upper()}_magnetic_field_data",
    )
    
    node_extract_features = node(
        extract_features,
        inputs=[f"primary_mag_rtn_{ts}", "params:tau", "params:extract_params"],
        outputs=f"feature_tau_{tau}",
        name=f"extract_{sat_id}_features",
    )

    nodes = [
        node_download_data,
        node_preprocess_data,
        node_process_data,
        node_extract_features,
    ]

    pipelines = pipeline(
        nodes,
        namespace=sat_id,
        parameters={
            "params:tau": "params:tau",
            "params:extract_params": "params:jno_1s_params",
            "params:start_date": "params:jno_start_date",
            "params:end_date": "params:jno_end_date",
        },
    )
    return pipelines

## Dataset Overview

### Index

In [None]:
pds_dir = "https://pds-ppi.igpp.ucla.edu/data"

possible_coords = ["se", "ser", "pc", "ss", "pl"]
possible_exts = ["sts", "lbl"]
possible_data_rates = ["1s", "1min", "1h"]

juno_ss_config = {
    "DATA_SET_ID": "JNO-SS-3-FGM-CAL-V1.0",
    "FILE_SPECIFICATION_NAME": "INDEX/INDEX.LBL",
}

juno_j_config = {
    "DATA_SET_ID": "JNO-J-3-FGM-CAL-V1.0",
    "FILE_SPECIFICATION_NAME": "INDEX/INDEX.LBL",
}

#### Process index

In [None]:
#| export
import pandas
import pdpipe as pdp

In [None]:
#| export
def process_jno_index(df: pandas.DataFrame):
    
    _index_time_format = "%Y-%jT%H:%M:%S.%f"
    
    df.columns = df.columns.str.replace(" ", "")
    jno_index_pipeline = pdp.PdPipeline(
        [
            pdp.ColDrop(["PRODUCT_ID", "CR_DATE", "PRODUCT_LABEL_MD5CHECKSUM"]),
            pdp.ApplyByCols("SID", str.rstrip),
            pdp.ApplyByCols("FILE_SPECIFICATION_NAME", str.rstrip),
            pdp.ColByFrameFunc(
                "START_TIME",
                lambda df: pandas.to_datetime(df["START_TIME"], format=_index_time_format),
            ),
            pdp.ColByFrameFunc(
                "STOP_TIME",
                lambda df: pandas.to_datetime(df["STOP_TIME"], format=_index_time_format),
            ),
            # pdp.ApplyByCols(['START_TIME', 'STOP_TIME'], pandas.to_datetime, format=_index_time_format), # NOTE: This is slow
        ]
    )
    
    return jno_index_pipeline(df)


#### Pipleline

In [None]:
#| export
from kedro.pipeline import pipeline, node

In [None]:
#| export
def create_jno_index_pipeline():
    jno_index_pipeline = pipeline([
        node(process_jno_index, inputs="raw_JNO_SS_index", outputs="JNO_SS_index"),
        node(process_jno_index, inputs="raw_JNO_J_index", outputs="JNO_J_index"),
        node(lambda x1, x2: pandas.concat([x1, x2]), inputs=["JNO_SS_index", "JNO_J_index"], outputs="JNO_index")
    ])
    return jno_index_pipeline

In [None]:
raw_JNO_SS_index = catalog.load('raw_JNO_SS_index')
raw_JNO_J_index = catalog.load('raw_JNO_J_index')
jno_index = catalog.load('JNO_index')

jno_ss_index = jno_index[lambda df: df["DATA_SET_ID"] == "JNO-SS-3-FGM-CAL-V1.0"]
jno_j_index  = jno_index[lambda df: df["DATA_SET_ID"] == "JNO-J-3-FGM-CAL-V1.0"]

#### Check the data

In [None]:
#| echo: false
starting_date = jno_ss_index['START_TIME'].min().date()
ending_date = jno_ss_index['STOP_TIME'].max().date()

print(f"JNO-SS Starting date: {starting_date}")
print(f"JNO-SS Ending date: {ending_date}")

starting_date = jno_j_index['START_TIME'].min().date()
ending_date = jno_j_index['STOP_TIME'].max().date()
print(f"JNO-J Starting date: {starting_date}")
print(f"JNO-J Ending date: {ending_date}")

JNO-SS Starting date: 2011-08-25
JNO-SS Ending date: 2016-06-29
JNO-J Starting date: 2016-07-07
JNO-J Ending date: 2022-12-15


In [None]:
#| echo: false
available_dates = pandas.concat([jno_ss_index['START_TIME'].dt.date, jno_ss_index['STOP_TIME'].dt.date]).unique()
full_year_range = pandas.date_range(start=starting_date, end=ending_date)

missing_dates = full_year_range[~full_year_range.isin(available_dates)]

if len(missing_dates) == 0:
    print(f"No days are missing.")
else:
    print(f"The following days are missing")
    print(coll_repr(missing_dates.map(lambda x: x.strftime("%Y-%m-%d"))))

The following days are missing
(#2353) ['2016-07-07','2016-07-08','2016-07-09','2016-07-10','2016-07-11','2016-07-12','2016-07-13','2016-07-14','2016-07-15','2016-07-16'...]
