---
title: JUNO Magnetic field data pipeline
---

JUNO Magnetic field data can be downloaded from [PDS](https://pds-ppi.igpp.ucla.edu/mission/JUNO/JNO/FGM) website.

In [3]:
# | export
from datetime import datetime

import polars as pl
import pandas as pd

from ids_finder import ROOT_DIR
from ids_finder.utils.pds import pds_download
from ids_finder.utils.lbl import load_lbl
from ids_finder.utils.basic import partition_data_by_year
from ids_finder.utils.polars import create_partitions
from ids_finder.pipelines.default.data_mag import create_pipeline_template


from pipe import select, filter

from typing import Literal

In [4]:
# | hide
# | default_exp pipelines/juno/mag
%load_ext autoreload
%autoreload 2

```txt
File Naming Convention                                                        
==============================================================================
Convention:                                                                   
   fgm_jno_LL_CCYYDDDxx_vVV.ext                                               
Where:                                                                        
   fgm - Fluxgate Magnetometer three character instrument abbreviation        
   jno - Juno                                                                 
    LL - CODMAC Data level, for example, l3 for level 3                       
    CC - The century portion of a date, 20                                    
    YY - The year of century portion of a date, 00-99                         
   DDD - The day of year, 001-366                                             
    xx - Coordinate system of data (se = Solar equatorial, ser = Solar        
         equatorial resampled, pc = Planetocentric, ss = Sun-State,           
         pl = Payload)                                                        
     v - separator to denote Version number                                   
    VV - version                                                              
   ext - file extension (sts = Standard Time Series (ASCII) file, lbl = Label 
         file)                                                                
Example:                                                                      
   fgm_jno_l3_2014055se_v00.sts    
```

## Downloading data 

In [5]:
#| code-summary: type definitions
#| export
JunoPhases = Literal["CRUISE", "JUPITER"]
JunoFGMCoords = Literal['SE', 'SS', 'PL']
JunoFGMTimeResolutions = Literal["1SEC", "1MIN"]

In [6]:
# | export
def download_data(
    start=None,
    end=None,
    phase: JunoPhases = "CRUISE",
    coord: JunoFGMCoords = "SE",
    datatype: JunoFGMTimeResolutions = "1SEC",  # time resolution
) -> list[str]:
    files = pds_download(
        mission="Juno",
        instrument="FGM",
        dataset=phase,
        coord=coord,
        datatype=datatype,
        path=ROOT_DIR / "data/01_raw/",
    )

    return list(files | filter(lambda x: x.endswith(".lbl")))

In [7]:
# | export
def _load_func(file):
    df = pl.from_dataframe(load_lbl(file))
    return (
        df.lazy()
        .with_columns(time=pl.col("SAMPLE UTC").str.to_datetime("%Y %j %H %M %S %f"))
        .drop(["SAMPLE UTC", "DECIMAL DAY", "INSTRUMENT RANGE", "X", "Y", "Z"])
        .sort("time")
    )

def parse_fp(fp):
    fn = fp.split('/')[-1]
    dt_str = fn.split('_')[3][0:7]
    format = "%Y%j"
    return datetime.strptime(dt_str, format)


def load_data(
    start,
    end,
    datatype: str = "1SEC",  # time resolution
) -> pl.DataFrame:
    start = pd.Timestamp(start)
    end = pd.Timestamp(end)
    files = download_data(start, end, datatype=datatype)
    files_dict = {
        file: parse_fp(file) for file in files
    }
    files = [k for k, v in files_dict.items() if start <= v < end]
    return pl.concat(files | select(_load_func))

## Preprocessing data

Convert all files from `lbl` format to `parquet` format for faster processing

In [8]:
# | export
def preprocess_data(raw_data: pl.DataFrame) -> pl.DataFrame:
    """
    Preprocess the raw dataset (only minor transformations)

    - Applying naming conventions for columns
    - Parsing and typing data
    - Changing storing format (from `lbl` to `parquet`)
    - Dropping useless columns
    """
    return create_partitions(raw_data, _load_func)

## Processing data

In [9]:
# | export
def process_data(
    raw_data: pl.DataFrame,
    ts: str = None,  # time resolution
) -> pl.DataFrame | dict[str, pl.DataFrame]:
    """
    Partitioning data, for the sake of memory
    """
    data: pl.LazyFrame = pl.concat(func() for func in raw_data.values())
    return (
        data.with_columns(pl.col("time").dt.cast_time_unit("us"))  # issue: https://github.com/pola-rs/polars/issues/12023
        .unique("time")
        .sort("time")
        .pipe(partition_data_by_year)
    )

## Pipeline

In [None]:
# | export
def create_pipeline(sat_id="JNO", source="MAG"):
    return create_pipeline_template(
        sat_id=sat_id,
        source=source,
        load_data_fn=download_data,
        preprocess_data_fn=preprocess_data,
        process_data_fn=process_data,
    )

## Dataset Overview

### Index

In [None]:
pds_dir = "https://pds-ppi.igpp.ucla.edu/data"

possible_coords = ["se", "ser", "pc", "ss", "pl"]
possible_exts = ["sts", "lbl"]
possible_data_rates = ["1s", "1min", "1h"]

juno_ss_config = {
    "DATA_SET_ID": "JNO-SS-3-FGM-CAL-V1.0",
    "FILE_SPECIFICATION_NAME": "INDEX/INDEX.LBL",
}

juno_j_config = {
    "DATA_SET_ID": "JNO-J-3-FGM-CAL-V1.0",
    "FILE_SPECIFICATION_NAME": "INDEX/INDEX.LBL",
}

#### Process index

In [None]:
# | export
import pandas
import pdpipe as pdp

In [None]:
# | export
def process_jno_index(df: pandas.DataFrame):
    _index_time_format = "%Y-%jT%H:%M:%S.%f"

    df.columns = df.columns.str.replace(" ", "")
    jno_index_pipeline = pdp.PdPipeline(
        [
            pdp.ColDrop(["PRODUCT_ID", "CR_DATE", "PRODUCT_LABEL_MD5CHECKSUM"]),
            pdp.ApplyByCols("SID", str.rstrip),
            pdp.ApplyByCols("FILE_SPECIFICATION_NAME", str.rstrip),
            pdp.ColByFrameFunc(
                "START_TIME",
                lambda df: pandas.to_datetime(
                    df["START_TIME"], format=_index_time_format
                ),
            ),
            pdp.ColByFrameFunc(
                "STOP_TIME",
                lambda df: pandas.to_datetime(
                    df["STOP_TIME"], format=_index_time_format
                ),
            ),
            # pdp.ApplyByCols(['START_TIME', 'STOP_TIME'], pandas.to_datetime, format=_index_time_format), # NOTE: This is slow
        ]
    )

    return jno_index_pipeline(df)

#### Pipleline

In [None]:
# | export
from kedro.pipeline import pipeline, node

In [None]:
# | export
def create_jno_index_pipeline():
    jno_index_pipeline = pipeline(
        [
            node(process_jno_index, inputs="raw_JNO_SS_index", outputs="JNO_SS_index"),
            node(process_jno_index, inputs="raw_JNO_J_index", outputs="JNO_J_index"),
            node(
                lambda x1, x2: pandas.concat([x1, x2]),
                inputs=["JNO_SS_index", "JNO_J_index"],
                outputs="JNO_index",
            ),
        ]
    )
    return jno_index_pipeline

In [None]:
raw_JNO_SS_index = catalog.load("raw_JNO_SS_index")
raw_JNO_J_index = catalog.load("raw_JNO_J_index")
jno_index = catalog.load("JNO_index")

jno_ss_index = jno_index[lambda df: df["DATA_SET_ID"] == "JNO-SS-3-FGM-CAL-V1.0"]
jno_j_index = jno_index[lambda df: df["DATA_SET_ID"] == "JNO-J-3-FGM-CAL-V1.0"]

#### Check the data

In [None]:
# | echo: false
starting_date = jno_ss_index["START_TIME"].min().date()
ending_date = jno_ss_index["STOP_TIME"].max().date()

print(f"JNO-SS Starting date: {starting_date}")
print(f"JNO-SS Ending date: {ending_date}")

starting_date = jno_j_index["START_TIME"].min().date()
ending_date = jno_j_index["STOP_TIME"].max().date()
print(f"JNO-J Starting date: {starting_date}")
print(f"JNO-J Ending date: {ending_date}")

In [None]:
# | echo: false
available_dates = pandas.concat(
    [jno_ss_index["START_TIME"].dt.date, jno_ss_index["STOP_TIME"].dt.date]
).unique()
full_year_range = pandas.date_range(start=starting_date, end=ending_date)

missing_dates = full_year_range[~full_year_range.isin(available_dates)]

if len(missing_dates) == 0:
    print(f"No days are missing.")
else:
    print(f"The following days are missing")
    print(coll_repr(missing_dates.map(lambda x: x.strftime("%Y-%m-%d"))))