In [None]:
#| hide
%load_ext autoreload
%autoreload 2
from nbdev.showdoc import *


In [None]:
#| output: hide
#| code-summary: import all the packages needed for the project

from ids_finder.utils import *
from ids_finder.core import *
from fastcore.utils import *
from fastcore.test import *

import polars as pl
try:
    import modin.pandas as pd
    import modin.pandas as mpd
except ImportError:
    import pandas as pd

import pandas

from datetime import timedelta

from loguru import logger

import pdr


import pytplot
from pytplot import timebar
from pytplot import get_data, store_data, tplot, split_vec, join_vec, tplot_options, options, tlimit, highlight, degap

import pdpipe as pdp

from typing import Callable
from pandas import (
    DataFrame,
)

## Background

Spacecraft-Solar equatorial

### Coordinate System of Data


1. **SE (Solar Equatorial)**
    - Code: `se`
    - Resampling options: 
        - Number of seconds (1 or 60): `se_rN[N]s`
        - Resampled 1 hour: `se_r1h`

2. **PC (Planetocentric)**
    - Code: `pc`
    - Resampling options: 
        - Number of seconds (1 or 60): `pc_rN[N]s`
        
3. **SS (Sun-State)**
    - Code: `ss`
    - Resampling options: 
        - Number of seconds (1 or 60): `ss_rN[N]s`
        
4. **PL (Payload)**
    - Code: `pl`
    - Resampling options: 
        - Number of seconds (1 or 60): `pl_rN[N]s`


```txt
------------------------------------------------------------------------------
Juno Mission Phases                                                           
------------------------------------------------------------------------------
Start       Mission                                                           
Date        Phase                                                             
==============================================================================
2011-08-05  Launch                                                            
2011-08-08  Inner Cruise 1                                                    
2011-10-10  Inner Cruise 2                                                    
2013-05-28  Inner Cruise 3                                                    
2013-11-05  Quiet Cruise                                                      
2016-01-05  Jupiter Approach                                                  
2016-06-30  Jupiter Orbital Insertion                                         
2016-07-05  Capture Orbit                                                     
2016-10-19  Period Reduction Maneuver                                         
2016-10-20  Orbits 1-2                                                        
2016-11-09  Science Orbits                                                    
2017-10-11  Deorbit
```

```txt
File Naming Convention                                                        
==============================================================================
Convention:                                                                   
   fgm_jno_LL_CCYYDDDxx_vVV.ext                                               
Where:                                                                        
   fgm - Fluxgate Magnetometer three character instrument abbreviation        
   jno - Juno                                                                 
    LL - CODMAC Data level, for example, l3 for level 3                       
    CC - The century portion of a date, 20                                    
    YY - The year of century portion of a date, 00-99                         
   DDD - The day of year, 001-366                                             
    xx - Coordinate system of data (se = Solar equatorial, ser = Solar        
         equatorial resampled, pc = Planetocentric, ss = Sun-State,           
         pl = Payload)                                                        
     v - separator to denote Version number                                   
    VV - version                                                              
   ext - file extension (sts = Standard Time Series (ASCII) file, lbl = Label 
         file)                                                                
Example:                                                                      
   fgm_jno_l3_2014055se_v00.sts    
```

## Dataset Overview

In [None]:
pds_dir = "https://pds-ppi.igpp.ucla.edu/data"

possible_coords = ["se", "ser", "pc", "ss", "pl"]
possible_exts = ["sts", "lbl"]
possible_data_rates = ["1s", "1min", "1h"]

juno_ss_config = {
    "DATA_SET_ID": "JNO-SS-3-FGM-CAL-V1.0",
    "FILE_SPECIFICATION_NAME": "INDEX/INDEX.LBL",
}

juno_j_config = {
    "DATA_SET_ID": "JNO-J-3-FGM-CAL-V1.0",
    "FILE_SPECIFICATION_NAME": "INDEX/INDEX.LBL",
}

In [None]:
#| hide
def download_and_read_lbl_file(config, index_table=False):
    """Download and read file for each config.

    Returns:
        DataFrame: The data read from the file.
    """
    # BUG: index file is not formatted properly according to `lbl` file, so can not be used with `pdr` for.
    # ValueError: time data "282T00:00:31.130,2019" doesn't match format "%Y-%jT%H:%M:%S.%f", at position 3553. You might want to try:
    # - passing `format` if your strings have a consistent format;
    # - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format;
    # - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this.

    local_dir = os.path.join(os.environ["HOME"], "juno", config["DATA_SET_ID"])
    base_url = f"{pds_dir}/{config['DATA_SET_ID']}"

    lbl_fn = config["FILE_SPECIFICATION_NAME"]

    if not index_table:
        parquet_fn = lbl_fn.replace("lbl", "parquet")
        parquet_fp = os.path.join(local_dir, parquet_fn)
        if os.path.exists(parquet_fp):
            return pandas.read_parquet(os.path.join(local_dir, parquet_fn))

    lbl_file_url = f"{base_url}/{lbl_fn}"
    lbl_fp = download_file(lbl_file_url, local_dir, lbl_fn)
    logger.debug(f"Reading {lbl_fp}")

    if index_table:
        tab_fn = lbl_fn.replace("LBL", "TAB")
        tab_fp = download_file(f"{base_url}/{tab_fn}", local_dir, tab_fn)
        tab_index = pandas.read_csv(tab_fp, delimiter=",", quotechar='"')
        tab_index.columns = tab_index.columns.str.replace(" ", "")
        return tab_index
    else:
        sts_fn = lbl_fn.replace("lbl", "sts")
        download_file(f"{base_url}/{sts_fn}", local_dir, sts_fn)
        return pdr.read(lbl_fp).TABLE

In [None]:
#| hide
jno_ss_index = download_and_read_lbl_file(juno_ss_config, index_table=True)
jno_j_index = download_and_read_lbl_file(juno_j_config, index_table=True)

_index_time_format = "%Y-%jT%H:%M:%S.%f"

jno_pipeline = pdp.PdPipeline(
    [
        pdp.ColDrop(["PRODUCT_ID", "CR_DATE", "PRODUCT_LABEL_MD5CHECKSUM"]),
        pdp.ApplyByCols("SID", str.rstrip),
        pdp.ApplyByCols("FILE_SPECIFICATION_NAME", str.rstrip),
        pdp.ColByFrameFunc(
            "START_TIME",
            lambda df: pandas.to_datetime(df["START_TIME"], format=_index_time_format),
        ),
        pdp.ColByFrameFunc(
            "STOP_TIME",
            lambda df: pandas.to_datetime(df["STOP_TIME"], format=_index_time_format),
        ),
        # pdp.ApplyByCols(['START_TIME', 'STOP_TIME'], pandas.to_datetime, format=_index_time_format), # NOTE: This is slow
    ]
)

jno_ss_index = jno_pipeline(jno_ss_index)
jno_j_index = jno_pipeline(jno_j_index)

index_df = pandas.concat(
    [jno_ss_index, jno_j_index], ignore_index=True
)

### Check the data

In [None]:
#| echo: false
starting_date = jno_ss_index['START_TIME'].min().date()
ending_date = jno_ss_index['STOP_TIME'].max().date()

print(f"Starting date: {starting_date}")
print(f"Ending date: {ending_date}")

In [None]:
#| echo: false
available_dates = pandas.concat([jno_ss_index['START_TIME'].dt.date, jno_ss_index['STOP_TIME'].dt.date]).unique()
full_year_range = pandas.date_range(start=starting_date, end=ending_date)

missing_dates = full_year_range[~full_year_range.isin(available_dates)]

if len(missing_dates) == 0:
    print(f"No days are missing.")
else:
    print(f"The following days are missing")
    print(coll_repr(missing_dates.map(lambda x: x.strftime("%Y-%m-%d"))))

### Download all the files

In [None]:
# wget -r --no-parent --no-clobber 'https://pds-ppi.igpp.ucla.edu/data/JNO-SS-3-FGM-CAL-V1.0/DATA/CRUISE/SE/1SEC/'
# aria2c -x 16 -s 16 'https://pds-ppi.igpp.ucla.edu/ditdos/download?id=pds://PPI/JNO-SS-3-FGM-CAL-V1.0/DATA/CRUISE/SE/1SEC'

1 day of data resampled by 1 sec is about 12 MB.

So 1 year of data is about 4 GB, and 6 years of JUNO Cruise data is about 24 GB.

Downloading rate is about 250 KB/s, so it will take about 3 days to download all the data.

In [None]:
num_of_files = 6*365
jno_file_size = 12e3
thm_file_size = 40e3
files_size = jno_file_size + thm_file_size
downloading_rate = 250
processing_rate = 1/60

time_to_download = num_of_files * files_size / downloading_rate / 60 / 60
space_required = num_of_files * files_size / 1e6
time_to_process = num_of_files / processing_rate / 60 / 60

print(f"Time to download: {time_to_download:.2f} hours")
print(f"Disk space required: {space_required:.2f} GB")
print(f"Time to process: {time_to_process:.2f} hours")

### Convert all files from `lbl` to `parquet` for faster processing

In [None]:
import pdr

In [None]:
#| code-summary: Convert data from `lbl` to `parquet` format
def lbl2parquet(src: Path, dest: Path) -> None:
    df = pdr.read(src).TABLE
    df.to_parquet(dest)


def convert_file(
    file_path: Path, target_format: str, conversion_func: Callable, check_exist=True
) -> None:
    target_suffix = (
        target_format if target_format.startswith(".") else f".{target_format}"
    )
    target_file = file_path.with_suffix(target_suffix)

    if check_exist and target_file.exists():
        return True

    try:
        conversion_func(file_path, target_file)
    except Exception as e:
        logger.error(f"Error converting {file_path} to {target_file}: {e}")
        return False

@startthread
def convert_files():
    format_from = "lbl"
    format_to = "parquet"
    local_dir = Path.home() / "data/juno"
    pattern = f"**/*.{format_from}"
    convert_func = lbl2parquet
    for file in local_dir.glob(pattern):
        convert_file(file, format_to, convert_func)
    logger.info("Done converting files")

In [None]:
# delete all files with extension
# find . -type f -name '*.parquet' -delete
# find . -type f -name '*.orc' -delete
# find . -type f -name '*.lbl' -delete

In [None]:
def _batch_pre_process(year, force=False):
    trange = [f"{year}-01-01", f"{year+1}-01-01T01"]  # having some overlap
    dir_path = Path.home() /  "data/juno/JNO-SS-3-FGM-CAL-V1.0/"
    pattern = "**/*.parquet"
    data = dir_path / pattern
    
    output = Path(f"../data/jno_data_{year}.parquet")
    output.parent.mkdir(parents=True, exist_ok=True)
    if os.path.exists(output) and not force:
        logger.info(f"File {output} exists. Skipping")
        return output
    logger.info(f"Preprocessing data for year {year}")
    
    lazy_df = pl.scan_parquet(data)
    temp_df = (
        lazy_df.filter(
            pl.col("time").is_between(pd.Timestamp(trange[0]), pd.Timestamp(trange[1])),
        )
        .sort(
            "time"
        )  # needed for `compute_index_std` to work properly as `group_by_dynamic` requires the data to be sorted
        .filter(
            pl.col(
                "time"
            ).is_first_distinct()  # remove duplicate time values for xarray to select data properly, though significantly slows down the computation
        )
        .rename({"BX SE": "BX", "BY SE": "BY", "BZ SE": "BZ"})
    )
    temp_df.collect().write_parquet(output)
    return output

@startthread
def batch_pre_process():
    starting_year = starting_date.year
    ending_year = ending_date.year

    for year in range(starting_year, ending_year+1):
        _batch_pre_process(year)

## Processing the whole data

In [None]:
from tqdm import tqdm

In [None]:
#| eval: false
sat = 'jno'
coord = 'se'
tau = timedelta(seconds=60)
data_resolution = timedelta(seconds=1)

def batch_process():
    for year in tqdm(range(starting_date.year, ending_date.year+1)):
        files = f'../data/{sat}_data_{year}.parquet'
        output = f'../data/{sat}_candidates_{year}_tau_{tau.seconds}.parquet'
        
        if os.path.exists(output):
            logger.info(f"Skipping {year} as the output file already exists.")
            continue

        data = pl.scan_parquet(files).set_sorted('time').collect()
        sat_fgm = df2ts(data, ["BX", "BY", "BZ"], attrs={"coordinate_system": coord, "units": "nT"})

        indices = compute_indices(data, tau)
        # filter condition
        sparse_num = tau / data_resolution // 3
        filter_condition = get_ID_filter_condition(sparse_num = sparse_num)

        candidates_pl = indices.filter(filter_condition).with_columns(pl_format_time(tau))
        candidates = convert_to_dataframe(candidates_pl)
        
        ids = process_candidates(candidates, sat_fgm, data, data_resolution)
        ids.write_parquet(output)

In [None]:
#| eval: false
batch_process()

In [None]:
#| code-summary: Combine all the files into one and remove duplicates
#| eval: false
files = f'../data/{sat}_candidates_*_tau_{tau.seconds}.parquet'
output = f'../data/{sat}_candidates_tau_{tau.seconds}.parquet'
ids = pl.scan_parquet(files).unique(["d_time", "d_tstart", "d_tstop"]).collect()
ids.write_parquet(output)

## Obsolete codes

#### Download and read file from the server (one by one)

In [None]:
#| eval: false
def juno_load_fgm(trange: list, coord="se", data_rate="1s") -> DataFrame:
    """
    Get the data array for a given time range and coordinate.

    Parameters:
        trange (list): The time range.
        coord (str, optional): The coordinate. Defaults to 'se'.
        data_rate (str, optional): The data rate. Defaults to '1s'.

    Returns:
        pandas.DataFrame: The dataframe for the given time range and coordinate.
    """

    if len(trange) != 2:
        raise ValueError(
            "Expected trange to have exactly 2 elements: start and stop time."
        )

    start_time = pandas.Timestamp(trange[0])
    stop_time = pandas.Timestamp(trange[1])

    temp_index_df = index_df[
        (index_df["SID"] == get_sid(coord, data_rate))
    ].reset_index(drop=True)

    # Filtering
    relevant_files = temp_index_df[
        (temp_index_df["STOP_TIME"] > start_time)
        & (temp_index_df["START_TIME"] < stop_time)
    ]
    dataframes = [download_and_read_lbl_file(row) for _, row in relevant_files.iterrows()]

    # rows = [row for _, row in relevant_files.iterrows()]
    # with concurrent.futures.ThreadPoolExecutor() as executor:
    #     dataframes = list(executor.map(download_and_read_file, rows))

    combined_data = pandas.concat(dataframes)

    return pdp_process_juno_df(combined_data)

def get_sid(coord, data_rate):
    sid_mapping = {
        "pc": {"1s": "PC 1 SECOND", "1min": "PC 1 MINUTE", "": "PCENTRIC"},
        "pl": {"1s": "PAYLOAD 1 SECOND", "": "PAYLOAD"},
        "ss": {"1s": "SS 1 SECOND", "1min": "SS 1 MINUTE", "": "SUNSTATE"},
        "se": {"1s": "SE 1 SECOND", "1min": "SE 1 MINUTE", "": "SE"},
    }
    try:
        return sid_mapping[coord][data_rate]
    except KeyError:
        return None

_skip_cond = ~pdp.cond.HasAllColumns(["SAMPLE UTC", "DECIMAL DAY", "INSTRUMENT RANGE"])
pdp_process_juno_df = pdp.PdPipeline(
    [
        pdp.ColByFrameFunc(
            "time",
            lambda df: pandas.to_datetime(df["SAMPLE UTC"], format="%Y %j %H %M %S %f"),
            skip=_skip_cond,
        ),
        pdp.ColDrop(["SAMPLE UTC", "DECIMAL DAY", "INSTRUMENT RANGE"], skip=_skip_cond),
        pdp.df.set_index("time"),
        pdp.ColRename(col_renamer)
        # pdp.AggByCols('SAMPLE UTC', func=lambda time: pandas.to_datetime(time, format='%Y %j %H %M %S %f'), func_desc='Convert time to datetime') # NOTE: this is quite slow
        # pdp.df['time'] << pandas.to_datetime(pdp.df['SAMPLE UTC'], format='%Y %j %H %M %S %f'), # NOTE: this is not work
    ],
)