In [1]:
#| hide
%load_ext autoreload
%autoreload 2
from nbdev.showdoc import *


In [2]:
import os
os.environ['SPEDAS_DATA_DIR'] = f"{os.environ['HOME']}/data"

In [3]:
#| code-summary: import all the packages needed for the project
#| output: hide

from fastcore.utils import *
from fastcore.test import *


from ids_finder.utils import *
from ids_finder.core import *

import polars as pl
try:
    import modin.pandas as pd
    import modin.pandas as mpd
except ImportError:
    import pandas as pd

import pandas
import numpy as np
import xarray as xr


from datetime import timedelta
from loguru import logger
import speasy as spz
from multipledispatch import dispatch

import altair as alt
from tqdm import tqdm

## Dataset Overview

In [4]:
stereo_probes = ["a", "b"]
probe = stereo_probes[0]

jno_start_date = "2011-08-25"
jno_end_date = "2016-06-30" 

trange = [jno_start_date, jno_end_date]
test_trange = ["2011-08-25", "2012-08-26"]

In [5]:
sat = 'STA'
coord = 'rtn'

In [6]:
cda_tree: spz.SpeasyIndex = spz.inventories.tree.cda
product = cda_tree.STEREO.Ahead.IMPACT_MAG.STA_L1_MAG_RTN

logger.info(product.description)
logger.info(product.ID)
logger.info(product.BFIELD.CATDESC)
logger.info(product.BFIELD.spz_uid())

# spz.inventories.data_tree.cda.STEREO.Ahead.IMPACT_MAG.STA_L1_MAG_RTN.
# spz.inventories.data_tree.cda.STEREO.STEREOA.IMPACT_MAG.STA_LB_MAG_RTN.description
# spz.inventories.data_tree.cda.STEREO.Ahead.IMPACT_MAG.STA_L1_MAG_RTN.MAGFLAGUC.CATDESC
spz.inventories.data_tree.cda.STEREO.Ahead.IMPACT_MAG.STA_L1_MAG_RTN.BFIELD.CATDESC
# spz.inventories.data_tree.cda.STEREO.Ahead.IMPACT_MAG.STA_L1_MAG_RTN.BFIELD.

[32m2023-09-29 23:58:56.568[0m | [1mINFO    [0m | [36m__main__[0m:[36m<cell line: 4>[0m:[36m4[0m - [1mSTEREO Ahead IMPACT/MAG Magnetic Field Vectors (RTN) - J. Luhmann (UCB/SSL)[0m
[32m2023-09-29 23:58:56.568[0m | [1mINFO    [0m | [36m__main__[0m:[36m<cell line: 5>[0m:[36m5[0m - [1msta_l1_mag_rtn_cdaweb[0m
[32m2023-09-29 23:58:56.569[0m | [1mINFO    [0m | [36m__main__[0m:[36m<cell line: 6>[0m:[36m6[0m - [1mMagnetic field vector in RTN coordinates from the IMPACT/MAG instrument.[0m
[32m2023-09-29 23:58:56.569[0m | [1mINFO    [0m | [36m__main__[0m:[36m<cell line: 7>[0m:[36m7[0m - [1mSTA_L1_MAG_RTN/BFIELD[0m


'Magnetic field vector in RTN coordinates from the IMPACT/MAG instrument.'

### Download all the files

Download data using `pyspedas`, but load it using `pycdfpp` (using `pyspedas` to load the data directly into `xarray` is very slow)

Using `wget` does not work.

`wget --recursive --no-parent --no-clobber http://sprg.ssl.berkeley.edu/data/misc/stereo/impact/level1/ahead/mag/RTN/2014`

In [7]:
import pyspedas
import pycdfpp
from sunpy.time import TimeRange
from pipe import select, take, where


File size is too large to process at once, split it into multiple time intervals.

In [10]:
def st_df(trange, save=False):
    if isinstance(trange, TimeRange):
        trange = [trange.start.strftime("%Y-%m-%d"), trange.end.strftime("%Y-%m-%d")]

    output = f"../data/{sat}_data_{trange[0]}.parquet"
    if Path(output).exists():
        logger.info(f"Data exists. Reading {output}")
        return pl.read_parquet(output)

    files = pyspedas.stereo.mag(trange, downloadonly=True)

    cdfs = [pycdfpp.load(file) for file in files]
    times = [pycdfpp.to_datetime64(cdf["Epoch"]) for cdf in cdfs]
    BFIELDs = [cdf["BFIELD"].values for cdf in cdfs]

    time = np.concatenate(times)
    BFIELD = np.concatenate(BFIELDs)

    df = pl.DataFrame(
        {
            "time": time,
            "BX": BFIELD[:, 0],
            "BY": BFIELD[:, 1],
            "BZ": BFIELD[:, 2],
            "B": BFIELD[:, 3],
        }
    )

    if save:
        df.to_parquet(output)
    return df


In [11]:
def st_downsample(
    data: pl.DataFrame, every: timedelta, period: timedelta
) -> pl.DataFrame:
    return (
        data.sort("time")
        .group_by_dynamic("time", every=every, period=period)
        .agg(pl.col(["BX", "BY", "BZ", "B"]).mean())
        .with_columns(pl.col("time") + period / 2)
        .with_columns(pl.col("time").dt.cast_time_unit("ns"))
    )

In [12]:
every = timedelta(seconds=1)
period = 2 * every

output = f"../data/{sat}_data_downsampled.parquet"

if Path(output).exists():
    logger.info(f"Data exists. Reading {output}")
else:
    df_downsampled = pl.concat(
        TimeRange(trange).split(10)
        | select(st_df)
        | select(lambda df: st_downsample(df, every, period))
    )
    df_downsampled.write_parquet(output)

[32m2023-09-29 23:58:56.934[0m | [1mINFO    [0m | [36m__main__[0m:[36m<cell line: 6>[0m:[36m7[0m - [1mData exists. Reading ../data/STA_data_downsampled.parquet[0m


NOTE: one can also use `speasy` to download data, however this is slower for `STEREO` data.

In [13]:
%%markdown
sat_fgm_product = cda_tree.STEREO.Ahead.IMPACT_MAG.STA_L1_MAG_RTN.BFIELD
sat_fgm_product = 'cda/STA_L1_MAG_RTN/BFIELD'
products = [sat_fgm_product]

dataset = spz.get_data(products, test_trange, disable_proxy=True)
sat_fgm_data  = dataset[0]
data_preview(sat_fgm_data)

sat_fgm_product = cda_tree.STEREO.Ahead.IMPACT_MAG.STA_L1_MAG_RTN.BFIELD
sat_fgm_product = 'cda/STA_L1_MAG_RTN/BFIELD'
products = [sat_fgm_product]

dataset = spz.get_data(products, test_trange, disable_proxy=True)
sat_fgm_data  = dataset[0]
data_preview(sat_fgm_data)


Download data in a background thread

In [14]:
%%markdown

@threaded
def download_data(products, trange):
    logger.info("Downloading data")
    spz.get_data(products, trange, disable_proxy=True)
    logger.info("Data downloaded")
    
download_data(products, trange)


@threaded
def download_data(products, trange):
    logger.info("Downloading data")
    spz.get_data(products, trange, disable_proxy=True)
    logger.info("Data downloaded")
    
download_data(products, trange)


### Check and preprocess the data

As we are only interested in the data when THEMIS is in the solar wind, for simplicity we will only keep the data when `X, SSE` and `X, GSE` is positive.

- State data time resolution is 1 minute...

- FGS data time resolution is 4 second...

In [15]:
%%markdown
df = (
    sat_state_sw.upsample("time", every="1m")
    .group_by_dynamic("time", every="1d")
    .agg(pl.col("X, SSE").null_count().alias("null_count"))
    .with_columns(
        pl.when(pl.col("null_count") > 720).then(0).otherwise(1).alias("availablity")
    )
)

properties = {
    'width': 800,
}

chart1 = alt.Chart(df).mark_point().encode(
    x='time',
    y='null_count'
).properties(**properties)

chart2  = alt.Chart(df).mark_point().encode(
    x='time',
    y='availablity'
).properties(**properties)

(chart1 & chart2)

df = (
    sat_state_sw.upsample("time", every="1m")
    .group_by_dynamic("time", every="1d")
    .agg(pl.col("X, SSE").null_count().alias("null_count"))
    .with_columns(
        pl.when(pl.col("null_count") > 720).then(0).otherwise(1).alias("availablity")
    )
)

properties = {
    'width': 800,
}

chart1 = alt.Chart(df).mark_point().encode(
    x='time',
    y='null_count'
).properties(**properties)

chart2  = alt.Chart(df).mark_point().encode(
    x='time',
    y='availablity'
).properties(**properties)

(chart1 & chart2)


## Processing the whole data

In [16]:
#| eval: false
sat = "sta"
tau = timedelta(seconds=60)
data_resolution = timedelta(seconds=1)
files = f"../data/{sat}_data_downsampled.parquet"
output = f'../data/{sat}_candidates_tau_{tau.seconds}.parquet'

data = pl.scan_parquet(files).collect()
if data.get_column('time').is_sorted():
    data = data.set_sorted('time')
else:
    data = data.sort('time')

In [17]:
indices = compute_indices(data, tau)

# filter condition
sparse_num = tau / data_resolution // 3
filter_condition = get_ID_filter_condition(sparse_num = sparse_num)

candidates = indices.filter(filter_condition).with_columns(pl_format_time(tau))
del indices

> Note that for missing data, fill values consisting of a blank followed 
by 9's which together constitute the format are used

In [19]:
parameters = """Year
DOY
Hour
Radial Distance, AU
HGI Lat. of the S/C
HGI Long. of the S/C
IMF BR, nT (RTN)
IMF BT, nT (RTN)
IMF BN, nT (RTN)
IMF B Scalar, nT
SW Plasma Speed, km/s
SW Lat. Angle RTN, deg.
SW Long. Angle RTN, deg.
SW Plasma Density, N/cm^3
SW Plasma Temperature, K
1.8-3.6 MeV H flux,LET
4.0-6.0 MeV H flux,LET
6.0-10.0 MeV H flux, LET
10.0-12.0 MeV H flux,LET
13.6-15.1 MeV H flux, HET
14.9-17.1 MeV H flux, HET
17.0-19.3 MeV H flux, HET
20.8-23.8 MeV H flux, HET
23.8-26.4 MeV H flux, HET
26.3-29.7 MeV H flux, HET
29.5-33.4 MeV H flux, HET
33.4-35.8 MeV H flux, HET
35.5-40.5 MeV H flux, HET
40.0-60.0 MeV H flux, HET
60.0-100.0 MeV H flux, HET
0.320-0.452 MeV H flux, SIT
0.452-0.64 MeV H flux, SIT
0.640-0.905 MeV H flux, SIT
0.905-1.28 MeV H flux, SIT
1.280-1.81 MeV H flux, SIT
1.810-2.56 MeV H flux, SIT
2.560-3.62 MeV H flux, SIT"""

def stereo_load_state(trange):
    from fastdownload import FastDownload

    d = FastDownload(base='../', archive='data', data='data')
    
    start_time = pd.Timestamp(trange[0])
    end_time = pd.Timestamp(trange[1])
    
    url = "https://spdf.gsfc.nasa.gov/pub/data/stereo/ahead/l2/merged/stereoa{year}.asc"
    columns = parameters.split("\n")
    
    df = pandas.concat(
        range(start_time.year, end_time.year + 1)
        | select(lambda x: url.format(year=x))
        | select(d.download)
        | select(lambda file: pandas.read_csv(file, delim_whitespace=True, names=columns)) # Read the file
    )
    
    
    data = pl.DataFrame(df).select(
        pl.col(['Radial Distance, AU', 'HGI Lat. of the S/C', 'HGI Long. of the S/C']),
        (pl.datetime(pl.col("Year"), month=1, day=1)
        + pl.duration(days=pl.col("DOY") - 1, hours=pl.col("Hour"))).dt.cast_time_unit("ns").alias("time"),
    )
    
    return data



In [21]:
get_memory_usage(data)
sat_fgm = df2ts(
    compress_data_by_cands(data, candidates), ["BX", "BY", "BZ"], attrs={"coordinate_system": coord, "units": "nT"}
)
get_memory_usage(sat_fgm)

sat_state = stereo_load_state(trange)

del data

[32m2023-09-29 23:59:23.582[0m | [1mINFO    [0m | [36mids_finder.utils[0m:[36mget_memory_usage[0m:[36m60[0m - [1m2.8 GB (DataFrame)[0m
[32m2023-09-29 23:59:28.705[0m | [1mINFO    [0m | [36mids_finder.utils[0m:[36mget_memory_usage[0m:[36m60[0m - [1m227.1 MB (DataArray)[0m


In [22]:
#| eval: false
ids = process_candidates(candidates, sat_fgm, sat_state, data_resolution)
ids = ids.unique(["d_time", "d_tstart", "d_tstop"])
ids.write_parquet(output)


    import ray
    ray.init()


29-Sep-23 23:59:47: Unable to poll TPU GCE metadata: HTTPConnectionPool(host='metadata.google.internal', port=80): Max retries exceeded with url: /computeMetadata/v1/instance/attributes/accelerator-type (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x28f6b5120>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))
29-Sep-23 23:59:47: Failed to detect number of TPUs: [Errno 2] No such file or directory: '/dev/vfio'
2023-09-29 23:59:47,591	INFO worker.py:1642 -- Started a local Ray instance.

Distributing Dataframe: 100%██████████ Elapsed time: 00:00, estimated remaining time: 00:00
Estimated completion of line 17: 100%██████████ Elapsed time: 00:00, estimated remaining time: 00:00
Please refer to https://modin.readthedocs.io/en/stable/supported_apis/defaulting_to_pandas.html for explanation.


Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?


In [23]:
#| eval: false

test_eq(ids.unique(["d_time", "d_tstart", "d_tstop"]).shape, ids.unique("d_time").shape)