In [None]:
#| hide
%load_ext autoreload
%autoreload 2
from nbdev.showdoc import *


In [None]:
#| code-summary: import all the packages needed for the project
#| output: hide

from ids_finder.utils import *
from ids_finder.core import *
from fastcore.utils import *
from fastcore.test import *

import polars as pl
try:
    import modin.pandas as pd
    import modin.pandas as mpd
except ImportError:
    import pandas as pd

import pandas
import numpy as np
import xarray as xr


from datetime import timedelta
from loguru import logger
import speasy as spz
from multipledispatch import dispatch

## Utils

In [None]:
from speasy.products import SpeasyVariable
from humanize import naturalsize

In [None]:
@threaded
def download_data(products, trange):
    logger.info("Downloading data")
    spz.get_data(products, trange, progress=True, disable_proxy=True)
    logger.info("Data downloaded")
    # spz.get_data(products, jno_start_date, jno_end_date)   

In [None]:


def data_preview(data: SpeasyVariable):
    print("===========================================")
    print(f"Name:         {data.name}")
    print(f"Columns:      {data.columns}")
    print(f"Values Unit:  {data.unit}")
    print(f"Memory usage: {naturalsize(data.nbytes)}")
    print(f"Axes Labels:  {data.axes_labels}")
    print("-------------------------------------------")
    print(f"Meta-data:    {data.meta}")
    print("-------------------------------------------")
    print(f"Time Axis:    {data.time[:3]}")
    print("-------------------------------------------")
    print(f"Values:       {data.values[:3]}")
    print("===========================================")


## Dataset Overview

In [None]:
artemis_probes = ["b", "c"]
probe = artemis_probes[0]

jno_start_date = "2011-08-25"
jno_end_date = "2016-06-30" 

trange = [jno_start_date, jno_end_date]
test_trange = ["2011-08-25", "2011-09-25"]

### Download all the files

In [None]:
sat = 'thb'
coord = 'gse'
datatype  = 'fgs'

sat_fgm_product = f'cda/{sat.upper()}_L2_FGM/{sat}_fgs_gse'
sat_pos_sse_product = f'cda/{sat.upper()}_L1_STATE/{sat}_pos_sse'
sat_pos_gse_product = f'cda/{sat.upper()}_L1_STATE/{sat}_pos_gse'

products = [
    sat_fgm_product,
    sat_pos_sse_product,
    sat_pos_gse_product
]

Download data in a background thread

In [None]:
%%markdown
#| eval: false
download_data(products, trange)

#| eval: false
download_data(products, trange)


### Convert data to `parquet` for faster processing

In [None]:
def spz2parquet(data, force=False):
    output = f"../data/{data.name}.parquet"
    if Path(output).exists() and not force:
        logger.info("Data already converted to parquet")
    else: 
        df = pandas.DataFrame(
            data.values, index=pandas.Series(data.time, name="time"), columns=data.columns
        )
        
        df.to_parquet(output)
        logger.info("Data converted to parquet successfully")

In [None]:
%%markdown

dataset = spz.get_data(products, trange)

for data in dataset:
    spz2parquet(data, force=False)


dataset = spz.get_data(products, trange)

for data in dataset:
    spz2parquet(data, force=False)


In [None]:
def thm_rename_col(col: str):
    if "," in col:
        col = col.split(",")[0]
    return col.split()[0].upper()

### Check and preprocess the data

As we are only interested in the data when THEMIS is in the solar wind, for simplicity we will only keep the data when `X, SSE` and `X, GSE` is positive.

- State data time resolution is 1 minute...

- FGS data time resolution is 4 second...

In [None]:
def get_thm_state(sat):
    sat_pos_sse_files = f"../data/{sat}_pos_sse.parquet"
    sat_pos_sse = pl.scan_parquet(sat_pos_sse_files).set_sorted("time")
    sat_pos_gse_files = f"../data/{sat}_pos_gse.parquet"
    sat_pos_gse = pl.scan_parquet(sat_pos_gse_files).set_sorted("time")
    sat_state = sat_pos_sse.join(sat_pos_gse, on="time", how="inner")
    return sat_state

In [None]:
@dispatch(pl.DataFrame)
def calc_time_diff(data: pl.DataFrame): 
    return data.get_column('time').diff(null_behavior="drop").unique().sort()

@dispatch(pl.LazyFrame)
def calc_time_diff(
    data: pl.LazyFrame
) -> pl.Series: 
    return calc_time_diff(data.collect())

# get_time_dff(sat_state)
# get_time_dff(data)

In [None]:
sat = "thb"
coord = "gse"
datatype = "fgs"
files = f"../data/{sat}_{datatype}_{coord}.parquet"
rename_mapping = {
    "Bx FGS-D": "BX",
    "By FGS-D": "BY",
    "Bz FGS-D": "BZ",
}


output = f"../data/{sat}_data_sw.parquet"
if Path(output).exists():
    pass
else:
    sat_state = get_thm_state(sat).collect()
    sat_state_sw = sat_state.filter((pl.col("X, SSE") >= 0) & (pl.col("X, GSE") >= 0))
    data = pl.scan_parquet(files).rename(rename_mapping).unique("time").sort("time")
    data_sw = data.join_asof(sat_state_sw, on="time", tolerance="1m").drop_nulls().collect()
    data_sw.write_parquet(output)

In [None]:
%%markdown
df = (
    sat_state_sw.upsample("time", every="1m")
    .group_by_dynamic("time", every="1d")
    .agg(pl.col("X, SSE").null_count().alias("null_count"))
    .with_columns(
        pl.when(pl.col("null_count") > 720).then(0).otherwise(1).alias("availablity")
    )
)

properties = {
    'width': 800,
}

chart1 = alt.Chart(df).mark_point().encode(
    x='time',
    y='null_count'
).properties(**properties)

chart2  = alt.Chart(df).mark_point().encode(
    x='time',
    y='availablity'
).properties(**properties)

(chart1 & chart2)

df = (
    sat_state_sw.upsample("time", every="1m")
    .group_by_dynamic("time", every="1d")
    .agg(pl.col("X, SSE").null_count().alias("null_count"))
    .with_columns(
        pl.when(pl.col("null_count") > 720).then(0).otherwise(1).alias("availablity")
    )
)

properties = {
    'width': 800,
}

chart1 = alt.Chart(df).mark_point().encode(
    x='time',
    y='null_count'
).properties(**properties)

chart2  = alt.Chart(df).mark_point().encode(
    x='time',
    y='availablity'
).properties(**properties)

(chart1 & chart2)


## Processing the whole data

In [None]:
#| eval: false
sat = "thb"
bcols = ["BX", "BY", "BZ"]
tau = timedelta(seconds=60)
data_resolution = timedelta(seconds=4)

files = f"../data/{sat}_data_sw.parquet"
output = f'../data/{sat}_candidates_sw_tau_{tau.seconds}.parquet'

data = pl.scan_parquet(files).set_sorted('time').collect()
sat_fgm = df2ts(
    data, ["BX", "BY", "BZ"], attrs={"coordinate_system": coord, "units": "nT"}
)
get_memory_usage(data)
get_memory_usage(sat_fgm)

indices = compute_indices(data, tau)

# filter condition
sparse_num = tau / data_resolution // 3
filter_condition = get_ID_filter_condition(sparse_num = sparse_num)

candidates_pl = indices.filter(filter_condition).with_columns(pl_format_time(tau))
candidates = convert_to_dataframe(candidates_pl)
get_memory_usage(candidates)
# del indices

[32m2023-09-27 11:57:07.031[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_memory_usage[0m:[36m11[0m - [1m741.8 MB (DataFrame)[0m
[32m2023-09-27 11:57:07.031[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_memory_usage[0m:[36m11[0m - [1m222.6 MB (DataArray)[0m

    import ray
    ray.init()


27-Sep-23 11:57:11: Unable to poll TPU GCE metadata: HTTPConnectionPool(host='metadata.google.internal', port=80): Max retries exceeded with url: /computeMetadata/v1/instance/attributes/accelerator-type (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))
27-Sep-23 11:57:11: Failed to detect number of TPUs: [Errno 2] No such file or directory: '/dev/vfio'
2023-09-27 11:57:12,367	INFO worker.py:1642 -- Started a local Ray instance.

Distributing Dataframe: 100%██████████ Elapsed time: 00:00, estimated remaining time: 00:00
Estimated completion of line 17: 100%██

6335828

In [None]:
#| eval: false
ids = process_candidates(candidates, sat_fgm, data, data_resolution)
ids = ids.unique(["d_time", "d_tstart", "d_tstop"])
ids.write_parquet(output)

Estimated completion of line 17: 100%██████████ Elapsed time: 00:00, estimated remaining time: 00:00


In [None]:
#| eval: false

test_eq(ids.unique(["d_time", "d_tstart", "d_tstop"]).shape, ids.unique("d_time").shape)

## Obsolete codes

In [None]:
#| eval: false
import pycdfpp
import pyspedas

In [None]:
#| eval: false

def convert_thm_state_to_parquet(
    probe: str, trange
):
    file_name = f"./data/th{probe}_state.parquet"
    if os.path.exists(file_name):
        return file_name

    start = trange.start.to_string()
    end = trange.end.to_string()

    files = pyspedas.themis.state(
        probe=probe,
        trange=[start, end],
        downloadonly=True,
        no_update=True,
    )

    thm_pos_sse_Xs = []
    thm_pos_gse_Xs = []
    thm_state_times = []
    for file in files:
        thm_state = pycdfpp.load(file)
        epoch_dt64 = thm_state[
            f"time"
        ].values  #  CATDESC: "thm_state_time, UTC, in seconds since 01-Jan-1970 00:00:00"
        thm_pos_sse_Xs.append(thm_state[f"th{probe}_pos_sse"].values[:, 0])
        thm_pos_gse_Xs.append(thm_state[f"th{probe}_pos_gse"].values[:, 0])
        thm_state_times.append(epoch_dt64)

    thm_pos_sse_X = np.concatenate(thm_pos_sse_Xs)
    thm_pos_gse_X = np.concatenate(thm_pos_gse_Xs)
    thm_state_time = np.concatenate(thm_state_times)

    pl.DataFrame(
        {
            "thm_state_time": thm_state_time,
            "thm_pos_gse_X": thm_pos_gse_X,
            "thm_pos_sse_X": thm_pos_sse_X,
        }
    ).with_columns(
        pl.from_epoch(pl.col("thm_state_time"), time_unit="s")
    ).write_parquet(
        file_name
    )

    return file_name


def convert_thm_fgm_to_parquet(probe, trange):
    file_name = f"./data/th{probe}_fgm.parquet"
    if os.path.exists(file_name):
        return file_name

    start = trange.start.to_string()
    end = trange.end.to_string()
    
    files = pyspedas.themis.fgm(
        probe=probe,
        trange=[start, end],
        downloadonly=True,
        no_update=True,
    )

    thm_fgl_gses = []
    thm_fgl_btotals = []
    thm_fgl_times = []

    for file in files:
        cdf = pycdfpp.load(file)
        thm_fgl_gses.append(cdf[f"th{probe}_fgl_gse"].values)
        thm_fgl_btotals.append(cdf[f"th{probe}_fgl_btotal"].values)
        thm_fgl_times.append(cdf[f"th{probe}_fgl_time"].values)

    thm_fgl_gse = np.concatenate(thm_fgl_gses)
    thm_fgl_btotal = np.concatenate(thm_fgl_btotals)
    thm_fgl_time = np.concatenate(thm_fgl_times)

    pl.DataFrame(
        {
            "time": thm_fgl_time,
            "BX": thm_fgl_gse[:,0],
            "BY": thm_fgl_gse[:,1],
            "BZ": thm_fgl_gse[:,2],
            "B": thm_fgl_btotal,
        }
    ).with_columns(
        pl.from_epoch(pl.col("thm_fgl_time"), time_unit="s"),
    ).write_parquet(   
        file_name
    )
    
    return file_name

In [None]:
#| eval: false
convert_thm_state_to_parquet(probe, trange)
convert_thm_fgm_to_parquet(probe, trange)