In [1]:
#| default_exp utils/speasy
#| export
import speasy as spz
import polars as pl

from pydantic import model_validator
from space_analysis.core import Variables as Vs

from speasy.core.dataprovider import DataProvider
from speasy import SpeasyVariable
from speasy.core.inventory import DatasetIndex, ParameterIndex

from fastcore.all import patch

In [2]:
#| export
def spzvar2pldf(var: SpeasyVariable):
    # see SpeasyVariable.to_dataframe
    var = var.replace_fillval_by_nan()
    return pl.DataFrame(var.values, schema=var.columns).with_columns(
        time = pl.Series(var.time)
    ).lazy() # Need to `lazy` last or ShapeError: unable to add a column of length xxxx to a DataFrame of height yyyy
    
    
def spzvars2pldf(vars: list[SpeasyVariable]):
    # join all dataframes into a single one on the time column
    if len(vars) == 1:
        return spzvar2pldf(vars[0])
    return pl.concat([spzvar2pldf(var) for var in vars], how='align')

In [3]:
#| export
@patch
def time_resolutions(self: SpeasyVariable):
    return pl.Series(self.time).diff().describe()
        

In [4]:
# | export
def get_provider(v: str) -> DataProvider:
    if v == "cda":
        return spz.cda
    else:
        return spz.amda


def get_dataset_index(v: str, provider: str) -> DatasetIndex:
    return get_provider(provider).flat_inventory.datasets[v]


class Variables(Vs):
    products: list[str | ParameterIndex] = None

    data: list[SpeasyVariable] = None
    
    _disable_proxy: bool = True

    # initize products from provider and dataset if not provided
    @model_validator(mode="after")
    def check_products(self):
        if self.products is None and self.dataset:
            if self.parameters:
                self.products = [
                    f"{self.provider}/{self.dataset}/{var}" for var in self.parameters
                ]

            else:
                dataset_index = get_dataset_index(self.dataset, self.provider)
                self.products = [
                    member
                    for member in vars(dataset_index).values()
                    if isinstance(member, ParameterIndex)
                ]
                self.parameters = [member.spz_name() for member in self.products]

    def retrieve_data(self):
        # return Variables with data set
        if 'local' in self.provider:
            self.data = spz.get_data(
                self.products, self.timerange
            )
        else:
            self.data = spz.get_data(
                self.products, self.timerange, disable_proxy=self._disable_proxy
            )
        return self

    @property
    def time_resolutions(self) -> pl.DataFrame:
        return self.get_data()[0].time_resolutions()

    def to_polars(self):
        return spzvars2pldf(self.get_data())

### Test

In [8]:
timerange = ['2019-04-07T01:00', '2019-04-07T12:00']
vars = Variables(
    dataset="PSP_FLD_L2_MAG_RTN",
    parameters=["psp_fld_l2_mag_RTN"],
    timerange=timerange,
).retrieve_data()



In [12]:
vars.data[0].time_resolutions(), vars.time_resolutions

(shape: (8, 2)
 ┌────────────┬────────────────┐
 │ statistic  ┆ value          │
 │ ---        ┆ ---            │
 │ str        ┆ str            │
 ╞════════════╪════════════════╡
 │ count      ┆ 5800765        │
 │ null_count ┆ 1              │
 │ mean       ┆ 0:00:00.006826 │
 │ min        ┆ 0:00:00.006690 │
 │ 25%        ┆ 0:00:00.006826 │
 │ 50%        ┆ 0:00:00.006826 │
 │ 75%        ┆ 0:00:00.006826 │
 │ max        ┆ 0:00:00.006935 │
 └────────────┴────────────────┘,
 shape: (8, 2)
 ┌────────────┬────────────────┐
 │ statistic  ┆ value          │
 │ ---        ┆ ---            │
 │ str        ┆ str            │
 ╞════════════╪════════════════╡
 │ count      ┆ 5800765        │
 │ null_count ┆ 1              │
 │ mean       ┆ 0:00:00.006826 │
 │ min        ┆ 0:00:00.006690 │
 │ 25%        ┆ 0:00:00.006826 │
 │ 50%        ┆ 0:00:00.006826 │
 │ 75%        ┆ 0:00:00.006826 │
 │ max        ┆ 0:00:00.006935 │
 └────────────┴────────────────┘)

In [53]:
def data_provider_summary(data_provider: DataProvider = spz.cda):
    # show the name of the data_provider, the number of datasets, parameters and catalogs

    inventory = data_provider.flat_inventory
    print("Data Provider:", data_provider.provider_name)
    print("Datasets:", len(inventory.datasets))
    print("Parameters:", len(inventory.parameters))
    print("Catalogs:", len(inventory.catalogs))

# data_provider_summary(spz.cda)
# data_provider_summary(spz.amda)
# data_provider_summary(spz.csa)

Data Provider: cda
Datasets: 2608
Parameters: 58510
Catalogs: 0
Data Provider: amda
Datasets: 1074
Parameters: 5397
Catalogs: 24
Data Provider: csa
Datasets: 912
Parameters: 1993
Catalogs: 0


In [None]:
from fastcore.utils import patch
from speasy.products import SpeasyVariable
from humanize import naturalsize

In [None]:
@patch
def preview(self: SpeasyVariable):
    print("===========================================")
    print(f"Name:         {self.name}")
    print(f"Columns:      {self.columns}")
    print(f"Values Unit:  {self.unit}")
    print(f"Memory usage: {naturalsize(self.nbytes)}")
    print(f"Axes Labels:  {self.axes_labels}")
    print("-------------------------------------------")
    print(f"Meta-data:    {self.meta}")
    print("-------------------------------------------")
    print(f"Time Axis:    {self.time[:3]}")
    print("-------------------------------------------")
    print(f"Values:       {self.values[:3]}")
    print("===========================================")
