# open_dataset() data via:
1) add_dataset_path(url without filename) -> open_dataset(filename without zarr ext)
2) _open_dataset(url with filename.zarr)

#### Findings:

1) When trying to __open zarr stored in GS via declaring GS URL & HTTPS URL__
   
- JSON decoding error:

File ~/anemoi/gh_most_uptodate/troubleshooting/anemoi-datasets/src/anemoi/datasets/data/stores.py:171, in open_zarr(path, dont_fail, cache)
    168         store = zarr.LRUStoreCache(store, max_size=cache)
    169         print("44")
--> 171     __return zarr.convenience.open(store, "r")__
    172 except zarr.errors.PathNotFoundError:
    173     if not dont_fail:

- Missing 'data' feature variable within the Zarr store
   - *To get around error, create a 'data' variable within the Zarr store
     
2) When trying to __open zarr stored in S3 via declaring S3 URL__
   
- JSON decoding error:

File ~/anemoi/gh_most_uptodate/troubleshooting/anemoi-datasets/src/anemoi/datasets/data/stores.py:171, in open_zarr(path, dont_fail, cache)
    168         store = zarr.LRUStoreCache(store, max_size=cache)
    169         print("44")
--> 171     __return zarr.convenience.open(store, "r")__
    172 except zarr.errors.PathNotFoundError:
    173     if not dont_fail:

- Missing 'data' feature variable within the Zarr store
   - To get around error, create a 'data' variable within the Zarr store
     
3) When trying to __open zarr stored in S3 via declaring HTTPS URL__

- An output from open_dataset() gets generated after *, BUT error getting:
   - metadata from zarr, as attributes are empty
   - dates from zarr and thus, frequency from zarr
   - statistics from zarr
   - resolution from zarr
   - name_to_index from zarr and thus, variables from zarr
     
- Missing 'data' feature variable within the Zarr store
   - *To get around error, create a 'data' variable within the Zarr store 

4) When trying to __open zarr stored on local disk__

- An output from open_dataset() gets generated after *, BUT error getting:
   - metadata from zarr, as attributes are empty
   - dates from zarr and thus, frequency from zarr
   - statistics from zarr
   - resolution from zarr
   - name_to_index from zarr and thus, variables from zarr
     
- Missing 'data' feature variable within the Zarr store
   - *To get around error, create a 'data' variable within the Zarr store 

In [None]:
# ZARR in GS
from __init__ import add_dataset_path
from __init__ import open_dataset

add_dataset_path("https://console.cloud.google.com/storage/browser/gcp-public-data-arco-era5/ar/")
#add_dataset_path("gs://gcp-public-data-arco-era5/ar/")

# Opening entire dataset w/out filter.
ds = open_dataset("1959-2022-1h-360x181_equiangular_with_poles_conservative")
ds

In [None]:
# ZARR in S3
from __init__ import add_dataset_path
from __init__ import open_dataset, list_dataset_names

add_dataset_path("https://noaa-ufs-gdas-pds.s3.amazonaws.com/")#
#add_dataset_path("s3://noaa-ufs-gdas-pds/")

# Opening entire dataset w/out filter.
ds = open_dataset("test_ar")
ds

In [None]:
# ZARR in ECMWF Site
from __init__ import add_dataset_path
from __init__ import open_dataset

add_dataset_path("https://object-store.os-api.cci1.ecmwf.int/ml-examples/")

# Opening entire dataset w/out filter.
ds = open_dataset("an-oper-2023-2023-2p5-6h-v1")

# Subsetting is the action of filtering the dataset by it’s first dimension (dates).
'''
** start & end: **
The following are equivalent ways of describing start or end:

2020 and "2020"
202306, "202306" and "2023-06"
20200301, "20200301" and "2020-03-01"

Note: 
- start="2020" is equivalent to start="2020-01-01" while end="2020" is equivalent to end="2020-12-31".
- Frequency of the dataset will change how the end option is interpreted: - end="2020" with a frequency of one hour 
is equivalent to end="2020-12-31 23:00:00" - end="2020" with a frequency of 6 hours is equivalent to end="2020-12-31 18:00:00"

** frequency **
- The new frequency must be a multiple of the original frequency.
- To artificially increase the frequency, you can use the "interpolate_frequency" option, which will create new dates in the 
dataset by linearly interpolating the data values between the original dates. Ex: ds = open_dataset(dataset, interpolate_frequency="10m")

'''
#ds = open_dataset("an-oper-2023-2023-2p5-6h-v1", start="2023-1-01", end="2023-12-30")# ECMWF ERA5 data file's metadata reveals dates are from 2023-1-01T00:00:00 to 2023-12-31T18:00:00 found via ds.dates
#ds = open_dataset("an-oper-2023-2023-2p5-6h-v1", start="2023", end="2023")# ECMWF ERA5 data file's metadata reveals dates are from 2023-1-01T00:00:00 to 2023-12-31T18:00:00
#ds = open_dataset("an-oper-2023-2023-2p5-6h-v1", frequency = "6h", start="2023", end="2023")# ECMWF ERA5 data file's metadata reveals dates are from 2023-1-01T00:00:00 to 2023-12-31T18:00:00 @ freq. multiples of 6h, 12h, etc; must be of proper frequency or will rcv error
#ds = open_dataset("an-oper-2023-2023-2p5-6h-v1", interpolate_frequency="1h", start="2023", end="2023")
#ds = open_dataset("an-oper-2023-2023-2p5-6h-v1", "an-oper-2023-2023-2p5-6h-v1")#, thinning=2)
ds


In [None]:
# ZARR in Local
from __init__ import add_dataset_path
from __init__ import open_dataset, list_dataset_names
import os 

add_dataset_path(os.getcwd())

# Opening entire dataset w/out filter.
ds = open_dataset("gcp_ar_era5_subset")
ds

# Read output info of open_dataset(ZARR_FILE)

In [None]:
# Demo: Methods of open_dataset's output:

# The following methods and attributes are available for the objects returned by open_dataset.

# Return the dataset’s metadata.
ds.metadata()

# Return the dataset’s provenance information.
#ds.provenance

# For debugging. Given the index of a variable, this will return from which Zarr store it will be loaded. This is useful to debug combining datasets with join.
#ds.source

# For debugging. Return the dataset’s internal tree structure.
#ds.tree()

# Demo: Attributes of open_dataset's output:
# '''
# When building a dataset, Ref: https://anemoi-datasets.readthedocs.io/en/latest/building/introduction.html
# '''

# shape: A tuple of the dataset’s dimensions.
#ds.shape

# field_shape: The original shape of a single field, either 1D or 2D. When building datasets, the fields are flattened to 1D.
#ds.field_shape

# dtype: The dataset’s NumPy data type.
#ds.dtype

# dates: The dataset’s dates, as a NumPy vector of datetime64 objects.
#ds.dates

# frequency: The dataset’s frequency (i.e the delta between two consecutive dates) in hours.
#ds.frequency

# latitudes: The dataset’s latitudes as a NumPy vector.
#ds.latitudes

# longitudes: The dataset’s longitudes as a NumPy vector.
#ds.longitudes

# statistics: The dataset’s statistics. This is a dictionary with the following entries:
# ds.statistics

# Example: Statistics can be used such that each entry is a NumPy vector with the same length as the number of variables, each element corresponding to a variable. You can therefore use it like:
#values = ds[0]
#normalized = (values - ds.statistics["mean"])/ ds.statistics["stdev"]


# resolution: The dataset’s resolution.
#ds.resolution

# name_to_index: A dictionary mapping variable names to their indices.
#ds.name_to_index["2t"]

# variables: A list of the dataset’s variable names, in the order they appear in the dataset.
#ds.variables

# missing: The set of indices of the missing dates.
#ds.missing

# grids: A tuple of number of grid points for each dataset that is combined with the grids method.
#ds.grids

# Demo: Slicing & Indexing a dataset opened via open_dataset:
#ds[0]
#ds[-1]
#ds[0:10]
#ds[0:10:2]
#ds[0, 1, :]

# *** 10/10/24 ====> LEFT OFF ON GUIDE: https://anemoi-datasets.readthedocs.io/en/latest/using/combining.html


# Create subset ERA5 AR Zarr in GS & Save to local

In [None]:
import xarray as xr
import gcsfs

## Load Zarr via open_zarr(direct gs url)
# Does not extract attibutes UNLESS you declare the variable and then, test[VARIBALE_NAME].attrs
gcp_ar_era5_subset = xr.open_zarr('gs://gcp-public-data-arco-era5/ar/1959-2022-1h-360x181_equiangular_with_poles_conservative.zarr', 
                    chunks={'time': 48},
                    consolidated=True)
gcp_ar_era5_subset = gcp_ar_era5_subset.isel(time=slice(-15, -1))
gcp_ar_era5_subset 


## Load Zarr via open_zarr(fs.get_mapper(direct gs url))
# bucket_name = "gcp-public-data-arco-era5"
# fs = gcsfs.GCSFileSystem(project=f'{bucket_name}_fs')

# print(f"Data Categories (higher-level prefix):\n{fs.ls(bucket_name)}")
# mapper = fs.get_mapper(f"gs://{bucket_name}/ar/1959-2022-1h-360x181_equiangular_with_poles_conservative.zarr")

# ds = xr.open_zarr(mapper)
# gcp_ar_era5_subset = ds.isel(time=slice(-10, -5))
# gcp_ar_era5_subset

# Save to local disk
#gcp_ar_era5_subset.to_zarr('gcp_ar_era5_subset.zarr')

# Verify generated anemoi formatted Zarr

In [None]:
# import xarray as xr
# import gcsfs

# ## Load Zarr
# vars = ['2m_temperature',
#         '10m_u_component_of_wind',
#         'geopotential',
#         '10m_v_component_of_wind',
#         'surface_pressure']
# gcp_ar_era5_subset = xr.open_zarr("test_s3_zarr.zarr")
# gcp_ar_era5_subset 