In [None]:
import contextlib
import math
from pathlib import Path
# import time

from ipyfilechooser import FileChooser

import numpy as np
import s3fs
import xarray as xr
import zarr

import opensarlab_lib as osl

import matplotlib.pyplot as plt

**Write aliases to the jupyter `%timeit` magic for 1, 10, 100, and 1000 iterations**

These aliases can be used to time your code. 

`elapsed_time = %t100 foo(bar)`

The above code will time running `foo(bar)` over 100 iterations.
    

In [None]:
%alias_magic t1 timeit -p "-t -q -r 1 -n 1 -o" --line
%alias_magic t10 timeit -p "-t -q -r 1 -n 10 -o" --line
%alias_magic t100 timeit -p "-t -q -r 1 -n 100 -o" --line
%alias_magic t1000 timeit -p "-t -q -r 1 -n 1000 -o" --line

**Write a function to plot bar charts comparing Zarr store access times**

In [None]:
def plot_zarr_access_time_compare(times, units, title, figsize=(10, 10)):
    data = list(times.values())
    columns = list(times.keys())

    max_t = math.ceil(math.floor(max(data)) + (math.floor(max(data)) / 4))
    print(max_t)
    values = np.arange(0.0, max_t, round(max_t/4, 1))
    
    edge_colors = ['mediumseagreen', 'tomato']
    face_colors = ['honeydew', 'lightsalmon']
    if data[0] > data[1]:
        edge_colors.reverse()
        face_colors.reverse()

    index = [0, 0.45]
    bar_width = 0.4
    fig = plt.subplots(figsize=figsize)
    bars = plt.bar(index, data, bar_width, linewidth=3, color=face_colors, edgecolor=edge_colors)
    plt.bar_label(bars, [f"{t:.2f} {units}" for t in data], label_type='center')

    plt.ylabel(f"Zarr Store Access Time in {units}")
    plt.yticks(values)
    plt.xticks(index, [f"{c}\nZarr Store" for c in columns])

    plt.title(title)
    plt.show()

**Are you accessing a local or remote Zarr store in S3?**

In [None]:
store_type = osl.select_parameter(['local Zarr store', 'remote zarr store in S3'])
display(store_type)

In [None]:
local = store_type.value == 'local Zarr store'

**Select a space optimized zarr store RTC stack and open it with xarray**

The handle of the open spatially optimized data set is `space_op`

In [None]:
# if local:
#     fc_1 = FileChooser(Path.cwd())
#     display(fc_1)

In [None]:
if local:
    # store_1 = Path(fc_1.selected_path)
    space_store = "/home/jovyan/notebooks/SAR_Training/English/Master/mekong/stack_space_optimized_100MB/"
    space_op = xr.open_zarr(store=space_store, consolidated=True)

**Select a time optimized zarr store RTC stack and open it with xarray**

The handle of the open temporally optimized data set is `time_op`

In [None]:
# if local:
#     fc_2 = FileChooser(Path.cwd())
#     display(fc_2)

In [None]:
if local:
    # store_2 = Path(fc_2.selected_path)
    time_store = "/home/jovyan/notebooks/SAR_Training/English/Master/mekong/stack_time_optimized_100MB/"  
    time_op = xr.open_zarr(store=time_store, consolidated=True)

**If using remote Zarr stores, this assumes you are working with 2 groups in the same store**

In [None]:
if not local:
    # s3_path = input("Enter the S3 bucket path to your Zarr store")
    s3_path = "s3://asf-jupyter-data-west/zarr_test/mekong"
    s3 = s3fs.S3FileSystem(anon=True)
    store = s3fs.S3Map(root=s3_path, s3=s3, check=False)

    # space_group = input("Enter the name of the group containing a spatially optimized zarr store stack")
    space_group = 'stack_space_optimized_100MB_chunks'
    space_op = xr.open_zarr(store=store, consolidated=True, group=space_group)                 

    # time_group = input("Enter the name of the group containing a temporally optimized zarr store stack")
    time_group = 'stack_time_optimized_100MB_chunks'
    time_op = xr.open_zarr(store=store, consolidated=True, group=time_group)

**Explore the Datasets and compare their chunking**

In [None]:
space_op

In [None]:
time_op

In [None]:
print(f"Time optimized chunking:\n{time_op.chunks}")
print(f"\nSpatially optimized chunking:\n{space_op.chunks}")

---
# 1: Speed test accessing arrays of pixels in various dimensions using time vs. spatially optimized zarr stores using xarray

### 1.1: Time creating a point time series xarray.Dataset (containing vv and vh backscatter xarray.DataArrays) from temporally-optimized vs spatially-optimized zarr stores

In [None]:
time_xarray_from_time_op_time = %t1000 time_op.sel(x=slice(675090., 675090.), y=slice(1022430., 1022430.))
time_xarray_from_time_op_time

In [None]:
time_array_from_space_op_time = %t1000 space_op.sel(x=slice(675090., 675090.), y=slice(1022430., 1022430.))
time_array_from_space_op_time

In [None]:
percentage = 100 * time_xarray_from_time_op_time.average / time_array_from_space_op_time.average
print(f"\nAccessing the data from a temporally optimized zarr store took {abs(percentage):.2f}% of the time it took from a spatially optimized zarr store")
print(f"\n{abs(percentage):.2f}%")

In [None]:
times = {
    'Temporally Optimized': time_xarray_from_time_op_time.average*1000,
    'Spatially Optimized': time_array_from_space_op_time.average*1000
}
title = f'Point Time Series Zarr Store Access:\nTemporally Optimized store vs Spatially Optimized store\n using xarray functions'
plot_zarr_access_time_compare(times, 'ms', title)

---

### 1.2: Time creating an xarray.Dataset (containing vv and vh backscatter xarray.DataArrays) across all x coords for a given time and y coord from temporally-optimized vs spatially-optimized zarr stores

In [None]:
x_array_from_space_optimized_time = %t1000 space_op.sel(time=slice('20180106T224528', '20180106T224528'), y=slice(1022430., 1022430.))
x_array_from_space_optimized_time

In [None]:
x_array_from_time_optimized_time = %t1000 time_op.sel(time=slice('20180106T224528', '20180106T224528'), y=slice(1022430., 1022430.))
x_array_from_time_optimized_time

In [None]:
percentage = 100 * x_array_from_space_optimized_time.average / x_array_from_time_optimized_time.average
print(f"\nAccessing the data from a spatially optimized zarr store took {abs(percentage):.2f}% of the time it took from a temporally optimized zarr store")
print(f"\n{abs(percentage):.2f}%")

In [None]:
times = {
    'Temporally Optimized': x_array_from_time_optimized_time.average*1000,
    'Spatially Optimized': x_array_from_space_optimized_time.average*1000
}
title = f'x coord spatial Zarr Store Access:\nTemporally Optimized store vs Spatially Optimized store\n using xarray functions'
plot_zarr_access_time_compare(times, 'ms', title)

---
### 1.3: Time creating an xarray.Dataset (containing vv and vh backscatter xarray.DataArrays) across all y coords for a given time and x coord from temporally-optimized vs spatially-optimized zarr stores**

In [None]:
y_array_from_space_optimized_time = %t1000 space_op.sel(time=slice('20180106T224528', '20180106T224528'), x=slice(675090., 675090.))
y_array_from_space_optimized_time

In [None]:
y_array_from_time_optimized_time = %t1000 time_op.sel(time=slice('20180106T224528', '20180106T224528'), x=slice(675090., 675090.))
y_array_from_time_optimized_time

In [None]:
percentage = 100 * y_array_from_space_optimized_time.average / y_array_from_time_optimized_time.average
print(f"\nAccessing the data from a spatially optimized zarr store took {abs(percentage):.2f}% of the time it took from a temporally optimized zarr store")
print(f"\n{abs(percentage):.2f}%")

In [None]:
times = {
    'Temporally Optimized': y_array_from_time_optimized_time.average*1000,
    'Spatially Optimized': y_array_from_space_optimized_time.average*1000
}
title = f'y coord spatial Zarr Store Access:\nTemporally Optimized store vs Spatially Optimized store\n using xarray functions'
plot_zarr_access_time_compare(times, 's', title)

---
---
# 2: Speed test creating similar arrays as above but convert them to numpy.ndarrays

- Uses xarray functions to access the desired data and then converts them to numpy.ndarrays

### 2.1: Time creating a point time series xarray.Dataset from temporally-optimized vs spatially-optimized zarr stores and converting the VV and VH backscatter xarray.DataArrays to numpy.ndarrays

In [None]:
time_ndarray_from_time_optimized_time = %t10 time_op.vh_backscatter.sel(x=slice(675090., 675090.), y=slice(1022430., 1022430.)).to_numpy(); time_op.vv_backscatter.sel(x=slice(675090., 675090.), y=slice(1022430., 1022430.)).to_numpy()
time_ndarray_from_time_optimized_time

In [None]:
time_ndarray_from_space_optimized_time = %t10 space_op.vh_backscatter.sel(x=slice(675090., 675090.), y=slice(1022430., 1022430.)).to_numpy(); space_op.vv_backscatter.sel(x=slice(675090., 675090.), y=slice(1022430., 1022430.)).to_numpy();
time_ndarray_from_space_optimized_time

In [None]:
percentage = 100 * time_ndarray_from_time_optimized_time.average / time_ndarray_from_space_optimized_time.average
print(f"\nAccessing the data from a temporally optimized zarr store took {abs(percentage):.2f}% of the time it took from a spatially optimized zarr store")
print(f"\n{abs(percentage):.2f}%")

In [None]:
times = {
    'Temporally Optimized': time_ndarray_from_time_optimized_time.average,
    'Spatially Optimized': time_ndarray_from_space_optimized_time.average
}
title = f'Point Time Series Zarr Store Access and Conversion to Numpy.ndarrays:\nTemporally Optimized store vs Spatially Optimized store'
plot_zarr_access_time_compare(times, 's', title)

---
## 2.2: Time creating an xarray.Dataset across all x coords for a given time and y coord and from temporally-optimized vs spatially-optimized zarr stores and converting the VV and VH backscatter xarray.DataArrays to numpy.ndarrays

In [None]:
x_ndarray_from_space_optimized_time = %t10 space_op.vh_backscatter.sel(time=slice('20180106T224528', '20180106T224528'), y=slice(1022430., 1022430.)).to_numpy(); space_op.vv_backscatter.sel(time=slice('20180106T224528', '20180106T224528'), y=slice(1022430., 1022430.)).to_numpy()
x_ndarray_from_space_optimized_time

In [None]:
x_ndarray_from_time_optimized_time = %t10 time_op.vh_backscatter.sel(time=slice('20180106T224528', '20180106T224528'), y=slice(1022430., 1022430.)).to_numpy(); time_op.vv_backscatter.sel(time=slice('20180106T224528', '20180106T224528'), y=slice(1022430., 1022430.)).to_numpy()
x_ndarray_from_time_optimized_time

In [None]:
percentage = 100 * x_ndarray_from_space_optimized_time.average / x_ndarray_from_time_optimized_time.average
print(f"\nAccessing the data from a spatially optimized zarr store took {abs(percentage):.2f}% of the time it took from a temporally optimized zarr store")
print(f"\n{abs(percentage):.2f}%")

In [None]:
times = {
    'Temporally Optimized': x_ndarray_from_time_optimized_time.average,
    'Spatially Optimized': x_ndarray_from_space_optimized_time.average
}
title = f'x coord spatial Zarr Store Access and conversion to numpy.ndarrays:\nTemporally Optimized store vs Spatially Optimized store'
plot_zarr_access_time_compare(times, 's', title)

---
## 2.3: Time creating an xarray.Dataset across all y coords for a given time and x coord and from temporally-optimized vs spatially-optimized zarr stores and converting the VV and VH backscatter xarray.DataArrays to numpy.ndarrays

In [None]:
y_ndarray_from_space_optimized_time = %t10 space_op.vh_backscatter.sel(time=slice('20180106T224528', '20180106T224528'), x=slice(675090., 675090.)).to_numpy(); space_op.vv_backscatter.sel(time=slice('20180106T224528', '20180106T224528'), x=slice(675090., 675090.)).to_numpy()
y_ndarray_from_space_optimized_time

In [None]:
y_ndarray_from_time_optimized_time = %t10 time_op.vh_backscatter.sel(time=slice('20180106T224528', '20180106T224528'), x=slice(675090., 675090.)).to_numpy(); time_op.vv_backscatter.sel(time=slice('20180106T224528', '20180106T224528'), x=slice(675090., 675090.)).to_numpy()
y_ndarray_from_time_optimized_time

In [None]:
percentage = 100 * y_ndarray_from_space_optimized_time.average / y_ndarray_from_time_optimized_time.average
print(f"\nAccessing the data from a spatially optimized zarr store took {abs(percentage):.2f}% of the time it took from a temporally optimized zarr store")
print(f"\n{abs(percentage):.2f}%")

In [None]:
times = {
    'Temporally Optimized': y_ndarray_from_time_optimized_time.average,
    'Spatially Optimized': y_ndarray_from_space_optimized_time.average
}
title = f'y coord spatial Zarr Store Access and conversion to numpy.ndarrays:\nTemporally Optimized store vs Spatially Optimized store'
plot_zarr_access_time_compare(times, 's', title)

---
---
# 3: Speed test creating the arrays as Python lists of a single polarization (1/2 the data of the Datasets created above)

- This computes the Dataset values, converting them to numpy floats, and assembles them into a Python list
- This is very slow so we will only iterate once in the `time_it()` function
- This demonstrates why it is important to use xarray functions to access zarr store data
    - Converting to numpy data types and assembling lists is orders of magnitude slower than accessing data with xarray

****Time creating a point time series Python list of the vh polarization from temporally-optimized vs spatially-optimized zarr stores****

In [None]:
time_list_from_time_optimized_time = %t1 [time_op.vh_backscatter[i][1000][1000].data.compute() for i in range(0, len(time_op.time))]
time_list_from_time_optimized_time

In [None]:
time_list_from_space_optimized_time = %t1 [space_op.vh_backscatter[i][1000][1000].data.compute() for i in range(0, len(space_op.time))]
time_list_from_space_optimized_time

In [None]:
percentage = 100 * time_list_from_time_optimized_time.average / time_list_from_space_optimized_time.average
print(f"\nAccessing the data from a temporally optimized zarr store took {abs(percentage):.2f}% of the time it took from a spatially optimized zarr store")
print(f"\n{abs(percentage):.2f}%")

In [None]:
times = {
    'Temporally Optimized': time_list_from_time_optimized_time.average,
    'Spatially Optimized': time_list_from_space_optimized_time.average
}
title = f'Point Time Series Zarr Store Access, Computation of Values, and Conversion to Python lists:\nTemporally Optimized store vs Spatially Optimized store'
plot_zarr_access_time_compare(times, 's', title)

---
**Time creating a Python list (containing only vh backscatter xarray.DataArrays) across all x coords for a given time and y coord from temporally-optimized vs spatially-optimized zarr stores**

- This is so slow, we will only access data for the first 100 `y` coords

In [None]:
y_list_from_space_optimized_time = %t1 [space_op.vh_backscatter[0][i][1000].data.compute() for i in range(0, len(space_op.y)//100)]
y_list_from_space_optimized_time

In [None]:
y_list_from_time_optimized_time = %t1 [time_op.vh_backscatter[0][i][1000].data.compute() for i in range(0, len(time_op.y)//100)]
y_list_from_time_optimized_time

In [None]:
percentage = 100 * y_list_from_space_optimized_time.average / y_list_from_time_optimized_time.average
print(f"\nAccessing the data from a spatially optimized zarr store took {abs(percentage):.2f}% of the time it took from a temporally optimized zarr store")
print(f"\n{abs(percentage):.2f}%")

In [None]:
times = {
    'Temporally Optimized': y_list_from_time_optimized_time.average,
    'Spatially Optimized': y_list_from_space_optimized_time.average
}
title = f'x coord spatial Zarr Store Access, Computation of Values, and Conversion to Python lists:\nTemporally Optimized store vs Spatially Optimized store'
plot_zarr_access_time_compare(times, 's', title)

---
**Time creating a Python list (containing only vh backscatter xarray.DataArrays) across all y coords for a given time and x coord from temporally-optimized vs spatially-optimized zarr stores**

- This is so slow, we will only access data for the first 100 `x` coords

In [None]:
x_list_from_space_optimized_time = %t1 [space_op.vh_backscatter[0][1000][i].data.compute() for i in range(0, len(space_op.x)//100)]
x_list_from_space_optimized_time

In [None]:
x_list_from_time_optimized_time = %t1 [time_op.vh_backscatter[0][1000][i].data.compute() for i in range(0, len(time_op.x)//100)]
x_list_from_time_optimized_time

In [None]:
percentage = 100 * x_list_from_space_optimized_time.average / x_list_from_time_optimized_time.average
print(f"\nAccessing the data from a spatially optimized zarr store took {abs(percentage):.2f}% of the time it took from a temporally optimized zarr store")
print(f"\n{abs(percentage):.2f}%")

In [None]:
times = {
    'Temporally Optimized': x_list_from_time_optimized_time.average,
    'Spatially Optimized': x_list_from_space_optimized_time.average
}
title = f'y coord spatial Zarr Store Access, Computation of Values, and Conversion to Python lists:\nTemporally Optimized store vs Spatially Optimized store'
plot_zarr_access_time_compare(times, 's', title)