# Usage of chunkindex with a slow network connexion simulated with lighttpd 
This notebook shows how to make use of chunkindex module to access data in a compressed netCDF file through a HTTP server.

In [1]:
# Append the root of the the project to PYTHONPATH
import sys
sys.path.append('..')

# Define some parameters used throughout this notebook

from pathlib import Path
import urllib

# Dataset filename
dataset_file = "SWOT_L2_LR_PreCalSSH_Expert_002_086_20230814T031152_20230814T040129_PIA1_01.nc"

# Define the dataset path
dataset_dir = Path('data') / 'www'
dataset_dir = dataset_dir.resolve()
dataset_dir.mkdir(parents=True, exist_ok=True)
dataset_path = dataset_dir / dataset_file

# Data URL
base_url = 'http://127.0.0.1:8000'
dataset_url = urllib.parse.urljoin(base_url, dataset_file)

# HTTP server speed
kbps=100

## Setup the HTTP server

Create a configuration file for the HTTP server: [lighttpd.conf](lighttpd.conf).

In [2]:
%%writefile lighttpd.conf
server.document-root = env.LIGHTTPD_DOC_ROOT
server.kbytes-per-second = env.LIGHTTPD_KBPS
server.port = 8000
dir-listing.activate = "enable" 

Overwriting lighttpd.conf


Start the HTTP server

In [3]:
%%bash --bg -s "$dataset_dir" "$kbps"

# Kill a previous lighttpd server
pkill lighttpd

# Run a HTTP server
LIGHTTPD_DOC_ROOT=${1} LIGHTTPD_KBPS=${2} lighttpd -Df lighttpd.conf

# Wait one second to let time to the server to start
sleep 3

Check that the HTTP server is running: Show the content of the data folder that is served by the HTTP server we have justed started: http://127.0.0.1:8000

In [4]:
from IPython.display import IFrame
IFrame(base_url, width=800, height=150)

## Create an index with chunkindex

Chunkindex create zran index that provides decompression starting points within the chunks.

File location: data/tmp/ramp_indexchunk.nc

In [5]:
import chunkindex
import contextlib
import os
import netCDF4
from pathlib import Path
import xarray as xr
import numpy as np

# Define the index path
index_filename = str(dataset_path.stem) + '_indexchunk.nc'
index_path = dataset_path.parent.joinpath(index_filename)
# index URL
index_url = urllib.parse.urljoin(base_url, index_filename)

# Remove it if it already exists
with contextlib.suppress(FileNotFoundError):
    os.remove(index_path)

# Create the zran index for all variables and chunks of the dataset and write it to the netcdf4 file at index_path
chunkindex.create_index(index_path, dataset_path)

# Display the resulting index for one chunk
index_x00 = xr.open_dataset(index_path, group='ssh_karin/0.0')
index_x00



## Time to access to the data

We define a slice of data we will access to. It lays on two chunks in the 3rd dimension.

In [6]:
#slice1 = ((slice(6000, 6010), slice(0, 69)))
slice1 = ((slice(6000, 8000), slice(0, 69)))
variable='ssh_karin'

### Direct access to the data __without__ the index 
#### with xarray

In [9]:
%%time
# Read the netcdf dataset without the use of the index (xarray and #mode=bytes style)

import time
import xarray as xr

start_time = time.time()

# Open the netCDF dataset
with xr.open_dataset(dataset_path) as ds:
    data = ds[variable][slice1]
    print(data.max().values)

end_time = time.time() - start_time
print('Elapsed time: %.2fms' % (end_time*1000))

# Check the data decompressed
print(data.values)
ref_data = data.values

61.284000000000006
Elapsed time: 142.41ms
[[     nan      nan  61.1774 ...  57.4727  57.4768      nan]
 [     nan      nan  61.2194 ...  57.4146  57.4277      nan]
 [     nan      nan  61.2322 ...  57.3577  57.3452      nan]
 ...
 [     nan      nan      nan ... -29.3178 -29.3235      nan]
 [     nan      nan      nan ... -29.455  -29.4612      nan]
 [     nan      nan      nan ... -29.6098 -29.6084      nan]]
CPU times: user 114 ms, sys: 15.9 ms, total: 130 ms
Wall time: 144 ms


#### with Dataset

In [10]:
%%time
# Read the netcdf dataset without the use of the index and with netCDF4 and #mode=bytes style)

import time
from netCDF4 import Dataset

start_time = time.time()

# Open the netCDF dataset
with Dataset(dataset_path) as dataset:
    data = dataset[variable][slice1]
    print(np.nanmax(data))

end_time = time.time() - start_time
print('Elapsed time: %.2fms' % (end_time*1000))

# Check the data decompressed
print(data)
assert(np.allclose(data, ref_data, equal_nan=True))

61.284000000000006
Elapsed time: 68.82ms
[[-- -- 61.177400000000006 ... 57.4727 57.476800000000004 --]
 [-- -- 61.2194 ... 57.4146 57.4277 --]
 [-- -- 61.232200000000006 ... 57.3577 57.345200000000006 --]
 ...
 [-- -- -- ... -29.317800000000002 -29.323500000000003 --]
 [-- -- -- ... -29.455000000000002 -29.4612 --]
 [-- -- -- ... -29.6098 -29.608400000000003 --]]
CPU times: user 74 ms, sys: 137 µs, total: 74.2 ms
Wall time: 77.1 ms


### Time to access to the data __without__ the index

#### With xarray

In [12]:
%%time
# Read the netcdf dataset without the use of the index (xarray and #mode=bytes style)

import time
import xarray as xr

start_time = time.time()

# Open the netCDF dataset
with xr.open_dataset(dataset_url + "#mode=bytes") as ds:
    data = ds[variable][slice1]
    print(data.max().values)

end_time = time.time() - start_time
print('Elapsed time: %.2fms' % (end_time*1000))

# Check the data decompressed
print(data.values)
assert(np.allclose(data.values, ref_data, equal_nan=True))

61.284000000000006
Elapsed time: 11324.06ms
[[     nan      nan  61.1774 ...  57.4727  57.4768      nan]
 [     nan      nan  61.2194 ...  57.4146  57.4277      nan]
 [     nan      nan  61.2322 ...  57.3577  57.3452      nan]
 ...
 [     nan      nan      nan ... -29.3178 -29.3235      nan]
 [     nan      nan      nan ... -29.455  -29.4612      nan]
 [     nan      nan      nan ... -29.6098 -29.6084      nan]]
CPU times: user 202 ms, sys: 8.45 ms, total: 211 ms
Wall time: 11.3 s


#### With netCDF4 Dataset

In [11]:
%%time
# Read the netcdf dataset without the use of the index and with netCDF4 and #mode=bytes style)

from netCDF4 import Dataset

start_time = time.time()

# Open the netCDF dataset
with Dataset(dataset_url + "#mode=bytes") as dataset:
    data = dataset[variable][slice1]
    print(np.nanmax(data))

end_time = time.time() - start_time
print('Elapsed time: %.2fms' % (end_time*1000))

# Check the data decompressed
print(data)
assert(np.allclose(data, ref_data, equal_nan=True))


61.284000000000006
Elapsed time: 11251.83ms
[[-- -- 61.177400000000006 ... 57.4727 57.476800000000004 --]
 [-- -- 61.2194 ... 57.4146 57.4277 --]
 [-- -- 61.232200000000006 ... 57.3577 57.345200000000006 --]
 ...
 [-- -- -- ... -29.317800000000002 -29.323500000000003 --]
 [-- -- -- ... -29.455000000000002 -29.4612 --]
 [-- -- -- ... -29.6098 -29.608400000000003 --]]
CPU times: user 151 ms, sys: 16.2 ms, total: 167 ms
Wall time: 11.3 s


#### With xarray and fsspec

In [13]:
%%time
# Read the netcdf dataset without the use of the index (fsspec and xarray style)

import time
import xarray as xr
import fsspec

start_time = time.time()

# Open the netCDF dataset
with fsspec.open(dataset_url, 'rb', block_size=32*2**10) as f:
    with xr.open_dataset(f) as ds:
        data = ds[variable][slice1]
        print(data.max().values)
        
end_time = time.time() - start_time
print('Elapsed time: %.2fms' % (end_time*1000))

# Check the data decompressed
print(data.values)
assert(np.allclose(data.values, ref_data, equal_nan=True))

61.284000000000006
Elapsed time: 173628.14ms
[[     nan      nan  61.1774 ...  57.4727  57.4768      nan]
 [     nan      nan  61.2194 ...  57.4146  57.4277      nan]
 [     nan      nan  61.2322 ...  57.3577  57.3452      nan]
 ...
 [     nan      nan      nan ... -29.3178 -29.3235      nan]
 [     nan      nan      nan ... -29.455  -29.4612      nan]
 [     nan      nan      nan ... -29.6098 -29.6084      nan]]
CPU times: user 2.73 s, sys: 137 ms, total: 2.87 s
Wall time: 2min 53s


#### With h5py and fsspec

In [14]:
%%time
# Read the netcdf dataset without the use of the index (fsspec and h5py style)

import time
import h5py as h5

start_time = time.time()

# Open the netCDF dataset
with fsspec.open(dataset_url, block_size=32*2**10) as f:
    with h5.File(f) as ds:
        # Access to h5py low-level API to have a direct access to the compressed data
        data = ds[variable][slice1]
        data = np.where(data==2147483647, np.nan, data)*0.0001
        print(np.nanmax(data))

end_time = time.time() - start_time
print('Elapsed time: %.2fms' % (end_time*1000))

# Check the data decompressed
print(data)
assert(np.allclose(data, ref_data, equal_nan=True))

61.284000000000006
Elapsed time: 10068.72ms
[[     nan      nan  61.1774 ...  57.4727  57.4768      nan]
 [     nan      nan  61.2194 ...  57.4146  57.4277      nan]
 [     nan      nan  61.2322 ...  57.3577  57.3452      nan]
 ...
 [     nan      nan      nan ... -29.3178 -29.3235      nan]
 [     nan      nan      nan ... -29.455  -29.4612      nan]
 [     nan      nan      nan ... -29.6098 -29.6084      nan]]
CPU times: user 52.5 ms, sys: 7.86 ms, total: 60.3 ms
Wall time: 10.1 s


### Time to access to the data __with__ the index

#### With the index locally

In [15]:
%%time
# Read the netcdf dataset with the use of the local index

import time

start_time = time.time()

with fsspec.open(dataset_url, block_size=32*2**10) as f:
    with open(index_path, mode='rb') as index:
        data = chunkindex.read_slice(f, index, variable, slice1)
        data = np.where(data==2147483647, np.nan, data)*0.0001
        print(np.nanmax(data))

end_time = time.time() - start_time
print('Elapsed time: %.2fms' % (end_time*1000))

# Check the data decompressed
print(data)
assert(np.allclose(data, ref_data, equal_nan=True))

61.284000000000006
Elapsed time: 7324.52ms
[[     nan      nan  61.1774 ...  57.4727  57.4768      nan]
 [     nan      nan  61.2194 ...  57.4146  57.4277      nan]
 [     nan      nan  61.2322 ...  57.3577  57.3452      nan]
 ...
 [     nan      nan      nan ... -29.3178 -29.3235      nan]
 [     nan      nan      nan ... -29.455  -29.4612      nan]
 [     nan      nan      nan ... -29.6098 -29.6084      nan]]
CPU times: user 257 ms, sys: 28.3 ms, total: 285 ms
Wall time: 7.33 s


#### With the index remote

*We need to improve this case !*

In [16]:
%%time
# Read the netcdf dataset with the use of the local index

import time

start_time = time.time()

with fsspec.open(dataset_url, block_size=32*2**10) as f:
    with fsspec.open(index_url, block_size=32*2**10) as index:
        data = chunkindex.read_slice(f, index, variable, slice1)
        data = np.where(data==2147483647, np.nan, data)*0.0001
        print(np.nanmax(data))

end_time = time.time() - start_time
print('Elapsed time: %.2fms' % (end_time*1000))

# Check the data decompressed
print(data)
assert(np.allclose(data, ref_data, equal_nan=True))

61.284000000000006
Elapsed time: 37638.76ms
[[     nan      nan  61.1774 ...  57.4727  57.4768      nan]
 [     nan      nan  61.2194 ...  57.4146  57.4277      nan]
 [     nan      nan  61.2322 ...  57.3577  57.3452      nan]
 ...
 [     nan      nan      nan ... -29.3178 -29.3235      nan]
 [     nan      nan      nan ... -29.455  -29.4612      nan]
 [     nan      nan      nan ... -29.6098 -29.6084      nan]]
CPU times: user 447 ms, sys: 51 ms, total: 498 ms
Wall time: 37.6 s
