# Usage of chunkindex with a slow network connexion simulated with lighttpd 
This notebook shows how to make use of chunkindex module to access data in a compressed netCDF file through a HTTP server.

In [1]:
# Append the root of the the project to PYTHONPATH
import sys
sys.path.append('..')

# Define some parameters used throughout this notebook

from pathlib import Path
import urllib

# Dataset filename
dataset_file = "SWOT_L2_LR_PreCalSSH_Expert_002_086_20230814T031152_20230814T040129_PIA1_01.nc"

# Define the dataset path
dataset_dir = Path('data') / 'www'
dataset_dir = dataset_dir.resolve()
dataset_dir.mkdir(parents=True, exist_ok=True)
dataset_path = dataset_dir / dataset_file

# Data URL
base_url = 'http://127.0.0.1:8000'
dataset_url = urllib.parse.urljoin(base_url, dataset_file)

# HTTP server speed
kbps=100

## Setup the HTTP server

Create a configuration file for the HTTP server: [lighttpd.conf](lighttpd.conf).

In [2]:
%%writefile lighttpd.conf
server.document-root = env.LIGHTTPD_DOC_ROOT
server.kbytes-per-second = env.LIGHTTPD_KBPS
server.port = 8000
dir-listing.activate = "enable" 

Overwriting lighttpd.conf


Start the HTTP server

In [3]:
%%bash --bg -s "$dataset_dir" "$kbps"

# Kill a previous lighttpd server
pkill lighttpd

# Run a HTTP server
LIGHTTPD_DOC_ROOT=${1} LIGHTTPD_KBPS=${2} lighttpd -Df lighttpd.conf

# Wait one second to let time to the server to start
sleep 3

Check that the HTTP server is running: Show the content of the data folder that is served by the HTTP server we have justed started: http://127.0.0.1:8000

In [4]:
from IPython.display import IFrame
IFrame(base_url, width=800, height=150)

## Create an index with chunkindex

Chunkindex create zran index that provides decompression starting points within the chunks.

File location: data/tmp/ramp_indexchunk.nc

In [5]:
import chunkindex
import contextlib
import os
import netCDF4
from pathlib import Path
import xarray as xr
import numpy as np

# Define the index path
index_filename = str(dataset_path.stem) + '_indexchunk.nc'
index_path = dataset_path.parent.joinpath(index_filename)
# index URL
index_url = urllib.parse.urljoin(base_url, index_filename)

# Remove it if it already exists
with contextlib.suppress(FileNotFoundError):
    os.remove(index_path)

# Create the zran index for all variables and chunks of the dataset and write it to the netcdf4 file at index_path
chunkindex.create_index(index_path, dataset_path)

# Display the resulting index for one chunk
index_x00 = xr.open_dataset(index_path, group='ssh_karin/0.0')
index_x00



## Time to access to the data

We define a slice of data we will access to. It lays on two chunks in the 3rd dimension.

In [6]:
#slice1 = ((slice(6000, 6010), slice(0, 69)))
slice1 = ((slice(6000, 8000), slice(0, 69)))
variable='ssh_karin'

### Direct access to the data __without__ the index 
#### with xarray

In [7]:
%%time
# Read the netcdf dataset without the use of the index (xarray and #mode=bytes style)

import time
import xarray as xr

start_time = time.time()

# Open the netCDF dataset
with xr.open_dataset(dataset_path) as ds:
    data = ds[variable][slice1]
    print(data.max().values)

end_time = time.time() - start_time
print('Elapsed time: %.2fms' % (end_time*1000))

# Check the data decompressed
print(data.values)
ref_data = data.values

61.284000000000006
Elapsed time: 198.29ms
[[     nan      nan  61.1774 ...  57.4727  57.4768      nan]
 [     nan      nan  61.2194 ...  57.4146  57.4277      nan]
 [     nan      nan  61.2322 ...  57.3577  57.3452      nan]
 ...
 [     nan      nan      nan ... -29.3178 -29.3235      nan]
 [     nan      nan      nan ... -29.455  -29.4612      nan]
 [     nan      nan      nan ... -29.6098 -29.6084      nan]]
CPU times: user 158 ms, sys: 11.6 ms, total: 170 ms
Wall time: 199 ms


#### with Dataset

In [8]:
%%time
# Read the netcdf dataset without the use of the index and with netCDF4 and #mode=bytes style)

import time
from netCDF4 import Dataset

start_time = time.time()

# Open the netCDF dataset
with Dataset(dataset_path) as dataset:
    data = dataset[variable][slice1]
    print(np.nanmax(data))

end_time = time.time() - start_time
print('Elapsed time: %.2fms' % (end_time*1000))

# Check the data decompressed
print(data)
assert(np.allclose(data, ref_data, equal_nan=True))

61.284000000000006
Elapsed time: 106.17ms
[[-- -- 61.177400000000006 ... 57.4727 57.476800000000004 --]
 [-- -- 61.2194 ... 57.4146 57.4277 --]
 [-- -- 61.232200000000006 ... 57.3577 57.345200000000006 --]
 ...
 [-- -- -- ... -29.317800000000002 -29.323500000000003 --]
 [-- -- -- ... -29.455000000000002 -29.4612 --]
 [-- -- -- ... -29.6098 -29.608400000000003 --]]
CPU times: user 79.9 ms, sys: 7.74 ms, total: 87.7 ms
Wall time: 118 ms


### Time to access to the data __without__ the index

#### With xarray

In [9]:
%%time
# Read the netcdf dataset without the use of the index (xarray and #mode=bytes style)

import time
import xarray as xr

start_time = time.time()

# Open the netCDF dataset
with xr.open_dataset(dataset_url + "#mode=bytes") as ds:
    data = ds[variable][slice1]
    print(data.max().values)

end_time = time.time() - start_time
print('Elapsed time: %.2fms' % (end_time*1000))

# Check the data decompressed
print(data.values)
assert(np.allclose(data.values, ref_data, equal_nan=True))

61.284000000000006
Elapsed time: 10254.50ms
[[     nan      nan  61.1774 ...  57.4727  57.4768      nan]
 [     nan      nan  61.2194 ...  57.4146  57.4277      nan]
 [     nan      nan  61.2322 ...  57.3577  57.3452      nan]
 ...
 [     nan      nan      nan ... -29.3178 -29.3235      nan]
 [     nan      nan      nan ... -29.455  -29.4612      nan]
 [     nan      nan      nan ... -29.6098 -29.6084      nan]]
CPU times: user 158 ms, sys: 40.7 ms, total: 198 ms
Wall time: 10.3 s


#### With netCDF4 Dataset

In [10]:
%%time
# Read the netcdf dataset without the use of the index and with netCDF4 and #mode=bytes style)

from netCDF4 import Dataset

start_time = time.time()

# Open the netCDF dataset
with Dataset(dataset_url + "#mode=bytes") as dataset:
    data = dataset[variable][slice1]
    print(np.nanmax(data))

end_time = time.time() - start_time
print('Elapsed time: %.2fms' % (end_time*1000))

# Check the data decompressed
print(data)
assert(np.allclose(data, ref_data, equal_nan=True))


61.284000000000006
Elapsed time: 12212.42ms
[[-- -- 61.177400000000006 ... 57.4727 57.476800000000004 --]
 [-- -- 61.2194 ... 57.4146 57.4277 --]
 [-- -- 61.232200000000006 ... 57.3577 57.345200000000006 --]
 ...
 [-- -- -- ... -29.317800000000002 -29.323500000000003 --]
 [-- -- -- ... -29.455000000000002 -29.4612 --]
 [-- -- -- ... -29.6098 -29.608400000000003 --]]
CPU times: user 139 ms, sys: 20.5 ms, total: 160 ms
Wall time: 12.2 s


#### With xarray and fsspec

In [22]:
%%time
# Read the netcdf dataset without the use of the index (fsspec and xarray style)

import time
import xarray as xr
import fsspec

start_time = time.time()

# Open the netCDF dataset
with fsspec.open(dataset_url, 'rb', block_size=32*2**10) as f:
    #with xr.open_dataset(f) as ds: # old 2min 53s
    # Aucune différence entre engine netcdf4 et h5netcdf
    with xr.open_dataset(f, engine="h5netcdf") as ds:
        data = ds[variable][slice1]
        print(data.max().values)
        
end_time = time.time() - start_time
print('Elapsed time: %.2fms' % (end_time*1000))

# Check the data decompressed
print(data.values)
assert(np.allclose(data.values, ref_data, equal_nan=True))

60.8926
Elapsed time: 174790.02ms
[[     nan      nan      nan      nan      nan      nan      nan      nan
       nan      nan      nan      nan      nan      nan      nan      nan
       nan      nan      nan      nan      nan      nan      nan      nan
       nan      nan      nan      nan      nan      nan      nan      nan
       nan      nan      nan      nan      nan      nan      nan      nan
       nan      nan      nan      nan      nan      nan      nan      nan
       nan      nan]
 [ 17.578   17.547   17.5189  17.4954  17.4569  17.407   17.3685  17.3344
   17.3037  17.271   17.2358  17.2026  17.1669  17.1345  17.0962  17.0646
   17.0328  17.0056  16.9816  16.9548  16.938   16.9301      nan      nan
       nan      nan  16.7418  16.7195  16.6823  16.6433  16.6128  16.5778
   16.5431  16.508   16.4749  16.4392  16.4067  16.3732  16.3383  16.3039
   16.2697  16.2387  16.2092  16.1783  16.1456  16.1134  16.0837  16.0486
   16.0282  16.0052]
 [  7.5852   7.634    7.69     7.758

#### With h5py and fsspec

In [12]:
%%time
# Read the netcdf dataset without the use of the index (fsspec and h5py style)

import time
import h5py as h5

start_time = time.time()

# Open the netCDF dataset
with fsspec.open(dataset_url, block_size=32*2**10) as f:
    with h5.File(f) as ds:
        # Access to h5py low-level API to have a direct access to the compressed data
        data = ds[variable][slice1]
        liste_att = ds[variable].attrs.keys()
        if '_FillValue' in liste_att:
            fillvalue = ds[variable].attrs['_FillValue'][0]
        else:
            fillvalue = False
        if 'scale_factor' in liste_att:
            scale_factor = ds[variable].attrs['scale_factor'][0]
        else:
            scale_factor = 1
        if 'offset' in liste_att:
            offset = ds[variable].attrs['offset'][0]
        else:
            offset = 0
        if fillvalue:
            data = np.where(data==fillvalue, np.nan, data)*scale_factor + offset
        else:
            data = data*scale_factor + offset
        print(np.nanmax(data))

end_time = time.time() - start_time
print('Elapsed time: %.2fms' % (end_time*1000))

# Check the data decompressed
print(data)
assert(np.allclose(data, ref_data, equal_nan=True))

61.284000000000006
Elapsed time: 12086.81ms
[[     nan      nan  61.1774 ...  57.4727  57.4768      nan]
 [     nan      nan  61.2194 ...  57.4146  57.4277      nan]
 [     nan      nan  61.2322 ...  57.3577  57.3452      nan]
 ...
 [     nan      nan      nan ... -29.3178 -29.3235      nan]
 [     nan      nan      nan ... -29.455  -29.4612      nan]
 [     nan      nan      nan ... -29.6098 -29.6084      nan]]
CPU times: user 62.1 ms, sys: 12.7 ms, total: 74.9 ms
Wall time: 12.1 s


### Time to access to the data __with__ the index

#### With the index locally

In [13]:
%%time
# Read the netcdf dataset with the use of the local index

import time
import fsspec
start_time = time.time()

with fsspec.open(dataset_url, block_size=32*2**10) as f:
    with open(index_path, mode='rb') as index:
        data = chunkindex.read_slice(f, index, variable, slice1)
        print(np.nanmax(data))

end_time = time.time() - start_time
print('Elapsed time: %.2fms' % (end_time*1000))

# Check the data decompressed
print(data)
assert(np.allclose(data, ref_data, equal_nan=True))

61.284000000000006
Elapsed time: 8314.36ms
[[-- -- 61.177400000000006 ... 57.4727 57.476800000000004 --]
 [-- -- 61.2194 ... 57.4146 57.4277 --]
 [-- -- 61.232200000000006 ... 57.3577 57.345200000000006 --]
 ...
 [-- -- -- ... -29.317800000000002 -29.323500000000003 --]
 [-- -- -- ... -29.455000000000002 -29.4612 --]
 [-- -- -- ... -29.6098 -29.608400000000003 --]]
CPU times: user 262 ms, sys: 28 ms, total: 290 ms
Wall time: 8.32 s


#### With the index remote

*We need to improve this case !*

In [14]:
%%time
# Read the netcdf dataset with the use of the local index

import time

start_time = time.time()

with fsspec.open(dataset_url, block_size=32*2**10) as f:
    with fsspec.open(index_url, block_size=32*2**10) as index:
        data = chunkindex.read_slice(f, index, variable, slice1)
        print(np.nanmax(data))

end_time = time.time() - start_time
print('Elapsed time: %.2fms' % (end_time*1000))

# Check the data decompressed
print(data)
assert(np.allclose(data, ref_data, equal_nan=True))

61.284000000000006
Elapsed time: 38814.83ms
[[-- -- 61.177400000000006 ... 57.4727 57.476800000000004 --]
 [-- -- 61.2194 ... 57.4146 57.4277 --]
 [-- -- 61.232200000000006 ... 57.3577 57.345200000000006 --]
 ...
 [-- -- -- ... -29.317800000000002 -29.323500000000003 --]
 [-- -- -- ... -29.455000000000002 -29.4612 --]
 [-- -- -- ... -29.6098 -29.608400000000003 --]]
CPU times: user 572 ms, sys: 37 ms, total: 609 ms
Wall time: 38.8 s


### Change the size of data

In [15]:
# Read One vector of 70 values
slice1 = ((slice(6000, 6001), slice(0, 69)))
variable='ssh_karin'

#### With netCDF4 Dataset

In [16]:
%%time
# Read the netcdf dataset without the use of the index and with netCDF4 and #mode=bytes style)

from netCDF4 import Dataset

start_time = time.time()

# Open the netCDF dataset
with Dataset(dataset_url + "#mode=bytes") as dataset:
    data = dataset[variable][slice1]
    print(np.nanmax(data))

end_time = time.time() - start_time
print('Elapsed time: %.2fms' % (end_time*1000))

# Check the data decompressed
print(data)
ref_data = data

61.177400000000006
Elapsed time: 10329.20ms
[[-- -- 61.177400000000006 61.1726 61.138400000000004 61.1182
  61.096000000000004 61.0627 61.013400000000004 60.964400000000005
  60.8926 60.8076 60.7075 60.601200000000006 60.481700000000004 60.3643
  60.2432 60.1171 59.9923 59.856500000000004 59.6863 59.4883 59.2714
  59.0563 58.851200000000006 58.6533 58.4634 58.282900000000005
  58.113600000000005 57.9555 57.815000000000005 57.6953 57.64 -- -- -- --
  57.3218 57.316 57.308800000000005 57.3234 57.3374 57.357200000000006
  57.370400000000004 57.382600000000004 57.3982 57.4117 57.4264 57.4425
  57.453700000000005 57.4679 57.489900000000006 57.5052 57.5139 57.5225
  57.530300000000004 57.5268 57.5231 57.505 57.4771 57.4528 57.4485
  57.4559 57.461600000000004 57.471900000000005 57.4716 57.4727
  57.476800000000004 --]]
CPU times: user 179 ms, sys: 34.7 ms, total: 214 ms
Wall time: 10.3 s


#### With the index locally

In [17]:
%%time
# Read the netcdf dataset with the use of the local index

import time
import fsspec
start_time = time.time()

with fsspec.open(dataset_url, block_size=32*2**10) as f:
    with open(index_path, mode='rb') as index:
        data = chunkindex.read_slice(f, index, variable, slice1)
        print(np.nanmax(data))

end_time = time.time() - start_time
print('Elapsed time: %.2fms' % (end_time*1000))

# Check the data decompressed
print(data)
assert(np.allclose(data, ref_data, equal_nan=True))

61.177400000000006
Elapsed time: 7346.60ms
[[-- -- 61.177400000000006 61.1726 61.138400000000004 61.1182
  61.096000000000004 61.0627 61.013400000000004 60.964400000000005
  60.8926 60.8076 60.7075 60.601200000000006 60.481700000000004 60.3643
  60.2432 60.1171 59.9923 59.856500000000004 59.6863 59.4883 59.2714
  59.0563 58.851200000000006 58.6533 58.4634 58.282900000000005
  58.113600000000005 57.9555 57.815000000000005 57.6953 57.64 -- -- -- --
  57.3218 57.316 57.308800000000005 57.3234 57.3374 57.357200000000006
  57.370400000000004 57.382600000000004 57.3982 57.4117 57.4264 57.4425
  57.453700000000005 57.4679 57.489900000000006 57.5052 57.5139 57.5225
  57.530300000000004 57.5268 57.5231 57.505 57.4771 57.4528 57.4485
  57.4559 57.461600000000004 57.471900000000005 57.4716 57.4727
  57.476800000000004 --]]
CPU times: user 316 ms, sys: 6.14 ms, total: 322 ms
Wall time: 7.35 s


### With multiple acces to small size of data

In [18]:
# Read 9 vectors of 50 values in the middle of the data
slice1 = ((slice(1000, 10000, 1000), slice(10, 60)))

#### With netCDF4 Dataset

In [19]:
%%time
# Read the netcdf dataset without the use of the index and with netCDF4 and #mode=bytes style)

from netCDF4 import Dataset

start_time = time.time()

# Open the netCDF dataset
with Dataset(dataset_url + "#mode=bytes") as dataset:
    data = dataset[variable][slice1]
    print(np.nanmax(data))

end_time = time.time() - start_time
print('Elapsed time: %.2fms' % (end_time*1000))

# Check the data decompressed
print(data)
ref_data = data

60.8926
Elapsed time: 11270.17ms
[[-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --
  -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --
  -- --]
 [17.578 17.547 17.518900000000002 17.4954 17.4569 17.407 17.3685
  17.334400000000002 17.3037 17.271 17.2358 17.2026 17.166900000000002
  17.1345 17.0962 17.064600000000002 17.0328 17.0056 16.9816
  16.954800000000002 16.938000000000002 16.9301 -- -- -- -- 16.7418
  16.7195 16.6823 16.6433 16.6128 16.5778 16.5431 16.508 16.4749 16.4392
  16.4067 16.3732 16.3383 16.303900000000002 16.2697 16.2387 16.2092
  16.1783 16.1456 16.113400000000002 16.0837 16.0486 16.028200000000002
  16.005200000000002]
 [7.5852 7.634 7.69 7.758100000000001 7.819800000000001
  7.8668000000000005 7.915100000000001 7.9759 8.0205 8.0579 8.1052 8.1499
  8.1922 8.243500000000001 8.2827 8.3183 8.3609 8.3991 8.4396 8.4794
  8.5091 8.5279 -- -- -- -- 8.7219 8.7268 8.741900000000001 8.7843
  8.810500000000001 8.8312 8.8614 8.8887 8

#### With the index locally

In [20]:
%%time
# Read the netcdf dataset with the use of the local index

import time
import fsspec
start_time = time.time()

with fsspec.open(dataset_url, block_size=32*2**10) as f:
    with open(index_path, mode='rb') as index:
        data = chunkindex.read_slice(f, index, variable, slice1)
        print(np.nanmax(data))

end_time = time.time() - start_time
print('Elapsed time: %.2fms' % (end_time*1000))

# Check the data decompressed
print(data)
assert(np.allclose(data, ref_data, equal_nan=True))

60.8926
Elapsed time: 12419.27ms
[[-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --
  -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --
  -- --]
 [17.578 17.547 17.518900000000002 17.4954 17.4569 17.407 17.3685
  17.334400000000002 17.3037 17.271 17.2358 17.2026 17.166900000000002
  17.1345 17.0962 17.064600000000002 17.0328 17.0056 16.9816
  16.954800000000002 16.938000000000002 16.9301 -- -- -- -- 16.7418
  16.7195 16.6823 16.6433 16.6128 16.5778 16.5431 16.508 16.4749 16.4392
  16.4067 16.3732 16.3383 16.303900000000002 16.2697 16.2387 16.2092
  16.1783 16.1456 16.113400000000002 16.0837 16.0486 16.028200000000002
  16.005200000000002]
 [7.5852 7.634 7.69 7.758100000000001 7.819800000000001
  7.8668000000000005 7.915100000000001 7.9759 8.0205 8.0579 8.1052 8.1499
  8.1922 8.243500000000001 8.2827 8.3183 8.3609 8.3991 8.4396 8.4794
  8.5091 8.5279 -- -- -- -- 8.7219 8.7268 8.741900000000001 8.7843
  8.810500000000001 8.8312 8.8614 8.8887 8

#### With the index localy __and__ one call to read_slice per slice

In [21]:
%%time
# Read the netcdf dataset with the use of the local index

import time
import fsspec
start_time = time.time()

with fsspec.open(dataset_url, block_size=32*2**10) as f:
    with open(index_path, mode='rb') as index:
        data = None
        for indice in range(1000, 10000, 1000):
            slice_tmp = ((slice(indice, indice+1), slice(10, 60)))
            data_tmp = chunkindex.read_slice(f, index, variable, slice_tmp)
            if data is not None:
                data = np.ma.append(data, data_tmp, axis = 0)
            else:
                data = data_tmp
        print(np.nanmax(data))

end_time = time.time() - start_time
print('Elapsed time: %.2fms' % (end_time*1000))

# Check the data decompressed
print(data)
assert(np.allclose(data, ref_data, equal_nan=True))

60.8926
Elapsed time: 46566.30ms
[[-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --
  -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --
  -- --]
 [17.578 17.547 17.518900000000002 17.4954 17.4569 17.407 17.3685
  17.334400000000002 17.3037 17.271 17.2358 17.2026 17.166900000000002
  17.1345 17.0962 17.064600000000002 17.0328 17.0056 16.9816
  16.954800000000002 16.938000000000002 16.9301 -- -- -- -- 16.7418
  16.7195 16.6823 16.6433 16.6128 16.5778 16.5431 16.508 16.4749 16.4392
  16.4067 16.3732 16.3383 16.303900000000002 16.2697 16.2387 16.2092
  16.1783 16.1456 16.113400000000002 16.0837 16.0486 16.028200000000002
  16.005200000000002]
 [7.5852 7.634 7.69 7.758100000000001 7.819800000000001
  7.8668000000000005 7.915100000000001 7.9759 8.0205 8.0579 8.1052 8.1499
  8.1922 8.243500000000001 8.2827 8.3183 8.3609 8.3991 8.4396 8.4794
  8.5091 8.5279 -- -- -- -- 8.7219 8.7268 8.741900000000001 8.7843
  8.810500000000001 8.8312 8.8614 8.8887 8