# Test s3 connect

## Set up s3 connexion

In [1]:
%%time
import fsspec
import s3fs
import xarray
import time
from pathlib import Path

# Dataset filename
dataset_file = "SWOT_L2_LR_PreCalSSH_Expert_002_086_20230814T031152_20230814T040129_PIA1_01.nc"

# Define the dataset path
dataset_dir = Path('data') / 'www'
dataset_dir = dataset_dir.resolve()
dataset_path = dataset_dir / dataset_file

# Define the index path
index_filename = str(dataset_path.stem) + '_indexchunk.nc'
index_path = dataset_path.parent.joinpath(index_filename)

# Authenticated mode
#fs_s3 = s3fs.S3FileSystem(
#      key='AKIAIOSFODNN7EXAMPLE',
#      secret='wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY',
#      endpoint_url='http://localhost:9444/s3/data'
#   )

# Anonimous mode
fs_s3 = s3fs.S3FileSystem(
      anon=True,
      endpoint_url='http://localhost:9444/s3/'
   )

# url s3 sur le serveur ninja
s3_url = 's3://data/SWOT_L2_LR_PreCalSSH_Expert_002_086_20230814T031152_20230814T040129_PIA1_01.nc'
s3_url_index = 's3://data/SWOT_L2_LR_PreCalSSH_Expert_002_086_20230814T031152_20230814T040129_PIA1_01_indexchunk.nc'

# Read One vector of 70 values
slice1 = ((slice(6000, 6001), slice(0, 69)))
variable='ssh_karin'

# Read 2000 vectors of 70 values
slice1 = ((slice(6000, 8000), slice(0, 69)))
variable='ssh_karin'


CPU times: user 742 ms, sys: 123 ms, total: 864 ms
Wall time: 1.05 s


## Direct acces to data via s3 and xarray

In [2]:
%%time
with fs_s3.open(s3_url, mode='rb') as f:
    with xarray.open_dataset(f, engine='h5netcdf') as dataset:
        data = dataset[variable][slice1]
        print(data.max().values)

print(data)
ref_data = data.values


61.284000000000006




<xarray.DataArray 'ssh_karin' (num_lines: 2000, num_pixels: 69)>
array([[     nan,      nan,  61.1774, ...,  57.4727,  57.4768,      nan],
       [     nan,      nan,  61.2194, ...,  57.4146,  57.4277,      nan],
       [     nan,      nan,  61.2322, ...,  57.3577,  57.3452,      nan],
       ...,
       [     nan,      nan,      nan, ..., -29.3178, -29.3235,      nan],
       [     nan,      nan,      nan, ..., -29.455 , -29.4612,      nan],
       [     nan,      nan,      nan, ..., -29.6098, -29.6084,      nan]])
Coordinates:
    latitude         (num_lines, num_pixels) float64 ...
    longitude        (num_lines, num_pixels) float64 ...
    latitude_nadir   (num_lines) float64 ...
    longitude_nadir  (num_lines) float64 ...
Dimensions without coordinates: num_lines, num_pixels
Attributes:
    long_name:      sea surface height
    standard_name:  sea surface height above reference ellipsoid
    units:          m
    quality_flag:   ssh_karin_qual
    valid_min:      -15000000
    

## Direct acces to data via s3 and h5py

In [3]:
%%time
# Read the netcdf dataset without the use of the index (fsspec and h5py style)

import time
import h5py as h5
import numpy

# Read 2000 vectors of 70 values
slice1 = ((slice(6000, 8000), slice(0, 69)))
variable='ssh_karin'
start_time = time.time()

# Open the netCDF dataset
with fs_s3.open(s3_url, mode='rb') as f:
    with h5.File(f) as ds:
        # Access to h5py low-level API to have a direct access to the compressed data
        data = ds[variable][slice1]
        liste_att = ds[variable].attrs.keys()
        if '_FillValue' in liste_att:
            fillvalue = ds[variable].attrs['_FillValue'][0]
        else:
            fillvalue = False
        if 'scale_factor' in liste_att:
            scale_factor = ds[variable].attrs['scale_factor'][0]
        else:
            scale_factor = 1
        if 'offset' in liste_att:
            offset = ds[variable].attrs['offset'][0]
        else:
            offset = 0
        if fillvalue:
            data = numpy.where(data==fillvalue, numpy.nan, data)*scale_factor + offset
        else:
            data = data*scale_factor + offset
        print(numpy.nanmax(data))

end_time = time.time() - start_time
print('Elapsed time: %.2fms' % (end_time*1000))

# Check the data decompressed
print(data)
assert(numpy.allclose(data, ref_data, equal_nan=True))

61.284000000000006
Elapsed time: 188.63ms
[[     nan      nan  61.1774 ...  57.4727  57.4768      nan]
 [     nan      nan  61.2194 ...  57.4146  57.4277      nan]
 [     nan      nan  61.2322 ...  57.3577  57.3452      nan]
 ...
 [     nan      nan      nan ... -29.3178 -29.3235      nan]
 [     nan      nan      nan ... -29.455  -29.4612      nan]
 [     nan      nan      nan ... -29.6098 -29.6084      nan]]
CPU times: user 71.3 ms, sys: 6.27 ms, total: 77.6 ms
Wall time: 206 ms


## With the index locally

In [4]:
%%time

import chunkindex
import numpy

# Read the netcdf dataset with the use of the local index

start_time = time.time()

with fs_s3.open(s3_url, mode='rb') as f:
    with open(index_path, mode='rb') as index:
        
        data = chunkindex.read_slice(f, index, variable, slice1)
        print(numpy.nanmax(data))

end_time = time.time() - start_time
print('Elapsed time: %.2fms' % (end_time*1000))

# Check the data decompressed
print(data)
assert(numpy.allclose(data, ref_data, equal_nan=True))

61.284000000000006
Elapsed time: 367.27ms
[[-- -- 61.177400000000006 ... 57.4727 57.476800000000004 --]
 [-- -- 61.2194 ... 57.4146 57.4277 --]
 [-- -- 61.232200000000006 ... 57.3577 57.345200000000006 --]
 ...
 [-- -- -- ... -29.317800000000002 -29.323500000000003 --]
 [-- -- -- ... -29.455000000000002 -29.4612 --]
 [-- -- -- ... -29.6098 -29.608400000000003 --]]
CPU times: user 310 ms, sys: 56.3 ms, total: 366 ms
Wall time: 480 ms


## With the index on s3 server

In [5]:
%%time

# Read the netcdf dataset with the use of the index on s3

start_time = time.time()

with fs_s3.open(s3_url, mode='rb') as f:
    with fs_s3.open(s3_url_index, mode='rb') as index:
        data = chunkindex.read_slice(f, index, variable, slice1)
        print(numpy.nanmax(data))

end_time = time.time() - start_time
print('Elapsed time: %.2fms' % (end_time*1000))

# Check the data decompressed
print(data)
assert(numpy.allclose(data, ref_data, equal_nan=True))

61.284000000000006
Elapsed time: 472.19ms
[[-- -- 61.177400000000006 ... 57.4727 57.476800000000004 --]
 [-- -- 61.2194 ... 57.4146 57.4277 --]
 [-- -- 61.232200000000006 ... 57.3577 57.345200000000006 --]
 ...
 [-- -- -- ... -29.317800000000002 -29.323500000000003 --]
 [-- -- -- ... -29.455000000000002 -29.4612 --]
 [-- -- -- ... -29.6098 -29.608400000000003 --]]
CPU times: user 318 ms, sys: 61.6 ms, total: 380 ms
Wall time: 480 ms


# Acces to multiple small size of data

In [6]:
# Read 9 vectors of 50 values in the middle of the data
slice1 = ((slice(1000, 10000, 1000), slice(10, 60)))

## Direct acces to data via s3 and xarray

In [7]:
%%time
with fs_s3.open(s3_url, mode='rb') as f:
    with xarray.open_dataset(f, engine='h5netcdf') as dataset:
        data = dataset[variable][slice1]
        #print(data)
        print(data.max().values)

print(data)
ref_data = data.values


60.8926
<xarray.DataArray 'ssh_karin' (num_lines: 9, num_pixels: 50)>
array([[     nan,      nan,      nan, ...,      nan,      nan,      nan],
       [ 17.578 ,  17.547 ,  17.5189, ...,  16.0486,  16.0282,  16.0052],
       [  7.5852,   7.634 ,   7.69  , ...,   9.2819,   9.3052,   9.3291],
       ...,
       [ 23.899 ,  23.8568,  23.8226, ...,  22.4377,  22.4164,  22.3957],
       [-29.7489, -29.7581, -29.7571, ..., -29.6473, -29.6553, -29.6588],
       [     nan,      nan,      nan, ...,      nan,      nan,      nan]])
Coordinates:
    latitude         (num_lines, num_pixels) float64 ...
    longitude        (num_lines, num_pixels) float64 ...
    latitude_nadir   (num_lines) float64 ...
    longitude_nadir  (num_lines) float64 ...
Dimensions without coordinates: num_lines, num_pixels
Attributes:
    long_name:      sea surface height
    standard_name:  sea surface height above reference ellipsoid
    units:          m
    quality_flag:   ssh_karin_qual
    valid_min:      -15000000

## Direct acces to data via s3 and h5py

In [8]:
%%time
# Read the netcdf dataset without the use of the index (fsspec and h5py style)

import time
import h5py as h5

start_time = time.time()

# Open the netCDF dataset
with fs_s3.open(s3_url, mode='rb') as f:
    with h5.File(f) as ds:
        # Access to h5py low-level API to have a direct access to the compressed data
        data = ds[variable][slice1]
        liste_att = ds[variable].attrs.keys()
        if '_FillValue' in liste_att:
            fillvalue = ds[variable].attrs['_FillValue'][0]
        else:
            fillvalue = False
        if 'scale_factor' in liste_att:
            scale_factor = ds[variable].attrs['scale_factor'][0]
        else:
            scale_factor = 1
        if 'offset' in liste_att:
            offset = ds[variable].attrs['offset'][0]
        else:
            offset = 0
        if fillvalue:
            data = numpy.where(data==fillvalue, numpy.nan, data)*scale_factor + offset
        else:
            data = data*scale_factor + offset
        print(numpy.nanmax(data))

end_time = time.time() - start_time
print('Elapsed time: %.2fms' % (end_time*1000))

# Check the data decompressed
print(data)
assert(numpy.allclose(data, ref_data, equal_nan=True))

60.8926
Elapsed time: 97.05ms
[[     nan      nan      nan      nan      nan      nan      nan      nan
       nan      nan      nan      nan      nan      nan      nan      nan
       nan      nan      nan      nan      nan      nan      nan      nan
       nan      nan      nan      nan      nan      nan      nan      nan
       nan      nan      nan      nan      nan      nan      nan      nan
       nan      nan      nan      nan      nan      nan      nan      nan
       nan      nan]
 [ 17.578   17.547   17.5189  17.4954  17.4569  17.407   17.3685  17.3344
   17.3037  17.271   17.2358  17.2026  17.1669  17.1345  17.0962  17.0646
   17.0328  17.0056  16.9816  16.9548  16.938   16.9301      nan      nan
       nan      nan  16.7418  16.7195  16.6823  16.6433  16.6128  16.5778
   16.5431  16.508   16.4749  16.4392  16.4067  16.3732  16.3383  16.3039
   16.2697  16.2387  16.2092  16.1783  16.1456  16.1134  16.0837  16.0486
   16.0282  16.0052]
 [  7.5852   7.634    7.69     7.7581   

## With the index locally

In [9]:
%%time

import chunkindex
import numpy

# Read the netcdf dataset with the use of the local index

start_time = time.time()

with fs_s3.open(s3_url, mode='rb') as f:
    with open(index_path, mode='rb') as index:
        
        data = chunkindex.read_slice(f, index, variable, slice1)
        print(numpy.nanmax(data))

end_time = time.time() - start_time
print('Elapsed time: %.2fms' % (end_time*1000))

# Check the data decompressed
print(data)
assert(numpy.allclose(data, ref_data, equal_nan=True))

60.8926
Elapsed time: 361.56ms
[[-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --
  -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --
  -- --]
 [17.578 17.547 17.518900000000002 17.4954 17.4569 17.407 17.3685
  17.334400000000002 17.3037 17.271 17.2358 17.2026 17.166900000000002
  17.1345 17.0962 17.064600000000002 17.0328 17.0056 16.9816
  16.954800000000002 16.938000000000002 16.9301 -- -- -- -- 16.7418
  16.7195 16.6823 16.6433 16.6128 16.5778 16.5431 16.508 16.4749 16.4392
  16.4067 16.3732 16.3383 16.303900000000002 16.2697 16.2387 16.2092
  16.1783 16.1456 16.113400000000002 16.0837 16.0486 16.028200000000002
  16.005200000000002]
 [7.5852 7.634 7.69 7.758100000000001 7.819800000000001
  7.8668000000000005 7.915100000000001 7.9759 8.0205 8.0579 8.1052 8.1499
  8.1922 8.243500000000001 8.2827 8.3183 8.3609 8.3991 8.4396 8.4794
  8.5091 8.5279 -- -- -- -- 8.7219 8.7268 8.741900000000001 8.7843
  8.810500000000001 8.8312 8.8614 8.8887 8.9

### With the index on s3 server

In [10]:
%%time

# Read the netcdf dataset with the use of the index on s3

start_time = time.time()

with fs_s3.open(s3_url, mode='rb') as f:
    with fs_s3.open(s3_url_index, mode='rb') as index:
        data = chunkindex.read_slice(f, index, variable, slice1)
        print(numpy.nanmax(data))

end_time = time.time() - start_time
print('Elapsed time: %.2fms' % (end_time*1000))

# Check the data decompressed
print(data)
assert(numpy.allclose(data, ref_data, equal_nan=True))

60.8926
Elapsed time: 576.08ms
[[-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --
  -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --
  -- --]
 [17.578 17.547 17.518900000000002 17.4954 17.4569 17.407 17.3685
  17.334400000000002 17.3037 17.271 17.2358 17.2026 17.166900000000002
  17.1345 17.0962 17.064600000000002 17.0328 17.0056 16.9816
  16.954800000000002 16.938000000000002 16.9301 -- -- -- -- 16.7418
  16.7195 16.6823 16.6433 16.6128 16.5778 16.5431 16.508 16.4749 16.4392
  16.4067 16.3732 16.3383 16.303900000000002 16.2697 16.2387 16.2092
  16.1783 16.1456 16.113400000000002 16.0837 16.0486 16.028200000000002
  16.005200000000002]
 [7.5852 7.634 7.69 7.758100000000001 7.819800000000001
  7.8668000000000005 7.915100000000001 7.9759 8.0205 8.0579 8.1052 8.1499
  8.1922 8.243500000000001 8.2827 8.3183 8.3609 8.3991 8.4396 8.4794
  8.5091 8.5279 -- -- -- -- 8.7219 8.7268 8.741900000000001 8.7843
  8.810500000000001 8.8312 8.8614 8.8887 8.9