In [1]:
import h5py
import time
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt

In [2]:
# Update this example with air quality from Madrid.
# Motivation for HDF5 by comparing with Numpy
# example with numpy
import numpy as np
import datetime
temperature = np.random.random(1024)
station =15
start_time = 1669100000 #unix time in seconds
dt = 10 #in seconds

np.savez('weather',data=temperature,start_time=start_time,station=station,frequency=dt)

In [3]:
out = np.load('weather.npz')

In [4]:
list(out.keys())

['data', 'start_time', 'station', 'frequency']

In [5]:
out['data']

array([0.6748173 , 0.30240691, 0.92504552, ..., 0.69201875, 0.00400845,
       0.17074045])

## Creating HDF5 Files

In [6]:
#creates new hdf5 file in the write mode overwriting if the file already exists
f = h5py.File('big_data.h5','w')
f.close()

In [7]:
# open an existing file in the read mode, error if file does not exist
f = h5py.File('big_data.hdf5')
f.close()

FileNotFoundError: [Errno 2] Unable to open file (unable to open file: name = 'big_data.hdf5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)

In [8]:
# open an existing file in the read and write mode
f = h5py.File('big_data.h5','r+') #default mode is read mode
f.close()

In [9]:
# open an existing file in the read and write mode. If the file does not exits it will first create it!
f = h5py.File('big_data2.hdf5','a')
f.close()

In [11]:
with open('big_data.hdf5','w') as f:
    f.write('Hello')

In [12]:
f.close()

## HDF5 Datasets

In [14]:
f = h5py.File('big_data.h5','r+')
np_arr = np.random.random((100,100))
f.create_dataset('dataset1',data=np_arr)


<HDF5 dataset "dataset1": shape (100, 100), type "<f8">

Where is the numpy array ```np_arr```?

In [15]:
f

<HDF5 file "big_data.h5" (mode r+)>

In [16]:
f['dataset1']

<HDF5 dataset "dataset1": shape (100, 100), type "<f8">

In [17]:
dset1 = f['dataset1']

In [18]:
dset1.shape

(100, 100)

In [19]:
#data type is inherited from numpy array float64 (a.k.a <f8)
dset1.dtype

dtype('<f8')

In [20]:
np_arr.dtype

dtype('float64')

In [21]:
#how do we see the content? By slicing
dset1[:,:]

array([[0.25288079, 0.03691909, 0.54506046, ..., 0.26312733, 0.13821613,
        0.39636085],
       [0.10899647, 0.29659176, 0.20539434, ..., 0.54614031, 0.88923355,
        0.86662054],
       [0.48943253, 0.54981945, 0.69585297, ..., 0.73082061, 0.77340327,
        0.02595559],
       ...,
       [0.10987585, 0.00787696, 0.99706144, ..., 0.44676023, 0.72863452,
        0.38570012],
       [0.64983059, 0.8729022 , 0.17447264, ..., 0.57917194, 0.48564616,
        0.81835254],
       [0.34862517, 0.29879997, 0.48151847, ..., 0.96570328, 0.33159829,
        0.93720287]])

Differences with the numpy arrays: The data lives in the storage and only the sliced data is brought to the memory.

In [22]:
# we create a 100,100 dataframe of floats. By default, it is filled with 0 of type float32 (a.k.a <f4)
f.create_dataset('dataset2',(100,100))
dset2 = f['dataset2']
dset2[:,:]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [23]:
# we create a 100,100 dataframe of floats filled with 2.3.
f.create_dataset('dataset3',(100,100),fillvalue=2.3)
dset3 = f['dataset3']
dset3[:,:]

array([[2.3, 2.3, 2.3, ..., 2.3, 2.3, 2.3],
       [2.3, 2.3, 2.3, ..., 2.3, 2.3, 2.3],
       [2.3, 2.3, 2.3, ..., 2.3, 2.3, 2.3],
       ...,
       [2.3, 2.3, 2.3, ..., 2.3, 2.3, 2.3],
       [2.3, 2.3, 2.3, ..., 2.3, 2.3, 2.3],
       [2.3, 2.3, 2.3, ..., 2.3, 2.3, 2.3]], dtype=float32)

In [24]:
#another way of creating datasets in hdf file
f.create_dataset('dataset4',data=np.random.random((10,10)))


<HDF5 dataset "dataset4": shape (10, 10), type "<f8">

In [25]:
#summary: what has been done so far? we can see the datasets that are created
f.keys()

<KeysViewHDF5 ['dataset1', 'dataset2', 'dataset3', 'dataset4']>

Difference with numpy arrays: We can create empty HDF5 datasets.

In [26]:
#empty dataset. observe that the file size does not change.
f.create_dataset('empty_dataset',dtype=np.uint8)

<HDF5 dataset "empty_dataset": shape None, type "|u1">

In [27]:
f['empty_dataset'][:,:]

ValueError: Empty datasets cannot be sliced

In [28]:
f['empty_dataset'] = 42

OSError: Unable to create link (name already exists)

In [29]:
#numpy arrays are not really empty.
np.empty((2,2))

array([[1.22233370e-311, 1.22226958e-311],
       [4.36754031e-321, 1.35807731e-312]])

In [30]:
t1 = time.time()
arr = np.zeros((1000,1000))
t2 = time.time()
print(t2-t1)

0.0


In [31]:
#np.empty is simply the fastest way of initializing an array of certain size. But it is not really empty.
t1 = time.time()
arr = np.empty((1000,1000))
t2 = time.time()
print(t2-t1)

0.0


### Indexing and Slicing of HDF5 Datasets

Slicing and indexing of HDF5 datasets are similar to numpy arrays.

In [32]:
#partial output
partial_out = dset1[:5,1:5]
partial_out

array([[0.03691909, 0.54506046, 0.63388669, 0.27092217],
       [0.29659176, 0.20539434, 0.28071654, 0.2342386 ],
       [0.54981945, 0.69585297, 0.52450474, 0.39928347],
       [0.22715108, 0.13081968, 0.12397939, 0.85400469],
       [0.35287768, 0.74717552, 0.91609854, 0.95002601]])

In [33]:
dset2[:10,:]

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.

In [34]:
#boolean indexing
t = np.random.random(1)
dset4 = f['dataset4']
print(t)
dset4[dset4>t]

[0.48280023]


array([0.97540802, 0.93148953, 0.50447757, 0.53335381, 0.74150548,
       0.91295566, 0.91903894, 0.57625104, 0.70802657, 0.77945721,
       0.9561224 , 0.81550761, 0.6186489 , 0.88521464, 0.72131619,
       0.50155858, 0.55450777, 0.72853309, 0.98393849, 0.99780505,
       0.69440188, 0.65358388, 0.95619076, 0.64748424, 0.64758228,
       0.51289418, 0.98653657, 0.73384441, 0.88286531, 0.76949916,
       0.86968889, 0.94772043, 0.66583016, 0.64164462, 0.99570757,
       0.6488536 , 0.87180646, 0.95895354, 0.75093306, 0.68156174,
       0.65702353, 0.76860118, 0.81733747, 0.698754  , 0.64236254,
       0.99429058, 0.64962797])

### Broadcasting

Braodcasting works for HDF5 datasets as well and it is (almost) same as numpy arrays.

In [35]:
£dset4 +2

TypeError: unsupported operand type(s) for +: 'Dataset' and 'int'

In [36]:
#dset4 has sahape (10,10)
dset4+np.array(2)

array([[2.18980807, 2.23282748, 2.20877485, 2.97540802, 2.93148953,
        2.05760374, 2.50447757, 2.10763068, 2.05634658, 2.53335381],
       [2.74150548, 2.07373344, 2.06436458, 2.91295566, 2.91903894,
        2.57625104, 2.04666196, 2.70802657, 2.77945721, 2.9561224 ],
       [2.81550761, 2.6186489 , 2.33761999, 2.05159611, 2.23645219,
        2.24552389, 2.88521464, 2.72131619, 2.50155858, 2.55450777],
       [2.04154207, 2.2123209 , 2.26443133, 2.36577835, 2.05748413,
        2.0605134 , 2.72853309, 2.98393849, 2.00239295, 2.46806673],
       [2.45282216, 2.99780505, 2.04420173, 2.69440188, 2.20159721,
        2.65358388, 2.35458005, 2.95619076, 2.64748424, 2.08095916],
       [2.15041676, 2.64758228, 2.01461796, 2.34526097, 2.51289418,
        2.98653657, 2.73384441, 2.88286531, 2.12858377, 2.23413496],
       [2.41785093, 2.05997907, 2.47176288, 2.76949916, 2.18538542,
        2.03370401, 2.86968889, 2.14176148, 2.12422623, 2.24280815],
       [2.94772043, 2.66583016, 2.6416446

In [37]:
dset4[:,:] +2

array([[2.18980807, 2.23282748, 2.20877485, 2.97540802, 2.93148953,
        2.05760374, 2.50447757, 2.10763068, 2.05634658, 2.53335381],
       [2.74150548, 2.07373344, 2.06436458, 2.91295566, 2.91903894,
        2.57625104, 2.04666196, 2.70802657, 2.77945721, 2.9561224 ],
       [2.81550761, 2.6186489 , 2.33761999, 2.05159611, 2.23645219,
        2.24552389, 2.88521464, 2.72131619, 2.50155858, 2.55450777],
       [2.04154207, 2.2123209 , 2.26443133, 2.36577835, 2.05748413,
        2.0605134 , 2.72853309, 2.98393849, 2.00239295, 2.46806673],
       [2.45282216, 2.99780505, 2.04420173, 2.69440188, 2.20159721,
        2.65358388, 2.35458005, 2.95619076, 2.64748424, 2.08095916],
       [2.15041676, 2.64758228, 2.01461796, 2.34526097, 2.51289418,
        2.98653657, 2.73384441, 2.88286531, 2.12858377, 2.23413496],
       [2.41785093, 2.05997907, 2.47176288, 2.76949916, 2.18538542,
        2.03370401, 2.86968889, 2.14176148, 2.12422623, 2.24280815],
       [2.94772043, 2.66583016, 2.6416446

In [38]:
f.create_dataset('dataset5',shape=(50,50,3),data=np.random.randint(-5,5,7500))
dset5 = f['dataset5']

In [39]:
dset5 + np.random.random((50,3))

array([[[-2.29448105,  4.95810782, -4.32991346],
        [-2.91430082,  2.08657161, -1.63224746],
        [ 1.31705756,  3.22822767, -1.16125999],
        ...,
        [-3.12930507, -4.1337993 ,  1.00689122],
        [-1.07447335,  4.70881822, -0.8283249 ],
        [ 0.78527712, -4.80220851,  2.45937394]],

       [[ 2.70551895, -4.04189218, -3.32991346],
        [ 1.08569918,  3.08657161, -2.63224746],
        [ 4.31705756, -0.77177233, -1.16125999],
        ...,
        [ 1.87069493, -2.1337993 ,  4.00689122],
        [ 0.92552665,  2.70881822, -0.8283249 ],
        [-0.21472288, -3.80220851, -3.54062606]],

       [[ 4.70551895,  3.95810782,  4.67008654],
        [-0.91430082,  2.08657161, -0.63224746],
        [ 0.31705756,  2.22822767, -2.16125999],
        ...,
        [ 4.87069493,  1.8662007 ,  3.00689122],
        [ 1.92552665,  3.70881822, -4.8283249 ],
        [-3.21472288, -2.80220851, -4.54062606]],

       ...,

       [[-0.29448105, -3.04189218,  3.67008654],
        [ 0

In [40]:
dset5 + np.random.random(3)

array([[[-2.99124727,  4.05576823, -4.9765106 ],
        [-2.99124727,  2.05576823, -1.9765106 ],
        [ 1.00875273,  3.05576823, -1.9765106 ],
        ...,
        [-3.99124727, -4.94423177,  1.0234894 ],
        [-1.99124727,  4.05576823, -0.9765106 ],
        [ 0.00875273, -4.94423177,  2.0234894 ]],

       [[ 2.00875273, -4.94423177, -3.9765106 ],
        [ 1.00875273,  3.05576823, -2.9765106 ],
        [ 4.00875273, -0.94423177, -1.9765106 ],
        ...,
        [ 1.00875273, -2.94423177,  4.0234894 ],
        [ 0.00875273,  2.05576823, -0.9765106 ],
        [-0.99124727, -3.94423177, -3.9765106 ]],

       [[ 4.00875273,  3.05576823,  4.0234894 ],
        [-0.99124727,  2.05576823, -0.9765106 ],
        [ 0.00875273,  2.05576823, -2.9765106 ],
        ...,
        [ 4.00875273,  1.05576823,  3.0234894 ],
        [ 1.00875273,  3.05576823, -4.9765106 ],
        [-3.99124727, -2.94423177, -4.9765106 ]],

       ...,

       [[-0.99124727, -3.94423177,  3.0234894 ],
        [ 0

In [41]:
#add an image
pokemon = Image.open('/home/atatar/Documents/CIT/Teaching/UCG/DataScience/week1/Data/pokemon/dataset/Abra/2eb2a528f9a247358452b3c740df69a0.jpg')
pokemon_arr = np.array(pokemon)
f.create_dataset('pokemon',data=pokemon_arr)

FileNotFoundError: [Errno 2] No such file or directory: '/home/atatar/Documents/CIT/Teaching/UCG/DataScience/week1/Data/pokemon/dataset/Abra/2eb2a528f9a247358452b3c740df69a0.jpg'

In [None]:
f.close()

## Groups: Hiererchical Structure in HDF

 Storing all datasets as hf['dataset'] is similar to saving all your files in the same directory on your computer.

In [43]:
f = h5py.File('big_data.h5','r+')

In [44]:
f.keys()

<KeysViewHDF5 ['dataset1', 'dataset2', 'dataset3', 'dataset4', 'dataset5', 'empty_dataset']>

In [45]:
f.create_group('week1') #it is as if we have created a new folder called week1

<HDF5 group "/week1" (0 members)>

In [46]:
#populate the folder week1
w1 = f['week1']
w1.create_dataset('dataset1',data=np.random.random(5))
w1.create_dataset('dataset2',data=np.random.randint(-5,5,(10,10)))
w1.create_group('Data')

<HDF5 group "/week1/Data" (0 members)>

In [47]:

print(w1.keys())
print('-----------')
print(w1)

<KeysViewHDF5 ['Data', 'dataset1', 'dataset2']>
-----------
<HDF5 group "/week1" (3 members)>


In [48]:
#access dataset1 in week1 using dictionary style
f['week1']['dataset1'][:]

array([0.95460048, 0.92717096, 0.93513563, 0.88954966, 0.59106595])

In [49]:
#access dataset1 in week1 using path style
f['/week1/dataset1'][:]

array([0.95460048, 0.92717096, 0.93513563, 0.88954966, 0.59106595])

In [50]:
f.close()

## Attributes: Storing Metadata
Attributes make files self-descripting. They work like dictionaries. They can store any kind of data.

In [52]:
f = h5py.File('big_data.h5','r+')

In [53]:
f.attrs #attributes object

<Attributes of HDF5 object at 2474412140528>

In [54]:
f.attrs.keys()

<KeysViewHDF5 []>

In [55]:
dset1 = f['dataset1']
dset1.attrs

<Attributes of HDF5 object at 2474407093536>

In [57]:
pokemon = f['pokemon']
pokemon.attrs['title'] = 'This is a pokemon color image.'
pokemon.attrs['source'] = 'https://www.kaggle.com/datasets/vishalsubbiah/pokemon-images-and-types'

KeyError: "Unable to open object (object 'pokemon' doesn't exist)"

In [None]:
pokemon.attrs.keys()

In [None]:
for a in pokemon.attrs:
    print(pokemon.attrs.get(a))

In [None]:
f.close()

## Explore an HDF5 File
Download the Spectrometer data from the [link](https://ndownloader.figshare.com/files/7024271) into your working directory.

### Using a Viewer
* https://support.hdfgroup.org/products/java/release/download.html
* https://www.neonscience.org/resources/learning-hub/tutorials/setup-qgis-h5view

### Using Python

In [58]:
spectrometer = h5py.File('NEONDSImagingSpectrometerData.h5','r+')

FileNotFoundError: [Errno 2] Unable to open file (unable to open file: name = 'NEONDSImagingSpectrometerData.h5', errno = 2, error message = 'No such file or directory', flags = 1, o_flags = 2)

#### Dictionary Style Iteration
Iterate over the keys at every layer.

In [None]:
for k in spectrometer:
    print('Key:',k)
    print('Value:',spectrometer[k]) #hdf5 object: group or dataset
    print('---------')

In [None]:
groups = []
datasets = []

for k,v in spectrometer.items():
    if isinstance(v, h5py.Dataset):
        print('Type: Dataset')
        print(v.name)
        print(v.shape)
        print(v.ndim)
        print(v.nbytes)
        print(v.dtype)
        datasets.append(v)
    if isinstance(v, h5py.Group):
        print('Type: Group')
        print(v.name)
        print('number of members:', len(v.keys()))
        groups.append(v)
    print('-----------')

In [None]:
#access one of the datasets at layer 0
reflectance = spectrometer['Reflectance']
reflectance[:,:,:]

In [None]:
#explore some attributes
spectrometer.attrs.keys()

In [None]:
spectrometer.attrs.get('SITE')

In [None]:
#see all the attributes
for k in spectrometer.attrs:
    print(spectrometer.attrs[k])

In [None]:
#see all the attributes
for k in spectrometer.attrs.values():
    print(k)

In [None]:
for i in spectrometer.attrs.items():
    print(i)

In [None]:
groups

In [None]:
#access a dataset in layer 1
spectrometer['spatialInfo/']

In [None]:
# add a random dataset under spatialInfo
arr = np.random.randint(-5,5,(10,10))
spectrometer['spatialInfo'].create_dataset('random',data=arr)


In [None]:
# add attributes
spectrometer['/spatialInfo/random'].attrs['Description'] = 'Random numpy array of integers between -5 and 5.'

In [None]:
groups = []
datasets = []

for k,v in spectrometer.items():
    if isinstance(v, h5py.Dataset):
        print('Type: Dataset')
        print(v.name)
        print(v.shape)
        print(v.ndim)
        print(v.nbytes)
        print(v.dtype)
        datasets.append(v)
    if isinstance(v, h5py.Group):
        print('Type: Group')
        print(v.name)
        print('number of members:', len(v.keys()))
        groups.append(v)
    print('-----------')

In [None]:
groups

In [None]:
# cannot find the random dataset.
datasets

#### Multilayer Iteration

We cover all layers at once.

In [None]:
def ls_nodes(node):
    print(node)

In [None]:
spectrometer.visit(ls_nodes)

In [None]:
def ls_nodes(node):
    print(spectrometer[node])

In [None]:
spectrometer.visit(ls_nodes)

In [None]:
#let's get fancy
spectrometer.visit(lambda x: print(spectrometer[x]))

In [None]:
spectrometer.close()