# Basic Datatypes in HDF5

In [110]:
import numpy as np

## Homogeneous datatypes

In [111]:
arr_to_store = np.arange(10, dtype=np.int8)

### Using h5py

In [112]:
import h5py

In [113]:
f = h5py.File("homogeneous1.h5", "w")

In [114]:
f.create_dataset(data=arr_to_store, name="mydata", dtype=np.int8)

<HDF5 dataset "mydata": shape (10,), type "|i1">

In [115]:
f['/mydata'][:]

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=int8)

In [116]:
f

<HDF5 file "homogeneous1.h5" (mode r+)>

In [117]:
f.close()

In [120]:
!h5ls -rv homogeneous1.h5

Opened "homogeneous1.h5" with sec2 driver.
/                        Group
    Location:  1:96
    Links:     1
/mydata                  Dataset {10/10}
    Location:  1:800
    Links:     1
    Storage:   10 logical bytes, 10 allocated bytes, 100.00% utilization
    Type:      native signed char


In [121]:
!ls -l homogeneous1.h5

-rw-r--r--  1 faltet  staff  2154 May  8 12:43 homogeneous1.h5


### Using PyTables

In [122]:
import tables

In [123]:
f2 = tables.open_file("homogeneous2.h5", "w")

In [124]:
f2.create_array(f2.root, name="mydata", obj=arr_to_store)

/mydata (Array(10,)) ''
  atom := Int8Atom(shape=(), dflt=0)
  maindim := 0
  flavor := 'numpy'
  byteorder := 'irrelevant'
  chunkshape := None

In [125]:
f2.root.mydata[:]

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=int8)

In [126]:
f2

File(filename=homogeneous2.h5, title='', mode='w', root_uep='/', filters=Filters(complevel=0, shuffle=False, bitshuffle=False, fletcher32=False, least_significant_digit=None))
/ (RootGroup) ''
/mydata (Array(10,)) ''
  atom := Int8Atom(shape=(), dflt=0)
  maindim := 0
  flavor := 'numpy'
  byteorder := 'irrelevant'
  chunkshape := None

In [127]:
f2.close()

In [128]:
!h5ls -v homogeneous2.h5/mydata

Opened "homogeneous2.h5" with sec2 driver.
mydata                   Dataset {10/10}
    Attribute: CLASS scalar
        Type:      5-byte null-terminated UTF-8 string
        Data:  "ARRAY"
    Attribute: FLAVOR scalar
        Type:      5-byte null-terminated UTF-8 string
        Data:  "numpy"
    Attribute: TITLE null
        Type:      1-byte null-terminated UTF-8 string

    Attribute: VERSION scalar
        Type:      3-byte null-terminated UTF-8 string
        Data:  "2.4"
    Location:  1:1024
    Links:     1
    Storage:   10 logical bytes, 10 allocated bytes, 100.00% utilization
    Type:      native signed char
H5tools-DIAG: Error detected in HDF5:tools (1.8.17) thread 140736151184320:
  #000: h5tools_dump.c line 1836 in h5tools_dump_mem(): H5Sis_simple failed
    major: Failure in tools library
    minor: error in function


In [129]:
!ls -l homogeneous*.h5

-rw-r--r--  1 faltet  staff  2154 May  8 12:43 homogeneous1.h5
-rw-r--r--  1 faltet  staff  2154 May  8 12:44 homogeneous2.h5


## Compound Datatypes

In [130]:
table_to_store = np.fromiter(((i, i**2, "%s"%i**3) for i in range(10)), dtype="i4,f8,S4")

In [131]:
table_to_store

array([(0,   0., b'0'), (1,   1., b'1'), (2,   4., b'8'), (3,   9., b'27'),
       (4,  16., b'64'), (5,  25., b'125'), (6,  36., b'216'),
       (7,  49., b'343'), (8,  64., b'512'), (9,  81., b'729')], 
      dtype=[('f0', '<i4'), ('f1', '<f8'), ('f2', 'S4')])

### Using h5py

In [132]:
f = h5py.File("compound1.h5", "w")

In [133]:
f.create_dataset(data=table_to_store, name="mydata")

<HDF5 dataset "mydata": shape (10,), type "|V16">

In [134]:
f['mydata'].dtype

dtype([('f0', '<i4'), ('f1', '<f8'), ('f2', 'S4')])

In [135]:
f['mydata'][:]

array([(0,   0., b'0'), (1,   1., b'1'), (2,   4., b'8'), (3,   9., b'27'),
       (4,  16., b'64'), (5,  25., b'125'), (6,  36., b'216'),
       (7,  49., b'343'), (8,  64., b'512'), (9,  81., b'729')], 
      dtype=[('f0', '<i4'), ('f1', '<f8'), ('f2', 'S4')])

In [136]:
f.close()

In [137]:
!h5ls -v compound1.h5

Opened "compound1.h5" with sec2 driver.
mydata                   Dataset {10/10}
    Location:  1:800
    Links:     1
    Storage:   160 logical bytes, 160 allocated bytes, 100.00% utilization
    Type:      struct {
                   "f0"               +0    native int
                   "f1"               +4    native double
                   "f2"               +12   4-byte null-padded ASCII string
               } 16 bytes


### Using PyTables (simple way)

In [138]:
f2 = tables.open_file("compound2.h5", "w")

In [139]:
f2.create_table(f2.root, name="mydata", obj=table_to_store)

/mydata (Table(10,)) ''
  description := {
  "f0": Int32Col(shape=(), dflt=0, pos=0),
  "f1": Float64Col(shape=(), dflt=0.0, pos=1),
  "f2": StringCol(itemsize=4, shape=(), dflt=b'', pos=2)}
  byteorder := 'little'
  chunkshape := (4096,)

In [140]:
f2.close()

### Using PyTables (description way)

In [141]:
class MyTable(tables.IsDescription):
    f0 = tables.Int32Col()
    f1 = tables.Float64Col()
    f2 = tables.StringCol(itemsize=4)

In [142]:
f3 = tables.open_file("compound3.h5", "w")

In [143]:
t = f3.create_table(f3.root, "mydata", MyTable)

In [144]:
t.append(table_to_store)

In [145]:
f3.close()

In [146]:
!h5ls -v compound2.h5/mydata

Opened "compound2.h5" with sec2 driver.
mydata                   Dataset {10/Inf}
    Attribute: CLASS scalar
        Type:      5-byte null-terminated UTF-8 string
        Data:  "TABLE"
    Attribute: FIELD_0_FILL scalar
        Type:      native int
        Data:  0
    Attribute: FIELD_0_NAME scalar
        Type:      2-byte null-terminated UTF-8 string
        Data:  "f0"
    Attribute: FIELD_1_FILL scalar
        Type:      native double
        Data:  0
    Attribute: FIELD_1_NAME scalar
        Type:      2-byte null-terminated UTF-8 string
        Data:  "f1"
    Attribute: FIELD_2_FILL scalar
        Type:      1-byte null-terminated ASCII string
        Data:  ""
    Attribute: FIELD_2_NAME scalar
        Type:      2-byte null-terminated UTF-8 string
        Data:  "f2"
    Attribute: NROWS scalar
        Type:      native long
        Data:  10
    Attribute: TITLE null
        Type:      1-byte null-terminated UTF-8 string

    Attribute: VERS

In [147]:
ls -lh compound*.h5

-rw-r--r--  1 faltet  staff   2.3K May  8 12:44 compound1.h5
-rw-r--r--  1 faltet  staff    68K May  8 12:45 compound2.h5
-rw-r--r--  1 faltet  staff    68K May  8 12:45 compound3.h5


Hmm, it seems like PyTables files are larger than h5py ones, why?  Let's introspect a bit into the files:

In [102]:
!h5ls compound1.h5

mydata                   Dataset {10}


In [103]:
!h5ls compound2.h5

mydata                   Dataset {10/Inf}


We see that the dimensionality of the table created with PyTables is `{10/Inf}`, indicating that the dataset is chunked, whereas the one created with h5py is just `{10}`, which means that it is not using chunking.  As chunked datasets takes more space than non-chunked, this is the main reason why PyTables are larger.

The reason why PyTables tables are chunked by default is that they can be enlarged and compressed, and chunking is required in order to allow that.  More on chunking later.