# Creating datasets

Creating datasets with PyTables via NumPy arrays is easy:

In [None]:
import numpy as np
import tables as tb

# Create a new file
f = tb.open_file("atest.h5", "w")

# Create a NumPy array
a = np.arange(100).reshape(20,5)

# Save the array
f.create_array(f.root, "array1", a)

In [None]:
# Peek data
f.root.array1[:]

In [None]:
# Slice and dice (only these slices are loaded into memory)
ta = f.root.array1
ta[1:10:3,2:5]

In [None]:
# Make sure that the read data is the same than the original
np.allclose(ta[1:10:3,2:5], a[1:10:3,2:5])

In [None]:
# Create another array
ta2 = f.create_array(f.root, "array2", np.arange(10))

In [None]:
# Let's have a look at the size of the underlying file
!ls -l atest.h5

In [None]:
# Flush data to the file (very important to keep all your data safe!)
f.flush()

In [None]:
!ls -l atest.h5

In [None]:
f.close()  # close access to file

In [None]:
# Look at its contents by using `ptdump` utility
! ptdump atest.h5

In [None]:
# Reopen the file and revisit the datasets there
f = tb.open_file("atest.h5", mode="r")  # note the 'r'ead mode

In [None]:
# Get the summary of the contents
f

In [None]:
f.root.array1

In [None]:
f.root.array1[:]

In [None]:
# Always close your files when you are done (or use contexts)
f.close()

## Exercise

Create a new HDF5 file with 2 arrays on it.  One should be 2-dimensional and the other the result of summing the 2nd dimension (.sum(axis=1)).  Use contexts so that you don't have to close the file explicitly.

### Solution

# Playing with the object tree

In [None]:
# Re-open the existing file in 'a'ppend mode
f = tb.open_file("atest.h5", "a")

In [None]:
f

In [None]:
# We can get a shortened view too:
print(f)

In [None]:
# Add a new group
f.create_group(f.root, 'group1', 'Title for group1')

In [None]:
f

In [None]:
f.move_node(f.root.array1, f.root.group1)

In [None]:
f

In [None]:
# Create a very nested group (note the `createparents` parameter)
f.create_group('/g1/g2/g3/g4', 'g5', createparents=True)

In [None]:
print(f)

In [None]:
# Add an array in the 'very nested' group
f.create_array(f.root.g1.g2.g3.g4.g5, 'array2', np.arange(10))

In [None]:
print(f)

In [None]:
# Removing nodes is very easy
f.remove_node(f.root.g1.g2.g3.g4.g5.array2)

In [None]:
print(f)

In [None]:
# Show the PyTables File object working as an iterator
for n in f:
    print(n)

In [None]:
# The `File.walk_nodes` method offers more flexibility
for n in f.walk_nodes():
    print(n)

In [None]:
# Get info from a certain point of the hierarchy on
for n in f.walk_nodes(f.root.group1):
    print(n)

In [None]:
# walknodes allows to iterate over specific classes
for n in f.walk_nodes(f.root.group1, classname="Array"):
    print(n[:2])

In [None]:
f.close()

## Exercise

Use the file that you created in the previous exercise and create a new group called 'reduced' and titled 'My Reduced data' and move the 1-dimensional array there.  Look at the final contents with the ptdump utility.  

### Solution

# HDF5 atributes

In [None]:
# Re-open the file
f = tb.open_file("atest.h5", "a")

In [None]:
print(f)

In [None]:
# Print the attrs in /array2
f.root.array2.attrs

In [None]:
# Add a new attribute to /array2
f.root.array2.attrs.myattr = "Hello World!"

In [None]:
f.root.array2.attrs

In [None]:
# Has the modification arrived to disk yet? 
!ptdump -a atest.h5:/array2  # note the -a flag and node specification

In [None]:
# Nope, so force a flush
f.flush()

In [None]:
!ptdump -a atest.h5:/array2

In [None]:
# Attributes can also be general arrays
f.root.array2.attrs.myarray = np.arange(10)
f.flush()

In [None]:
!ptdump -a atest.h5:/array2

In [None]:
# Get a shortcut to the attribute handler
attrs = f.root.array2.attrs

In [None]:
attrs

In [None]:
# Removing an attribute
del attrs.myarray
attrs

In [None]:
# Overwrite an existing one (be careful with this feature!)
attrs.myattr = 12.3
attrs

In [None]:
# Print the attributes for all the arrays in the object tree
for n in f.walk_nodes(f.root.group1, classname="Array"):
    print(repr(n.attrs))

In [None]:
f.close()

## Exercise

Use the file in the previous exercise and add an attribute to the 1-dimensional array specifying the mean and the standard deviation. Use ptdump -a to check that the attributes are there.

### Solution

# Chunked datasets

In [None]:
f = tb.open_file('ctest.h5', 'w')

In [None]:
# Create an un-initialized CArray (Compressible Array)
f.create_carray(f.root, 'carray', tb.Float64Atom(), (10000,1000))

In [None]:
# Flush everything to disk
f.flush()

In [None]:
# The container is there, but not the data (yet)
!ls -lh ctest.h5

In [None]:
# Push some data into this carray container
ca = f.root.carray
na = np.linspace(0, 1, 1e7).reshape(10000,1000)
%time ca[:] = na

In [None]:
# Flush (we can specify which node should be flushed)
ca.flush()

In [None]:
!ls -lh ctest.h5

In [None]:
np.prod(ca.shape) * ca.dtype.itemsize / 2**20.

In [None]:
# Retrieve only part of the data
ca[:10,::2]

In [None]:
f.close()

## Using compression with chunked arrays

In [None]:
f = tb.open_file('ctest-zlib.h5', 'w')

In [None]:
# Create a CArray (Compressible Array) using the zlib compressor
filters = tb.Filters(complib='zlib', complevel=1)
ca = f.create_carray(f.root, 'carray', tb.Float64Atom(), (10000,1000),
                     filters=filters)

In [None]:
# Push some data on this carray container
na = np.linspace(0, 1, 1e7).reshape(10000,1000)
%time ca[:] = na

In [None]:
# Flush the carray container only
ca.flush()
!ls -lh ctest-zlib.h5

In [None]:
np.prod(ca.shape) * ca.dtype.itemsize / 2**20.

In [None]:
f.close()

In [None]:
# Look at the file with a native HDF5 tool
!h5ls -v ctest-zlib.h5

## Using compression (Blosc)

In [None]:
f = tb.open_file('ctest-blosc.h5', 'w')

In [None]:
# Create a CArray (Compressible Array) using the Blosc compressor
filters = tb.Filters(complib='blosc:lz4', complevel=9)
ca = f.create_carray(f.root, 'carray', tb.Float64Atom(), (10000,1000),
                     filters=filters)

In [None]:
# Push some data on this carray container
na = np.linspace(0, 1, 1e7).reshape(10000,1000)
%time ca[:] = na

**Note how witing a compressed carray is faster in the this case than both the uncompressed case above (~500 ms) and with using zlib (~750 ms).**

In [None]:
f.close()
!ls -lh ctest-blosc.h5

As you see, the compression ratio is quite the same than with zlib.

## Exercise

PyTables comes with support for different compressors, namely 'zlib' (the default), 'bzip2' and 'blosc:X' where X is a codec can be one of 'blosclz', 'lz4', 'lz4hc', 'snappy', 'zlib' (and 'zstd' from PyTables 3.3 on).  Based on the example above, do a small study on which ones work best.

* Which one compresses best?
* Which one compresses faster?
* Which one shows the best balance?

### Solution

# Specifying chunk size

When creating a chunked dataset, a chunksize is chosen automatically based on some heuristics.  However, you may want to specify your own chunksize and see the best for you:

In [None]:
na = np.linspace(0, 1, 1e7).reshape(10000,1000)
for nrows in range(10, 210, 30):
    with tb.open_file("chunk_study.h5", "w") as f:
        chunkshape = (nrows, 1000)
        print("chunkshape:", chunkshape)
        filters = tb.Filters(complib="blosc:lz4", complevel=9)
        ca = f.create_carray(f.root, 'carray', tb.Float64Atom(), (10000,1000),
                            filters=filters, chunkshape=chunkshape)
        %time ca[:] = na
    !ls -lh chunk_study.h5

# Using ptrepack

You can use the 'ptrepack' utility to copy HDF5 whole files (or only parts) and change different parameters during the copy process.

In [None]:
!ptrepack -o --complib zlib --complevel 1 ctest.h5 ctest-repacked-zlib.h5

In [None]:
!ptrepack -o --complib blosc:lz4 --complevel 9 ctest.h5 ctest-repacked-blosc.h5

In [None]:
!ptrepack -o --complib blosc:lz4 --complevel 9 --chunkshape '(1000,1000)' ctest.h5 ctest-repacked-blosc-chunkshape.h5

# Queries in Table objects

In [None]:
# The description for the tabular data
class TabularData(tb.IsDescription):
    col1 = tb.StringCol(200)
    col2 = tb.IntCol()
    col3 = tb.FloatCol()

In [None]:
# Open a file and create the Table container
f = tb.open_file('atable.h5', 'w')
t = f.create_table(f.root, 'table', TabularData, 'table title',
                   filters=tb.Filters(9, 'blosc'))

In [None]:
t

In [None]:
%%time
#  Fill the table with some 1 million rows
r = t.row
for i in range(1000*1000):
    r['col1'] = str(i)
    r['col2'] = i + 1
    r['col3'] = i * (i + 1)
    r.append()
t.flush()

In [None]:
t

In [None]:
# Size on disk
!ls -lh atable.h5

In [None]:
# Real size
np.prod(t.shape) * t.dtype.itemsize / 2**20.

In [None]:
# Do a query (regular)
%time [r['col1'] for r in t if r['col2'] < 5]

In [None]:
# Repeat the query, but using in-kernel method
%time [r['col1'] for r in t.where('col2 < 5')]

In [None]:
# Performing complex conditions (regular query)
%time [r['col1'] for r in t if r['col2'] < 5 and r['col3'] < 10]

In [None]:
# Complex, in-kernel queries
%time [r['col1'] for r in t.where('(col2 < 5) & (col3 < 10)')]

In [None]:
# Get a structured array out of disk
sa = t[:]
sa

In [None]:
# Perform the query in-memory using pure NumPy machinery 
%time sa[((sa['col2'] < 5) & (sa['col3'] < 10))]['col1']

In [None]:
# Create an index for the on-disk table
%time t.cols.col2.create_csindex()

In [None]:
# Repeat the complex query (indexed)
%time [r['col1'] for r in t.where('(col2 < 5) & (col3 < 10)')]

Indexing normally offers the best speed for doing queries.

In [None]:
f.close()

# Exercise

Open the 'ic_dst...' file in the data/ directory:

In [None]:
f = tb.open_file("../data/ic_dst_NEXT_v0_08_02_Kr_ACTIVE_0_0_5bar_MCRD_10000.root.h5")

In [None]:
print(f)


* Determine the chunksize of the /MLR/mau and /RD/pmtrwf datasets

* Copy them to another (new) HDF5 file using different chunksizes and compressors.  Determine the ones that offers best ratio and speed. (use ptrepack).

* Use the /TWF/TWF and /Sensors/DataSiPM and do some small analysis (e.g. plotting the times for TWF, or query them based on some conditions that make sense).

### Solution