There are 5 pieces of data:
(coordinates, velocities, forces)
frame number, time stamp of the frame.



Come up with an example jupyter notebook where you read some random data (say 1000 particles worth) into a zarr group containing the three arrays.

Load and save from disk.

How big is the resulting zarr object? How will this scale with particle size? Can you compare with other formats?



In [5]:
import zarr
import numpy as np
import MDAnalysis as mda

In [5]:
root = zarr.group()
root

<zarr.hierarchy.Group '/'>

In [None]:
foo = root.create_group('foo')
bar = foo.create_group('bar')
root.attrs['foo'] = 'bar'

In [None]:
z = zarr.array(np.arange(10))
z.get_coordinate_selection([1, 4])

array([1, 4])

Notation:
        (name) is an HDF5 group that the reader recognizes
        {name} is an HDF5 group with arbitrary name
        [variable] is an HDF5 dataset
        <dtype> is dataset datatype
        +-- is an attribute of a group or dataset

        H5MD root
         \-- (h5md)
            +-- version <int>
            \-- author
                +-- name <str>, author's name
                +-- email <str>, optional email address
            \-- creator
                +-- name <str>, file that created .h5md file
                +-- version
         \-- (particles)
            \-- {group1}
                \-- (box)
                    +-- dimension : <int>, number of spatial dimensions
                    +-- boundary : <str>, boundary conditions of unit cell
                    \-- (edges)
                        \-- [step] <int>, gives frame
                        \-- [value] <float>, gives box dimensions
                            +-- unit <str>
                \-- (position)
                    \-- [step] <int>, gives frame
                    \-- [time] <float>, gives time
                        +-- unit <str>
                    \-- [value] <float>, gives numpy arrary of positions
                                         with shape (n_atoms, 3)
                        +-- unit <str>
                \-- (velocity)
                    \-- [step] <int>, gives frame
                    \-- [time] <float>, gives time
                        +-- unit <str>
                    \-- [value] <float>, gives numpy arrary of velocities
                                         with shape (n_atoms, 3)
                        +-- unit <str>
                \-- (force)
                    \-- [step] <int>, gives frame
                    \-- [time] <float>, gives time
                        +-- unit <str>
                    \-- [value] <float>, gives numpy arrary of forces
                                         with shape (n_atoms, 3)
                        +-- unit <str>

In [None]:
n_atoms = 1000
pos = np.arange(3 * n_atoms).reshape(n_atoms, 3)
orig_box = np.array([81.1, 82.2, 83.3, 75, 80, 85], dtype=np.float32)

z_group = zarr.group()
particles = root.create_group('particles')
g_name = particles.create_group('g_name')

# needs: frame num, timestamp

box = g_name.create_group('box')

position = g_name.create_group('position')

velocity = g_name.create_group('velocity')

force = g_name.create_group('force')


root.attrs['foo'] = 'bar'

Writing to h5md to get access to underlying object

In [28]:
from MDAnalysis.tests.datafiles import PDB, XTC

u = mda.Universe(PSF, DCD)

ag = u.atoms
ag.write('h5md_view.h5md', frames='all')

In [40]:
import h5py

file = h5py.File('h5md_view.h5md', 'r')
data = file['particles/trajectory']


pos = data['position']
pos_frame = np.array(pos['value'])

print(pos_frame)





# Shape of position array is (frame num, atom, 3 spacial dim values)

[[[ 11.736044    8.500797  -10.445281 ]
  [ 12.365119    7.839936  -10.834842 ]
  [ 12.0919485   9.441535  -10.724611 ]
  ...
  [  6.512604   18.447018   -7.134053 ]
  [  6.300186   19.363485   -7.935916 ]
  [  5.5854015  17.589624   -6.9656615]]

 [[ 11.505546    8.062977  -10.38611  ]
  [ 12.054723    7.151329  -10.616048 ]
  [ 11.8052025   8.942828  -10.862341 ]
  ...
  [  6.643505   17.84961    -7.008922 ]
  [  6.6989756  18.616297   -8.0264   ]
  [  5.682343   17.086544   -6.8337812]]

 [[ 11.694641    8.390831  -10.681395 ]
  [ 12.40489     7.7260346 -11.133236 ]
  [ 11.936471    9.270585  -11.150342 ]
  ...
  [  6.854948   17.816687   -7.032191 ]
  [  6.6823397  18.81354    -7.775057 ]
  [  6.0196676  16.883717   -6.9835215]]

 ...

 [[ 16.297781    6.8397956  -7.622989 ]
  [ 16.822018    6.566309   -6.7072215]
  [ 16.760832    7.6656146  -7.9530683]
  ...
  [ 12.63667    15.566869   -6.1185045]
  [ 12.8278     16.214436   -7.167255 ]
  [ 11.55105    14.879154   -5.940134 ]]

 [

In [23]:
from MDAnalysis.tests.datafiles import PDB, XTC

u = mda.Universe(PDB, XTC)

i = 0
for ts in u.trajectory:
    i += 1
print(i)

10




Saving a zarr file to disk

In [45]:
import zarr

# create zarr group layout
root = zarr.open('test_zip.zarr', mode='a')
# root = zarr.group()
particles = root.create_group('particles')
g_name = particles.create_group('g_name')
box = g_name.create_group('box')
position = g_name.create_group('position')
velocity = g_name.create_group('velocity')
force = g_name.create_group('force')

# Generate atom box
n_atoms = 1000
# Generate an array of vals from 0 to 3* 1000
# turn this into an array of 1000 x,y,z coordinates
pos = np.arange(3 * n_atoms).reshape(n_atoms, 3)
orig_box = np.array([81.1, 82.2, 83.3, 75, 80, 85], dtype=np.float32)


# Shape of position array is (frame num, atom, 3 spacial dim values)
positions = np.empty((5, n_atoms, 3))
for i in range(5):
    positions[i] =  2** i * pos

# Insert the array into the zarr group
position.create_dataset('value', data=positions)

# Save the zarr group to disk
print(g_name['position/value'])



<zarr.core.Array '/particles/g_name/position/value' (5, 1000, 3) float64>


In [None]:
import zarr

open_test = zarr.open('test_zip.zarr', mode='a')

open_test.tree()