# How to edit a nexus file

Example on how to read a nexus file and create a new one based on its content.   
Here we take a GIXD scan with sensors and 2D pilatus images. We create a new nexus file with only a subset of the measurements. We also add compression to the data, which is very efficient to reduce the size of the file.

In [1]:
import h5py
import os
import numpy as np

original_file_name = 'original_nexus_files/SIRIUS_2020_03_12_0756.nxs'
new_file_name =  'modified_nexus_files/reduced_SIRIUS_2020_03_12_0756.nxs'

if os.path.exists(new_file_name):
    os.remove(new_file_name)

## Read info from the original file

In [2]:
# Print all the datasets (:=leaves)
f = h5py.File(original_file_name, 'r')
for k0, v0 in f.items():
    print('%s'%k0)    
    for k1, v1 in v0.items():
        print('\t%s/%s'%(k0,k1))
        try:
            for k2, v2 in v1.items():
                print('\t\t%s/%s/%s'%(k0,k1,k2))
                try:
                    for k3, v3 in v2.items():
                        print('\t\t\t%s/%s/%s/%s'%(k0,k1,k2,k3))
                except:
                    pass
        except:
            pass
f.close()

root.spyc.DiffractoScanConfig
	root.spyc.DiffractoScanConfig/SIRIUS
		root.spyc.DiffractoScanConfig/SIRIUS/ans-c15-ei-c-hu36_energy
			root.spyc.DiffractoScanConfig/SIRIUS/ans-c15-ei-c-hu36_energy/energy
			root.spyc.DiffractoScanConfig/SIRIUS/ans-c15-ei-c-hu36_energy/gap
			root.spyc.DiffractoScanConfig/SIRIUS/ans-c15-ei-c-hu36_energy/harmonic
			root.spyc.DiffractoScanConfig/SIRIUS/ans-c15-ei-c-hu36_energy/mode
			root.spyc.DiffractoScanConfig/SIRIUS/ans-c15-ei-c-hu36_energy/phase
			root.spyc.DiffractoScanConfig/SIRIUS/ans-c15-ei-c-hu36_energy/polarisation
		root.spyc.DiffractoScanConfig/SIRIUS/ans-ca-machinestatus
			root.spyc.DiffractoScanConfig/SIRIUS/ans-ca-machinestatus/current
			root.spyc.DiffractoScanConfig/SIRIUS/ans-ca-machinestatus/function_mode
			root.spyc.DiffractoScanConfig/SIRIUS/ans-ca-machinestatus/life_time
			root.spyc.DiffractoScanConfig/SIRIUS/ans-ca-machinestatus/name
			root.spyc.DiffractoScanConfig/SIRIUS/ans-ca-machinestatus/probe
			root.spyc.DiffractoScan

Extract the names of the relevant leaves. Print their shape and type.

In [3]:
leaf_names = []
f = h5py.File(original_file_name, 'r')
for k0, v0 in f.items():
    for k1, v1 in v0.items():
        try:
            for k2, v2 in v1.items():
                if '%s'%k1 == 'scan_data':
                    leaf_names.append(k0+'/'+k1+'/'+k2)
        except:
            pass

for leaf in leaf_names:
    try:
        print('%s\nalias: %s\nshape: %s\ntype: %s\n'%(leaf, f[leaf].attrs['alias'], f[leaf].shape, f[leaf].dtype))
    except:
        print('%s\nlong_name: %s\nshape: %s\ntype: %s\n'%(leaf, f[leaf].attrs['long_name'], f[leaf].shape, f[leaf].dtype))
f.close()



root.spyc.DiffractoScanConfig/scan_data/data_01
alias: b'delta'
shape: (101,)
type: float64

root.spyc.DiffractoScanConfig/scan_data/data_02
alias: b'zs'
shape: (101,)
type: float64

root.spyc.DiffractoScanConfig/scan_data/data_03
alias: b'gamma'
shape: (101,)
type: float64

root.spyc.DiffractoScanConfig/scan_data/data_04
alias: b'hu36energy'
shape: (101,)
type: float64

root.spyc.DiffractoScanConfig/scan_data/data_05
alias: b'xs'
shape: (101,)
type: float64

root.spyc.DiffractoScanConfig/scan_data/data_06
alias: b'energydcm'
shape: (101,)
type: float64

root.spyc.DiffractoScanConfig/scan_data/data_07
alias: b'current'
shape: (101,)
type: float64

root.spyc.DiffractoScanConfig/scan_data/data_08
alias: b'mon2'
shape: (101,)
type: float64

root.spyc.DiffractoScanConfig/scan_data/data_09
alias: b'surfacepressure'
shape: (101,)
type: float64

root.spyc.DiffractoScanConfig/scan_data/data_10
alias: b'areapermolecule'
shape: (101,)
type: float64

root.spyc.DiffractoScanConfig/scan_data/data_1

## Write the new file

Create a new file with only the interesting data (keep only a subset)

In [4]:
# limits of the subset to keep
imin = 20
imax = 100

with h5py.File(original_file_name,  "r") as r:
    with h5py.File(new_file_name,  "w") as w:

        for leaf in leaf_names:

            # create the subset
            new_data = r[leaf][...][imin:imax]

            # create a new dataset with the subset (without compression)
            #w.create_dataset(leaf, new_data.shape, dtype=new_data.dtype, data=new_data)
            
            # to add compression (very efficient!)
            w.create_dataset(leaf, new_data.shape, dtype=new_data.dtype, data=new_data, compression='gzip')

            for key in r[leaf].attrs.keys():
                w[leaf].attrs[key]=r[leaf].attrs[key]
    

## Read info from the new file

Check the format of the new file.

In [5]:
# Print all the datasets (:=leaves)
f = h5py.File(new_file_name, 'r')

for k0, v0 in f.items():
    print('%s'%k0)    
    for k1, v1 in v0.items():
        print('\t%s/%s'%(k0,k1))
        try:
            for k2, v2 in v1.items():
                print('\t\t%s/%s/%s'%(k0,k1,k2))
                try:
                    for k3, v3 in v2.items():
                        print('\t\t\t%s/%s/%s/%s'%(k0,k1,k2,k3))
                except:
                    pass
        except:
            pass
f.close()

root.spyc.DiffractoScanConfig
	root.spyc.DiffractoScanConfig/scan_data
		root.spyc.DiffractoScanConfig/scan_data/data_01
		root.spyc.DiffractoScanConfig/scan_data/data_02
		root.spyc.DiffractoScanConfig/scan_data/data_03
		root.spyc.DiffractoScanConfig/scan_data/data_04
		root.spyc.DiffractoScanConfig/scan_data/data_05
		root.spyc.DiffractoScanConfig/scan_data/data_06
		root.spyc.DiffractoScanConfig/scan_data/data_07
		root.spyc.DiffractoScanConfig/scan_data/data_08
		root.spyc.DiffractoScanConfig/scan_data/data_09
		root.spyc.DiffractoScanConfig/scan_data/data_10
		root.spyc.DiffractoScanConfig/scan_data/data_11
		root.spyc.DiffractoScanConfig/scan_data/data_12
		root.spyc.DiffractoScanConfig/scan_data/data_13
		root.spyc.DiffractoScanConfig/scan_data/integration_times
		root.spyc.DiffractoScanConfig/scan_data/sensors_rel_timestamps
		root.spyc.DiffractoScanConfig/scan_data/sensors_timestamps


In [6]:
leaf_names = []
f = h5py.File(new_file_name, 'r')
for k0, v0 in f.items():
    for k1, v1 in v0.items():
        try:
            for k2, v2 in v1.items():
                if '%s'%k1 == 'scan_data':
                    leaf_names.append(k0+'/'+k1+'/'+k2)
        except:
            pass

for leaf in leaf_names:
    try:
        print('%s\nalias: %s\nshape: %s\ntype: %s\n'%(leaf, f[leaf].attrs['alias'], f[leaf].shape, f[leaf].dtype))
    except:
        print('%s\nlong_name: %s\nshape: %s\ntype: %s\n'%(leaf, f[leaf].attrs['long_name'], f[leaf].shape, f[leaf].dtype))
f.close()


root.spyc.DiffractoScanConfig/scan_data/data_01
alias: b'delta'
shape: (80,)
type: float64

root.spyc.DiffractoScanConfig/scan_data/data_02
alias: b'zs'
shape: (80,)
type: float64

root.spyc.DiffractoScanConfig/scan_data/data_03
alias: b'gamma'
shape: (80,)
type: float64

root.spyc.DiffractoScanConfig/scan_data/data_04
alias: b'hu36energy'
shape: (80,)
type: float64

root.spyc.DiffractoScanConfig/scan_data/data_05
alias: b'xs'
shape: (80,)
type: float64

root.spyc.DiffractoScanConfig/scan_data/data_06
alias: b'energydcm'
shape: (80,)
type: float64

root.spyc.DiffractoScanConfig/scan_data/data_07
alias: b'current'
shape: (80,)
type: float64

root.spyc.DiffractoScanConfig/scan_data/data_08
alias: b'mon2'
shape: (80,)
type: float64

root.spyc.DiffractoScanConfig/scan_data/data_09
alias: b'surfacepressure'
shape: (80,)
type: float64

root.spyc.DiffractoScanConfig/scan_data/data_10
alias: b'areapermolecule'
shape: (80,)
type: float64

root.spyc.DiffractoScanConfig/scan_data/data_11
alias: b

# Batch

We batch now a series of nexus files, adding compression to the data.

In [7]:
folder_to_compress = 'original_nexus_files'
folder_destination = 'modified_nexus_files'

if not os.path.exists(folder_destination):
    os.makedirs(folder_destination)

files = [file for file in os.listdir(folder_to_compress) if '.nxs' in file]

for file in files:
    old_file_path = folder_to_compress+'/'.join(file.split('/')[:-1])+'/'+file.split('/')[-1]
    new_file_path = folder_destination+'/'.join(file.split('/')[:-1])+'/'+file.split('/')[-1]

    leaf_names = []
    f = h5py.File(old_file_path, 'r')
    for k0, v0 in f.items():
        for k1, v1 in v0.items():
            try:
                for k2, v2 in v1.items():
                    if '%s'%k1 == 'scan_data':
                        leaf_names.append(k0+'/'+k1+'/'+k2)
            except:
                pass
    
    with h5py.File(old_file_path,  "r") as r:
        with h5py.File(new_file_path,  "w") as w:

            for leaf in leaf_names:

                # create the subset
                new_data = r[leaf][...]

                # to add compression (very efficient!)
                w.create_dataset(leaf, new_data.shape, dtype=new_data.dtype, data=new_data, compression='gzip')

                for key in r[leaf].attrs.keys():
                    w[leaf].attrs[key]=r[leaf].attrs[key]
 