In [13]:
import h5py
import re
import pandas as pd
import numpy as np
import pathlib

# Examples how to extract information from ilastik projects

This shows how to:
- Export a csv that can be loaded as metata csv in cellprofiler to reproduce the 'random' cropping
- How to dump the labels used for training


Further useful scripts how to use ilastik from the command line, e.g. how to train it with pre-define labels, can be found here: https://github.com/ilastik/ilastik/tree/master/bin

By
vito.zanotelli@gmail.com

In [17]:
class config:
    fn_ilastik = './exampledata/example_training.ilp'
    fn_crop_list ='~/tmp/crop_metadata.csv' # This can be used within cellprofiler to reproduce the croping identically to the training data
    fol_out_labels = pathlib.Path('/home/vitoz/tmp')
C=config

class variables:
    re_crop = re.compile('(?P<basename>.*)(_s(P<scale>[0-9]+))?_x(?P<x>[0-9]+)_y(?P<y>[0-9]+)_w(?P<w>[0-9]+)_h(?P<h>[0-9]+).*')
    COL_FN = 'filename_training'
    SUFFIX_LABEL = '_label.npy'
V=variables

Helper function to dump a `hdf5` content nicely formated from:
https://stackoverflow.com/a/53340677

In [3]:
def descend_obj(obj,sep='\t'):
    """
    Iterate through groups in a HDF5 file and prints the groups and datasets names and datasets attributes
    """
    if type(obj) in [h5py._hl.group.Group,h5py._hl.files.File]:
        for key in obj.keys():
            print(sep,'-',key,':',obj[key])
            descend_obj(obj[key],sep=sep+'\t')
    elif type(obj)==h5py._hl.dataset.Dataset:
        for key in obj.attrs.keys():
            print(sep+'\t','-',key,':',obj.attrs[key])

def h5dump(path,group='/'):
    """
    print HDF5 file metadata

    group: you can give a specific group, defaults to the root group
    """
    with h5py.File(path,'r') as f:
         descend_obj(f[group])

## Dump a metadata csv to reproduce the random crops

Make a metadata file from the ilasik project filenames, that can be used to reproduce the crops by using this csv as metadta file.
This file can be loaded in cellprofiler as metadata. The `crop bb` module from ImcPluginsCP (https://github.com/BodenmillerGroup/ImcPluginsCP) can use metadata as parameters to specify were to crop.

In [4]:
with h5py.File(C.fn_ilastik, 'r') as f:
    lanes = f['Input Data']['infos'].values()
    fns_training = [lane['Raw Data/nickname'][()].decode('UTF-8') for lane in lanes ]

In [5]:
def split_names(x, re_comp):
    c = re_comp
    m = c.match(x)
    g = m.groups() 
    return pd.Series({l: g[i-1] for l, i in c.groupindex.items()}, name=x.index)

In [6]:
dat_training = pd.DataFrame({V.COL_FN: fns_training})
dat_training = dat_training.join(dat_training[V.COL_FN].apply(split_names, re_comp=V.re_crop))

In [7]:
dat_training

Unnamed: 0,filename_training,basename,x,y,w,h
0,20170906_FluidigmONfinal_SE_s0_p1_r0_a0_ac_ila...,20170906_FluidigmONfinal_SE_s0_p1_r0_a0_ac_ila...,27,482,500,500
1,20170906_FluidigmONfinal_SE_s0_p1_r1_a1_ac_ila...,20170906_FluidigmONfinal_SE_s0_p1_r1_a1_ac_ila...,190,1059,500,500
2,20170906_FluidigmONfinal_SE_s0_p2_r2_a2_ac_ila...,20170906_FluidigmONfinal_SE_s0_p2_r2_a2_ac_ila...,101,240,500,500
3,20170906_FluidigmONfinal_SE_s0_p2_r3_a3_ac_ila...,20170906_FluidigmONfinal_SE_s0_p2_r3_a3_ac_ila...,373,915,500,500
4,20170906_FluidigmONfinal_SE_s0_p3_r6_a6_ac_ila...,20170906_FluidigmONfinal_SE_s0_p3_r6_a6_ac_ila...,198,1037,500,500
5,20170906_FluidigmONfinal_SE_s0_p3_r7_a7_ac_ila...,20170906_FluidigmONfinal_SE_s0_p3_r7_a7_ac_ila...,848,742,500,500
6,20170906_FluidigmONfinal_SE_s0_p4_r4_a4_ac_ila...,20170906_FluidigmONfinal_SE_s0_p4_r4_a4_ac_ila...,394,52,500,500
7,20170906_FluidigmONfinal_SE_s0_p4_r5_a5_ac_ila...,20170906_FluidigmONfinal_SE_s0_p4_r5_a5_ac_ila...,860,0,500,500


In [8]:
dat_training.to_csv(C.fn_crop_list, index=False)

## Extract ilastik training labels

These labels will be saved in the label output folder as a `.npy` array.

This is usefull e.g. to combine classifiers.


In [18]:
with h5py.File(C.fn_ilastik, 'r') as f:
    labels = f['/PixelClassification/LabelSets']
    lanes = f['Input Data']['infos']
    for label, lane in zip(labels.values(),lanes.values()):
        name = lane['Raw Data/nickname'][()].decode('UTF-8')
        for val in label.values():
            print(name)
            np.save(C.fol_out_labels / (name+V.SUFFIX_LABEL), val[:])

20170906_FluidigmONfinal_SE_s0_p1_r0_a0_ac_ilastik_s2_x27_y482_w500_h500
20170906_FluidigmONfinal_SE_s0_p1_r0_a0_ac_ilastik_s2_x27_y482_w500_h500
20170906_FluidigmONfinal_SE_s0_p1_r1_a1_ac_ilastik_s2_x190_y1059_w500_h500
20170906_FluidigmONfinal_SE_s0_p2_r2_a2_ac_ilastik_s2_x101_y240_w500_h500
20170906_FluidigmONfinal_SE_s0_p2_r2_a2_ac_ilastik_s2_x101_y240_w500_h500
20170906_FluidigmONfinal_SE_s0_p2_r2_a2_ac_ilastik_s2_x101_y240_w500_h500
20170906_FluidigmONfinal_SE_s0_p2_r2_a2_ac_ilastik_s2_x101_y240_w500_h500


# Environment

In [19]:
import sys
!conda env export -p {sys.prefix}

name: null
channels:
  - r
  - bioconda
  - pyviz
  - conda-forge
  - defaults
dependencies:
  - _libgcc_mutex=0.1=conda_forge
  - _openmp_mutex=4.5=1_llvm
  - backcall=0.1.0=py_0
  - blas=2.16=openblas
  - ca-certificates=2019.11.28=hecc5488_0
  - certifi=2019.11.28=py37hc8dfbb8_1
  - decorator=4.4.2=py_0
  - entrypoints=0.3=py37hc8dfbb8_1001
  - h5py=2.10.0=nompi_py37h513d04c_102
  - hdf5=1.10.5=nompi_h3c11f04_1104
  - ipykernel=5.2.0=py37h43977f1_0
  - ipython=7.13.0=py37hc8dfbb8_2
  - ipython_genutils=0.2.0=py_1
  - jedi=0.16.0=py37hc8dfbb8_1
  - jupyter_client=6.1.0=py_0
  - jupyter_core=4.6.3=py37hc8dfbb8_1
  - ld_impl_linux-64=2.34=h53a641e_0
  - libblas=3.8.0=16_openblas
  - libcblas=3.8.0=16_openblas
  - libffi=3.2.1=he1b5a44_1007
  - libgcc-ng=9.2.0=h24d8f2e_2
  - libgfortran-ng=7.3.0=hdf63c60_5
  - liblapack=3.8.0=16_openblas
  - liblapacke=3.8.0=16_openblas
  - libopenblas=0.3.9=h5ec1e0e_0
  - libsodium=1.0.17=h516909a_0
  - libstdcxx-ng=9.2.0=hdf63c60_2
  - llvm-openmp=9.0