# XNAT unique values extractor
This notebook walk through all objects in a XNAT database to extract all possible fields and all possible values.
Please edit login.cfg with your credentials before executing this script.

### Init and helper functions

In [None]:
#
# Creation: 07/2017 by Stephen Larroque
#
%load_ext autoreload
%autoreload 2

import os
import lxml
import xml.etree.ElementTree as ET
import pyxnat
import re

#try:
import ujson as json # fast json lib
#except ImportError:
#    import json # native json lib

# For out-of-core computing (ie, to store the dict on disk and thus avoid MemoryOverflow error)
from libs.chest import Chest

In [None]:
#### HELPER FUNCTIONS
from copy import deepcopy
from libs.xmlpp import get_pprint as xml_pprint
def get_raw_xml(elements_list):
    '''Get the source xml of a list of lxml elements or pyxnat objects'''
    # Convert to a list of elements if it's a single element (to ease looping)
    if not isinstance(elements_list, list):
        elements_list = [elements_list]

    out = ''
    for i, element in enumerate(elements_list):
        out += '\n=== Element %i\n' % i
        # If this is an XML element
        if isinstance(element, lxml.etree._Element):
            # Make a copy of the element because we will modify it
            e = deepcopy(element)
            # Strip comments, else lxml does not know how to print the XML
            lxml.etree.strip_tags(e, lxml.etree.Comment)
            # Add the XML of this element to the output
            out += xml_pprint(lxml.etree.tostring(e, pretty_print=True))
            #print(lxml.etree.tostring(e, pretty_print=True)) #debug
        # pyxnat object, we just fetch the xml from the server
        if isinstance(element, pyxnat.core.resources.EObject):
            out += element.get()
        # Print differently if this is any other type
        else:
            out += repr(element)
    return out

def pprint_xml(obj):
    print(xml_pprint(get_raw_xml(obj)))

#### HELPER GLOBALS
# XNAT namespace (to use with lxml xpath queries)
xnatns = {'arc': 'http://nrg.wustl.edu/arc',
 'cat': 'http://nrg.wustl.edu/catalog',
 'ext': 'http://nrg.wustl.edu/ext',
 'pipe': 'http://nrg.wustl.edu/pipe',
 'prov': 'http://www.nbirn.net/prov',
 'scr': 'http://nrg.wustl.edu/scr',
 'val': 'http://nrg.wustl.edu/val',
 'wrk': 'http://nrg.wustl.edu/workflow',
 'xdat': 'http://nrg.wustl.edu/security',
 'xnat': 'http://nrg.wustl.edu/xnat',
 'xnat_a': 'http://nrg.wustl.edu/xnat_assessments',
 'xsi': 'http://www.w3.org/2001/XMLSchema-instance'}

### Connection to XNAT

In [None]:
# Loading login infos
cfgpath = os.path.join(os.getcwd(), 'login.cfg')
with open(cfgpath) as f:
    login_infos = json.load(f)

# Connect to XNAT db
central = pyxnat.Interface(server="http://tbixnat.incf.org:8080", user=login_infos['username'], password=login_infos['password'], cachedir='/tmp')
# Add schemas (allows to use .attrs() to get list of attributes)
central.manage.schemas.add('xnat/xnat.xsd')

# Get list of all centers
centers = central.select.projects()
print(centers.get())

# Select center (constraining to one center for the moment)
# TODO: loop over all centers
#cULgData_Liege_project = central.select.project('LIE')

# Show structure of project
central.inspect.structure()

In [None]:
obj = centers[0]

In [None]:
count = sum(1 for _ in centers)

In [None]:
print(get_raw_xml(obj))

In [None]:
from libs.tqdm import tqdm_notebook

def rec_merge(a, b, path=None, robust=False):
    """Recursively merges dict b into a
    Kudos to Andrew Cooke: https://stackoverflow.com/a/7205107/1121352"""
    if path is None: path = []
    for key in b:
        if key in a:
            if isinstance(a[key], dict) and isinstance(b[key], dict):
                rec_merge(a[key], b[key], path + [str(key)], robust)
            elif isinstance(a[key], set) and isinstance(b[key], set):
                a[key].update(b[key])
            elif a[key] == b[key]:
                pass # same leaf value
            else:
                # Conflict: if robust, we create a new array key+'_conflicts' and we either try to merge with a previously conflicting type but same as b[key], or if b[key] is of an unseen type yet, we just create a new entry for this type
                if robust:
                    ckey = key+'_conflicts'
                    # Never had any conflict for this key, just create a conflicts array
                    if not ckey in a:
                        a[ckey] = []
                        a[ckey].append(b[key])
                    # Else there were already conflicts in the past for this key
                    else:
                        #raise Exception('Warning: rec_merge encountered a conflict in %s' % ('/'.join(path)+'/'+str(key)))
                        # Try to find a previous conflict of the same type to merge
                        found_compatible_type = False
                        for cid in xrange(len(a[ckey])):
                            conflict = a[ckey][cid]
                            if type(conflict) == type(b[key]):
                                if isinstance(conflict, dict):
                                    rec_merge(a[ckey][cid], b[key], path + [str(ckey)], robust)
                                else:
                                    a[ckey][cid].update(b[key])
                                found_compatible_type = True
                        # This type was never seen before, we add this conflict type
                        if not found_compatible_type:
                            a[ckey].append(b[key])
                # Else, display an exception
                else:
                    print(key, a[key], b[key], type(a[key]), type(b[key]))
                    raise Exception('Conflict at %s' % '.'.join(path + [str(key)]))
        else:
            a[key] = b[key]
    return a

def walkthrough(xnatobjlist, namespaces=None, outofcore=False, firstonly=False, level=0, progress_maxlevel=2, debug=False):
    if not hasattr(xnatobjlist, '__iter__'):
        xnatobjlist = [xnatobjlist]
    # Out of core computing using chest to store on disk rather than in-memory (to avoid MemoryOverflow error)
    if outofcore and level == 0 and maxmem > 0:
        chest = Chest(path=os.path.join(os.getcwd(), 'uniquevalschest_level%i' % level))
        chest['projects'] = {}
        out = chest['projects']
        out['id'] = set()
        out['label'] = set()
        out['datatype'] = set()
        out['text'] = set()
        out['attributes'] = {}
        out['subelements'] = {}
        out['children'] = {}
    else:
        if level == 0:
            chest = {}
            chest['projects'] = {}
            out = chest['projects']
        else:
            out = {}
        out.update({'id': set(),
                    'label': set(),
                    'datatype': set(),
                    'text': set(),
                    'attributes': {},
                    'subelements': {},
                    'children': {}
                   })
    # prepare namespaces for attribute search
    if namespaces:
        namespaces_filt = dict()
        for val, key in namespaces.items():
            namespaces_filt['{'+key+'}'] = val+':'

    # prepare for progress display
    # count
    count = sum(1 for _ in xnatobjlist) # count total number of items (to predict time and display progress)
    if firstonly and count > firstonly:
        count = firstonly
    # current object type name
    try:
        if hasattr(xnatobjlist, 'tag') and isinstance(xnatobjlist.tag, str):
            curtype = re.sub('{.*}', '', xnatobjlist.tag)
        else:
            obj = str(type(xnatobjlist[0]))
            curtype = obj[obj.rfind('.')+1:obj.rfind("'")]
    except StopIteration as exc:
        curtype = ''
        pass

    # Limit the progress display, because there is currently a memory leak of ipywidgets, old widgets stay in memory
    if progress_maxlevel <= 0:
        xnatobjlist_iterator = xnatobjlist
    else:
        if level < progress_maxlevel:
            xnatobjlist_iterator = tqdm_notebook(xnatobjlist, total=count, desc=curtype, position=level, leave=False)
        else:
            xnatobjlist_iterator = xnatobjlist

    i = 0
    # Main loop: for each item in the list
    for obj in xnatobjlist_iterator:
        # Continue to the next object?
        if firstonly:
            if i >= firstonly:
                break
        # Debug print
        if debug: print(obj)
        # add id and label
        if hasattr(obj, 'id'):
            out['id'].add(obj.id())
        if hasattr(obj, 'label'):
            out['label'].add(obj.label())
        if hasattr(obj, 'datatype'):
            out['datatype'].add(obj.datatype())
        if hasattr(obj, 'text') and obj.text:
            out['text'].add(obj.text.strip())
        # Only if not a resource nor a file, else it's only about files so no xml content
        if not isinstance(obj, (pyxnat.core.resources.Resource, pyxnat.core.resources.File)):
            # Attributes
            attrs = obj.xpath('@*')
            for attr in attrs:
                # Get attribute's xml name and value
                attrname = attr.attrname
                if namespaces: # if namespaces is provided, we can use that to replace the prefix (else attributes don't provide the prefix, only subelements do)
                    #if attrname.startswith('{'): print(attrname)
                    for key, val in namespaces_filt.items():
                        #if attrname.startswith('{'): print(key, val)
                        attrname = attrname.replace(key, val)
                    #print('lala'+attrname)
                attrval = str(attr)
                # Create a unique set for this attribute
                if not attrname in out['attributes']:
                    out['attributes'][attrname] = set()
                # Add this value (the set will make sure it is unique)
                out['attributes'][attrname].add( attrval )
            # Subelements
            subelts = obj.xpath('*')
            for subelt in subelts:
                # Get subelement's xml name
                if subelt.prefix:
                    prefix = (subelt.prefix + ':')
                else:
                    prefix = ''
                subeltname = prefix + re.sub('{.*}', '', subelt.tag)
                # Get subelement's value
                if hasattr(subelt, 'text'):
                    subeltval = subelt.text
                else:
                    subeltval = str(subelt)
                if subeltval: # remove useless chars at the start and end
                    subeltval = subeltval.strip()
                # Recursive call if it has children
                if subelt.getchildren() or subelt.xpath('@*'): # subelt.getchildren() == subelt.xpath('*')
                    # Create a dict for this subelement
                    if not subeltname in out['subelements']:
                        out['subelements'][subeltname] = dict()
                    cval = walkthrough([subelt], namespaces=namespaces, firstonly=firstonly, level=level+1, debug=debug)
                    # Merge with our dict
                    rec_merge(out['subelements'][subeltname], cval, robust=True)
                else:
                    # Create a unique set for this subelement
                    if not subeltname in out['subelements']:
                        out['subelements'][subeltname] = set()
                    # Add this value (the set will make sure it is unique)
                    out['subelements'][subeltname].add( subeltval )
        # Children
        if hasattr(obj, 'children'):
            for childname in obj.children():
                # Call the method to retrieve children from child name
                child = getattr(obj, childname)()
                # Recursive call
                cres = walkthrough(child, namespaces=namespaces, firstonly=firstonly, level=level+1, debug=debug)
                # Merge with our dict
                if not childname in out['children']:
                    out['children'][childname] = {}
                rec_merge(out['children'][childname], cres, robust=True)
        # Flush to disk
        if outofcore and level == 0:
            chest.flush()
        # Increment counter for firstonly
        i += 1
    if level == 0:
        return chest
    else:
        return out


In [None]:
out = walkthrough(centers, namespaces=xnatns, firstonly=2, outofcore=False, debug=False)
print('All Done!')
# TODO: MAYBE: if again we get memory errors, chest and shelve and other dbs are useless here because they work only on first-level because they all pickle, so any 2nd level will need to be loaded fully in memory. Also chest cannot store changed values (hence update not working).
# So need to flatten the dict: create a custom dict that uses internally shelve, that does proper assignment when .append by using setitem instead and use xpath-like paths 'item1/item2' instead of ['item1']['item2'] to flatten the whole dict and allow easy insertion in shelve or any other db like hdf5 etc.
# for inspiration to flatten dict, see:
# * https://github.com/gmr/flatdict
# * https://github.com/bunbun/nested-dict
# for db insertion, see:
# * shelve
# * sqlite_object
# *hdf5pys

In [None]:
# Just a little sanity check
try:
    assert len(out['projects']['children']['subjects']['children']['experiments']['children']['scans']['subelements']['xnat:parameters']['subelements']['xnat:voxelRes']['attributes']['x']) > 1
    print('Sanity check OK!')
except AssertionError as exc:
    print('ERROR: xnat:parameters has only 1 sample set of values, probably something went wrong (or you used firstonly=1)')
    out['projects']['children']['subjects']['children']['experiments']['children']['scans']['subelements']['xnat:parameters']


In [None]:
# Save the result into a json file
import pickle as pk
if isinstance(out, Chest):
    with open(out.key_to_filename('projects'), 'rb') as e:
        with open('db_unique_values.json', 'w') as f:
            json.dump({'projects': pk.load(e)}, f, ensure_ascii=False, indent=4, sort_keys=True)
else:
    with open('db_unique_values.json', 'w') as f:
        json.dump(out, f, ensure_ascii=False, indent=4, sort_keys=True)

In [None]:
def dict_xpath(obj, path):
    a = obj
    for p in path.split('/'):
        a = a[p]
    return a

In [None]:
dict_xpath(out, 'projects/children/subjects/subelements/xnat:experiments/subelements/xnat:experiment/subelements')

In [None]:
# Show the result
out

-------------------------
### Unused code

In [None]:
from collections import OrderedDict

class SortedDict(OrderedDict):

    def __init__(self, **kwargs):
        super(SortedDict, self).__init__()

        for key, value in sorted(kwargs.items()):
            if isinstance(value, dict):
                self[key] = SortedDict(**value)
            else:
                self[key] = value

sorted_dict = SortedDict(**a)
sorted_dict

In [None]:
b = obj.xpath('*')
b2 = b[0]
b2.text()

In [None]:
key, val = ('{http://www.w3.org/2001/XMLSchema-instance}', 'xsi:')
s = '{http://www.w3.org/2001/XMLSchema-instance}schemaLocation'
s.replace(key, val)

In [None]:
obj.id()

In [None]:
g = obj.subject('CTBI_S00638').experiment('CTBI_E02818').scan('2').resource('3144').file('DTI.bval')
g.attributes()
type(g)

In [None]:
scans = central.select.project('HEL').subject('CTBI_S00800').experiment('CTBI_E04816').scans()
scan = scans[0]

In [None]:
pprint_xml((scan))

In [None]:
param = scan.xpath('*')[-1]

In [None]:
param_children = param.getchildren()
param_children

In [None]:
param.xpath('*')

In [None]:
walkthrough(param)

In [None]:
pprint_xml(param)

In [None]:
from libs.xmltodict import xmltodict
xmltodict.parse(get_raw_xml(param).encode('utf-8'), process_namespaces=True, namespaces=xnatns)

In [None]:


a = {'a': set([1, 2]), 'b': {1, 2}, 'c': {'d': set([1, 2])}}
b = {'a': set([1, 4]), 'b': {1, 3}, 'c': {'d': set([1, 3, 4])}}
c = merge(a, b)
c

In [None]:
c['a'].add(4)
c['b'].add(3)
c['b'].add(4)
c

In [None]:
e = set([1, 2])
e.update(set([1, 3]))
e

In [None]:
from libs.sqlite_object import SqliteDict
a = SqliteDict(filename='test.sqlite3', persist=True)
a['label'] = set([1,2,3])
a['id'] = a['label']
a