# XNAT database extractor
This notebook walk through all objects in a XNAT database to extract all possible fields and all possible values. This notebook does not just extract unique values but the whole database, hence you should expect it will take a while and quite some storage space on big databases.

Please edit login.cfg with your credentials before executing this script.

Before (re-)running this script, please clear output, shutdown and relaunch kernel, close down and reopen your browser, and then (re-)launch all the cells! Else the memory is not correctly freed and you will get very fast a MemoryError (this is a bug in ipywidgets or jupyter notebook).

### Init and helper functions

In [None]:
#
# Creation: 08/2017 by Stephen Larroque
#
%load_ext autoreload
%autoreload 2

import os
import lxml
import xml.etree.ElementTree as ET
import pyxnat
import re
import traceback

from time import gmtime, strftime
from libs.tqdm import tqdm_notebook

try:
    import cPickle as pickle
except ImportError as exc:
    import pickle

#try:
import ujson as json # fast json lib
#except ImportError:
#    import json # native json lib

# For out-of-core computing (ie, to store the dict on disk and thus avoid MemoryOverflow error)
from fdict import sfdict

In [None]:
#### HELPER FUNCTIONS
from copy import deepcopy
from libs.xmlpp import get_pprint as xml_pprint
def get_raw_xml(elements_list):
    '''Get the source xml of a list of lxml elements or pyxnat objects'''
    # Convert to a list of elements if it's a single element (to ease looping)
    if not isinstance(elements_list, list):
        elements_list = [elements_list]

    out = ''
    for i, element in enumerate(elements_list):
        out += '\n=== Element %i\n' % i
        # If this is an XML element
        if isinstance(element, lxml.etree._Element):
            # Make a copy of the element because we will modify it
            e = deepcopy(element)
            # Strip comments, else lxml does not know how to print the XML
            lxml.etree.strip_tags(e, lxml.etree.Comment)
            # Add the XML of this element to the output
            out += xml_pprint(lxml.etree.tostring(e, pretty_print=True))
            #print(lxml.etree.tostring(e, pretty_print=True)) #debug
        # pyxnat object, we just fetch the xml from the server
        if isinstance(element, pyxnat.core.resources.EObject):
            out += element.get()
        # Print differently if this is any other type
        else:
            out += repr(element)
    return out

def pprint_xml(obj):
    print(xml_pprint(get_raw_xml(obj)))

#### HELPER GLOBALS
# XNAT namespace (to use with lxml xpath queries)
xnatns = {'arc': 'http://nrg.wustl.edu/arc',
 'cat': 'http://nrg.wustl.edu/catalog',
 'ext': 'http://nrg.wustl.edu/ext',
 'pipe': 'http://nrg.wustl.edu/pipe',
 'prov': 'http://www.nbirn.net/prov',
 'scr': 'http://nrg.wustl.edu/scr',
 'val': 'http://nrg.wustl.edu/val',
 'wrk': 'http://nrg.wustl.edu/workflow',
 'xdat': 'http://nrg.wustl.edu/security',
 'xnat': 'http://nrg.wustl.edu/xnat',
 'xnat_a': 'http://nrg.wustl.edu/xnat_assessments',
 'xsi': 'http://www.w3.org/2001/XMLSchema-instance'}

### Connection to XNAT

In [None]:
# Loading login infos
cfgpath = os.path.join(os.getcwd(), 'login.cfg')
with open(cfgpath) as f:
    login_infos = json.load(f)

# Connect to XNAT db
central = pyxnat.Interface(server="http://tbixnat.incf.org:8080", user=login_infos['username'], password=login_infos['password'], cachedir='/tmp')
# Add schemas (allows to use .attrs() to get list of attributes)
central.manage.schemas.add('xnat/xnat.xsd')

# Get list of all centers
centers = central.select.projects()
print(centers.get())

# Select center (constraining to one center for the moment)
# TODO: loop over all centers
#cULgData_Liege_project = central.select.project('LIE')

# Show structure of project
central.inspect.structure()

In [None]:
count = sum(1 for _ in centers)
print('Total number of centers/projects: %i' % count)

In [None]:
obj = centers[0]
assert get_raw_xml(obj)

In [None]:
def walkthrough_data(xnatobjlist, namespaces=None, outofcore=False, firstonly=False, level=0, progress_maxlevel=2, initial=None, debug=False):
    if not hasattr(xnatobjlist, '__iter__'):
        xnatobjlist = [xnatobjlist]
    # Initialization of the out object
    passuntilproject = None
    if initial is not None and level == 0:
        # Restart from previous state (ie, after a bug etc)
        if isinstance(initial, str):
            with open(initial, 'rb') as f:
                initial = pickle.load(f)
        topshelf = initial['object']
        out = topshelf['projects']
        passuntilproject = initial['lastproject']
    else:
        # Initialization from scratch
        if outofcore and level == 0:
            # Out of core computing using fdict/shelve to store on disk rather than in-memory (to avoid MemoryOverflow error)
            topshelf = sfdict(filename=os.path.join(os.getcwd(), 'xnat_db_extractor.db'), nodel=True)
            topshelf['projects'] = {}
            out = topshelf['projects']
        else:
            # Store in memory
            if level == 0:
                # Fake a topshelf to be compatible with out-of-core
                topshelf = {}
                topshelf['projects'] = {}
                out = topshelf['projects']
            else:
                out = {}

    # prepare namespaces for attribute search
    if namespaces:
        namespaces_filt = dict()
        for val, key in namespaces.items():
            namespaces_filt['{'+key+'}'] = val+':'

    # prepare for progress display
    # count
    count = sum(1 for _ in xnatobjlist) # count total number of items (to predict time and display progress)
    if firstonly and count > firstonly:
        count = firstonly
    # current object type name
    try:
        if hasattr(xnatobjlist, 'tag') and isinstance(xnatobjlist.tag, str):
            curtype = re.sub('{.*}', '', xnatobjlist.tag)
        else:
            obj = str(type(xnatobjlist[0]))
            curtype = obj[obj.rfind('.')+1:obj.rfind("'")]
    except StopIteration as exc:
        curtype = ''
        pass

    # Limit the progress display, because there is currently a memory leak of ipywidgets, old widgets stay in memory
    if progress_maxlevel <= 0:
        xnatobjlist_iterator = xnatobjlist
    else:
        if level < progress_maxlevel:
            xnatobjlist_iterator = tqdm_notebook(xnatobjlist, total=count, desc=curtype, position=level, leave=False)
        else:
            xnatobjlist_iterator = xnatobjlist

    i = 0
    lastproj = None
    # Main loop: for each item in the list
    try:
        for obj in xnatobjlist_iterator:
            # Continue to the next object?
            if firstonly:
                if i >= firstonly:
                    break
            # Debug print
            if debug: print(obj)
            # Find label to set as entries of the dict
            if hasattr(obj, 'label'):
                # Use label as key to access this object if available
                objlabel = obj.label()
                if level == 0:
                    lastproj = obj.label()
                out[objlabel] = {}
                outelt = out[objlabel]
            else:
                # Else we just use a number
                objlabel = str(i)
                out[objlabel] = {}
                outelt = out[objlabel]
            # Add label, id, datatype and content
            outelt['label'] = objlabel
            if hasattr(obj, 'id'):
                outelt['id'] = obj.id()
            if hasattr(obj, 'datatype'):
                outelt['datatype'] = obj.datatype()
            if hasattr(obj, 'text') and obj.text:
                outelt['text'] = obj.text.strip()
            # Skip if we want to restart on a specific project
            if passuntilproject is not None and level == 0:
                if lastproj != passuntilproject:
                    i += 1
                    continue
                else:
                    # We reached the last project, we disable the "continue" flag
                    passuntilproject = None
            # Only if not a resource nor a file, else it's only about files so no xml content
            if not isinstance(obj, (pyxnat.core.resources.Resource, pyxnat.core.resources.File)):
                # Attributes
                attrs = obj.xpath('@*')
                for attr in attrs:
                    # Get attribute's xml name and value
                    attrname = attr.attrname
                    if namespaces: # if namespaces is provided, we can use that to replace the prefix (else attributes don't provide the prefix, only subelements do)
                        #if attrname.startswith('{'): print(attrname)
                        for key, val in namespaces_filt.items():
                            #if attrname.startswith('{'): print(key, val)
                            attrname = attrname.replace(key, val)
                        #print('lala'+attrname)
                    attrval = str(attr)
                    # Add this value
                    attrnamedict = '@'+attrname
                    outelt[attrnamedict] = attrval
                # Subelements
                subelts = obj.xpath('*')
                for subelt in subelts:
                    # Get subelement's xml name
                    if subelt.prefix:
                        prefix = (subelt.prefix + ':')
                    else:
                        prefix = ''
                    subeltname = prefix + re.sub('{.*}', '', subelt.tag)
                    # Get subelement's value
                    if hasattr(subelt, 'text'):
                        subeltval = subelt.text
                    else:
                        subeltval = str(subelt)
                    if subeltval: # remove useless chars at the start and end
                        subeltval = subeltval.strip()
                    # Recursive call if it has children
                    if subelt.getchildren() or subelt.xpath('@*') or (subeltname in outelt and isinstance(outelt[subeltname], dict)): # subelt.getchildren() == subelt.xpath('*')
                        # Create a dict for this subelement
                        if not subeltname in out:
                            outelt[subeltname] = {}
                        if isinstance(subeltval, (str, list, set)): # value is not an xml element we can walk, but out subelt is a dict, so we have a conflict, we still add the value
                            cval = subeltval
                        else:
                            cval = walkthrough_data([subelt], namespaces=namespaces, firstonly=firstonly, level=level+1, progress_maxlevel=progress_maxlevel, debug=debug)
                        # Merge with our dict
                        outelt[subeltname] = cval
                    else:
                        # Add this value since it is a singleton
                        try:
                            outelt[subeltname] = subeltval
                        except AttributeError as exc:
                            print(subeltname, subeltval, type(outelt[subeltname]), outelt[subeltname])
                            raise
            # Children
            if hasattr(obj, 'children'):
                for childname in obj.children():
                    # Call the method to retrieve children from child name
                    child = getattr(obj, childname)()
                    # Recursive call
                    cres = walkthrough_data(child, namespaces=namespaces, firstonly=firstonly, level=level+1, progress_maxlevel=progress_maxlevel, debug=debug)
                    # Merge with our dict
                    #childid = child.id()
                    #if not childname in out:
                    #    out[childname] = {}
                    #if not childid in out[childname]:
                    #    out[childname][childid] = {}
                    #rec_merge(out[childname][childid], cres, robust=True)
                    outelt[childname] = cres
            # Flush to disk at every project and subject iteration (to unload from memory)
            if outofcore and level <= 1:
                topshelf.sync()
            # Increment counter
            i += 1

        # Return the dict for this level
        if level == 0:
            return topshelf
        else:
            return out
    except (Exception, KeyboardInterrupt) as exc:
        if level == 0:
            # Level is 0, we save the current state before stopping
            curstate = {'lastproject': lastproj, 'object': topshelf, 'exception': exc, 'trace': traceback.format_exc()}
            curtime = strftime("%Y-%m-%d_%H-%M-%S", gmtime())
            dumpfilename = 'xnat_data_extractor_dump_%s.pickle' % curtime
            with open(dumpfilename, 'wb') as f:
                pickle.dump(curstate, f)
            with open('dbdata_lastdump.pickle', 'wb') as f2:
                pickle.dump(curstate, f2)
            print('Dump saved in dbdata_lastdump.pickle and %s' % dumpfilename)
        # Propagate the exception in any case
        raise
# If you modify the functions above, please restart kernel and clear output before relaunching a walkthrough_data, else the code changes might not take effect.


In [None]:
reload_last = 'dbdata_lastdump.pickle' # if you want to restart from a previous state, put here the dump filename as a string
if not os.path.isfile(reload_last):
    print('The supplied reload dump file does not exist! Start from scratch...')
    reload_last = None
out = walkthrough_data(centers, namespaces=xnatns, firstonly=2, outofcore=True, progress_maxlevel=3, initial=reload_last, debug=False)
print('All Done!')
# TIP: when testing on your database, do a first run with firstonly=3 (this will process only the first 3 elements at any level) just to quickly see if everything runs alright (you should also check the generated file). Then to go to production mode, set firstonly=None.
# TIP2: set progress_maxlevel=1 when using in production to minimize memory overhead but still get to see progress updates.


In [None]:
# Save the result into a json file
import pickle as pk

filename = 'xnat_data_extract.json'

if isinstance(out, sfdict):
    with open(filename, 'w') as f:
        json.dump(out.to_dict_nested(), f, ensure_ascii=False, indent=4, sort_keys=True)
#elif isinstance(out, Chest):
#    with open(out.key_to_filename('projects'), 'rb') as e:
#        with open('db_unique_values.json', 'w') as f:
#            json.dump({'projects': pk.load(e)}, f, ensure_ascii=False, indent=4, sort_keys=True)
else:
    with open(filename, 'w') as f:
        json.dump(out, f, ensure_ascii=False, indent=4, sort_keys=True)

print('Results saved in %s.' % filename)

In [None]:
# Just a little sanity check
#try:
#    assert len(out['projects']['children']['subjects']['children']['experiments']['children']['scans']['subelements']['xnat:parameters']['subelements']['xnat:voxelRes']['attributes']['x']) > 1
#    print('Sanity check OK!')
#except AssertionError as exc:
#    print('ERROR: xnat:parameters has only 1 sample set of values, probably something went wrong (or you used firstonly=1)')
#    out['projects']['children']['subjects']['children']['experiments']['children']['scans']['subelements']['xnat:parameters']


------------------
### Exploration

In [None]:
def dict_xpath(obj, path):
    a = obj
    for p in path.split('/'):
        a = a[p]
    return a

In [None]:
dict_xpath(out, 'projects/children/subjects/subelements/xnat:experiments/subelements/xnat:experiment/subelements')

In [None]:
# Show the result
out

In [None]:
# Load results from last dump (if there was a bug)
with open('dbdata_lastdump.pickle', 'rb') as f:
    out = pickle.load(f)
out

-------------------------
### Unused code

In [None]:
from collections import OrderedDict

class SortedDict(OrderedDict):

    def __init__(self, **kwargs):
        super(SortedDict, self).__init__()

        for key, value in sorted(kwargs.items()):
            if isinstance(value, dict):
                self[key] = SortedDict(**value)
            else:
                self[key] = value

sorted_dict = SortedDict(**a)
sorted_dict

In [None]:
b = obj.xpath('*')
b2 = b[0]
b2.text()

In [None]:
key, val = ('{http://www.w3.org/2001/XMLSchema-instance}', 'xsi:')
s = '{http://www.w3.org/2001/XMLSchema-instance}schemaLocation'
s.replace(key, val)

In [None]:
obj.id()

In [None]:
g = obj.subject('CTBI_S00638').experiment('CTBI_E02818').scan('2').resource('3144').file('DTI.bval')
g.attributes()
type(g)

In [None]:
scans = central.select.project('HEL').subject('CTBI_S00800').experiment('CTBI_E04816').scans()
scan = scans[0]

In [None]:
pprint_xml((scan))

In [None]:
param = scan.xpath('*')[-1]

In [None]:
param_children = param.getchildren()
param_children

In [None]:
param.xpath('*')

In [None]:
walkthrough_data(param)

In [None]:
pprint_xml(param)

In [None]:
from libs.xmltodict import xmltodict
xmltodict.parse(get_raw_xml(param).encode('utf-8'), process_namespaces=True, namespaces=xnatns)

In [None]:


a = {'a': set([1, 2]), 'b': {1, 2}, 'c': {'d': set([1, 2])}}
b = {'a': set([1, 4]), 'b': {1, 3}, 'c': {'d': set([1, 3, 4])}}
c = merge(a, b)
c

In [None]:
c['a'].add(4)
c['b'].add(3)
c['b'].add(4)
c

In [None]:
e = set([1, 2])
e.update(set([1, 3]))
e

In [None]:
from libs.sqlite_object import SqliteDict
a = SqliteDict(filename='test.sqlite3', persist=True)
a['label'] = set([1,2,3])
a['id'] = a['label']
a