# XNAT nifti downloader
Download all the dicoms for all subjects specified in a csv file.

Please edit login.cfg with your credentials before executing this script.

Before (re-)running this script, please clear output, shutdown and relaunch kernel, close down and reopen your browser, and then (re-)launch all the cells! Else the memory is not correctly freed (this is a bug in ipywidgets or jupyter notebook).


In [None]:
%load_ext autoreload
%autoreload 2
# BEWARE: autoreload works on functions and on general code, but NOT on new class methods:
# if you add or change the name of a method, you have to reload the kernel!
# also it will fail if you use super() calls in the classes you change

# Profilers:
# http://pynash.org/2013/03/06/timing-and-profiling/
# http://mortada.net/easily-profile-python-code-in-jupyter.html
# use %lprun -m module func(*args, **kwargs)
try:
    %load_ext line_profiler
    %load_ext memory_profiler
    from fdict import fdict
except ImportError as exc:
    pass

import json
import pandas as pd
import os
import pyxnat
from tqdm import tqdm_notebook

# Setup some display options for pandas
pd.set_option('max_columns', 400)
pd.set_option('expand_frame_repr', False)

## Parameters

In [None]:
# Edit the filepath to the csv file with the subjects you want to download here
csv_filepath = 'xnat_data_extract_rs-fMRI-only.csv'
# Edit the subjects and projects column names
subjectcol = 'subject.id'
projectcol = 'project.id'
# Edit the folder path where the NIFTI images will be saved
#nifti_path = os.path.join(os.getcwd(), 'niftis')
nifti_path='F:\ctbi_rest_niftis'
# Download each NIFTI file separately (True) or directly the whole experiment as a zip file (True) ? The latter is faster and you don't risk missing any file.
dlmanualmode = False
# Max retries to download nifti files before failing
maxretries = 10

## Auxiliary functions

In [None]:
#### HELPER FUNCTIONS
from copy import deepcopy
from libs.xmlpp import get_pprint as xml_pprint
def get_raw_xml(elements_list):
    '''Get the source xml of a list of lxml elements or pyxnat objects'''
    # Convert to a list of elements if it's a single element (to ease looping)
    if not isinstance(elements_list, list):
        elements_list = [elements_list]

    out = ''
    for i, element in enumerate(elements_list):
        out += '\n=== Element %i\n' % i
        # If this is an XML element
        if isinstance(element, lxml.etree._Element):
            # Make a copy of the element because we will modify it
            e = deepcopy(element)
            # Strip comments, else lxml does not know how to print the XML
            lxml.etree.strip_tags(e, lxml.etree.Comment)
            # Add the XML of this element to the output
            out += xml_pprint(lxml.etree.tostring(e, pretty_print=True))
            #print(lxml.etree.tostring(e, pretty_print=True)) #debug
        # pyxnat object, we just fetch the xml from the server
        if isinstance(element, pyxnat.core.resources.EObject):
            out += element.get()
        # Print differently if this is any other type
        else:
            out += repr(element)
    return out

def pprint_xml(obj):
    print(xml_pprint(get_raw_xml(obj)))

#### HELPER GLOBALS
# XNAT namespace (to use with lxml xpath queries)
xnatns = {'arc': 'http://nrg.wustl.edu/arc',
 'cat': 'http://nrg.wustl.edu/catalog',
 'ext': 'http://nrg.wustl.edu/ext',
 'pipe': 'http://nrg.wustl.edu/pipe',
 'prov': 'http://www.nbirn.net/prov',
 'scr': 'http://nrg.wustl.edu/scr',
 'val': 'http://nrg.wustl.edu/val',
 'wrk': 'http://nrg.wustl.edu/workflow',
 'xdat': 'http://nrg.wustl.edu/security',
 'xnat': 'http://nrg.wustl.edu/xnat',
 'xnat_a': 'http://nrg.wustl.edu/xnat_assessments',
 'xsi': 'http://www.w3.org/2001/XMLSchema-instance'}

## Connect to XNAT server

In [None]:
def xnat_connect():
    # Loading login infos
    cfgpath = os.path.join(os.getcwd(), 'login.cfg')
    with open(cfgpath) as f:
        login_infos = json.load(f)

    # Connect to XNAT db
    central = pyxnat.Interface(server="http://tbixnat.incf.org:8080", user=login_infos['username'], password=login_infos['password'], cachedir='/tmp')
    # Add schemas (allows to use .attrs() to get list of attributes)
    central.manage.schemas.add('xnat/xnat.xsd')

    # Get list of all centers
    centers = central.select.projects()
    return central, centers

# Select center (constraining to one center for the moment)
# TODO: loop over all centers
#cULgData_Liege_project = central.select.project('LIE')

central, centers = xnat_connect()

# Show all centers list
print(centers.get())

# Show structure of project
central.inspect.structure()

## Load csv file as pandas dataframe

In [None]:
df = pd.read_csv(csv_filepath, sep=';', index_col=False, encoding='utf-8', escapechar='\\')
df

## Download dicoms

In [None]:
# Groupby project and subject (to avoid duplication of subject, we don't want to download twice the same)
df_subjects = df.ix[:, (projectcol, subjectcol)].groupby((projectcol,subjectcol)).count().reset_index()
df_subjects

In [None]:
import re
import unicodedata
def slugify(value, allow_unicode=False):
    """
    Convert to ASCII if 'allow_unicode' is False. Convert spaces to hyphens.
    Remove characters that aren't alphanumerics, underscores, or hyphens.
    Convert to lowercase. Also strip leading and trailing whitespace.
    From Django and this excellent SO answer: https://stackoverflow.com/a/295466/1121352
    """
    value = str(value)
    if allow_unicode:
        value = unicodedata.normalize('NFKC', value)
    else:
        value = unicodedata.normalize('NFKD', unicode(value.encode('utf-8', 'ignore'))).encode('ascii', 'ignore').decode('ascii')
    value = re.sub(r'[^\w\s-]', '', value).strip().lower()
    return re.sub(r'[-\s]+', '-', value)

In [None]:
# Prepare progress bar
tbar = tqdm_notebook(total=len(df_subjects), desc='DOWNLD', unit='subject')

# Main download loop
for row in df_subjects.iterrows():
    # Loop for each project & subject in the provided csv
    projectid = row[1][projectcol]
    subjectid = row[1][subjectcol]
    while True:  # infinite loop to retry in case of error
        flagrestart = False
        try:
            experiments = central.select("/project/%s/subject/%s" % (projectid, subjectid)).experiments()
            # sometimes experiment/scans is empty, so we will restart until we can extract something
            if experiments is None:
                flag_restart = True
                break
            for experiment in experiments:
                # sometimes experiment/scans is empty, so we will restart until we can extract something
                if experiment is None:
                    flag_restart = True
                    break
                # For each experiment (acquisition sessions) of this subject
                experimentid = experiment.id()
                scans = experiment.scans()
                # sometimes experiment/scans is empty, so we will restart until we can extract something
                if scans is None:
                    flag_restart = True
                    break
                for scan in scans:
                    # sometimes experiment/scans is empty, so we will restart until we can extract something
                    if scan is None:
                        flag_restart = True
                        break
                    # For each scan of this experiment
                    # Build subdirectory path where to save this nifti
                    #print(scan.xpath('@UID')[0])  # do NOT use scan.attrs.get('@UID'), pyxnat v1.0 is not reliable when using that! Prefer to use xpath to get attributes.
                    scantype = scan.xpath('@type')[0].strip()
                    if not scantype:
                        scantype = 'blank'
                    scanid = scan.xpath('@ID')[0]
                    try:
                        scanuid = scan.xpath('@UID')[0]
                    except IndexError as exc:
                        scanuid = '0'
                    # Build path to store this resource
                    scanpath = '%s_%s_%s' % (scanid, scantype, scanuid)
                    scanpath = slugify(scanpath)  # clean up path to always have only valid path characters
                    # Build full path
                    scanfullpath = os.path.join(nifti_path, projectid, subjectid, experimentid, scanpath)
                    if not os.path.exists(scanfullpath):
                        # create directories recursively
                        os.makedirs(scanfullpath)
                    # Save meta-infos
                    scan_metadata = scan.get()
                    with open(os.path.join(scanfullpath, 'metadata.xml'), 'w') as f:
                        f.write(scan_metadata)
                    # Download the NIFTI files (manual mode: we download each scan separately, this allows finer grained control over naming etc)
                    if dlmanualmode:
                        r = scan.resource('NIFTI')
                        for nfile in r.files():
                            # TODO: remove from cachemanager as soon as it gets downloaded, else we will overbloat our cache for nothing (maybe it helps accelerate redownloads but well we don't care)
                            nfile.get_copy(dest=os.path.join(scanfullpath, nfile.label()))
                # Download the NIFTI files (auto mode: download the whole archive of all scans for this experiment directly as a zip file)
                if experiment.scans().get():
                    try:
                        experiment.scans().download(os.path.join(nifti_path, projectid, subjectid, experimentid))
                    except Exception as exc:
                        if 'BadZipfile' in str(type(exc)) or 'not a zip file' in str(exc):
                            # Sometimes the experiment's scans contains no real file, and no real acquisition, then we need to skip
                            pass
                        else:
                            raise
        except Exception as exc:
            if 'ConnectionError' in str(type(exc)):
                flagrestart = True
                pass
            else:
                raise
        # No exception? Then break of the infinite loop, go to next experiment
        if not flagrestart:
            break
    # Update progress bar
    tbar.update()

print('All done!')