# Center-TBI statistics (from full dump via XNAT REST API/pyxnat)
Short description to add here...

Please edit login.cfg with your credentials before executing this script.

Before (re-)running this script, please clear output, shutdown and relaunch kernel, close down and reopen your browser, and then (re-)launch all the cells! Else the memory is not correctly freed (this is a bug in ipywidgets or jupyter notebook).


In [None]:
%load_ext autoreload
%autoreload 2
# BEWARE: autoreload works on functions and on general code, but NOT on new class methods:
# if you add or change the name of a method, you have to reload the kernel!
# also it will fail if you use super() calls in the classes you change

# Profilers:
# http://pynash.org/2013/03/06/timing-and-profiling/
# http://mortada.net/easily-profile-python-code-in-jupyter.html
# use %lprun -m module func(*args, **kwargs)
%load_ext line_profiler
%load_ext memory_profiler

import json
import pandas as pd
from fdict import fdict

# Setup some display options for pandas
pd.set_option('max_columns', 400)
pd.set_option('expand_frame_repr', False)

## Parameters

In [None]:
# Edit the filepath to the json file here
json_filepath = 'xnat_data_extract.json'

## Extracting pertinent database fields into a Dataframe
The original structure is very nested, here we extract only the relevant fields to get a 2D matrix of only the relevant fields.

In [None]:
def flatten_json(y):
    # From https://medium.com/@amirziai/flattening-json-objects-in-python-f5343c794b10
    out = {}

    def flatten(x, name=''):
        if type(x) is dict:
            for a in x:
                flatten(x[a], name + a + '_')
        elif type(x) is list:
            i = 0
            for a in x:
                flatten(a, name + str(i) + '_')
                i += 1
        else:
            out[name[:-1]] = x

    flatten(y)
    return out

In [None]:
def JSON2DICTParser(data):
    # From https://stackoverflow.com/questions/20680272/parsing-a-json-string-which-was-loaded-from-a-csv-using-pandas
    if data and re.sub('[\[\]{}\s]+', '', data):
        try:
            j1 = json.loads(data)
        except ValueError:
            print(data)
            raise
        return j1
    else:
        return float('nan')

# Load the csv into a Pandas DataFrame
#df = pd.read_csv(json_filepath, converters={'software_metadata':JSON2DICTParser, 'parameters_dict':JSON2DICTParser, 'results_dict':JSON2DICTParser})
df = pd.read_csv(json_filepath)


In [None]:
# Load the json dict
with open(json_filepath, 'rb') as f:
    j = f.read()
    jdict = json.loads(j)

In [None]:
# Reorder the json dict so that it is organized per scan session instead of per project
def ensure_list(obj):
    '''Ensure an object is a collection/list, and not a single object (this simplifies looping)'''
    if '@ID' in obj:
        # If it has an id attribute, this is a single object, we need to transform it into a list
        obj = {obj['@ID']: obj}  # we use dicts here instead of lists (for various technical reasons)
    return obj

def copy_dict_exclude(d, exclude):
    '''Copy a dict but exclude the specified keys'''
    return {k:v for k,v in d.items() if k not in exclude}

# Create a dict per scan
jdict2 = {'scans': []}
for projectname, project in ensure_list(jdict['projects']).items():
    # For each project
    for subjectname, subject in ensure_list(project['subjects']).items():
        # For each subject
        for experimentname, experiment in ensure_list(subject['experiments']).items():
            # For each experiment
            for scanname, scan in ensure_list(experiment['scans']).items():
                # For each scan
                jdict2['scans'].append(copy_dict_exclude(scan, ['@xsi:schemaLocation']))  # add this scan record
                currecord = jdict2['scans'][-1]  # keep it at hand so we can add infos of above levels
                # Add the experiment infos for this scan (these infos are shared across all scans of this experiment, so we duplicate for each scan)
                currecord['experiment'] = copy_dict_exclude(experiment, ['scans', 'xnat:scans', '@xsi:schemaLocation'])
                # Same for subject infos
                currecord['subject'] = copy_dict_exclude(subject, ['experiments', 'xnat:experiments', '@xsi:schemaLocation'])
                # Same for project infos
                currecord['project'] = copy_dict_exclude(project, ['subjects', 'xnat:subjects', '@xsi:schemaLocation'])
                # Compate agregative score for QA fields
                try:
                    a = json.loads(currecord['experiment']['xnat:assessors']['xnat:assessor']['ext:results_dict'], strict=False)  # need strict=False because of non-escaped \n and \r (line return + carriage return)
                    currecord['qa.protocol_check'] = a['protocol_check']
                    currecord['qa.head_coverage'] = [[k,v] for k,v in a['head_coverage'].items()]
                    currecord['qa.protocol_check_global'] = all(1 if v.lower() == 'pass' else 0 for _, v in a['protocol_check'])
                    currecord['qa.head_coverage_global'] = all(1 if v.lower() == 'good' else 0 for v in a['head_coverage'].values())
                except Exception:
                    pass


In [None]:
# Postprocessing: we convert nested dicts into json strings
#for scanid in xrange(len(jdict2['scans'])):
#    for key, value in jdict2['scans'][scanid].items():
#        if isinstance(value, dict):
#            for subkey, subvalue in jdict2['scans'][scanid][key].items():
#                if isinstance(subvalue, dict):
#                    jdict2['scans'][scanid][key][subkey] = json.dumps(subvalue)

In [None]:
# You should check these files to see if the structure is correct and has all required fields

# Save one record for test
with open('center-tbi-statistics-from-full-rest-dump_onerecordtest.json', 'w') as f:
    json.dump(currecord, f, indent=4, sort_keys=True)
# Save all reordered records
with open('center-tbi-statistics-from-full-rest-dump_persession.json', 'w') as f:
    json.dump(jdict2, f, indent=4, sort_keys=True)

In [None]:
# Flatten keys so that pandas can access all nested dicts
jdict3 = [fdict.flatkeys(scan, sep='.') for scan in jdict2['scans']]
# Convert to a pandas dataframe!
df = pd.io.json.json_normalize(jdict3)

In [None]:
# Show structure of df
print('The dataframe has %s (lines, columns)' % str(df.shape))
print('Columns of the dataframe: ')
print(df.columns)
df  # to pretty print, just put the dataframe name as the last line, without print or anything

In [None]:
a = df.apply(lambda df: (df['project.id'], df['qa.protocol_check_global']), axis=1)
print('Number of validated scans per center')
a.value_counts()

In [None]:
print('Only centers with validated scans:')
a.apply(lambda x: x if x[1] == True else None).dropna().value_counts()

In [None]:
print('Total number of validated scans:')
len(a.apply(lambda x: x if x[1] == True else None).dropna())

In [None]:
print('Only centers with unvalidated scans:')
a.apply(lambda x: x if x[1] != True else None).dropna().value_counts()

In [None]:
#df[['project.id', 'qa.protocol_check_global']].groupby(['project.id']).value_counts()

In [None]:
a = df.apply(lambda df: (df['project.id'], df['qa.head_coverage_global']), axis=1)
print('Number of validated head coverage scans per center')
a.value_counts()

In [None]:
phantom_validated_centers = df.where(df['experiment.@visit_id'] == 'Phantom')['project.id'].dropna().unique()
print('List of Phantom validated centers:')
for center in sorted(phantom_validated_centers):
    print('* %s' % center)

In [None]:
# TODO: filter by resting state fmri
# TODO: make figures of all of these infos (number of validated/unvalidated scans per center, number of fmri validated scans)

## Stats figures
Generate interesting stats figures.

## Resources
Useful resources:
* https://stackoverflow.com/questions/34092808/extract-nested-json-embedded-as-string-in-pandas-dataframe#
* https://stackoverflow.com/questions/39899005/how-to-flatten-a-pandas-dataframe-with-some-columns-as-json
* http://mindtrove.info/flatten-nested-json-with-pandas/