# Center-TBI statistics (from full dump via XNAT REST API/pyxnat)
Compute statistics from the full database dumped in JSON from XNAT REST API (via pyxnat). Please generate beforehand `xnat_data_extrat.json` by executing `xnat_data_extractor.ipynb` (or get the file somewhere else, since it contains private project data it cannot be stored publicly here).

Please edit login.cfg with your credentials before executing this script.

Before (re-)running this script, please clear output, shutdown and relaunch kernel, close down and reopen your browser, and then (re-)launch all the cells! Else the memory is not correctly freed (this is a bug in ipywidgets or jupyter notebook).


In [None]:
%load_ext autoreload
%autoreload 2
# BEWARE: autoreload works on functions and on general code, but NOT on new class methods:
# if you add or change the name of a method, you have to reload the kernel!
# also it will fail if you use super() calls in the classes you change

# Profilers:
# http://pynash.org/2013/03/06/timing-and-profiling/
# http://mortada.net/easily-profile-python-code-in-jupyter.html
# use %lprun -m module func(*args, **kwargs)
try:
    %load_ext line_profiler
    %load_ext memory_profiler
    from fdict import fdict
except ImportError as exc:
    pass

import json
import pandas as pd

# Setup some display options for pandas
pd.set_option('max_columns', 400)
pd.set_option('expand_frame_repr', False)

## Parameters

In [None]:
# Edit the filepath to the json file here
json_filepath = 'xnat_data_extract.json'

## Auxiliary Functions

In [None]:
def json_load_dict(json_filepath):
    with open(json_filepath, 'rb') as f:
        #j = f.read()
        jdict = json.load(f)
        #del j
    return jdict

def save_df_as_csv(d, output_file, fields_order=None, csv_order_by=None, encoding='ascii', escapechar='\\', index=False, verbose=False, **kwargs):
    """Save a dataframe in a csv"""
    # Define CSV fields order
    # If we were provided a fields_order list, we will show them first, else we create an empty fields_order
    if fields_order is None:
        fields_order = []
    # Then automatically add any other field (which order we don't care, they will be appended in alphabetical order)
    fields_order_check = set(fields_order)
    for missing_field in sorted(d.columns):
        if missing_field not in fields_order_check:
            fields_order.append(missing_field)
    if verbose:
        print('CSV fields order: '+str(fields_order))

    # Write the csv
    # Note that escapechar is particularly important if you have nested fields (lists, json, etc)
    if csv_order_by:
        d2 = d.sort_values(csv_order_by)
    else:
        d2 = d
    d2.to_csv(output_file, sep=';', index=index, columns=fields_order, encoding=encoding, escapechar=escapechar, **kwargs)
    return True

## Extracting pertinent database fields into a Dataframe
The original structure is very nested, here we extract only the relevant fields to get a 2D matrix of only the relevant fields.

In [None]:
# Load the json dict
jdict = json_load_dict(json_filepath)

In [None]:
# Reorder the json dict so that it is organized per scan session instead of per project
def ensure_list(obj):
    '''Ensure an object is a collection/list, and not a single object (this simplifies looping)'''
    if '@ID' in obj:
        # If it has an id attribute, this is a single object, we need to transform it into a list
        obj = {obj['@ID']: obj}  # we use dicts here instead of lists (for various technical reasons)
    return obj

def copy_dict_exclude(d, exclude):
    '''Copy a dict but exclude the specified keys'''
    return {k:v for k,v in d.items() if k not in exclude}

exclude_resources = False  # exclude resources because they use up a lot of columns and are pretty much useless for stats (only useful for download)

# Initialization, do not touch
scans_excludes = ['@xsi:schemaLocation']
if exclude_resources:
    scans_excludes.append('resources')

# Create a dict per scan
jdict2 = {'scans': []}
for projectname, project in ensure_list(jdict['projects']).items():
    # For each project
    for subjectname, subject in ensure_list(project['subjects']).items():
        # For each subject
        for experimentname, experiment in ensure_list(subject['experiments']).items():
            # For each experiment
            for scanname, scan in ensure_list(experiment['scans']).items():
                # For each scan
                # Get the scan infos (excluding the columns/fields we do not want)
                scan_temp = copy_dict_exclude(scan, scans_excludes)
                if 'resources' in scan_temp:
                    # Convert to a list so the list of files does not get flattened as columns
                    if 'SNAPSHOTS' in scan_temp['resources'] and 'files' in scan_temp['resources']['SNAPSHOTS']:
                        scan_temp['resources']['SNAPSHOTS']['files'] = [scan_temp['resources']['SNAPSHOTS']['files'].copy()]
                    if 'files' in scan_temp['resources']:
                        scan_temp['resources']['files'] = [scan_temp['resources']['files'].copy()]
                # Add the scan infos
                jdict2['scans'].append(copy_dict_exclude(scan, scans_excludes))  # add this scan record
                currecord = jdict2['scans'][-1]  # keep it at hand so we can add infos of above levels
                # Add the experiment infos for this scan (these infos are shared across all scans of this experiment, so we duplicate for each scan)
                currecord['experiment'] = copy_dict_exclude(experiment, ['scans', 'xnat:scans', '@xsi:schemaLocation'])
                # Same for subject infos
                currecord['subject'] = copy_dict_exclude(subject, ['experiments', 'xnat:experiments', '@xsi:schemaLocation'])
                # Same for project infos
                currecord['project'] = copy_dict_exclude(project, ['subjects', 'xnat:subjects', '@xsi:schemaLocation'])
                # Compate agregative score for QA fields
                try:
                    a = json.loads(currecord['experiment']['xnat:assessors']['xnat:assessor']['ext:results_dict'], strict=False)  # need strict=False because of non-escaped \n and \r (line return + carriage return)
                    currecord['qa.protocol_check'] = a['protocol_check']
                    currecord['qa.head_coverage'] = [[k,v] for k,v in a['head_coverage'].items()]
                    currecord['qa.protocol_check_global'] = all(1 if v.lower() == 'pass' else 0 for _, v in a['protocol_check'])
                    currecord['qa.head_coverage_global'] = all(1 if v.lower() == 'good' else 0 for v in a['head_coverage'].values())
                except Exception:
                    pass


In [None]:
del jdict

In [None]:
# Postprocessing: we convert nested dicts into json strings
#for scanid in xrange(len(jdict2['scans'])):
#    for key, value in jdict2['scans'][scanid].items():
#        if isinstance(value, dict):
#            for subkey, subvalue in jdict2['scans'][scanid][key].items():
#                if isinstance(subvalue, dict):
#                    jdict2['scans'][scanid][key][subkey] = json.dumps(subvalue)

In [None]:
# You should check these files to see if the structure is correct and has all required fields

# Save one record for test
with open('center-tbi-statistics-from-full-rest-dump_onerecordtest.json', 'w') as f:
    json.dump(currecord, f, indent=4, sort_keys=True)
# Save all reordered records
with open('center-tbi-statistics-from-full-rest-dump_persession.json', 'w') as f:
    json.dump(jdict2, f, indent=4, sort_keys=True)

In [None]:
# Flatten keys so that pandas can access all nested dicts
jdict3 = [fdict.flatkeys(scan, sep='.') for scan in jdict2['scans']]
# Save flattened records
with open('center-tbi-statistics-from-full-rest-dump_persession_flattened.json', 'w') as f:
    json.dump(jdict3, f, indent=4, sort_keys=True)
# Free up memory
del jdict2

## Loading final table from json file and showing stats

In [None]:
try:
    del jdict3
except Exception:
    pass
with open('center-tbi-statistics-from-full-rest-dump_persession_flattened.json', 'r') as f:
    jdict3 = json.load(f)

In [None]:
# Convert to a pandas dataframe!
# TODO: if out of memory error, can try to use json-streamer with a mockup-class to return a dict-like generator to pandas: https://github.com/kashifrazzaqui/json-streamer
df = pd.io.json.json_normalize(jdict3)
# Free up memory
del jdict3

In [None]:
# Show structure of df
print('The dataframe has %s (lines, columns)' % str(df.shape))
print('Columns of the dataframe: ')
print(df.columns)
df  # to pretty print, just put the dataframe name as the last line, without print or anything

In [None]:
print('Count of unique values per columns:')
for c in df.columns:
    try:
        print('* %s: %s' % (c, str(len(df[c].unique()))))
    except TypeError as exc:
        print('* %s: type error (list), cannot compute length' % c)

In [None]:
print('Count per value for all columns:')
#with pd.option_context("display.max_rows", -1, "display.max_columns", -1):
for c in df.columns:
    try:
        print "---- %s ---" % c
        print df[c].value_counts()
    except TypeError as exc:
        pass

In [None]:
a = df.apply(lambda df: (df['project.id'], df['qa.protocol_check_global']), axis=1)
print('Number of validated scans per center')
a.value_counts()

In [None]:
print('Only centers with validated scans:')
a.apply(lambda x: x if x[1] == True else None).dropna().value_counts()

In [None]:
print('Total number of validated scans:')
len(a.apply(lambda x: x if x[1] == True else None).dropna())

In [None]:
print('Only centers with unvalidated scans:')
a.apply(lambda x: x if x[1] != True else None).dropna().value_counts()

In [None]:
#df[['project.id', 'qa.protocol_check_global']].groupby(['project.id']).value_counts()

In [None]:
a = df.apply(lambda df: (df['project.id'], df['qa.head_coverage_global']), axis=1)
print('Number of validated head coverage scans per center')
a.value_counts()

In [None]:
phantom_validated_centers = df.where(df['experiment.@visit_id'] == 'Phantom')['project.id'].dropna().unique()
print('List of Phantom validated centers:')
for center in sorted(phantom_validated_centers):
    print('* %s' % center)

### rs_fMRI

In [None]:
print('Scan types:')
for t in sorted(df['@type'].unique()):
    print('* '+ str(t))

In [None]:
# Show count per scan type
with pd.option_context('display.max_rows', None):  # show all rows (remove pandas default limit)
    print(df['@type'].value_counts().sort_values(ascending=False))
# Alternative way (and equivalent):
#print('Scan types:')
#for k, t in df['@type'].value_counts().sort_values(ascending=False).iteritems():
#    print('* '+k+': '+str(t))

In [None]:
# Extract only resting state data
rsfMRI_filter = ['rs fMRI FE EPI', 'rs fMRI FE_EPI', 'rs fMRI FE_EPI SENSE', 'rs-fMRI', 'rs_fMRI', 'rsfMRI', 'rsfMRI 60CM']
df_rest = df.loc[df['@type'].isin(rsfMRI_filter), :]
#df_rest = df.loc[df['@type'] == 'rs_fMRI', :]  # deprecated, only for rs_fmri, does not include all variants of the name in db
df_rest[['@UID', 'project.id', 'qa.protocol_check_global']]

In [None]:
# Save rs_fMRI only data to csv file
save_df_as_csv(df_rest, 'xnat_data_extract_rs-fMRI-only.csv', csv_order_by=['project.id', '@UID'])

In [None]:
# Save full database as csv file
save_df_as_csv(df, 'xnat_data_extract_fulldb.csv', csv_order_by=['project.id', '@UID'], encoding='utf-8')

In [None]:
# Generate only validated scans csv list
df_validated = df.ix[df['qa.protocol_check_global'] == True]
df_rest_validated = df_rest.ix[df_rest['qa.protocol_check_global'] == True]
# Save them
save_df_as_csv(df_validated, 'xnat_data_extract_validatedonly.csv', csv_order_by=['project.id', '@UID'], encoding='utf-8')
save_df_as_csv(df_rest_validated, 'xnat_data_extract_rs-fMRI-only-validatedonly.csv', csv_order_by=['project.id', '@UID'], encoding='utf-8')

## Stats figures
Generate interesting stats figures.

In [None]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')  # for nicer plots
#plt.xkcd()  # uncomment this line for xkcd style plots!!!

In [None]:
def df_couples(df, cols):
    return df.apply(lambda row: tuple(row[col] for col in cols), axis=1).value_counts()

def plot_boolean(df, col1, col2, morecols=None, where=None, groupby=None, truename='validated', falsename='not validated', title=None, nostats=False, nograph=False):
    '''Plot the col2 according to col1, counting for the number of unique values (true/false or true/nan)'''
    # Prepare stuff
    if not morecols:
        morecols = []
    if not groupby:
        groupby = []
    elif isinstance(groupby, str):
        groupby = [groupby]
    # Extract both columns
    if where is not None:
        a = df.loc[where, [col1, col2]+morecols+groupby]
    else:
        a = df[[col1, col2]+morecols+groupby]
    # Group by the first column (we call this column the categories)
    groupby.insert(0, col1)
    b = a.groupby(groupby)
    # Aggregate using the sum (or count of True values) and size (the total number of entries)
    c = b.aggregate(['sum', 'size'])
    # Replace nan/none values by 0 (when there is absolutely no true value for one category)
    c.fillna(0, inplace=True)
    # Because of aggregation we get a hierarchical multiindex, we flatten the indices for easier access
    c.columns = c.columns.get_level_values(1)
    # Compute the number of false values (true values count - total values count)
    c['size'] = c['size'] - c['sum']
    # Rename the columns
    c = c.rename(columns={'sum': truename, 'size': falsename})
    # Group by first level if necessary
    if groupby is not None:
        d = c.groupby(level=0).sum()
    else:
        d = c
    if not nograph:
        # Plot as stacked bars
        d.plot.bar(stacked=True, title=title)
        plt.show()
        # Plot only validated
        d[truename].plot.bar(stacked=True, title=(title+' (validated only)'))
        plt.show()
    if not nostats:
        # Print the table
        print(c)
        # Print stats
        print(c.describe())
        print('\nTotal sum:\n'+str(c.sum()))
        print('\nNumber of nonzeros %s entries:\n%s' % (str(groupby), str((c != 0).sum())) )
    # Return c and d in case user wants to manipulate them further (or debug)
    return c, d

In [None]:
def plot_categorical(df, col1, col2, morecols=None, where=None, groupby=None, truename='validated', falsename='not validated', title=None, sort=None, nostats=False, nograph=False):
    '''Plot the col2 according to col1, counting for the number of unique values (categorical values)'''
    # Prepare stuff
    if not morecols:
        morecols = []
    if not groupby:
        groupby = []
    elif isinstance(groupby, str):
        groupby = [groupby]
    # Extract both columns
    if where is not None:
        a = df.loc[where, [col1, col2]+morecols+groupby]
    else:
        a = df[[col1, col2]+morecols+groupby]
    # Group by all columns we want (including the categories col2) and then unstack to compute the count for each category
    b = a.groupby([col1]+groupby+[col2]).size().unstack(fill_value=0)
    # Reorder the columns (will make it easier to read)
    if sort is not None:
        col_order = list(b.columns)
        col_order.sort(sort)
        b = b[col_order]
    # Group by first level if necessary
    if groupby:
        # If groupby, count the number of non zeros entries
        d = (b != 0).groupby(level=0).sum()
    else:
        # If no groupby, calculate the sum
        d = b.groupby(level=0).sum()
    if not nograph:
        # Plot as stacked bars
        ax = d.plot.bar(stacked=True, title=title, figsize=(12,12))
        plt.legend(prop={'size':15}, loc='best')
        plt.xticks(size=15)
        plt.yticks(size=15)
        ax.title.set_size(20)
        plt.show()
        # Plot only validated
        ax = d.plot.bar(subplots=True, title=(title+' (per category)'), figsize=(15,20))
        plt.tight_layout()  # tighten the space between plots
        plt.xticks(size=15)
        for axe in ax:  # resize all titles and legends and labels
            axe.title.set_size(15)
            axe.legend(prop={'size':15}, loc='best')
        plt.suptitle((title+' (per category)'), size=20)  # resize main title
        plt.show()
    if not nostats:
        # Print the table
        print(b)
        # Print stats
        print(b.describe())
        print('\nTotal sum:\n'+str(b.sum()))
        print('\nNumber of nonzeros %s entries:\n%s' % (str(groupby), str((b != 0).sum())) )
    # Return b and d in case user wants to manipulate them further (or debug)
    return b, d

In [None]:
plot_boolean(df, 'project.id', 'qa.protocol_check_global', title='Protocol check global (all scan types)')

In [None]:
plot_boolean(df, 'project.id', 'qa.head_coverage_global', truename='Good', falsename='Bad', title='Head coverage global (all scan types)')

In [None]:
# One center is validated but has bad head coverage, show which one
df_bad_head_coverage = df.ix[(df['qa.head_coverage_global'] != True) & (df['qa.protocol_check_global'] == True)]
df_good_head_coverage = df.ix[(df['qa.head_coverage_global'] == True) & (df['qa.protocol_check_global'] == True)]
print('One example:')
print(df_bad_head_coverage.ix[:, 'qa.head_coverage'].iloc[0])
print('List of validated centers but which have some scans with bad head coverage:')
print(df_bad_head_coverage['project.id'].unique())
print("Total number of scans that would be validated but have bad head coverage: %i" % len(df_bad_head_coverage))
df_centers_no_good_head_coverage = set(df_bad_head_coverage['project.id']) - set(df_good_head_coverage['project.id'].unique())
print('Validated centers with no good head coverage:')
print(df_centers_no_good_head_coverage)
print('List of validated scans with bad head coverage:')
df_bad_head_coverage

In [None]:
plot_boolean(df, 'project.id', 'qa.protocol_check_global', where=(df['@type'].isin(rsfMRI_filter)), title='Protocol check global (rs_fMRI)')

In [None]:
print('Number of subjects with validated rs_fMRI')
plot_boolean(df, 'project.id', 'qa.protocol_check_global', where=(df['@type'].isin(rsfMRI_filter)), groupby=['subject.@ID'], title='Protocol check global (rs_fMRI)', nograph=True)

In [None]:
print('Number of subjects with validated rs_fMRI')
plot_boolean(df, 'subject.@ID', 'qa.protocol_check_global', where=(df['@type'].isin(rsfMRI_filter)), title='Protocol check global (rs_fMRI)', nograph=True)

In [None]:
print('Number of subjects with validated rs_fMRI')
plot_boolean(df, 'subject.@ID', 'qa.protocol_check_global', where=(df['@type'].isin(rsfMRI_filter)), title='Protocol check global (rs_fMRI)', nograph=True)

In [None]:
print('Number of subjects with validated T1')
plot_boolean(df, 'project.id', 'qa.protocol_check_global', where=(df['@type'] == 'T1'), title='Protocol check global (T1)')
plot_boolean(df, 'project.id', 'qa.protocol_check_global', where=(df['@type'] == 'T1'), groupby=['subject.@ID'], title='Protocol check global (T1)', nograph=True)

In [None]:
# Find healthy controls
df_healthy = df_rest[['project.id', 'subject.id', 'experiment.@visit_id']].ix[(df_rest['experiment.@visit_id'] == 'Healthy Volunteer')]
df_healthy

In [None]:
# Plot total number of validated controls
df_healthy.groupby(['project.id'])['experiment.@visit_id'].count().plot(kind='bar', title='Total per center of healthy volunteers')
plt.show()

In [None]:
# Show number of patients with validated rs_fMRI
plot_boolean(df_rest, 'project.id', 'qa.protocol_check_global', where=(df['experiment.@visit_id'] != 'Healthy Volunteer'), title='Protocol check global (rs_fMRI patients only)')

In [None]:
# Plot number of validated healthy controls
plot_boolean(df_rest, 'project.id', 'qa.protocol_check_global', where=(df['experiment.@visit_id'] == 'Healthy Volunteer'), title='Protocol check global (rs_fMRI patients only)')

In [None]:
# Check TR parameter and others among validated scans
print(df_rest_validated['xnat:parameters.xnat:tr'].unique())
print(df_couples(df_rest_validated, ['project.id', 'xnat:parameters.xnat:tr']))
df_rest_validated['xnat:parameters.xnat:voxelRes@xyz'] = df_rest_validated['xnat:parameters.xnat:voxelRes.@x'].astype('str') + 'x' + df_rest_validated['xnat:parameters.xnat:voxelRes.@y'] + 'x' + df_rest_validated['xnat:parameters.xnat:voxelRes.@z']
print(df_rest_validated['xnat:parameters.xnat:voxelRes@xyz'].unique())

In [None]:
# Extract all scanner models of validated scans
df_rest_validated['experiment.xnat:scannermodel'] = df_rest_validated['experiment.xnat:scanner.@manufacturer'].astype('str') + ' ' + df_rest_validated['experiment.xnat:scanner.@model'].astype('str')
# fill the nan ones by the other scanner model field
idxs = (df_rest_validated['experiment.xnat:scannermodel'] == 'nan nan')
df_rest_validated.ix[idxs, 'experiment.xnat:scannermodel'] = df_rest_validated[idxs]['experiment.xnat:scanner']
# Show unique scanner models list
df_rest_validated['experiment.xnat:scannermodel'].unique()
#df_couples(df_rest_validated, ['project.id', 'experiment.xnat:scannermodel'])

In [None]:
plot_categorical(df, 'project.id', 'subject.@group', title='Subjects groups (all scans)')

In [None]:
plot_categorical(df, 'project.id', 'subject.@group', groupby=['subject.id'], title='Subjects groups')

In [None]:
plot_categorical(df_rest_validated, 'project.id', 'subject.@group', groupby=['subject.id'], title='Subjects groups (validated+rsfMRI)')

In [None]:
def custom_sort(a, b):
    '''Sort by MR early first and then by the shortest MR timeframe first'''
    if a == b:
        return 0
    elif a.startswith('MR') and b.startswith('MR'):
        if a.split()[1] == 'Early':
            return -1
        elif int(a.split()[1]) > int(b.split()[1]):
            return 1
        else:
            return -1
    else:
        return 1

plot_categorical(df_rest, 'project.id', 'experiment.@visit_id', groupby=['subject.id'], title='Longitudinal scans availability', sort=custom_sort)

In [None]:
# nb of validated rs_fmri subjects
len(df_rest_validated.groupby('subject.id'))

In [None]:
# List of all available longitudinal categories
df_rest['experiment.@visit_id'].unique()

In [None]:
# Test
b = df_rest[['subject.id', 'experiment.@visit_id']].groupby(['experiment.@visit_id'])
b.aggregate('sum')

In [None]:
# Build longitudinal table for every subjects (without project id)
b = df_couples(df_rest_validated, ['subject.id', 'experiment.@visit_id'])
df_rest_longitudinal = pd.DataFrame(columns=df_rest_validated['experiment.@visit_id'].unique())
for k, v in b.iteritems():
    df_rest_longitudinal.ix[k] = v
df_rest_longitudinal.fillna('', inplace=True)
save_df_as_csv(df_rest_longitudinal, 'xnat_data_extract_rs-fMRI-validated-longitudinal-noproject.csv', index=True, encoding='utf-8')
df_rest_longitudinal

In [None]:
# Generate longitudinal data table for validated rs_fMRI sessions (with project id)
b = df_couples(df_rest_validated, ['project.id', 'subject.id', 'experiment.@visit_id'])
index = pd.MultiIndex.from_tuples(df_couples(df_rest_validated, ['project.id', 'subject.id']).index)
df_rest_longitudinal = pd.DataFrame(columns=df_rest_validated['experiment.@visit_id'].unique(), index=index)
for k, v in b.iteritems():
    df_rest_longitudinal.ix[(k[0], k[1]), k[2]] = v
save_df_as_csv(df_rest_longitudinal.fillna(''), 'xnat_data_extract_rs-fMRI-validated-longitudinal.csv', index=True, encoding='utf-8')
df_rest_longitudinal.fillna('')

In [None]:
# Generate table of only subjects with at least 2 sessions
df_rest_longitudinal_morethan1 = df_rest_longitudinal.dropna(axis=0, thresh=2)
print((~df_rest_longitudinal_morethan1.isnull()).sum(axis=1))
save_df_as_csv(df_rest_longitudinal_morethan1.fillna(''), 'xnat_data_extract_rs-fMRI-validated-longitudinal_morethan1.csv', index=True, encoding='utf-8')
df_rest_longitudinal_morethan1.fillna('')

In [None]:
# Test: show all sessions for one specific subject
df_rest.ix[df_rest['subject.id'] == 'CTBI_S01190', ('subject.id', 'experiment.@visit_id', 'experiment.xnat:date', 'experiment.xnat:time', 'project.@ID', 'qa.head_coverage', 'qa.head_coverage_global', 'qa.protocol_check', 'qa.protocol_check_global', 'xnat:quality')]

In [None]:
# Show all available quality categories
df['xnat:quality'].unique()

In [None]:
# Show all usable rs_fmri scans for every centers
df_couples(df_rest, ['project.id', 'xnat:quality'])

In [None]:
# Plot number of usable rs_fMRI subjects per center
df['qa.quality'] = (df['xnat:quality'] == 'usable')
df_rest['qa.quality'] = (df_rest['xnat:quality'] == 'usable')
c, d = plot_boolean(df_rest, 'project.id', 'qa.quality', groupby=['subject.@ID'], title='Quality (rs_fMRI)', truename='usable', falsename='unusable')
d

In [None]:
# Evolution of the number of rsfMRI sessions acquisition over time
dtime = df_rest.ix[:, ('experiment.xnat:date', 'qa.protocol_check_global')]
dtime['experiment.xnat:date'] = pd.to_datetime(dtime['experiment.xnat:date'], format='%Y-%m-%d')
dtime['qa.protocol_check_global'] = (dtime['qa.protocol_check_global'] == True)
dtime['qa.protocol_check_global_false'] = (~dtime['qa.protocol_check_global'])
dtime2 = dtime.set_index('experiment.xnat:date')
dtime2.sort_index(inplace=True)
dtimecum = dtime2.cumsum()
dtimecum.plot(kind='area', title='Cumulative number of sessions over time')
plt.figure()
dtimecum['qa.protocol_check_global'].plot(kind='area', title='Cumulative number of sessions over time (only validated)')
plt.show()

In [None]:
# Evolution of the number of acquired rsfMRI sessions over time - per project
dtime = df_rest.ix[:, ('experiment.xnat:date', 'project.id', 'qa.protocol_check_global')]
dtime['experiment.xnat:date'] = pd.to_datetime(dtime['experiment.xnat:date'], format='%Y-%m-%d')
dtime['qa.protocol_check_global'] = (dtime['qa.protocol_check_global'] == True)
dtime['qa.protocol_check_global_false'] = (~dtime['qa.protocol_check_global'])
dtime2 = dtime.set_index(['experiment.xnat:date', 'project.id'])
dtime2.sort_index(inplace=True)
a = dtime2.groupby(level=[1]).cumsum() #(kind='area', subplots=True)
a = a.reset_index()
a = a.set_index(['experiment.xnat:date'])
#a.groupby('project.id').plot(kind='area')
for title, group in a.groupby('project.id'):
    group.plot(kind='area', title=title)
    group.plot(kind='area', title=title, subplots=True)
plt.show()

In [None]:
# Most missing fields
df_nan = df.isnull().sum() / len(df)
for key, val in df_nan.sort_values(ascending=False).iteritems():
    print('%s: %.4f' % (key, val))

In [None]:
# Most complete fields
df_nan = df.count() / len(df)
for key, val in df_nan.sort_values(ascending=False).iteritems():
    print('%s: %.4f' % (key, val))

In [None]:
# Plot distribution/histogram of each column
pd.options.display.mpl_style = 'default'
df['subject.age_days'] = pd.to_datetime(df['experiment.xnat:date'], format='%Y-%m-%d') - pd.to_datetime(df['subject.xnat:demographics.xnat:yob'], format='%Y')
df['subject.age'] = df['subject.age_days'].apply(lambda d: d.days / 365 if pd.notnull(d) else d)
#df['subject.xnat:demographics.xnat:gender']
#df.ix[:, ('subject.age', 'subject.xnat:demographics.xnat:gender')].fillna(0).value_counts().plot(kind='bar')
#df.fillna(0).hist()
#df.plot(kind='hist', subplots=True)
#plt.show()
for col in df.columns:
    try:
        if df[col].count() > 0:
            print(col)
            try:
                plt.figure()
                df[col].fillna(0).plot(kind='hist', title=col)
            except TypeError:
                # For categorical type values, we cannot use hist (which expects only continuous values), thus fallback to value_counts().barplot()
                if len(df[col].unique()) < 100:
                    df[col].fillna(0).value_counts().plot(kind='bar', title=col)
                else:
                    continue
    except TypeError:
        pass
plt.show()

In [None]:
# Find buggy age patients (with age > 100)
df.ix[df['subject.age'] > 100,:].ix[:, ('project.id', 'subject.id', 'subject.age', 'experiment.xnat:date', 'subject.xnat:demographics.xnat:yob')]

--------
### Under work

--------------------------------
## Resources
Useful resources:
* https://stackoverflow.com/questions/34092808/extract-nested-json-embedded-as-string-in-pandas-dataframe#
* https://stackoverflow.com/questions/39899005/how-to-flatten-a-pandas-dataframe-with-some-columns-as-json
* http://mindtrove.info/flatten-nested-json-with-pandas/

----------------------------------------------
## Random code

In [None]:
def flatten_json(y):
    # From https://medium.com/@amirziai/flattening-json-objects-in-python-f5343c794b10
    out = {}

    def flatten(x, name=''):
        if type(x) is dict:
            for a in x:
                flatten(x[a], name + a + '_')
        elif type(x) is list:
            i = 0
            for a in x:
                flatten(a, name + str(i) + '_')
                i += 1
        else:
            out[name[:-1]] = x

    flatten(y)
    return out

In [None]:
def JSON2DICTParser(data):
    # From https://stackoverflow.com/questions/20680272/parsing-a-json-string-which-was-loaded-from-a-csv-using-pandas
    if data and re.sub('[\[\]{}\s]+', '', data):
        try:
            j1 = json.loads(data)
        except ValueError:
            print(data)
            raise
        return j1
    else:
        return float('nan')

# Load the csv into a Pandas DataFrame
#df = pd.read_csv(json_filepath, converters={'software_metadata':JSON2DICTParser, 'parameters_dict':JSON2DICTParser, 'results_dict':JSON2DICTParser})
