In [1]:
import json
import os
import requests

import rdflib as rl
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

import numpy as np



In [2]:
r = requests.get('https://ndar.nih.gov/api/datadictionary/v2/datastructure')

In [3]:
ds_data = r.json()

In [4]:
ds_data[0]

{u'categories': [u'Questionnaire'],
 u'dataType': u'Clinical Assessments',
 u'publishDate': None,
 u'shortName': u'grit01',
 u'sources': [u'RDoC'],
 u'status': u'Published',
 u'title': u'12-Item Grit Scale'}

In [5]:
ds_names = [val['shortName'] for val in ds_data]
ds_titles = [val['title'] for val in ds_data]

In [6]:
ds_names[:10]

[u'grit01',
 u'nepsy01',
 u'ace_fammedhist01',
 u'ace_subjmedhist01',
 u'ace_physexam01',
 u'adhdrs01',
 u'adhdc01',
 u'agre_ados1_200101',
 u'agre_ados1_200102',
 u'agre_ados2_200102']

In [25]:
def fetch_named_ds(name):
    named_json = '{0}.json'.format(name)
    if not os.path.exists(named_json):
        r = requests.get('https://ndar.nih.gov/api/datadictionary/v2/datastructure/{0}'.format(name))
        named_data = r.json()
        with open(named_json, 'wt') as fp:
            json.dump(named_data, fp)
    else:
        with open(named_json, 'rt') as fp:
            named_data = json.load(fp)
    if named_data['status'] == "Archived":
        return None
    info = []
    for element in named_data['dataElements']:
        info_el = []
        info_el.append(element['name'])
        info_el.append(element['description'])
        aliases = []
        for alias in element['aliases']:
            if ' ' in alias:
                aliases.extend(alias.split())
            else:
                aliases.append(alias)
        info_el.append(aliases)
        info.append(info_el)
    return info

In [26]:
fetch_named_ds(ds_names[0])

[[u'subjectkey',
  u'The NDAR Global Unique Identifier (GUID) for research subject',
  []],
 [u'src_subject_id', u"Subject ID how it's defined in lab/project", []],
 [u'interview_date',
  u'Date on which the interview/genetic test/sampling/imaging was completed. MM/DD/YYYY',
  []],
 [u'interview_age',
  u'Age in months at the time of the interview/test/sampling/imaging.',
  []],
 [u'gender', u'Sex of the subject', []],
 [u'grit1',
  u'I have overcome setbacks to conquer an important challenge.',
  []],
 [u'grit2',
  u'New ideas and projects sometimes distract me from previous ones.',
  []],
 [u'grit3', u'My interests change from year to year.', []],
 [u'grit4', u"Setbacks don't discourage me.", []],
 [u'grit5',
  u'I have been obsessed with a certain idea or project for a short time but later lost interest.',
  []],
 [u'grit6', u'I am a hard worker.', []],
 [u'grit7',
  u'I often set a goal but later choose to pursue a different one.',
  []],
 [u'grit8',
  u'I have difficulty maintaini

In [27]:
all_info = []
for idx, name in enumerate(ds_names):
    val = dict(name=name)
    val['description'] = ds_titles[idx]
    val['info'] = fetch_named_ds(name)
    if val['info'] is not None:
        all_info.append(val)

In [28]:
len(all_info)

1247

In [29]:
def generate_graph(info):
    nda = rl.Namespace("https://ndar.nih.gov/api/datadictionary/v2/datastructure/")
    ex = rl.Namespace("https://example.org/")
    g = rl.ConjunctiveGraph()
    g.bind("nda", nda)
    g.bind("ex", ex)
    tokens = []
    for val in info:
        g.add((nda[val['name']], ex['description'], rl.Literal(val['description'])))
        tokens.append(val['description'])
        for item in val['info']:
            item_el = rl.URIRef(str(nda) + val['name'] + '/' + item[0])
            g.add((nda[val['name']], ex['hasItem'], item_el))
            g.add((item_el, ex['description'], rl.Literal(item[1])))
            g.add((item_el, ex['description'], rl.Literal(item[0])))
            #tokens.append(item[0])
            tokens.append(item[1])
            for alias in item[2]:
                item_alias = rl.URIRef(str(nda) + val['name'] + '/' + alias)
                g.add((item_el, ex['isSameas'], item_alias))
                g.add((item_alias, ex['description'], rl.Literal(alias)))
                g.add((nda[val['name']], ex['hasItem'], item_alias))
                #tokens.append(alias)
    return g, tokens

In [30]:
g, tokens = generate_graph(all_info)

In [33]:
utokens = [val for val in np.unique(tokens).tolist() if val]
len(utokens)

106817

In [34]:
utokens[:20]

[u' Compared to one year ago, how would you rate your physical health in general now?',
 u" Does your child's condition interfere with his/her ability to learn?",
 u' Hostility. Becoming angry, argumentative. ',
 u' Increased level of care required, defined by an increase in clinic visits, emergency room visits or a change from general outpatient status to day hospital participation',
 u' Mother ethnicity (choices are Hispanic/Latino or Not Hispanic/Latino)',
 u' Must meet DSM-IV TR diagnostic criteria for Major Depressive Disorder, Bipolar I or II Depressed, Generalized Anxiety Disorder, Social Phobia, Panic Disorder, or Post Traumatic Stress Disorder',
 u'" 1. Difficult to hold, cuddle? (no molding to body)  "',
 u'" 10. Can\'t sit still, restless, or hyperactive  "',
 u'" 10. Exceptionally fussy, difficult to soothe? (not related  to colic)"',
 u'" 102. Underactive, slow moving, or lacks energy  "',
 u'" 103. Unhappy, sad, or depressed  "',
 u'" 108. show repeated movements like squ

In [36]:
query = """
PREFIX ex: <https://example.org/>

SELECT ?assessment where
{
   ?assessment ex:hasItem?/ex:description "%s" .
}

"""

def get_parent(g, val):
    query_str = query % val
    return g.query(query_str)

In [37]:
get_parent(g, 'My life is fast-paced').bindings

[{rdflib.term.Variable(u'assessment'): rdflib.term.URIRef(u'https://ndar.nih.gov/api/datadictionary/v2/datastructure/neo_ffi_form_s_adult_200301/neo_q47')},
 {rdflib.term.Variable(u'assessment'): rdflib.term.URIRef(u'https://ndar.nih.gov/api/datadictionary/v2/datastructure/neo_ffi_form_s_adult_200301')}]

In [38]:
process.extract("fath", utokens, limit=10)

[(u'"82.  Cuddliness, current (father)"', 90),
 (u'"Asperger Syndrome, father"', 90),
 (u'"Autism Disorder, father"', 90),
 (u'"Autism Spectrum Disorder, father"', 90),
 (u'"Cerebral Palsy, father"', 90),
 (u'"Congenital Blindness, father"', 90),
 (u'"Disrupted sleep patterns, father"', 90),
 (u'"Down Syndrome, father"', 90),
 (u'"Neurofibromatosis I, father"', 90),
 (u'"PDD-NOS, father"', 90)]

In [39]:
process.extract("fibromatosis", utokens, limit=10)

[(u'"Neurofibromatosis I, father"', 90),
 (u'"Neurofibromatosis I, mother"', 90),
 (u'"Neurofibromatosis I, sibling 1"', 90),
 (u'"Neurofibromatosis I, sibling 2"', 90),
 (u'"Neurofibromatosis I, sibling 3"', 90),
 (u'"Neurofibromatosis I, sibling 4"', 90),
 (u'"Neurofibromatosis I, sibling 5"', 90),
 (u'Child have Neurofibromatosis?', 90),
 (u'Does the child have Neurofibromatosis?', 90),
 (u'Family History: Neurofibromatosis Child', 90)]

In [40]:
process.extract("47. My life is fast-paced.", utokens, limit=10)

[(u'My life is fast-paced', 95),
 (u'fast', 90),
 (u'My lifeIs fast-paced.', 89),
 (u'" 16. Is happy, cheerful, and has a positive attitude.  "', 86),
 (u'" 37. Is patient and content, even when waiting in a long   line. "', 86),
 (u'" Adds qualifying words to nouns so that others will understand what or whom he/she is talking about (e.g.,   \'John, my brother, is nine years ald.\')  "',
  86),
 (u'" Becomes unusually upset when the order of a routine is changed (e.g., sees a teacher out of usual  sequence, has to go to a later class before an earlier   one, order of programs changed) "',
  86),
 (u'" Becomes upset when class is interrupted (e.g., fire drills, announcements over loudspeaker)  "',
  86),
 (u'" If person observed is not a parent, specify relationship.  "', 86),
 (u'" If setting is not home or clinic, specify setting.  "', 86)]

In [41]:
process.extract(utokens[1], utokens, limit=10)

[(u" Does your child's condition interfere with his/her ability to learn?",
  100),
 (u'condition', 90),
 (u' Increased level of care required, defined by an increase in clinic visits, emergency room visits or a change from general outpatient status to day hospital participation',
  86),
 (u'" 14. Feeding problems? (sucking problems, choking,   regurgitation, arching, anxiousness, refusal to feed)"',
  86),
 (u'" 16. Cruelty, bullying, or meanness to others  "', 86),
 (u'" 21. Exceptionally fussy, difficult to soothe?  "', 86),
 (u'" 25. Feeding problems? (sucking problems, choking,  regurgitation, arching, anxiousness, refusal to feed)  "',
  86),
 (u'" 42. splash, kick, or try to jump?  "', 86),
 (u'" 43. Overly sensitive/lack of sensitivity to noise, tactile   input, other sensory input or other unusual  sensory/visual response? "',
  86),
 (u'" 56. Overly sensitive/lack of sensitivity to noise, tactile   input, other sensory input or other unusual sensory/visual response? "',
  86)

In [42]:
matches = process.extract("subject", np.unique(tokens).tolist(), limit=2)
print(matches)
for match in matches:
    print(match, get_parent(g, match[0]).bindings)

[(u'Subject #1:', 95), (u'Subject #2:', 95)]
((u'Subject #1:', 95), [{rdflib.term.Variable(u'assessment'): rdflib.term.URIRef(u'https://ndar.nih.gov/api/datadictionary/v2/datastructure/trf01/trf_vii1subje')}, {rdflib.term.Variable(u'assessment'): rdflib.term.URIRef(u'https://ndar.nih.gov/api/datadictionary/v2/datastructure/trf01')}])
((u'Subject #2:', 95), [{rdflib.term.Variable(u'assessment'): rdflib.term.URIRef(u'https://ndar.nih.gov/api/datadictionary/v2/datastructure/trf01/trf_vii2subj')}, {rdflib.term.Variable(u'assessment'): rdflib.term.URIRef(u'https://ndar.nih.gov/api/datadictionary/v2/datastructure/trf01')}])


In [43]:
for match in matches:
    print(match, [str(val['assessment']) for val in get_parent(g, match[0]).bindings])

((u'Subject #1:', 95), ['https://ndar.nih.gov/api/datadictionary/v2/datastructure/trf01/trf_vii1subje', 'https://ndar.nih.gov/api/datadictionary/v2/datastructure/trf01'])
((u'Subject #2:', 95), ['https://ndar.nih.gov/api/datadictionary/v2/datastructure/trf01/trf_vii2subj', 'https://ndar.nih.gov/api/datadictionary/v2/datastructure/trf01'])
