In [81]:
import json
import os
import requests

import rdflib as rl
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

import numpy as np

In [7]:
r = requests.get('https://ndar.nih.gov/api/datadictionary/v2/datastructure')

In [13]:
ds_data = r.json()

In [14]:
ds_data[0]

{u'categories': [u'Questionnaire'],
 u'dataType': u'Clinical Assessments',
 u'publishDate': None,
 u'shortName': u'grit01',
 u'sources': [u'RDoC'],
 u'status': u'Published',
 u'title': u'12-Item Grit Scale'}

In [23]:
ds_names = [val['shortName'] for val in ds_data]
ds_titles = [val['title'] for val in ds_data]

In [17]:
ds_names[:10]

[u'grit01',
 u'nepsy01',
 u'ace_fammedhist01',
 u'ace_subjmedhist01',
 u'ace_physexam01',
 u'adhdrs01',
 u'adhdc01',
 u'agre_ados1_200101',
 u'agre_ados1_200102',
 u'agre_ados2_200102']

In [41]:
def fetch_named_ds(name):
    named_json = '{0}.json'.format(name)
    if not os.path.exists(named_json):
        r = requests.get('https://ndar.nih.gov/api/datadictionary/v2/datastructure/{0}'.format(name))
        named_data = r.json()
        with open(named_json, 'wt') as fp:
            json.dump(named_data, fp)
    else:
        with open(named_json, 'rt') as fp:
            named_data = json.load(fp)
    info = []
    for element in named_data['dataElements']:
        info_el = []
        info_el.append(element['name'])
        info_el.append(element['description'])
        info_el.append(element['aliases'])
        info.append(info_el)
    return info

In [43]:
fetch_named_ds(ds_names[0])

[[u'subjectkey',
  u'The NDAR Global Unique Identifier (GUID) for research subject',
  []],
 [u'src_subject_id', u"Subject ID how it's defined in lab/project", []],
 [u'interview_date',
  u'Date on which the interview/genetic test/sampling/imaging was completed. MM/DD/YYYY',
  []],
 [u'interview_age',
  u'Age in months at the time of the interview/test/sampling/imaging.',
  []],
 [u'gender', u'Sex of the subject', []],
 [u'grit1',
  u'I have overcome setbacks to conquer an important challenge.',
  []],
 [u'grit2',
  u'New ideas and projects sometimes distract me from previous ones.',
  []],
 [u'grit3', u'My interests change from year to year.', []],
 [u'grit4', u"Setbacks don't discourage me.", []],
 [u'grit5',
  u'I have been obsessed with a certain idea or project for a short time but later lost interest.',
  []],
 [u'grit6', u'I am a hard worker.', []],
 [u'grit7',
  u'I often set a goal but later choose to pursue a different one.',
  []],
 [u'grit8',
  u'I have difficulty maintaini

In [None]:
all_info = []
for idx, name in enumerate(ds_names):
    val = dict(name=name)
    val['description'] = ds_titles[idx]
    val['info'] = fetch_named_ds(name)
    all_info.append(val)

In [91]:
def generate_graph(info):
    nda = rl.Namespace("https://ndar.nih.gov/api/datadictionary/v2/datastructure/")
    ex = rl.Namespace("https://example.org/")
    g = rl.ConjunctiveGraph()
    g.bind("nda", nda)
    g.bind("ex", ex)
    tokens = []
    for val in info:
        g.add((nda[val['name']], ex['description'], rl.Literal(val['description'])))
        tokens.append(val['description'])
        for item in val['info']:
            item_el = rl.URIRef(str(nda) + val['name'] + '/' + item[0])
            g.add((nda[val['name']], ex['hasItem'], item_el))
            g.add((item_el, ex['description'], rl.Literal(item[1])))
            g.add((item_el, ex['description'], rl.Literal(item[0])))
            tokens.append(item[0])
            tokens.append(item[1])
            for alias in item[2]:
                item_alias = rl.URIRef(str(nda) + val['name'] + '/' + alias)
                g.add((item_el, ex['isSameas'], item_alias))
                g.add((item_alias, ex['description'], rl.Literal(alias)))
                g.add((nda[val['name']], ex['hasItem'], item_alias))
                tokens.append(alias)
    return g, tokens

In [92]:
g, tokens = generate_graph(all_info[:10])

In [93]:
print(g.serialize(format='turtle'))

@prefix ex: <https://example.org/> .
@prefix nda: <https://ndar.nih.gov/api/datadictionary/v2/datastructure/> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix xml: <http://www.w3.org/XML/1998/namespace> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

nda:ace_fammedhist01 ex:description "ACE Family Medical History" ;
    ex:hasItem <https://ndar.nih.gov/api/datadictionary/v2/datastructure/ace_fammedhist01/ad_fath>,
        <https://ndar.nih.gov/api/datadictionary/v2/datastructure/ace_fammedhist01/ad_moth>,
        <https://ndar.nih.gov/api/datadictionary/v2/datastructure/ace_fammedhist01/ad_sib1>,
        <https://ndar.nih.gov/api/datadictionary/v2/datastructure/ace_fammedhist01/ad_sib2>,
        <https://ndar.nih.gov/api/datadictionary/v2/datastructure/ace_fammedhist01/ad_sib3>,
        <https://ndar.nih.gov/api/datadictionary/v2/datastructure/ace_fammedhist01/ad_sib4>,
        <https://ndar.nih.gov/api

In [129]:
query = """
PREFIX ex: <https://example.org/>

SELECT ?assessment where
{
    FILTER (EXISTS { ?assessment ex:description "%s" } ||
            EXISTS { ?assessment ex:hasItem/ex:description "%s" }) .
}

"""

def get_parent(g, val):
    query_str = query % (val, val)
    #print("A" + query_str + "B")
    return g.query(query_str)

In [134]:
process.extract("fath", np.unique(tokens).tolist(), limit=10)

[(u'"Asperger Syndrome, father"', 90),
 (u'"Autism Disorder, father"', 90),
 (u'"Autism Spectrum Disorder, father"', 90),
 (u'"Cerebral Palsy, father"', 90),
 (u'"Congenital Blindness, father"', 90),
 (u'"Disrupted sleep patterns, father"', 90),
 (u'"Down Syndrome, father"', 90),
 (u'"Neurofibromatosis I, father"', 90),
 (u'"PDD-NOS, father"', 90),
 (u'"Rett Syndrome, father"', 90)]

In [137]:
process.extract("fibromatosis", np.unique(tokens).tolist(), limit=10)

[(u'"Neurofibromatosis I, father"', 90),
 (u'"Neurofibromatosis I, mother"', 90),
 (u'"Neurofibromatosis I, sibling 1"', 90),
 (u'"Neurofibromatosis I, sibling 2"', 90),
 (u'"Neurofibromatosis I, sibling 3"', 90),
 (u'"Neurofibromatosis I, sibling 4"', 90),
 (u'"Neurofibromatosis I, sibling 5"', 90),
 (u'Ptosis', 75),
 (u'oman', 68),
 (u'Are there neurofibromas (bumps under the skin, sometimes with a bluish tinge)?',
  64)]

In [None]:
process.extract("fibromatosis", np.unique(tokens).tolist(), limit=10)

In [136]:
matches = process.extract("subject", np.unique(tokens).tolist(), limit=10)
print(matches)
for match in matches:
    pass
    #print(match, get_parent(g, match[0]).bindings)

[(u'ACE Subject Medical History', 90), (u'ACE Subject Physical Exam', 90), (u'Does subject have a special diet?', 90), (u'Sex of the subject', 90), (u"Subject ID how it's defined in lab/project", 90), (u'src_subject_id', 90), (u'subjectkey_father', 90), (u'subjectkey_mother', 90), (u'subjectkey_sibling1', 90), (u'subjectkey_sibling2', 90)]
