# DbGaP FHIR API Exploration
2023-08-16 ZD  

This notebook will explore options to gather data from the [Database of Genotypes and Phenotypes (DbGaP)](https://www.ncbi.nlm.nih.gov/gap/) using the [dbGaP FHIR API](https://github.com/ncbi/DbGaP-FHIR-API-Docs/tree/production). 

Much of the code used in this notebook is reused from the [DbGaP-FHIR-API_Docs repo](https://github.com/ncbi/DbGaP-FHIR-API-Docs/tree/production)

## Copy `dbgap_fhir.py` from [DbGaP-FHIR-API_Docs repo](https://github.com/ncbi/DbGaP-FHIR-API-Docs/tree/production)

In [1]:
import os
import sys
import json
import requests
import pandas as pd 
import numpy as np
from pathlib import Path
from datetime import datetime
import time
import pprint

class DbGapFHIR:

    def __init__(self, fhir_server, verify_ssl = True, api_key=None, passport=None, debug=False, show_stats=True):
        
        # Optional: Turn off SSL verification. Useful when dealing with a corporate proxy with self-signed certificates.
        # This should be set to True unless you actually see certificate errors.

        if not verify_ssl:
            requests.packages.urllib3.disable_warnings()
            
        self.fhir_server = fhir_server
        self.api_key = api_key
        self.passport = passport
        self.debug = debug
        self.show_stats = show_stats

        # We make a requests.Session to ensure consistent headers/cookie across all the requests we make
        self.s = requests.Session()
        self.s.headers.update({'Accept': 'application/fhir+json'})
        # handle security needed for dbGaP
        self.__add_passport()
        self.s.verify = verify_ssl        
        self.bytes_retrieved = 0
        
        # Test out the client by querying the server metadata#
        r = self.s.get(f"{self.fhir_server}/metadata")

        if "<!DOCTYPE html>" in r.text:
            sys.stderr.write('ERROR: Could not get the server capability statement. ')

    # Resolves all pages for the bundle. Returns an array with all Bundles, including the original Bundle.
    def resolve_pages(self, bundle, debug=False, sleep=None):

        max_tries = 10 # maximum number of tries to get next page
        retry_sleep = 10 # after multiple failures, wait this number of seconds for a retry
        try:
            next_page_link = next(filter(lambda link: link['relation'] == 'next', bundle['link']), None)
        except KeyError:
            print('Key error link/next_page')
            print(json.dumps(bundle, indent=3))
            raise
        n = 1
        if next_page_link:
            if sleep != None:
                time.sleep(sleep)
            fhir_query = next_page_link['url']
            if self.api_key != None:
                fhir_query += f"&api_key={self.api_key}"
            if debug:
                print('_'*80)
                print(fhir_query)
            tries = 1
            r = self.s.get(fhir_query)
            while r.status_code == 500 and tries < max_tries:
            	tries += 1
            	if tries > 6:
            		time.sleep(retry_sleep)
            		print (f"trying again - waiting {retry_sleep}s")
            	else:
            		print ("trying again")
            	r = self.s.get(fhir_query)
            if tries > 1:
            	print(f'took {tries} tries')
            next_page = r.json()
            self.bytes_retrieved += len(r.content)
            if 'link' not in next_page:
                print(json.dumps(next_page, indent=3))
            nl = [l for l in next_page['link'] if l['relation'] == 'next']
            if debug:
                if len(nl) < 1:
                    print('Full last response')
                    print(json.dumps(next_page, indent=3))
            return [bundle] + self.resolve_pages(next_page, debug, sleep)
        else:
            return [bundle]

    # NOTE: No cell output.

    # Run a query, and get the whole set of results back as a list of resources
    # Set limit to True if you want  to the first page if you like
    def run_query(self, query, limit=None, debug=False, sleep=None, show_stats=None):
    
        if show_stats == None:
            show_stats = self.show_stats
            
        t_start = time.perf_counter()

        self.bytes_retrieved = 0
        subset = False
        
        fhir_query = f"{self.fhir_server}/{query}"
        if self.api_key != None:
            fhir_query += f"&api_key={self.api_key}"
        if debug:
            print(fhir_query)
        r = self.s.get(fhir_query)
        first_bundle = r.json()
        self.bytes_retrieved += len(r.content)
        if debug:
            print(json.dumps(first_bundle, indent=3))
            print('got response')
        # if it's just a summary
        if 'meta' in first_bundle and 'tag' in first_bundle['meta'] and first_bundle['meta']['tag'][0]['code'] == 'SUBSETTED':
                subset = True
                all_bundles = [first_bundle]
        elif limit == None:
            all_bundles = self.resolve_pages(first_bundle, debug, sleep)
        else:
            all_bundles = [first_bundle]

        t_end = time.perf_counter()
        pagecount = len(all_bundles)
        
        resources = []
        if subset:
            resources = [first_bundle]
        else:
            resources = []
            for bundle in all_bundles:
                if 'entry' in bundle:
                    resources.extend([entry['resource']  for entry in bundle['entry']])
        
        elapsed = t_end - t_start
        if show_stats:
            print(f"Total  Resources: {len(resources)}")
            print(f"Total  Bytes: {self.bytes_retrieved}")
            print(f"Total  Pages: {pagecount}")
            print(f"Time elapsed {elapsed:0.4f} seconds")
        return resources
        
        
    def __add_passport(self, passport=None):
        '''Adds Passport/TST to session header
        '''
        if passport == None:
            passport = self.passport
        
        if passport != None:
            full_key_path = os.path.expanduser(passport)
            file_content = ""
            if self.debug: print(f"passport path {full_key_path}")
            try:
                with open(full_key_path) as f:
                    file_content = f.read()
                if self.debug: print(f"content of passport file {file_content}")
                self.s.headers.update({'Authorization': f'Bearer {file_content}'})
            except:
                print("Could not find passport file")

def obs_to_df(observations):
    patient_observations_dict = {}
    
    for obs in observations:
        subject_id = obs['subject']['reference']
        obs_display_name = obs['code']['coding'][0]['display']
        if obs_display_name in ['SUBJECT_ID','SAMPLE_ID']:
            obs_code = obs['code']['coding'][0]['code']
            obs_display_name = f'{obs_display_name}_{obs_code}'
        if 'valueQuantity' in obs:
            value_text = obs['valueQuantity']['value']
            #value_unit = obs['valueQuantity']['unit']
        elif 'valueCodeableConcept' in obs:
             value_text = obs['valueCodeableConcept']['coding'][0]['display']
        else:
            value_text = 'unknown'

        if subject_id not in patient_observations_dict:
            patient_observations_dict[subject_id] = {obs_display_name: value_text}
        else:
            patient_observations_dict[subject_id][obs_display_name] = value_text

    df = pd.DataFrame.from_dict(patient_observations_dict, orient='index')
    return df
    
def prettyprint(some_json):
    print(json.dumps(some_json, indent=3))

In [8]:
# for a given resource find the extension identified by a given url
# The assumption is that there is only one such extension within a given resource
# For the dbGaP ResearchStudy resource that is true
def getExtension(resource, uri):
    exts = [d for d in resource['extension'] if d['url'] == uri]
    if len(exts) > 0 :
        return exts[0]
    else:
        return None

In [9]:
def studies_to_df(documents, verbose=False):
    studies = []
    for s in documents:

        if verbose:
            print (s['id'])
            print (s['title'])
        # use our function to find the "study content" extension
        content = getExtension(s, "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/StructureDefinition/ResearchStudy-Content")
        # use our function again to find the "number of subjects" extension nested within the content extension
        subject_ext = getExtension(content, "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/StructureDefinition/ResearchStudy-Content-NumSubjects")
        #print(subject_ext)
        # Handle the fact that not all studies may have this extension
        if subject_ext != None and 'value' in subject_ext['valueCount']:
            subject_count = subject_ext['valueCount']['value']
        else:
            subject_count = 0

        # Now find the extension containing the study consents
        consent_ext = getExtension(s, "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/StructureDefinition/ResearchStudy-StudyConsents")
        # extract the display name for each consent group and print them
        if consent_ext != None:
            consents = [d['valueCoding']['display'] for d in consent_ext['extension'] ]
            if verbose:
                print(consents)
        else:
            consents = []
            
        # focus
        if 'focus' in s:
            focus = s['focus'][0]['text']
            if 'coding' in s['focus'][0]:
                focus_code = s['focus'][0]['coding'][0]['code']
            else:
                focus_code = ''
        else:
            focus = ''
            focus_code = ''
        # Add the relevant details to our list of studies
        study = {"id":s['id'], "title":s["title"], "num_subjects":subject_count,
                 "focus":focus,"focus_mesh":focus_code,"consents":consents}
        studies.append(study)
        if verbose:
            print('_'*40)
        df = pd.DataFrame(studies)
    return df

## Call API
Again reusing code from https://github.com/ncbi/DbGaP-FHIR-API-Docs/blob/production/jupyter/Notebook02_studies.ipynb

In [2]:
import json
import os
import pandas as pd

FHIR_SERVER = 'https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1'

# No API Key in use
    
mf = DbGapFHIR(FHIR_SERVER)

In [5]:
nci_studies = mf.run_query("ResearchStudy?sponsor=NCI")

Total  Resources: 557
Total  Bytes: 11296929
Total  Pages: 6
Time elapsed 34.8254 seconds


In [12]:
df_from_docs = studies_to_df(nci_studies)

In [13]:
df_from_docs

Unnamed: 0,id,title,num_subjects,focus,focus_mesh,consents
0,phs001287,CPTAC 3 Study,5495,Neoplasms,D009369,[GRU]
1,phs000527,National Cancer Institute (NCI) Cancer Genome ...,0,"Lymphoma, Non-Hodgkin",D008228,"[DS-CA-MDS, GRU, GRU-IRB]"
2,phs000528,National Cancer Institute (NCI) Cancer Genome ...,0,"Lymphoma, AIDS-Related",D016483,"[DS-CA-MDS, GRU, GRU-IRB]"
3,phs000531,National Cancer Institute (NCI) Cancer Genome ...,0,Medulloblastoma,D008527,"[DS-CA-MDS, GRU, GRU-IRB]"
4,phs000530,National Cancer Institute (NCI) Cancer Genome ...,16,"Lymphoma, AIDS-Related",D016483,"[DS-CA-MDS, GRU, GRU-IRB]"
...,...,...,...,...,...,...
552,phs000336,DCEG Lung Cancer Study,11517,Lung Neoplasms,D008175,"[CADM, SLD]"
553,phs000124,Genome-Wide Association Study of Neuroblastoma,1662,Neuroblastoma,D009447,[GRU]
554,phs000093,A Genome Wide Scan of Lung Cancer and Smoking,5566,Lung Neoplasms,D008175,"[NRUP, CADM, SLD]"
555,phs000207,CGEMS Prostate Cancer GWAS - Stage 1 - PLCO,2252,Prostatic Neoplasms,D011471,[CADM]


In [14]:
df = pd.DataFrame(nci_studies)

In [15]:
df

Unnamed: 0,resourceType,id,meta,extension,identifier,title,status,category,focus,condition,keyword,description,enrollment,sponsor,partOf
0,ResearchStudy,phs001287,"{'versionId': '1', 'lastUpdated': '2022-02-14T...",[{'url': 'https://dbgap-api.ncbi.nlm.nih.gov/f...,[{'type': {'coding': [{'system': 'https://dbga...,CPTAC 3 Study,completed,[{'coding': [{'system': 'https://dbgap-api.ncb...,[{'coding': [{'system': 'urn:oid:2.16.840.1.11...,[{'coding': [{'system': 'urn:oid:2.16.840.1.11...,[{'coding': [{'system': 'urn:oid:2.16.840.1.11...,"\nRecently, significant progress has been made...",[{'reference': 'Group/phs001287.v13.p5-all-sub...,"{'reference': 'Organization/NCI', 'type': 'Org...",
1,ResearchStudy,phs000527,"{'versionId': '1', 'lastUpdated': '2022-02-14T...",[{'url': 'https://dbgap-api.ncbi.nlm.nih.gov/f...,[{'type': {'coding': [{'system': 'https://dbga...,National Cancer Institute (NCI) Cancer Genome ...,completed,[{'coding': [{'system': 'https://dbgap-api.ncb...,[{'coding': [{'system': 'urn:oid:2.16.840.1.11...,[{'coding': [{'system': 'urn:oid:2.16.840.1.11...,[{'coding': [{'system': 'urn:oid:2.16.840.1.11...,\nThe Office of Cancer Genomics at the Nationa...,[{'reference': 'Group/phs000527.v15.p5-all-sub...,"{'reference': 'Organization/NCI', 'type': 'Org...",[{'reference': 'ResearchStudy/phs000235'}]
2,ResearchStudy,phs000528,"{'versionId': '1', 'lastUpdated': '2022-02-14T...",[{'url': 'https://dbgap-api.ncbi.nlm.nih.gov/f...,[{'type': {'coding': [{'system': 'https://dbga...,National Cancer Institute (NCI) Cancer Genome ...,completed,[{'coding': [{'system': 'https://dbgap-api.ncb...,[{'coding': [{'system': 'urn:oid:2.16.840.1.11...,[{'coding': [{'system': 'urn:oid:2.16.840.1.11...,[{'coding': [{'system': 'urn:oid:2.16.840.1.11...,\nThe Office of Cancer Genomics at the Nationa...,[{'reference': 'Group/phs000528.v15.p5-all-sub...,"{'reference': 'Organization/NCI', 'type': 'Org...",[{'reference': 'ResearchStudy/phs000235'}]
3,ResearchStudy,phs000531,"{'versionId': '1', 'lastUpdated': '2022-02-14T...",[{'url': 'https://dbgap-api.ncbi.nlm.nih.gov/f...,[{'type': {'coding': [{'system': 'https://dbga...,National Cancer Institute (NCI) Cancer Genome ...,completed,[{'coding': [{'system': 'https://dbgap-api.ncb...,[{'coding': [{'system': 'urn:oid:2.16.840.1.11...,[{'coding': [{'system': 'urn:oid:2.16.840.1.11...,[{'coding': [{'system': 'urn:oid:2.16.840.1.11...,\nThe Office of Cancer Genomics (OCG) at the N...,[{'reference': 'Group/phs000531.v15.p5-all-sub...,"{'reference': 'Organization/NCI', 'type': 'Org...",[{'reference': 'ResearchStudy/phs000235'}]
4,ResearchStudy,phs000530,"{'versionId': '1', 'lastUpdated': '2022-02-14T...",[{'url': 'https://dbgap-api.ncbi.nlm.nih.gov/f...,[{'type': {'coding': [{'system': 'https://dbga...,National Cancer Institute (NCI) Cancer Genome ...,completed,[{'coding': [{'system': 'https://dbgap-api.ncb...,[{'coding': [{'system': 'urn:oid:2.16.840.1.11...,[{'coding': [{'system': 'urn:oid:2.16.840.1.11...,[{'coding': [{'system': 'urn:oid:2.16.840.1.11...,\nThe Office of Cancer Genomics at the Nationa...,[{'reference': 'Group/phs000530.v15.p5-all-sub...,"{'reference': 'Organization/NCI', 'type': 'Org...",[{'reference': 'ResearchStudy/phs000235'}]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
552,ResearchStudy,phs000336,"{'versionId': '1', 'lastUpdated': '2022-02-14T...",[{'url': 'https://dbgap-api.ncbi.nlm.nih.gov/f...,[{'type': {'coding': [{'system': 'https://dbga...,DCEG Lung Cancer Study,completed,[{'coding': [{'system': 'https://dbgap-api.ncb...,[{'coding': [{'system': 'urn:oid:2.16.840.1.11...,[{'coding': [{'system': 'urn:oid:2.16.840.1.11...,[{'coding': [{'system': 'urn:oid:2.16.840.1.11...,\nThree genetic loci for lung cancer risk have...,[{'reference': 'Group/phs000336.v1.p1-all-subj...,"{'reference': 'Organization/NCI', 'type': 'Org...",
553,ResearchStudy,phs000124,"{'versionId': '1', 'lastUpdated': '2022-02-14T...",[{'url': 'https://dbgap-api.ncbi.nlm.nih.gov/f...,[{'type': {'coding': [{'system': 'https://dbga...,Genome-Wide Association Study of Neuroblastoma,completed,[{'coding': [{'system': 'https://dbgap-api.ncb...,[{'coding': [{'system': 'urn:oid:2.16.840.1.11...,[{'coding': [{'system': 'urn:oid:2.16.840.1.11...,[{'coding': [{'system': 'urn:oid:2.16.840.1.11...,\nNeuroblastoma is a malignancy of the develop...,[{'reference': 'Group/phs000124.v2.p1-all-subj...,"{'reference': 'Organization/NCI', 'type': 'Org...",
554,ResearchStudy,phs000093,"{'versionId': '1', 'lastUpdated': '2022-02-14T...",[{'url': 'https://dbgap-api.ncbi.nlm.nih.gov/f...,[{'type': {'coding': [{'system': 'https://dbga...,A Genome Wide Scan of Lung Cancer and Smoking,completed,[{'coding': [{'system': 'https://dbgap-api.ncb...,[{'coding': [{'system': 'urn:oid:2.16.840.1.11...,[{'coding': [{'system': 'urn:oid:2.16.840.1.11...,[{'coding': [{'system': 'urn:oid:2.16.840.1.11...,\nThe majority of cases of lung cancer are the...,[{'reference': 'Group/phs000093.v2.p2-all-subj...,"{'reference': 'Organization/NCI', 'type': 'Org...",
555,ResearchStudy,phs000207,"{'versionId': '1', 'lastUpdated': '2022-02-14T...",[{'url': 'https://dbgap-api.ncbi.nlm.nih.gov/f...,[{'type': {'coding': [{'system': 'https://dbga...,CGEMS Prostate Cancer GWAS - Stage 1 - PLCO,completed,[{'coding': [{'system': 'https://dbgap-api.ncb...,[{'coding': [{'system': 'urn:oid:2.16.840.1.11...,[{'coding': [{'system': 'urn:oid:2.16.840.1.11...,[{'coding': [{'system': 'urn:oid:2.16.840.1.11...,\nThe Cancer Genetic Markers of Susceptibility...,[{'reference': 'Group/phs000207.v1.p1-all-subj...,"{'reference': 'Organization/NCI', 'type': 'Org...",


In [19]:
df.status.unique()

array(['completed'], dtype=object)

In [22]:
df['meta']

0      {'versionId': '1', 'lastUpdated': '2022-02-14T...
1      {'versionId': '1', 'lastUpdated': '2022-02-14T...
2      {'versionId': '1', 'lastUpdated': '2022-02-14T...
3      {'versionId': '1', 'lastUpdated': '2022-02-14T...
4      {'versionId': '1', 'lastUpdated': '2022-02-14T...
                             ...                        
552    {'versionId': '1', 'lastUpdated': '2022-02-14T...
553    {'versionId': '1', 'lastUpdated': '2022-02-14T...
554    {'versionId': '1', 'lastUpdated': '2022-02-14T...
555    {'versionId': '1', 'lastUpdated': '2022-02-14T...
556    {'versionId': '1', 'lastUpdated': '2022-02-14T...
Name: meta, Length: 557, dtype: object

In [29]:
df['meta'][0]['lastUpdated'][:10]

'2022-02-14'

In [30]:
def extract_last_updated_date(meta):
    return meta['lastUpdated'][:10]

In [31]:
df['last_updated'] = df['meta'].apply(lambda x: extract_last_updated_date(x))

In [33]:
df.last_updated.unique()

array(['2022-02-14'], dtype=object)

### Issue: The response results for ALL studies show a "Last Updated Date" of Feb 2022.
Does this mean the date at which the entire API db was last updated? 

In [None]:
cptac3 = mf.run_query('ResearchStudy?_id=phs001287')

Total  Resources: 1
Total  Bytes: 13916
Total  Pages: 1
Time elapsed 0.6606 seconds


In [None]:
cptac3

[{'resourceType': 'ResearchStudy',
  'id': 'phs001287',
  'meta': {'versionId': '1',
   'lastUpdated': '2022-02-14T02:04:51.822-05:00',
   'source': '#qRs8LVqZYunRZQh4',
   'security': [{'system': 'https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/CodeSystem/DbGaPConcept-SecurityStudyConsent',
     'code': 'public',
     'display': 'public'}]},
  'extension': [{'url': 'https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/StructureDefinition/ResearchStudy-StudyOverviewUrl',
    'valueUrl': 'https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id=phs001287.v13.p5'},
   {'url': 'https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/StructureDefinition/ResearchStudy-ReleaseDate',
    'valueDate': '2021-12-13'},
   {'url': 'https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/StructureDefinition/ResearchStudy-StudyConsents',
    'extension': [{'url': 'https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/StructureDefinition/ResearchStudy-StudyConsents-StudyConsent',
      'valueCoding': {'system': 'https://dbgap-api.ncbi.nlm.

### Issue: Searching the phs ID for the CPTAC study returns outdated results compared to the DbGaP site
https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id=phs001287.v17.p6  

Most recent version in API results: 
```
'identifier': [{'type': {'coding': [{'system': 'https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/CodeSystem/DbGaPConcept-DbGaPStudyIdentifier',
       'code': 'dbgap_study_id',
       'display': 'dbgap_study_id'}]},
    'value': 'phs001287.v13.p5'}],
  'title': 'CPTAC 3 Study',
  ```