# Data Discovery Studio Jupyter Notebook Dispatcher

## 1. Execute the cells below to get parameters from DDS and select a notebook for processing ###

Example call with Document ID from DDS: CinergiDispatch.ipynb?documentId=61cc7f6afb5246d2be41811e94a1a8ea


In [1]:
# Check if the parameters are correct
from __future__ import print_function
import ntpath
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
import webbrowser
from lxml import etree  #supposed to be better than xml.etree
import json

documentID=''

In [2]:
%%javascript
function getQueryStringValue (key)
{  
    return unescape(window.location.search.replace(new RegExp("^(?:.*[&\\?]" + escape(key).replace(/[\.\+\*]/g, "\\$&") + "(?:\\=([^&]*))?)?.*$", "i"), "$1"));
}
IPython.notebook.kernel.execute("documentID='".concat(getQueryStringValue("documentId")).concat("'"));
IPython.notebook.kernel.execute("user='".concat(getQueryStringValue("user")).concat("'"));
IPython.notebook.kernel.execute("full_notebook_url='" + window.location + "'"); 

<IPython.core.display.Javascript object>

## 2. Print out parameters passed to Jupyter: ##

In [3]:

# use hardwired values for testing
# these only get used if no documentID is passes as an argument when 
#   opening the notebook (per getQueryStringValue javascript function at start)
catalogURL = "http://datadiscoverystudio.org/geoportal/"
if (len(documentID)==0):
    documentID="e3619c5df2644204b67f51f48525a0b1"    #NGDS wfs
    #documentID="4db8156abb6d4119aa5c35aa39514b42"   #sciencebase WFS
    #documentID="61cc7f6afb5246d2be41811e94a1a8ea"   #ndbc data
    #documentID="de5383bf941d4d60ae9443bd7ffa9a33"   #Magic data
    #documentID="b20f8f12ef594520abb0e5efbcd891fe"   #nwis qwdata

In [4]:

url_partitioned = full_notebook_url.partition('/CinergiDispatch')
base_url = url_partitioned[0];

print("User: ",user)
print("DocumentID: ", documentID)
# print("full notebook url partition", url_partitioned)
print("Full notebook URL: ", full_notebook_url)

User:  
DocumentID:  e3619c5df2644204b67f51f48525a0b1
Full notebook URL:  http://localhost:8889/notebooks/jupyter-dispatchtests/ShutongCinergiDispatch.ipynb


---------------------------------------------------------------------------------------------------------------------

### New stuff

In [5]:
import pandas as pd
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
import requests

#construct url for metadata
catalogURL = "http://datadiscoverystudio.org/geoportal/"
metadataURLx=catalogURL + 'rest/metadata/item/' + documentID + '/xml'
#metadata = requests.get(metadataURLx)

print("metadata URL: ", metadataURLx)

metadata URL:  http://datadiscoverystudio.org/geoportal/rest/metadata/item/e3619c5df2644204b67f51f48525a0b1/xml


#### parsing the xml metadata
P.S. Repetitive keys have its key value pair stored in a list at its level of hiearchy in the dict

In [6]:
#!pip install xmltodict

In [7]:
#import xmltodict
#from collections import OrderedDict

#metadata_dict = xmltodict.parse(metadata.text)

In [8]:
#set up namespace map for ISO metadata
NSMAP = {"gmi":"http://www.isotc211.org/2005/gmi" ,
    "gco":"http://www.isotc211.org/2005/gco" ,
    "gmd":"http://www.isotc211.org/2005/gmd" ,
    "gml":"http://www.opengis.net/gml" ,
    "gmx":"http://www.isotc211.org/2005/gmx" ,
    "gts":"http://www.isotc211.org/2005/gts" ,
    "srv":"http://www.isotc211.org/2005/srv" ,
    "xlink":"http://www.w3.org/1999/xlink"}

In [9]:
#tree is an element tree
tree = etree.parse(metadataURLx)
#root = etree.tostring(tree.getroot())
root = tree.getroot()
docinfo = tree.docinfo
print(docinfo.xml_version)

1.0


In [10]:
title=tree.find("//gmd:identificationInfo//gmd:title/gco:CharacterString",namespaces=NSMAP).text
print(title)

Colorado Active Faults


In [11]:
#title of the dataset, not sure how useful
#title = metadata_dict['gmi:MI_Metadata']['gmd:identificationInfo']['gmd:MD_DataIdentification']['gmd:citation']['gmd:CI_Citation']['gmd:title']['gco:CharacterString']

In [12]:
# development of logic for reading the xml tree
for  dist in tree.getiterator("{http://www.isotc211.org/2005/gmd}MD_Distribution"):
    if dist.find("gmd:distributionFormat/gmd:MD_Format/gmd:name/gco:CharacterString",namespaces=NSMAP) is not None: 
        # this code is not handling multiple formats under a distribution at this point. 
        # have to consider how to deal with binding between formats and digital transfer options
        
        dist_format = dist.find("gmd:distributionFormat/gmd:MD_Format/gmd:name/gco:CharacterString",namespaces=NSMAP).text
    else:
        dist_format = ''
    print(dist_format)
    if dist.find("gmd:distributor/gmd:MD_Distributor//gmd:organisationName/.",namespaces=NSMAP) is not None:
        distorg =dist.xpath("gmd:distributor//gmd:organisationName/child::node()/text()",namespaces=NSMAP)[0]
        print('distribution org: ' + distorg)
    else:
        distorg = ''
        
    if dist.find("gmd:distributor/gmd:MD_Distributor//gmd:MD_DigitalTransferOptions//gmd:CI_OnlineResource.",namespaces=NSMAP) is not None:
        distonline =dist.find("gmd:distributor/gmd:MD_Distributor//gmd:MD_DigitalTransferOptions//gmd:CI_OnlineResource",namespaces=NSMAP)
    else:
        distonline = ''
        
    print(distonline)


distribution org: Colorado Geological Survey



In [13]:
# define distribution object with CI_ONlineResource elements, plus the distributor organization 
#  and list of possible formats

class DistObj:
    def __init__(self,aname):
        self.name = aname
        self.url = ''
        self.description = ''
        self.protocol = ''
        self.appprofile = ''
        self.functioncode = ''
        self.functiontext = ''
        self.distorg = ''
        self.formatlist = []
        
    def dump(self):
        return {"adistobj": {'name': self.name,
                               'url': self.url,
                               'description': self.description,
                               'protocol': self.protocol,
                                'appprofile': self.appprofile,
                               'functioncode': self.functioncode,
                               'functiontext': self.functiontext,
                               'distorg': self.distorg,
                               'formatlist' : self.formatlist                            
                             }}     
    

In [18]:
#count = 0

distlist=[]  # this will be a list of distribution objects

# plan -- iterate over the CI_OnlineResource Elements that are in MD_DigitalTransfer Options
for  elt in tree.getiterator("{http://www.isotc211.org/2005/gmd}MD_DigitalTransferOptions"):
    # only want OnlineResources that are in distribution//MD_DigitalTransferOptions
    #  TBD-- figure out what to do with CI_OnlineResource inside SV_OperationMetadata
    
#iterate through CI_OnlineResource elements
    for onlineres in elt.getiterator("{http://www.isotc211.org/2005/gmd}CI_OnlineResource"):
        
        if ((onlineres.find("gmd:linkage/gmd:URL",namespaces=NSMAP) is not None) and
            (onlineres.find("gmd:linkage/gmd:URL",namespaces=NSMAP).text is not None)  ):
            theURL=onlineres.find("gmd:linkage/gmd:URL",namespaces=NSMAP).text
        else:
            theURL = 'empty'
        
        #print('theURL ' + theURL + '\n')
        
        if (onlineres.find("gmd:name/gco:CharacterString",namespaces=NSMAP) is not None):
            thename=onlineres.find("gmd:name/gco:CharacterString",namespaces=NSMAP).text
        else:
            thename=''
        
        if (onlineres.find("gmd:description/gco:CharacterString",namespaces=NSMAP) is not None):
            thedescription=onlineres.find("gmd:description/gco:CharacterString",namespaces=NSMAP).text
        else:
            thedescription=''
            
        if (onlineres.find("gmd:protocol/gco:CharacterString",namespaces=NSMAP) is not None):
            theprotocol=onlineres.find("gmd:protocol/gco:CharacterString",namespaces=NSMAP).text
        else:
            theprotocol=''
        
        if (onlineres.find("gmd:applicationProfile/gco:CharacterString",namespaces=NSMAP) is not None):
            theappprofile=onlineres.find("gmd:applicationProfile/gco:CharacterString",namespaces=NSMAP).text
        else:
            theappprofile=''
            
        if (onlineres.find("gmd:function/gmd:CI_OnLineFunctionCode",namespaces=NSMAP) is not None):
            thefunctioncode=onlineres.find("gmd:function/gmd:CI_OnLineFunctionCode",namespaces=NSMAP).get("codeListValue")
        else:
            thefunctioncode=''
            
        if (onlineres.find("gmd:function/gmd:CI_OnLineFunctionCode",namespaces=NSMAP) is not None):    
            thefunctiontext=onlineres.find("gmd:function/gmd:CI_OnLineFunctionCode",namespaces=NSMAP).text
        else:
            thefunctiontext=''
            
        #print('\n Distribution: name-%s;\n  url- %s; \n  description--%s; \n   protocol-%s, app profile- %s; function- %s; %s' %
        #      (thename,theURL,thedescription,theprotocol,theappprofile,thefunctioncode,thefunctiontext))

        #Handle format and distributor organization
# have to figure out who is the distributor
#check to see if have multiple distributors, if so they should have distributor formats and transfer options
#   if they don't then assume all formats apply to all distributions
        formatlist = []  #initialize

        if len(onlineres.xpath("./ancestor::gmd:MD_Distribution/gmd:distributor",namespaces=NSMAP)) <= 1:
     #have zero or one distributor; 
            distorg = ''
            if len(onlineres.xpath("./ancestor::gmd:MD_Distribution/gmd:distributor",namespaces=NSMAP)) == 1:
                dist = onlineres.xpath("./ancestor::gmd:MD_Distribution/gmd:distributor",namespaces=NSMAP)[0]
                if len(dist.xpath("gmd:MD_Distributor//gmd:organisationName",namespaces=NSMAP)) > 0:
                    distorg =dist.xpath("gmd:MD_Distributor//gmd:organisationName/child::node()/text()",namespaces=NSMAP)[0]
            #print("distorg: " + distorg)                        
    #get formats. Formats might be on Distribution, Distributor, or DigitalTransferOption
            thedistformats = onlineres.xpath("./ancestor::gmd:MD_Distribution/gmd:distributionFormat",namespaces=NSMAP)
            #get formats at gmd:MD_Distribution/gmd:distributionFormat/gmd:MD_Format       
            for aformat in thedistformats:
                if ((aformat.find("gmd:MD_Format/gmd:name/gco:CharacterString",namespaces=NSMAP) is not None) and
                    (aformat.find("gmd:MD_Format/gmd:name/gco:CharacterString",namespaces=NSMAP).text not in formatlist)):
                    formatlist.append(aformat.find("gmd:MD_Format/gmd:name/gco:CharacterString",namespaces=NSMAP).text)

            thedistformats = onlineres.xpath("./ancestor::gmd:MD_DigitalTransferOptions/gmd:distributionFormat",namespaces=NSMAP)
            #get formats on the parent gmd:MD_DigitalTransferOptions       
            for aformat in thedistformats:
                if (( aformat.find("gmd:MD_Format/gmd:name/gco:CharacterString",namespaces=NSMAP) is not None) and
                            (aformat.find("gmd:MD_Format/gmd:name/gco:CharacterString",namespaces=NSMAP).text not in formatlist)):
                    formatlist.append(aformat.find("gmd:MD_Format/gmd:name/gco:CharacterString",namespaces=NSMAP).text)


            thedistformats = onlineres.xpath("./ancestor::gmd:MD_Distribution//gmd:distributorFormat",namespaces=NSMAP)
            #get formats on the gmd:MD_Distributor; the transfer options might not be child of distributor    
            for aformat in thedistformats:
                if ((aformat.find("gmd:MD_Format/gmd:name/gco:CharacterString",namespaces=NSMAP) is not None) and
                       (aformat.find("gmd:MD_Format/gmd:name/gco:CharacterString",namespaces=NSMAP).text in formatlist)):
                    formatlist.append(aformat.find("gmd:MD_Format/gmd:name/gco:CharacterString",namespaces=NSMAP).text)
  
        elif len(onlineres.xpath("./ancestor::gmd:MD_Distributor",namespaces=NSMAP)) == 1:
            #MD_DigitalTransferOptions is child of MD_Distributor; there are multiple distributors
            distorg = ''
            dist = onlineres.xpath("./ancestor::gmd:MD_Distributor",namespaces=NSMAP)[0]
            if len(dist.xpath("gmd:distributorContact//gmd:organisationName",namespaces=NSMAP)) > 0:
                distorg =dist.xpath("gmd:distributorContact//gmd:organisationName/child::node()/text()",namespaces=NSMAP)[0]
            #print("distorg: " + distorg) 
            
            
            #check if they have distributorFormat 
            thedistformats = onlineres.xpath("./ancestor::gmd:MD_Distributor/gmd:distributorFormat",namespaces=NSMAP)
            #get formats on the gmd:MD_Distributor; note in this case look for specific distributor that is parent 
            #   of the digital transfer options/online resource.
            for aformat in thedistformats:
                if ((aformat.find("gmd:MD_Format/gmd:name/gco:CharacterString",namespaces=NSMAP) is not None) and
                       (aformat.find("gmd:MD_Format/gmd:name/gco:CharacterString",namespaces=NSMAP).text in formatlist)):
                    formatlist.append(aformat.find("gmd:MD_Format/gmd:name/gco:CharacterString",namespaces=NSMAP).text)
                    
                    
            thedistformats = onlineres.xpath("./ancestor::gmd:MD_DigitalTransferOptions/gmd:distributionFormat",namespaces=NSMAP)
            #get formats specific to the parent gmd:MD_DigitalTransferOptions       
            for aformat in thedistformats:
                if (( aformat.find("gmd:MD_Format/gmd:name/gco:CharacterString",namespaces=NSMAP) is not None) and
                            (aformat.find("gmd:MD_Format/gmd:name/gco:CharacterString",namespaces=NSMAP).text not in formatlist)):
                    formatlist.append(aformat.find("gmd:MD_Format/gmd:name/gco:CharacterString",namespaces=NSMAP).text)
            
            thedistformats = onlineres.xpath("./ancestor::gmd:MD_Distribution/gmd:distributionFormat",namespaces=NSMAP)
            #get formats at gmd:MD_Distribution/gmd:distributionFormat; assume these apply to all digital transfer options       
            for aformat in thedistformats:
                if ((aformat.find("gmd:MD_Format/gmd:name/gco:CharacterString",namespaces=NSMAP) is not None) and
                    (aformat.find("gmd:MD_Format/gmd:name/gco:CharacterString",namespaces=NSMAP).text not in formatlist)):
                    formatlist.append(aformat.find("gmd:MD_Format/gmd:name/gco:CharacterString",namespaces=NSMAP).text)
                    
        else:
            # multiple distributors, but digital transfer options are not associated with specific distributor
            
            distorg = ''
            # arbitrarily take the first distributor organization
            dist = onlineres.xpath("./ancestor::gmd:MD_Distribution/gmd:distributor",namespaces=NSMAP)[0]
            if len(dist.xpath("gmd:MD_Distributor//gmd:organisationName",namespaces=NSMAP)) > 0:
                distorg =dist.xpath("gmd:MD_Distributor//gmd:organisationName/child::node()/text()",namespaces=NSMAP)[0]
            #print("distorg: " + distorg) 
            
            #  assume all distributors offer all digital transfer options and formats that are child of distribution
            thedistformats = onlineres.xpath("./ancestor::gmd:MD_DigitalTransferOptions/gmd:distributionFormat",namespaces=NSMAP)
            #get formats specific to the parent gmd:MD_DigitalTransferOptions       
            for aformat in thedistformats:
                if (( aformat.find("gmd:MD_Format/gmd:name/gco:CharacterString",namespaces=NSMAP) is not None) and
                            (aformat.find("gmd:MD_Format/gmd:name/gco:CharacterString",namespaces=NSMAP).text not in formatlist)):
                    formatlist.append(aformat.find("gmd:MD_Format/gmd:name/gco:CharacterString",namespaces=NSMAP).text)
            
            thedistformats = onlineres.xpath("./ancestor::gmd:MD_Distribution/gmd:distributionFormat",namespaces=NSMAP)
            #get formats at gmd:MD_Distribution/gmd:distributionFormat; assume these apply to all digital transfer options       
            for aformat in thedistformats:
                if ((aformat.find("gmd:MD_Format/gmd:name/gco:CharacterString",namespaces=NSMAP) is not None) and
                    (aformat.find("gmd:MD_Format/gmd:name/gco:CharacterString",namespaces=NSMAP).text not in formatlist)):
                    formatlist.append(aformat.find("gmd:MD_Format/gmd:name/gco:CharacterString",namespaces=NSMAP).text)
        
        thisdistobj = DistObj(thename)
        #thisdistobj.name = thename
        #print("theName: " + thename)
        thisdistobj.url = theURL
        thisdistobj.description = thedescription
        thisdistobj.protocol = theprotocol
        thisdistobj.appprofile = theappprofile
        thisdistobj.functioncode = thefunctioncode
        thisdistobj.functiontext = thefunctiontext
        thisdistobj.distorg = distorg
        thisdistobj.formatlist = formatlist
                
        distlist.append(thisdistobj)                    

    
    
json.dumps([theobj.dump() for theobj in distlist])
#json.dumps(distlist[1].dump(distlist[1]))

'[{"adistobj": {"name": "Service Description", "url": "http://web2.nbmg.unr.edu/ArcGIS/services/CO_Data/COActiveFaults/MapServer/WMSServer?request=GetCapabilities&service=WMS", "description": "parameters:{layers:\\"ActiveFault\\"}", "protocol": "OGC:WMS", "appprofile": "", "functioncode": "381", "functiontext": "webService", "distorg": "Colorado Geological Survey", "formatlist": []}}, {"adistobj": {"name": "WFS Capabilities", "url": "http://web2.nbmg.unr.edu/ArcGIS/services/CO_Data/COActiveFaults/MapServer/WFSServer?request=GetCapabilities&service=WFS", "description": "parameters:{typeName:\\"ActiveFault\\"}", "protocol": "OGC:WFS", "appprofile": "", "functioncode": "381", "functiontext": "webService", "distorg": "Colorado Geological Survey", "formatlist": []}}, {"adistobj": {"name": "ESRI Service Endpoint", "url": "http://web2.nbmg.unr.edu/ArcGIS/rest/services/CO_Data/COActiveFaults/MapServer", "description": "", "protocol": "ESRI", "appprofile": "", "functioncode": "381", "functionte

In [None]:
#subtree involving meta data distribution information

# this is problematic because it depends on use of the standard namespace prefixes, which
#  is not required by xml
md_dist = metadata_dict['gmi:MI_Metadata']['gmd:distributionInfo']['gmd:MD_Distribution']
distribute_format = md_dist['gmd:distributionFormat']['gmd:MD_Format']['gmd:name']['gco:CharacterString']
distributor = md_dist['gmd:distributor'][0]['gmd:MD_Distributor']['gmd:distributorContact']['gmd:CI_ResponsibleParty']['gmd:organisationName']['gco:CharacterString']
dist_landing = md_dist['gmd:distributor'][0]['gmd:MD_Distributor']['gmd:distributorTransferOptions']['gmd:MD_DigitalTransferOptions']['gmd:onLine']['gmd:CI_OnlineResource']['gmd:linkage']

#### Map the properties of metadata to suggested tokens

In [None]:
#look up the combination of metadata features in a table and map the combination to potential tokens
def md_to_token(df, *features):
    #look up the combination of the features
    #precondition : features comes in order, if feature for corresponding column does not exist, leave it as null
    featurelist = list(features)
    assert len(featurelist) <= (df.shape[1] - 1), 'feature space dimension mismatch'
    sets = []
    col_indx = 0
    non_empty_sets = 0
    for feature in featurelist:
        index_set = set(df.loc[df.iloc[:, col_indx] == feature].index)
        sets += [index_set]
        if (len(index_set) > 0):
            non_empty_sets += 1
        col_indx = col_indx + 1
    
    #if the feature set is complete
    if non_empty_sets == df.shape[1] - 1:
        idx = list(set.intersection(*sets))
        return list(df['tokens'].loc[idx])
    
    #the feature set is not complete, suggest all possibilities
    else:
        idx = list(set.union(*sets))
        return list(df['tokens'].loc[idx])

#### Map tokens to the notebook url

In [None]:
#map the given set to tokens to a set of urls
def token_to_url(tokens, df):
    #tokens is a list of string tokens
    #assume the token space of this url-mapping df is the same as the one of the token mapping df
    urls = list(df.loc[df['tokens'].apply(lambda x: x in tokens)]['url'])
    return urls

In [None]:
#not really used
def token_to_desc(tokens, df):
    #df is a mapping table from tokens to labels and descriptions
    #return a dict with keys as tokens and values as another dict - 
    #whose keys are descprtion and label and values are corresponding values
    subset = df.loc[df['tokens'].apply(lambda x: x in tokens)].set_index('tokens')
    return subset.to_dict('index')

#### Demo, for testing.

an example token map


In [None]:
md_token_map = pd.DataFrame({'feature1': ['a', 'b', 'c', 'b'],
                             'feature2': ['d', 'e', 'f', 'x'],
                             'feature3': ['g', 'h', 'i', 'y'],
                             'tokens': [1, 2, 3, 4]})
md_token_map

In [None]:
token_url_map = pd.DataFrame({'tokens': [1, 2, 3, 4],
                             'url': ['url1', 'url2', 'url3', 'url4']})
token_url_map

In [None]:
token_desc_map = pd.DataFrame({'tokens': [1, 2, 3, 4],
                              'label': ['label1', 'label2', 'label3', 'label4'],
                              'description': ['desc1', 'desc2', 'desc3', 'desc4']})
token_desc_map

In [None]:
url_desc_map = pd.merge(left = token_url_map, right = token_desc_map, on = 'tokens')[['url', 'label', 'description']].set_index('url')
url_desc_map

suppose I attain feature combination of [b, e, h]. Actual datasets should have more combinations 

In [None]:
feature1 = 'b'
feature2 = 'e'
feature3 = 'h'
tokens = md_to_token(md_token_map, feature1, feature2, feature3)
urls = token_to_url(tokens, token_url_map)
token_info = token_to_desc(tokens, token_desc_map)

test_menu = OrderedDict()
for url in urls:
    menu_label = dict(url_desc_map.loc[url])['label']
    if menu_label in test_menu.keys():
        menu_label = menu_label + '1'
    test_menu[menu_label] = url
def wrapper(menu):
    return menu
test_out = interact(wrapper, menu = test_menu)

Another example that returns >1 url

In [None]:
import numpy as np

In [None]:
feature4 = 'b'
feature5 = np.nan
feature6 = np.nan
tokens2 = md_to_token(md_token_map, feature4, feature5, feature6)
urls2 = token_to_url(tokens2, token_url_map)

test_menu2 = OrderedDict()
for url in urls2:
    menu_label = dict(url_desc_map.loc[url])['label']
    if menu_label in test_menu2.keys():
        menu_label = menu_label + '1'
    test_menu2[menu_label] = url
def wrapper(menu):
    return menu
test_out2 = interact(wrapper, menu = test_menu2)

----------------------------------------------------------------------------------------------------------------------------

## 4. Open the URL of the selected notebook ##

In [None]:
chosen_nb_name = nb_menu[out.widget.result]
url1 = ('{base_url}/operations/{nb_name}?'+'documentId='+documentID+'&'+'user='+user).format(base_url=base_url, nb_name=chosen_nb_name)

#webbrowser.open(url1)
webbrowser.open_new(url1)
print("CLICK TO OPEN THE URL: ", url1)
