# Extract Distribution information from ISO 19139 metadata

This notebook is opened with a documentID used to pull an ISO XML record from the CINERGI catalog;
The record is parsed to extract distribution information and generate a dispatchList object

The dispatch list object gets passed to a dispatcher that accesses mapping between endpoint applications and application profiles in the dispatchList.


In [None]:
# package dependency imports

#import xmltodict
import requests
#import json
from lxml import etree  #supposed to be better than xml.etree
#from io import StringIO,BytesIO

In [12]:
%%javascript
function getQueryStringValue (key)
{  
    return unescape(window.location.search.replace(new RegExp("^(?:.*[&\\?]" + escape(key).replace(/[\.\+\*]/g, "\\$&") + "(?:\\=([^&]*))?)?.*$", "i"), "$1"));
}
IPython.notebook.kernel.execute("documentID='".concat(getQueryStringValue("documentId")).concat("'"));
IPython.notebook.kernel.execute("user='".concat(getQueryStringValue("user")).concat("'"));
IPython.notebook.kernel.execute("full_notebook_url='" + window.location + "'"); 

<IPython.core.display.Javascript object>

In [14]:
def testurl(theurl):
    #try HEAD first in case the response document is big
    r = requests.head(theurl)
    if (r.status_code != requests.codes.ok):
        #check GET in case is an incomplete http implementation
        r = requests.get(theurl)
        if (r.status_code == requests.codes.ok):
            return True
        else:
            return False
    else:
        return True


In [15]:
# use hardwired values for testing


catalogURL = "http://cinergi.sdsc.edu/geoportal/"
if (len(documentID)==0):
    #documentID="e3619c5df2644204b67f51f48525a0b1"
    documentID="4db8156abb6d4119aa5c35aa39514b42"

In [16]:
url_partitioned = full_notebook_url.partition('ISOmetadata-ExtractDistributions.ipynb')
base_url = url_partitioned[0];

print("User: ",user)
print("DocumentID: ", documentID)
print("full notebook url partition", url_partitioned)
print("full notebook url", full_notebook_url)

('User: ', '')
('DocumentID: ', 'ad5868280db04bfc84d9a75b9bf00fbd')
('full notebook url partition', ('http://localhost:8889/notebooks/DispatchTesting/', 'ISOmetadata-ExtractDistributions.ipynb', '?documentId=ad5868280db04bfc84d9a75b9bf00fbd'))
('full notebook url', 'http://localhost:8889/notebooks/DispatchTesting/ISOmetadata-ExtractDistributions.ipynb?documentId=ad5868280db04bfc84d9a75b9bf00fbd')


In [17]:


#get the url to retrieve xml record from catalog
metadataURLx=catalogURL + 'rest/metadata/item/' + documentID + '/xml'

print ("metadata URL: ", metadataURLx)

#get the xml record
the_page = requests.get(metadataURLx)


('metadata URL: ', 'http://cinergi.sdsc.edu/geoportal/rest/metadata/item/ad5868280db04bfc84d9a75b9bf00fbd/xml')


In [18]:
# use this to generate JSON representation of the metadata record
#the_isojson = json.loads(json.dumps(xmltodict.parse(the_page.text)))

#print(the_isojson.keys())
#print(the_isojson["gmi:MI_Metadata"])

In [19]:
#set up namespace map for ISO metadata
NSMAP = {"gmi":"http://www.isotc211.org/2005/gmi" ,
    "gco":"http://www.isotc211.org/2005/gco" ,
    "gmd":"http://www.isotc211.org/2005/gmd" ,
    "gml":"http://www.opengis.net/gml" ,
    "gmx":"http://www.isotc211.org/2005/gmx" ,
    "gts":"http://www.isotc211.org/2005/gts" ,
    "srv":"http://www.isotc211.org/2005/srv" ,
    "xlink":"http://www.w3.org/1999/xlink"}

In [20]:
#root = etree.fromstring(the_page.text)

#tree is an element tree
tree = etree.parse(metadataURLx)
#root = etree.tostring(tree.getroot())
root = tree.getroot()
docinfo = tree.docinfo
print(docinfo.xml_version)
print(tree.findall("//gmd:MD_DigitalTransferOptions",namespaces=NSMAP))



1.0
[<Element {http://www.isotc211.org/2005/gmd}MD_DigitalTransferOptions at 0x6b69388>]


In [21]:
#iterate through digital transfer options and set up dispatch object
# dispatch list is a list of 'options' consisting of 
# {an application profile (string, from EC resource registry) that the disptcher will use to identify target notebooks, 
#   the URL for the information resource input to the target for that profile}
# e.g. dispatchlist = [{"profile":"profile1","url":"url1"}, {"profile":"profile2","url":"url2"}]

dispatchlist = []

for  elt in tree.getiterator("{http://www.isotc211.org/2005/gmd}MD_DigitalTransferOptions"):
    # only want OnlineResources that are in distribution//MD_DigitalTransferOptions
    #  TBD-- figure out what to do with CI_OnlineResource inside SV_OperationMetadata
    #print elt.text
#iterate through CI_OnlineResource elements
    for onlineres in elt.getiterator("{http://www.isotc211.org/2005/gmd}CI_OnlineResource"):
        if (onlineres.find("gmd:linkage/gmd:URL",namespaces=NSMAP) is not None):
            theURL=onlineres.find("gmd:linkage/gmd:URL",namespaces=NSMAP).text
        else:
            continue #don't bother if there's no URL!
        
        if (onlineres.find("gmd:name/gco:CharacterString",namespaces=NSMAP) is not None):
            thename=onlineres.find("gmd:name/gco:CharacterString",namespaces=NSMAP).text
        else:
            thename=''
        
        if (onlineres.find("gmd:description/gco:CharacterString",namespaces=NSMAP) is not None):
            thedescription=onlineres.find("gmd:description/gco:CharacterString",namespaces=NSMAP).text
        else:
            thedescription=''
            
        if (onlineres.find("gmd:protocol/gco:CharacterString",namespaces=NSMAP) is not None):
            theprotocol=onlineres.find("gmd:protocol/gco:CharacterString",namespaces=NSMAP).text
        else:
            theprotocol=''
        
        if (onlineres.find("gmd:applicationProfile/gco:CharacterString",namespaces=NSMAP) is not None):
            theappprofile=onlineres.find("gmd:applicationProfile/gco:CharacterString",namespaces=NSMAP).text
        else:
            theappprofile=''
            
        if (onlineres.find("gmd:function/gmd:CI_OnLineFunctionCode",namespaces=NSMAP) is not None):
            thefunctioncode=onlineres.find("gmd:function/gmd:CI_OnLineFunctionCode",namespaces=NSMAP).get("codeListValue")
        else:
            thefunctioncode=''
            
        if (onlineres.find("gmd:function/gmd:CI_OnLineFunctionCode",namespaces=NSMAP) is not None):    
            thefunctiontext=onlineres.find("gmd:function/gmd:CI_OnLineFunctionCode",namespaces=NSMAP).text
        else:
            thefunctiontext=''
            
        print(theURL,thename,thedescription,theprotocol,theappprofile,thefunctioncode,thefunctiontext)
        
        # series of tests to determine what application profiles are applicable for this online resource
        #check for OGC WFS Web feature service
        if (theprotocol.lower().find('wfs')>-1 or
           theURL.lower().find('service=wfs')>-1):       
            # append to dispatchlist
            # wfs disptacher gets the base URL for the service
            #check if service is responding
            tryurl=theURL.split('?')[0] + '?service=wfs&request=getCapabilities'
            if testurl(tryurl):
                dispatchlist.append({"profile":"wfsclient","url":theURL.split('?')[0]})
            
        #check for OGC WMS; open in QGIS, ArcGIS, or OpenLayers web client
        if (theprotocol.lower().find('wms')>-1 or
           (theURL.lower().find('service=wms')>-1 and theURL.lower().find('request=kml')==-1) ):
            #kml test is because of GeoServer handling of kml response for wms
            # append to dispatchlist
            # wms disptacher gets the base URL for the service
            tryurl=theURL.split('?')[0] + '?service=wms&request=getCapabilities'
            if testurl(tryurl):
                dispatchlist.append({"profile":"wmsclient","url":theURL.split('?')[0]})
            
        # KML client-- open in GoogleEarth or ?OpenLayers? kml client
        if (thedescription.lower().find('kml download')>-1 or
           (theURL.lower().find('request=kml')>-1 and theURL.lower().find('mode=download')>-1) or
           theURL.lower().find('.kml')>-1 or theURL.lower().find('.kmz')>-1):
            #kml test for GeoServer handling of kml response for wms
            # append to dispatchlist
            # wfs disptacher gets the base URL for the service
            if testurl(theURL):
                dispatchlist.append({"profile":"kmlclient","url":theURL})
            
        # other http URL-- check if the URL works
        if ((theURL.lower().find('.html')>-1 ) 
            or  (theprotocol.lower().find('http')>-1)):
            #kml test for GeoServer handling of kml response for wms
            # append to dispatchlist
            # wfs disptacher gets the base URL for the service
            if testurl(theURL):
                r = requests.get(theURL)
                if (r.headers['Content-Type'].find('html')>-1):
                    dispatchlist.append({"profile":"webbrowser","url":theURL,
                                       "label":thename })
                    
        # nwis rdb data
        if ((theURL.lower().find('/nwis/qwdata')>-1 )
            or (theURL.lower().find('/nwis/gwlevels')>-1 )
            or (theURL.lower().find('/nwis/uv')>-1 )
            or (theURL.lower().find('/nwis/peak')>-1 )
            or (theURL.lower().find('/nwis/measurements')>-1)):
            #kml test for GeoServer handling of kml response for wms
            # append to dispatchlist
            # wfs disptacher gets the base URL for the service
            if testurl(theURL):
                dispatchlist.append({"profile":"nwis_rdb","url":theURL,
                                       "label":thename })
            
print(dispatchlist)

('ftp://anonymous:fireftp%40example.com@ftp.nbmg.unr.edu/pub/Geothermal/07_Groundwater_Data/NV_DepthToGroundwater.zip', 'Zipped folder containing Depth to Groundwater Geo Database and Shape Files', 'Downloadable Data FTP Site', '', '', '375', 'download')
[]


Call the dispatcher with the dispatchlist
The dispatcher will need to access registry with mapping from application profile values to endpoints that will 'open' the url associated with that profile in the dispatch option.

In the long run, the dispatcher should be a separate component accessed via URL; start with it hard wired here.



In [22]:
#Simple dispatcher
for option in dispatchlist:
    if (option['profile']=='wfsclient'):
        #offer links for apps that consume generic WFS
        #print('got wfs')
        wfsurl=base_url+ 'WFSprocessor.ipynb?endpoint='+option['url']
        print('To inspect the Dataset contents via OGC Web Feature Service interface, click here: %s' % wfsurl)
    if (option['profile']=='webbrowser'):
        #offer links for apps that consume generic WFS
        print('To display %s in browser, click here: %s' % (option['label'],option['url']))
    if (option['profile']=='nwis_rdb'):
        #offer links for apps that consume generic WFS
        nwisgdburl=base_url+ 'NWIS-explore2.ipynb?dataurl='+option['url']
        print('To inspect data for %s, click here: %s' % (option['label'],nwisgdburl))
        