In [4]:
import gensim, logging, nltk, re, os, sys
from nltk.stem.lancaster import LancasterStemmer
from gensim.models import Phrases
import googlemaps
import requests
import dateparser
from dateparser.search import search_dates

## Geoparsing
This is based on GoogleMaps API, and uses Python2

In [5]:
def geoParse(query):
    gmaps = googlemaps.Client(key='AIzaSyACekuOv6hyB5o2dQq1mpP0Bztjx0vjTuM')
    geocode_result = gmaps.geocode(query)
    return {'bounds': geocode_result[0]['geometry']['bounds'], 'string': geocode_result[0]['formatted_address']}

In [6]:
print geoParse('sst pacific ocean')

{'bounds': {u'northeast': {u'lat': 48.7297227, u'lng': -122.4090338}, u'southwest': {u'lat': 48.717229, u'lng': -122.416576}}, 'string': u'SST, Bellingham, WA 98229, USA'}


## Temporal parsing
This is based on dateparser library. Internally, it is a name entity recogonition (NER) process. In this case, dateparser uses a pre-built time detection NLP model.

In [7]:
def temporalParse(query):
    return search_dates(query)

In [8]:
print dateparser.parse('1 hour ago')

2018-06-26 14:30:59.663437


In [9]:
print search_dates("ocean wind in March 3rd, 2004")

[(u'in March 3rd, 2004', datetime.datetime(2004, 3, 3, 0, 0))]


## Semantic analysis
This is based on MUDROD, a previous NASA funded project (https://github.com/apache/incubator-sdap-mudrod). Internally, it is based on a collaborative filtering algorithm training with user behavior and metadata.

In [10]:
baseUrl = "http://199.26.254.151:8080/mudrod-service/SearchVocab?concept="

def expandQuery(query):
    url = baseUrl + query
    response = requests.get(url)    
    dic = response.json()
    return dic

In [11]:
print expandQuery('sst')

{u'graph': {u'ontology': [{u'word': u'sea surface temperature', u'weight': 1.0}, {u'word': u'ocean temperature', u'weight': 1.0}, {u'word': u'group high resolution sea surface temperature dataset', u'weight': 0.89}, {u'word': u'ghrsst', u'weight': 0.87}, {u'word': u'surface temperature', u'weight': 0.78}, {u'word': u'l2p', u'weight': 0.78}, {u'word': u'sea surface temperature monthly', u'weight': 0.72}, {u'word': u'l4', u'weight': 0.68}, {u'word': u'caspian sea', u'weight': 0.68}, {u'word': u'aqua', u'weight': 0.62}]}}


## Phrase detection
This is based on the co-occurance matrix computed by genism. This bigram and trigram models are training with PO.DAAC metadata.

In [12]:
def phraseDetect(query):
    bigram = Phrases.load('model/bigram')
    trigram = Phrases.load('model/trigram')
    return trigram[bigram[query.split()]]

In [14]:
print phraseDetect('modis level 2 sea surface temperature')

[u'modis', u'level_2', u'sea_surface_temperature']


## Putting all this together

In [15]:
def deepQuery(query):
    res= {}
    
    #temporal parsing
    res['time'] = temporalParse(query)
    query = query.replace(str(res['time'][0][0]), '')
    
    #geoparsing
    geo = geoParse(query)
    res['geo'] = geo['bounds']
    query = query.replace(str(geo['string']).lower(), '')
    
    #query expansion
    res['phrase'] = phraseDetect(query.strip())
    
    #semantic expansion, just to expand the first phrase as an example
    phrase = str(res['phrase'][0])
    res['semantics'] = expandQuery(phrase.replace('_', ' '))
    return res

In [16]:
testQ = 'sea surface temperature modis level 2 pacific ocean in March 3rd, 2004'
print(deepQuery(testQ))

{'phrase': [u'sea_surface_temperature', u'modis', u'level_2'], 'geo': {u'northeast': {u'lat': 59.48222930000001, u'lng': -66.51908139999999}, u'southwest': {u'lat': -77.8225785, u'lng': 128.576489}}, 'semantics': {u'graph': {u'ontology': [{u'word': u'sst', u'weight': 1.0}, {u'word': u'ocean temperature', u'weight': 1.0}, {u'word': u'ghrsst', u'weight': 1.0}, {u'word': u'group high resolution sea surface temperature dataset', u'weight': 0.97}, {u'word': u'reynolds sea surface temperature', u'weight': 0.83}, {u'word': u'surface temperature', u'weight': 0.82}, {u'word': u'mur', u'weight': 0.81}, {u'word': u'caspian sea', u'weight': 0.81}, {u'word': u'sea surface temperature el nino', u'weight': 0.76}, {u'word': u'ocean tempetature', u'weight': 0.76}]}}, 'time': [(u'in March 3rd, 2004', datetime.datetime(2004, 3, 3, 0, 0))]}
