### 1) Create a new database in PostgreSQL

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import os
import sys

#set this user line 
user = 'jared'

PARALLEL = 4 # assuming a quad-core machine
ATTRIBUTE = "entity_phone"
os.environ['SNORKELDBNAME'] = "location_extraction"

if user == 'accenture':
    os.environ['SNORKELDB'] = 'postgresql://localhost:5432/' + os.environ['SNORKELDBNAME']
    sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/fonduer/memex/')
elif user == 'jared':
    os.environ['SNORKELDB'] = 'postgres://jdunnmon:123@localhost:5432/' + os.environ['SNORKELDBNAME']
    sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/fonduer/memex/')
    
#from sqlalchemy import create_engine
#snorkeldb = create_engine('postgresql://localhost:5432/', isolation_level="AUTOCOMMIT")

 ## 1.1 Defining a Candidate Schema2) Candidate Schema

In [2]:
from snorkel.contrib.fonduer import SnorkelSession

session = SnorkelSession()

In [3]:
import os
from snorkel.contrib.fonduer.models import candidate_subclass

Location_Extraction = candidate_subclass('location_extraction', ["location"])


## 1.2 Parsing and Transforming the Input Documents into Unified Data Models

### Configuring an `HTMLPreprocessor`

In [6]:
from snorkel.contrib.fonduer import HTMLPreprocessor, OmniParser

if user == 'accenture':
    docs_path = os.environ['SNORKELHOME'] + '/tutorials/fonduer/memex/data/profiles_chtap/'
elif user == 'jared':
    docs_path = '/lfs/local/0/jdunnmon/chtap/data/s3/chtap_profiles_20170928/'

doc_preprocessor = HTMLPreprocessor(docs_path)

### Configuring an `OmniParser`

In [7]:
corpus_parser = OmniParser(structural=True, lingual=True)
%time corpus_parser.apply(doc_preprocessor, parallelism=PARALLEL)

Clearing existing...
Running UDF...
CPU times: user 45.3 s, sys: 1.16 s, total: 46.4 s
Wall time: 44min 11s


In [8]:
from snorkel.contrib.fonduer.models import Document, Phrase,Table

print "Documents:", session.query(Document).count()
print "Phrases:", session.query(Phrase).count()
print "Table", session.query(Table).count()


Documents: 1019
Phrases: 239581
Table 1571


## 1.3 Dividing the Corpus into Test and Train

In [9]:
docs = session.query(Document).order_by(Document.name).all()
ld   = len(docs)

train_docs = set()
dev_docs   = set()
test_docs  = set()
splits = (0.8, 0.9)
data = [(doc.name, doc) for doc in docs]
data.sort(key=lambda x: x[0])
for i, (doc_name, doc) in enumerate(data):
    if i < splits[0] * ld:
        train_docs.add(doc)
    elif i < splits[1] * ld:
        dev_docs.add(doc)
    else:
        test_docs.add(doc)
from pprint import pprint
#pprint([x.name for x in train_docs])
print "train:",len(train_docs)
print "dev:" ,len(dev_docs)
print "test:",len(test_docs)
# from pprint import pprint
# pprint([x.name for x in train_docs])

train: 816
dev: 102
test: 101


### Phase 2: Candidate Extraction & Multimodal Featurization

In [10]:
from snorkel.matchers import *
location_matcher = LocationMatcher(longest_match_only=True) 

####Define a relation's ContextSpaces

from snorkel.contrib.fonduer.fonduer.candidates import OmniNgrams
location_ngrams = OmniNgrams(n_max=6, split_tokens=[])


### Defining candidate Throttlers

In [11]:
from snorkel.contrib.fonduer.lf_helpers import *
import re
from snorkel.lf_helpers import (
    get_left_tokens, get_right_tokens, get_between_tokens,
    get_text_between, get_tagged_text,
)


    
def location_currencies_filter(location):
    list_currencies = [ "dollar", "dollars", "lira","kwacha","rials","rial","dong","dongs","fuerte","euro",
                       "euros","vatu","som","peso","sterling","sterlings","soms","pestos",
                       "pounds", 
                  "pound","dirham","dirhams","hryvnia","manat","manats","liras","lira",
                       "dinar","dinars","pa'anga","franc","baht","schilling",
                  "somoni","krona","lilangeni","rupee","rand","shilling","leone","riyal","dobra",
                  "tala","ruble","zloty","peso","sol","quarani","kina","guinean","balboa","krone","naira",
                  "cordoba","kyat","metical","togrog","leu","ouguiya","rufiyaa","ringgit","kwacha",
                  "ariary","denar","litas","loti","lats","kip","som","won","tenge","yen","shekel","rupiah",
                  "forint","lempira","gourde","quetzal","cedi","lari","dalasi","cfp","birr","kroon","nakfa",
                  "cfa","Peso","koruna","croatian","colon","yuan","escudo","cape","riel","lev","real"
                  ,"real","mark","boliviano","ngultrum","taka","manat","dram","kwanza","lek","afghani","renminbi"]

    
    cand_right_tokens = list(get_right_ngrams(location,window=2))
    #print len(cand_right_tokens)
    #print cand_right_tokens#(get_right_ngrams(location,window=4))
    for cand in cand_right_tokens:
        #print "["+cand+"]"
        if cand not in list_currencies:
            #print "["+cand+"]"
            #print location
            return location
    
candidate_filter = location_currencies_filter

In [12]:
from snorkel.contrib.fonduer.candidates import CandidateExtractor

candidate_extractor = CandidateExtractor(Location_Extraction,
                                         [location_ngrams], [location_matcher],
                                         candidate_filter=candidate_filter)


#                         candidate_filter=candidate_filter

%time candidate_extractor.apply(train_docs, split=0, parallelism=PARALLEL)

Clearing existing...
Running UDF...
CPU times: user 284 ms, sys: 283 ms, total: 567 ms
Wall time: 6min 59s


In [13]:
train_cands = session.query(Location_Extraction).filter(Location_Extraction.split == 0).all()
print "Number of candidates:", len(train_cands) 

Number of candidates: 7121


### Exploring the candidate 

In [14]:
from snorkel.contrib.fonduer.fonduer.lf_helpers import*
from snorkel.contrib.fonduer.candidates import*
###################### print candidates and text spans

# for cand in train_cands:
#     print cand
#     print cand.get_parent()
    #print cand
#cand = train_cands[0]



In [15]:
cand_16= train_cands[16]
print cand_16
cand_18= train_cands[18]
print cand_18
cand_19= train_cands[19]
print cand_19

location_extraction(Span("Houston Body Rubs", sentence=392310, chars=[0,16], words=[0,2]))
location_extraction(Span("Hawaii", sentence=497348, chars=[28,33], words=[5,5]))
location_extraction(Span("Hawaii", sentence=497353, chars=[0,5], words=[0,0]))


In [17]:
cand_16= train_cands[16]
print "text for the 16th candidate:\n", cand_16.get_parent()
print "16th candidate\n:",cand_16
ance_16 = get_ancestor_tag_names(cand_16)
print "ancestor of 16th candidate\n:", ance_16 
print "***************************************************"
cand_17= train_cands[17]
print "text for the 17th candidate:\n", cand_17.get_parent()
print "17th candidate:",cand_17
ance_17 = get_ancestor_tag_names(cand_17)
print "ancestor of 17th candidate\n:", ance_17
print "***************************************************"

cand_19= train_cands[19]
print "text for the 19th candidate:\n", cand_19.get_parent()
print "19th candidate:",cand_19
ance_19 = get_ancestor_tag_names(cand_18)
print "ancestor of 19th candidate\n:", ance_19

text for the 16th candidate:
Phrase (Doc: a07e995c-1bae-4b13-9843-c13e34a788d4, Index: 0, Text: Houston Body Rubs in Texas)
16th candidate
: location_extraction(Span("Houston Body Rubs", sentence=392310, chars=[0,16], words=[0,2]))
ancestor of 16th candidate
: ['html', 'head', 'title']
***************************************************
text for the 17th candidate:
Phrase (Doc: a07e995c-1bae-4b13-9843-c13e34a788d4, Index: 25, Text: Texas    »)
17th candidate: location_extraction(Span("Texas", sentence=392410, chars=[0,4], words=[0,0]))
ancestor of 17th candidate
: ['html', 'body', 'div', 'div']
***************************************************
text for the 19th candidate:
Phrase (Doc: 5f8b3b55-f796-42aa-a49a-e10e77fd8834, Index: 53, Text: Hawaii By Night)
19th candidate: location_extraction(Span("Hawaii", sentence=497353, chars=[0,5], words=[0,0]))
ancestor of 19th candidate
: ['html', 'body', 'div', 'div', 'div', 'div', 'li']


### Repeating for development and test splits

In [18]:
%%time
for i, docs in enumerate([dev_docs, test_docs]):
    candidate_extractor.apply(docs, split=i+1)
    print "Number of candidates:", session.query(Location_Extraction).filter(Location_Extraction.split == i+1).count()

Clearing existing...
Running UDF...
Number of candidates: 918
Clearing existing...
Running UDF...
Number of candidates: 878
CPU times: user 48.6 s, sys: 2.5 s, total: 51.1 s
Wall time: 3min 41s


In [19]:
session.rollback()

In [20]:
dev_cands = session.query(Location_Extraction).filter(Location_Extraction.split == 1).all()
print "Number of candidates:", len(train_cands)
dev_cand1= dev_cands[300]
# for cand in dev_cand:
#     print cand
#     print cand.get_parent()
print get_ancestor_tag_names(dev_cand1)
print dev_cand1.get_parent()

Number of candidates: 7121
['html', 'head', 'title']
Phrase (Doc: d7e35a5a-9966-4b1c-a322-59d3c591e822, Index: 0, Text: Boston Escorts - Boston Female Escorts - Female Escorts in Boston - Massachusetts Call Girls)


## Getting Place Names from Google API

In [3]:
#getting google place autocomplete API
import googlemaps as gm

def get_candidate_locations(plc):
    """
    INPUTS
    plc: string describing place to match

    OUTPUTS
    jsn: full json structure returned from API call
    plcs: list of candidate location strings
    """
    api_key = 'AIzaSyDbk3lLZHuQVKDRBN99_oz-p4AJjIzhA0w' 
    gmaps = gm.Client(key=api_key)
    qo = gm.places.places_autocomplete(gmaps,test_loc)
    cl = [a['description'] for a in qo]
    return qo,cl

In [4]:
#testing on a single candidate
test_loc = 'Jupiter'
query_out,can_locs = get_candidate_locations(test_loc)

print "TEST LOCATION:"
print test_loc

print "" 

print "CANDIDATE LOCATIONS:"
for p in can_locs: print p
    
print ""
    
print "API RETURN STRUCTURE KEYS:"
for p in query_out[0].keys(): print p

TEST LOCATION:
Jupiter

CANDIDATE LOCATIONS:
Jupiter, Shattuck Avenue, Berkeley, CA, United States
Jupiter Research Foundation Inc., 2nd Street, Los Altos, CA, United States
Jupiter, FL, United States
Jupiter Systems, Huntwood Avenue, Hayward, CA, United States
Jupiter Drive, Milpitas, CA, United States

API RETURN STRUCTURE KEYS:
terms
description
reference
structured_formatting
matched_substrings
place_id
id
types
