# Step 1: Parsing Files, Adding Candidates and Labels to Database

In [1]:
import json
# Loading config
with open("run_config.json") as fl:
    cfg = json.load(fl)
cfg_params = cfg['parameters']

# Setting snorkel path and output root
import os
from os.path import join
output_root = join(cfg_params['output_path'],cfg_params['experiment_name'])
os.environ['FONDUERDBNAME'] = cfg['postgres_db_name']
os.environ['SNORKELDB'] = join(cfg['postgres_location'],os.environ['FONDUERDBNAME'])

# For loading input files
import pandas as pd

# For running Snorkel
from snorkel.contrib.fonduer import SnorkelSession
from snorkel.contrib.fonduer.models import candidate_subclass
from snorkel.contrib.fonduer import HTMLPreprocessor, OmniParser
from utils import HTMLListPreprocessor

from sqlalchemy import create_engine
snorkeldb = create_engine(os.environ['SNORKELDB'], isolation_level="AUTOCOMMIT")

In [2]:
# Load labeled data from tsv
pth_labeled = join(cfg['data_path'],'labels_and_splits')
fl_labeled = cfg['labeled_data_file']
df_labeled = pd.read_csv(join(pth_labeled,fl_labeled),sep='\t')
path_list_labeled = [_+'.html' for _ in df_labeled['file name'].tolist()]

#Load unlabeled data from tsv
fl_unlabeled = cfg['unlabeled_data_file']
df_unlabeled = pd.read_csv(join(pth_labeled,fl_unlabeled),sep='\t')
path_list_unlabeled = [_+'.html' for _ in df_unlabeled['file name'].tolist()]

In [3]:
# Start snorkel session and creating location subclass
session = SnorkelSession()
Location_Extraction = candidate_subclass('location_extraction',\
                          ["location"])

# Parsing documents 
max_docs = cfg['max_docs']
data_loc = join(cfg['data_path'],'raw_data')
path_list = path_list_labeled[:max_docs]+path_list_unlabeled[:max_docs]
doc_preprocessor = HTMLListPreprocessor(data_loc,\
                                file_list=path_list)
corpus_parser = OmniParser(structural=True, lingual=True, visual=False)
%time corpus_parser.apply(doc_preprocessor, parallelism=cfg['parallel'])

Clearing existing...
Running UDF...




CPU times: user 532 ms, sys: 164 ms, total: 696 ms
Wall time: 1min 7s


In [6]:
from snorkel.contrib.fonduer.models import Document, Phrase

# Checking database contents
print("Documents:", session.query(Document).count())
print("Phrases:", session.query(Phrase).count())

('Documents:', 8L)
('Phrases:', 2388L)


## Dividing into Test/Train, Extracting Features, Throttling

In [12]:
docs = session.query(Document).order_by(Document.name).all()
ld   = len(docs)

train_docs = set()
dev_docs   = set()
test_docs  = set()
data = [(doc.name+'.html', doc) for doc in docs]
data.sort(key=lambda x: x[0])
for i, (doc_name, doc) in enumerate(data):
    if doc_name in path_list_unlabeled:
        train_docs.add(doc)
    else:
        if len(dev_docs)<=len(test_docs):
            dev_docs.add(doc)
        else:
            test_docs.add(doc)

print "train:",len(train_docs)
print "dev:" ,len(dev_docs)
print "test:",len(test_docs)

from pprint import pprint
pprint([x.name for x in train_docs])

train: 4
dev: 2
test: 2
[u'005dd27d-91c5-4569-b285-489391dcff4f',
 u'0069a7dd-9a03-4240-9073-77744c10b467',
 u'001a5f8b-82c5-4428-b539-0c8a0f2f87c4',
 u'0034ff21-5d7a-4edf-9150-e22c5188dde1']


In [13]:
from snorkel.matchers import *
location_matcher = LocationMatcher(longest_match_only=True) 

from snorkel.contrib.fonduer.fonduer.candidates import OmniNgrams
location_ngrams = OmniNgrams(n_max=6, split_tokens=[])

In [14]:
from snorkel.contrib.fonduer.lf_helpers import *
import re
from snorkel.lf_helpers import (
    get_left_tokens, get_right_tokens, get_between_tokens,
    get_text_between, get_tagged_text,
)


    
def location_currencies_filter(location):
    list_currencies = [ "dollar", "dollars", "lira","kwacha","rials","rial","dong","dongs","fuerte","euro",
                       "euros","vatu","som","peso","sterling","sterlings","soms","pestos",
                       "pounds", 
                  "pound","dirham","dirhams","hryvnia","manat","manats","liras","lira",
                       "dinar","dinars","pa'anga","franc","baht","schilling",
                  "somoni","krona","lilangeni","rupee","rand","shilling","leone","riyal","dobra",
                  "tala","ruble","zloty","peso","sol","quarani","kina","guinean","balboa","krone","naira",
                  "cordoba","kyat","metical","togrog","leu","ouguiya","rufiyaa","ringgit","kwacha",
                  "ariary","denar","litas","loti","lats","kip","som","won","tenge","yen","shekel","rupiah",
                  "forint","lempira","gourde","quetzal","cedi","lari","dalasi","cfp","birr","kroon","nakfa",
                  "cfa","Peso","koruna","croatian","colon","yuan","escudo","cape","riel","lev","real"
                  ,"real","mark","boliviano","ngultrum","taka","manat","dram","kwanza","lek","afghani","renminbi"]

    
    cand_right_tokens = list(get_right_ngrams(location,window=2))
    #print len(cand_right_tokens)
    #print cand_right_tokens#(get_right_ngrams(location,window=4))
    for cand in cand_right_tokens:
        #print "["+cand+"]"
        if cand not in list_currencies:
            #print "["+cand+"]"
            #print location
            return location
    
candidate_filter = location_currencies_filter

  from ._conv import register_converters as _register_converters


In [17]:
from snorkel.contrib.fonduer.candidates import CandidateExtractor

candidate_extractor = CandidateExtractor(Location_Extraction,
                                         [location_ngrams], [location_matcher],
                                         candidate_filter=candidate_filter)

%time candidate_extractor.apply(train_docs, split=0, parallelism=cfg['parallel'])
print("Number of candidates:", session.query(Location_Extraction).filter(Location_Extraction.split == i+1).count())
%time
for i, docs in enumerate([dev_docs, test_docs]):
    candidate_extractor.apply(docs, split=i+1, parallelism=cfg['parallel'])
    print("Number of candidates:", session.query(Location_Extraction).filter(Location_Extraction.split == i+1).count())

Clearing existing...
Running UDF...
CPU times: user 24 ms, sys: 300 ms, total: 324 ms
Wall time: 3.7 s
('Number of candidates:', 12L)
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 10 µs
Clearing existing...
Running UDF...
('Number of candidates:', 24L)
Clearing existing...
Running UDF...
('Number of candidates:', 12L)


In [20]:
from snorkel.contrib.fonduer import BatchFeatureAnnotator

featurizer = BatchFeatureAnnotator(Location_Extraction)
%time F_train = featurizer.apply(split=0, replace_key_set=True, parallelism=cfg['parallel'])
print(F_train.shape)
%time F_dev = featurizer.apply(split=1, replace_key_set=False, parallelism=cfg['parallel'])
print(F_dev.shape)
%time F_test = featurizer.apply(split=2, replace_key_set=False, parallelism=cfg['parallel'])
print(F_test.shape)

Clearing existing...
Running UDF...


Process BatchAnnotatorUDF-113:
Traceback (most recent call last):
  File "/lfs/local/0/jdunnmon/repos/anaconda3/envs/py27snorkel/lib/python2.7/multiprocessing/process.py", line 267, in _bootstrap
    self.run()
    for y in self.apply(x, **self.apply_kwargs):
  File "/lfs/local/0/jdunnmon/chtap/backup/snorkel/snorkel/udf.py", line 156, in run
  File "/lfs/local/0/jdunnmon/chtap/backup/snorkel/snorkel/contrib/fonduer/fonduer/async_annotations.py", line 210, in apply
    for id, k, v in self.anno_generator(list(candidates)):
  File "/lfs/local/0/jdunnmon/chtap/backup/snorkel/snorkel/contrib/fonduer/fonduer/features/features.py", line 11, in get_all_feats
    for id, f, v in get_content_feats(candidates):
  File "/lfs/local/0/jdunnmon/chtap/backup/snorkel/snorkel/contrib/fonduer/fonduer/features/content_features.py", line 32, in get_content_feats
    get_tdl_feats = compile_entity_feature_generator()
  File "/lfs/local/0/jdunnmon/chtap/backup/snorkel/snorkel/contrib/fonduer/fonduer/featur

Copying location_extraction_feature to postgres
COPY 0

CPU times: user 52 ms, sys: 280 ms, total: 332 ms
Wall time: 3.5 s
(9, 0)
Clearing existing...
Running UDF...


Process BatchAnnotatorUDF-129:
Traceback (most recent call last):
  File "/lfs/local/0/jdunnmon/repos/anaconda3/envs/py27snorkel/lib/python2.7/multiprocessing/process.py", line 267, in _bootstrap
    self.run()
  File "/lfs/local/0/jdunnmon/chtap/backup/snorkel/snorkel/udf.py", line 156, in run
    for y in self.apply(x, **self.apply_kwargs):
  File "/lfs/local/0/jdunnmon/chtap/backup/snorkel/snorkel/contrib/fonduer/fonduer/async_annotations.py", line 210, in apply
    for id, k, v in self.anno_generator(list(candidates)):
  File "/lfs/local/0/jdunnmon/chtap/backup/snorkel/snorkel/contrib/fonduer/fonduer/features/features.py", line 11, in get_all_feats
    for id, f, v in get_content_feats(candidates):
  File "/lfs/local/0/jdunnmon/chtap/backup/snorkel/snorkel/contrib/fonduer/fonduer/features/content_features.py", line 32, in get_content_feats
    get_tdl_feats = compile_entity_feature_generator()
  File "/lfs/local/0/jdunnmon/chtap/backup/snorkel/snorkel/contrib/fonduer/fonduer/featur

Copying location_extraction_feature_updates to postgres
COPY 0

CPU times: user 52 ms, sys: 204 ms, total: 256 ms
Wall time: 3.38 s
(24, 0)
Clearing existing...
Running UDF...


Process BatchAnnotatorUDF-145:
Traceback (most recent call last):
  File "/lfs/local/0/jdunnmon/repos/anaconda3/envs/py27snorkel/lib/python2.7/multiprocessing/process.py", line 267, in _bootstrap
    self.run()
    for y in self.apply(x, **self.apply_kwargs):
  File "/lfs/local/0/jdunnmon/chtap/backup/snorkel/snorkel/udf.py", line 156, in run
  File "/lfs/local/0/jdunnmon/chtap/backup/snorkel/snorkel/contrib/fonduer/fonduer/async_annotations.py", line 210, in apply
    for id, k, v in self.anno_generator(list(candidates)):
  File "/lfs/local/0/jdunnmon/chtap/backup/snorkel/snorkel/contrib/fonduer/fonduer/features/features.py", line 11, in get_all_feats
    for id, f, v in get_content_feats(candidates):
  File "/lfs/local/0/jdunnmon/chtap/backup/snorkel/snorkel/contrib/fonduer/fonduer/features/content_features.py", line 32, in get_content_feats
    get_tdl_feats = compile_entity_feature_generator()
  File "/lfs/local/0/jdunnmon/chtap/backup/snorkel/snorkel/contrib/fonduer/fonduer/featur

Copying location_extraction_feature_updates to postgres
COPY 0

CPU times: user 68 ms, sys: 216 ms, total: 284 ms
Wall time: 3.41 s
(12, 0)


In [None]:
# TODO: add ground truth labels, add to db