# Step 1: Parsing Files, Adding Candidates and Labels to Database

In [1]:
import json
# Loading config
with open("run_config.json") as fl:
    cfg = json.load(fl)
cfg_params = cfg['parameters']

# Setting snorkel path and output root
import os
from os.path import join
output_root = join(cfg_params['output_path'],cfg_params['experiment_name'])
os.environ['FONDUERDBNAME'] = cfg['postgres_db_name']
os.environ['SNORKELDB'] = join(cfg['postgres_location'],os.environ['FONDUERDBNAME'])

# For loading input files
import pandas as pd

# For running Snorkel
from snorkel.contrib.fonduer import SnorkelSession
from snorkel.contrib.fonduer.models import candidate_subclass
from snorkel.contrib.fonduer import HTMLPreprocessor, OmniParser
from utils import HTMLListPreprocessor

from sqlalchemy import create_engine
snorkeldb = create_engine(os.environ['SNORKELDB'], isolation_level="AUTOCOMMIT")

In [2]:
# Load labeled data from tsv
pth_labeled = join(cfg['data_path'],'labels_and_splits')
fl_labeled = cfg['labeled_data_file']
df_labeled = pd.read_csv(join(pth_labeled,fl_labeled),sep='\t')
path_list_labeled = [_+'.html' for _ in df_labeled['file name'].tolist()]

#Load unlabeled data from tsv
fl_unlabeled = cfg['unlabeled_data_file']
df_unlabeled = pd.read_csv(join(pth_labeled,fl_unlabeled),sep='\t')
path_list_unlabeled = [_+'.html' for _ in df_unlabeled['file name'].tolist()]

In [3]:
# Start snorkel session and creating location subclass
session = SnorkelSession()
Location_Extraction = candidate_subclass('location_extraction',\
                          ["location"])

# Parsing documents 
max_docs = cfg['max_docs']
data_loc = join(cfg['data_path'],'raw_data')
path_list = path_list_labeled[:max_docs]+path_list_unlabeled[:max_docs]
doc_preprocessor = HTMLListPreprocessor(data_loc,\
                                file_list=path_list)
corpus_parser = OmniParser(structural=True, lingual=True, visual=False)
%time corpus_parser.apply(doc_preprocessor, parallelism=cfg['parallel'])

Clearing existing...
Running UDF...




CPU times: user 612 ms, sys: 116 ms, total: 728 ms
Wall time: 53.9 s


In [4]:
from snorkel.contrib.fonduer.models import Document, Phrase

# Checking database contents
print("Documents:", session.query(Document).count())
print("Phrases:", session.query(Phrase).count())

('Documents:', 8L)
('Phrases:', 2388L)


## Dividing into Test/Train, Extracting Features, Throttling

In [5]:
docs = session.query(Document).order_by(Document.name).all()
ld   = len(docs)

train_docs = set()
dev_docs   = set()
test_docs  = set()
data = [(doc.name+'.html', doc) for doc in docs]
data.sort(key=lambda x: x[0])
for i, (doc_name, doc) in enumerate(data):
    if doc_name in path_list_unlabeled:
        train_docs.add(doc)
    else:
        if len(dev_docs)<=len(test_docs):
            dev_docs.add(doc)
        else:
            test_docs.add(doc)

print "train:",len(train_docs)
print "dev:" ,len(dev_docs)
print "test:",len(test_docs)

from pprint import pprint
pprint([x.name for x in train_docs])

train: 4
dev: 2
test: 2
[u'001a5f8b-82c5-4428-b539-0c8a0f2f87c4',
 u'005dd27d-91c5-4569-b285-489391dcff4f',
 u'0069a7dd-9a03-4240-9073-77744c10b467',
 u'0034ff21-5d7a-4edf-9150-e22c5188dde1']


In [6]:
from snorkel.matchers import *
location_matcher = LocationMatcher(longest_match_only=True) 

from snorkel.contrib.fonduer.fonduer.candidates import OmniNgrams
location_ngrams = OmniNgrams(n_max=6, split_tokens=[])

In [7]:
from snorkel.contrib.fonduer.lf_helpers import *
import re
from snorkel.lf_helpers import (
    get_left_tokens, get_right_tokens, get_between_tokens,
    get_text_between, get_tagged_text,
)


    
def location_currencies_filter(location):
    list_currencies = [ "dollar", "dollars", "lira","kwacha","rials","rial","dong","dongs","fuerte","euro",
                       "euros","vatu","som","peso","sterling","sterlings","soms","pestos",
                       "pounds", 
                  "pound","dirham","dirhams","hryvnia","manat","manats","liras","lira",
                       "dinar","dinars","pa'anga","franc","baht","schilling",
                  "somoni","krona","lilangeni","rupee","rand","shilling","leone","riyal","dobra",
                  "tala","ruble","zloty","peso","sol","quarani","kina","guinean","balboa","krone","naira",
                  "cordoba","kyat","metical","togrog","leu","ouguiya","rufiyaa","ringgit","kwacha",
                  "ariary","denar","litas","loti","lats","kip","som","won","tenge","yen","shekel","rupiah",
                  "forint","lempira","gourde","quetzal","cedi","lari","dalasi","cfp","birr","kroon","nakfa",
                  "cfa","Peso","koruna","croatian","colon","yuan","escudo","cape","riel","lev","real"
                  ,"real","mark","boliviano","ngultrum","taka","manat","dram","kwanza","lek","afghani","renminbi"]

    
    cand_right_tokens = list(get_right_ngrams(location,window=2))
    for cand in cand_right_tokens:
        if cand not in list_currencies:
            return location
    
candidate_filter = location_currencies_filter

  from ._conv import register_converters as _register_converters


In [9]:
from snorkel.contrib.fonduer.candidates import CandidateExtractor

candidate_extractor = CandidateExtractor(Location_Extraction,
                                         [location_ngrams], [location_matcher],
                                         candidate_filter=candidate_filter)

%time candidate_extractor.apply(train_docs, split=0, parallelism=cfg['parallel'])
print("Number of candidates:", session.query(Location_Extraction).filter(Location_Extraction.split == i+1).count())
%time
for i, docs in enumerate([dev_docs, test_docs]):
    candidate_extractor.apply(docs, split=i+1, parallelism=cfg['parallel'])
    print("Number of candidates:", session.query(Location_Extraction).filter(Location_Extraction.split == i+1).count())

Clearing existing...
Running UDF...
CPU times: user 36 ms, sys: 268 ms, total: 304 ms
Wall time: 4.29 s
('Number of candidates:', 12L)
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 11 µs
Clearing existing...
Running UDF...
('Number of candidates:', 26L)
Clearing existing...
Running UDF...
('Number of candidates:', 12L)


In [10]:
cands = session.query(Location_Extraction).filter(Location_Extraction.split == 1).order_by(Location_Extraction.id).all()
cand = cands[0]
args = cand.get_contexts()
span = args[0]
c = span.sentence.is_lingual()
a = span.get_parent()
print c, args, span
print a

True (Span("Dallas", sentence=31081, chars=[34,39], words=[8,8]),) Span("Dallas", sentence=31081, chars=[34,39], words=[8,8])
Phrase (Doc: 0397de89-5130-4f56-8a46-3e533d393d8d, Index: 50, Text: on  Friday, January 19th, 2018 in Dallas Escorts • PostID 9076788)


In [11]:
from snorkel.contrib.fonduer import BatchFeatureAnnotator
featurizer = BatchFeatureAnnotator(Location_Extraction)
%time F_train = featurizer.apply(split=0, replace_key_set=True, parallelism=1)
print(F_train.shape)
#%time F_dev = featurizer.apply(split=1, replace_key_set=False, parallelism=1)
#print(F_dev.shape)
#%time F_test = featurizer.apply(split=2, replace_key_set=False, parallelism=1)
#print(F_test.shape)

Clearing existing...
Running UDF...


NameError: global name 'unary_tdl_feats' is not defined

## Loading Gold Labels

In [None]:
def load_chtap_labels(session, candidate_class, df, target, annotator_name='gold'):
    ak = session.query(GoldLabelKey).filter(GoldLabelKey.name == annotator_name).first()
    if ak is None:
        ak = GoldLabelKey(name=annotator_name)
        session.add(ak)
        session.commit()   
        
    candidates = session.query(candidate_class).all()
    gold_dict = get_gold_dict(filename, attribute=attribute)
    cand_total = len(candidates)
    print('Loading', cand_total, 'candidate labels')
    pb = ProgressBar(cand_total)
    labels=[]
    for i, c in enumerate(candidates):
        pb.bar(i)
        doc = c[0].sentence.document.name
        doc_html = doc+'.html'
        text = c[0].get_span()
        target_string = df[df['file name']==doc_html]['meep']
        #context_stable_ids = '~~'.join([i.stable_id for i in c.get_contexts()])
        label = session.query(GoldLabel).filter(GoldLabel.key == ak).filter(GoldLabel.candidate == c).first()
        if label is None:
            if (doc, formation, measurement) in gold_dict:
                label = GoldLabel(candidate=c, key=ak, value=1)
            else:
                label = GoldLabel(candidate=c, key=ak, value=-1)
            session.add(label)
            labels.append(label)
    session.commit()
    pb.close()

    session.commit()
    print("AnnotatorLabels created: %s" % (len(labels),))

In [None]:
target = 'location'
load_chtap_labels(session, Location_Extraction, df_labeled, target ,annotator_name='gold')

In [None]:
def get_gold_dict(filename, doc_on=True, formation_on=True, measurement_on=True, attribute=None, docs=None, integerize=False):
    attribute = [_.upper() for _ in attribute]
    with codecs.open(filename, encoding="utf-8") as csvfile:
        gold_reader = csv.reader(csvfile)
        gold_dict = set()
        for row in gold_reader:
            (doc, m1, formation, m2, measurement) = row
            if docs is None or doc.upper() in docs:
                if attribute and m2.upper() not in attribute:
                    continue
                else:
                    key = []
                    if doc_on: key.append(doc.upper())
                    if formation_on: key.append(formation.upper())
                    if measurement_on:
                        if integerize:
                            key.append(int(float(measurement)))
                        else:
                            key.append(measurement.upper())
                    gold_dict.add(tuple(key))
    return gold_dict

def load_paleo_labels(session, candidate_class, filename, attribute, annotator_name='gold'):

    ak = session.query(GoldLabelKey).filter(GoldLabelKey.name == annotator_name).first()
    if ak is None:
        ak = GoldLabelKey(name=annotator_name)
        session.add(ak)
        session.commit()

    candidates = session.query(candidate_class).all()
    gold_dict = get_gold_dict(filename, attribute=attribute)
    cand_total = len(candidates)
    print('Loading', cand_total, 'candidate labels')
    pb = ProgressBar(cand_total)
    labels=[]
    for i, c in enumerate(candidates):
        pb.bar(i)
        doc = (c[0].sentence.document.name).upper()
        formation = (c[0].get_span()).upper()
        measurement = (''.join(c[1].get_span().split())).upper()
        context_stable_ids = '~~'.join([i.stable_id for i in c.get_contexts()])
        label = session.query(GoldLabel).filter(GoldLabel.key == ak).filter(GoldLabel.candidate == c).first()
        if label is None:
            if (doc, formation, measurement) in gold_dict:
                label = GoldLabel(candidate=c, key=ak, value=1)
            else:
                label = GoldLabel(candidate=c, key=ak, value=-1)
            session.add(label)
            labels.append(label)
    session.commit()
    pb.close()

    session.commit()
    print("AnnotatorLabels created: %s" % (len(labels),))