# Composing LFs

In [35]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

# Setting Snorkel DB location
import os
import sys
sys.path.append('..')

#For PostgreSQL
postgres_location = 'postgresql://jdunnmon:123@localhost:5432'
postgres_db_name = 'memex_db_snorkel'
os.environ['SNORKELDB'] = os.path.join(postgres_location,postgres_db_name)

# For SQLite
#db_location = '.'
#db_name = "snorkel_memex.db"
#os.environ['SNORKELDB'] = '{0}:///{1}/{2}'.format("sqlite", db_location, db_name)

# Start Snorkel session
from snorkel import SnorkelSession
session = SnorkelSession()

# Setting parallelism
parallelism = 16

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [27]:
from snorkel.annotations import load_gold_labels

L_gold_dev = load_gold_labels(session, annotator_name='gold', split=1)

In [7]:
from snorkel.models import Candidate, candidate_subclass

# Designing candidate subclasses
LocationExtraction = candidate_subclass('Location', ['location'])

cands_dev = session.query(LocationExtraction).filter(LocationExtraction.split == 1).all()

In [16]:
import geograpy
# installed fro pip install git+https://github.com/reach2ashish/geograpy.git
ind = 17
cand = cands_dev[ind]
label = int(L_gold_dev[ind].toarray())
txt = cand.location.get_span()
sent = cand.get_parent().text
doc = cand.get_parent().document.sentences
print(f'cand: {txt}')
print(f'sent: {sent}')
print(f'label: {label:d}')
#print(f'doc: {doc}')
placs = geograpy.get_place_context(text=sent)
placs.country_regions
#placs.address_strings

from geograpy import extraction

e = extraction.Extractor(text=sent)
e.find_entities()
print(e.places)

cand:        East Bay
sent: Short notice OK.\r\n        \r\n      \r\n    ' b"Poster's age: 28" b'\r\n         Location: \r\n        East Bay, North Bay, Potrero Hill, San Francisco, San Jose / South Bay, San Mateo\r\n      ' b' Post ID: 10077567 sf' b'email to friend' b' ' b' .posting ' b' .mainBody '
label: -1
['Short', 'Poster', 'North Bay', 'Potrero Hill', 'San Francisco', 'San Jose', 'South Bay', 'San', 'Post ID']


In [281]:
lf_from(cand)

(based|from|out)


1

In [19]:
import re
from fonduer.lf_helpers import (
    get_left_ngrams, get_right_ngrams, get_between_ngrams
)
from snorkel.lf_helpers import get_tagged_text
from snorkel.learning.utils import MentionScorer
import geotext
from geograpy import extraction
from snorkel_utils import *

def test_LF(test_candidates, test_labels, lf):
    """
    Gets the accuracy of a single LF on a split of the candidates, w.r.t. annotator labels,
    and also returns the error buckets of the candidates.
    """
    scorer          = MentionScorer(test_candidates, test_labels)
    test_marginals  = np.array([0.5 * (lf(c) + 1) for c in test_candidates])
    return scorer.score(test_marginals, set_unlabeled_as_neg=False, set_at_thresh_as_neg=False)
                
# List to parenthetical
def ltp(x):
    return '(' + '|'.join(x) + ')'

def rule_regex_search_before_A(candidate, pattern, sign):
    return sign if re.search(pattern + r'*{{A}}', get_tagged_text(candidate), flags=re.I) else 0

def overlap(a, b):
    """Check if a overlaps b.
    This is typically used to check if ANY of a list of phrases is in the ngrams returned by an lf_helper.
    :param a: A collection of items
    :param b: A collection of items
    :rtype: boolean
    """
    return not set(a).isdisjoint(b)

# PUT LFs HERE

def lf_geograpy_entity_neg(c):
    txt = c.location.get_span().lower()
    sent = c.get_parent().text
    e = extraction.Extractor(text=sent)
    e.find_entities()
    places = [p.lower() for p in e.places]
    if txt not in places:
        return -1
    else:
        return 0
    
def lf_geograpy_entity_pos(c):
    txt = c.location.get_span().lower()
    sent = c.get_parent().text
    e = extraction.Extractor(text=sent)
    e.find_entities()
    places = [p.lower() for p in e.places]
    if txt not in places:
        return 0
    else:
        return 1
    
def lf_geograpy_country(c):
    txt = c.location.get_span().lower()
    sent = c.get_parent().text
    places = geograpy.get_place_context(text=sent)
    if places.countries:
        return 1
    else:
        return 0
    
#def lf_location_words(c):
#    location_words = ['place']
#    txt = c.location.get_span().lower()
#    sent = c.get_parent().text
#    if any([a in sent.lower() for a in location_words]):
#        return 1 
#    else: 
#        return 0

def lf_call(c):
    call_words = ['call']
    return -1 if overlap(
      call_words, 
      get_left_ngrams(c, window=1)) else 0

def lf_many_locations(c):
    txt = c.location.get_span().lower()
    sent = c.get_parent().text
    e = extraction.Extractor(text=sent)
    e.find_entities()
    thresh = 3
    return -1 if len(e.places)>thresh else 0


def lf_is_country(c):
    txt = c.location.get_span().lower()
    if lookup_country_name(txt).lower() != 'no country': return 1 
    if lookup_country_alpha2(txt).lower() != 'no country': return 1 
    if lookup_country_alpha3(txt).lower() != 'no country': 
        return 1 
    else:
        return 0
    
    
def lf_is_state(c):
    txt = c.location.get_span().lower()
    if lookup_state_name(txt).lower() != 'no state' : return 1
    if lookup_state_abbr(txt).lower() != 'no state':
        return 1 
    else:
        return 0
    
def lf_following_words(c):
    following_words = ['area', 'escort', 'province']
    return 1 if overlap(
      following_words, 
      get_left_ngrams(c, window=3)) else 0

def lf_preceding_words(c):
    preceding_words = ['escort','province','area']
    return 1 if overlap(
      preceding_words, 
      get_right_ngrams(c, window=3)) else 0

def lf_escort(c):
    words = ['escort']
    return 1 if overlap(
      words, 
      get_right_ngrams(c, window=2)) or overlap(
      words, 
      get_left_ngrams(c, window=10))else 0

def lf_from(c):
    words = ['based']
    return 1 if overlap(
      words, 
      get_right_ngrams(c, window=5)) or overlap(
      words, 
      get_left_ngrams(c, window=5))else 0

def lf_area(c):
    words = ['area']
    return 1 if overlap(
      words, 
      get_right_ngrams(c, window=5)) or overlap(
      words, 
      get_left_ngrams(c, window=5))else 0


def lf_long_context(c):
    sent = c.get_parent().text.split()
    return -1 if len(sent)>10 else 0

#def lf_comma(c):
#    words = [',']
#    return 1 if overlap(
#      words, 
#      get_right_ngrams(c, window=1)) and overlap(
#      words, 
#      get_left_ngrams(c, window=1))else 0


#def lf_prep_before(c):
#    prep_words = ['to']
#    return -1 if overlap(
#      prep_words, 
#      get_left_ngrams(c, window=1)) else 0

In [20]:
tp, fp, tn, fn = test_LF(cands_dev, L_gold_dev, lf_geograpy_country)

Scores (Un-adjusted)
Pos. class accuracy: 1.0
Neg. class accuracy: 0.0
Precision            0.0833
Recall               1.0
F1                   0.154
----------------------------------------
TP: 1 | FP: 11 | TN: 0 | FN: 0



In [38]:
LFs = [
    lf_geograpy_entity_pos,
    lf_geograpy_entity_neg,
    #lf_location_words,
    lf_call,
    lf_many_locations,
    lf_following_words,
    lf_from,
]

In [22]:
from  snorkel.annotations import LabelAnnotator
import numpy as np
labeler = LabelAnnotator(lfs=LFs)

np.random.seed(1701)
%time L_dev = labeler.apply(split=1, parallelism=16)

Clearing existing...
Running UDF...
CPU times: user 252 ms, sys: 492 ms, total: 744 ms
Wall time: 9.36 s


In [23]:
L_dev.lf_stats(session, L_gold_dev)

  ac = (tp+tn) / (tp+tn+fp+fn)


Unnamed: 0,j,Coverage,Overlaps,Conflicts,TP,FP,FN,TN,Empirical Acc.
lf_geograpy_entity_pos,0,0.321429,0.214286,0.214286,0,9,0,0,0.0
lf_geograpy_entity_neg,1,0.678571,0.142857,0.035714,0,0,1,18,0.947368
lf_call,2,0.0,0.0,0.0,0,0,0,0,
lf_many_locations,3,0.321429,0.321429,0.214286,0,0,1,8,0.888889
lf_following_words,4,0.035714,0.035714,0.035714,0,1,0,0,0.0
lf_from,5,0.0,0.0,0.0,0,0,0,0,


In [24]:
from snorkel.viewer import SentenceNgramViewer
labeled = []
cands_dev_red = cands_dev[400:500]
L_gold_dev_red = L_gold_dev[400:500]
for ii, c in enumerate(cands_dev_red):
    if L_gold_dev_red[ii] == 1:
        labeled.append(c)
print("Number labeled:", len(labeled))

SentenceNgramViewer(labeled, session)

Number labeled: 0


<IPython.core.display.Javascript object>

SentenceNgramViewer(html='<head>\n<style>\nspan.candidate {\n    background-color: rgba(255,255,0,0.3);\n}\n\n…

In [36]:
L_dev.lf_stats(L_gold_dev)

AttributeError: query not found

In [29]:
from  snorkel.annotations import LabelAnnotator
import numpy as np
labeler = LabelAnnotator(lfs=LFs)

np.random.seed(1701)
%time L_train = labeler.apply(split=0)
%time L_test = labeler.apply(split=2)
#L_train

# can also load with:
# %time L_train = labeler.load_matrix(session, split=0)

Clearing existing...
Running UDF...

CPU times: user 9min 48s, sys: 9.49 s, total: 9min 57s
Wall time: 11min 48s
Clearing existing...
Running UDF...

CPU times: user 9min 45s, sys: 9.16 s, total: 9min 54s
Wall time: 11min 35s


In [30]:
%time L_test = labeler.apply(split=2)

Clearing existing...
Running UDF...

CPU times: user 2.87 s, sys: 140 ms, total: 3.01 s
Wall time: 3.9 s


In [39]:
L_train.lf_stats(session)

NoResultFound: No row was found for one()

## Training the Generative Model

In [44]:
from snorkel.learning import GenerativeModel
from snorkel.learning import RandomSearch

param_ranges = {
    'step_size' : [1e-2, 1e-3, 1e-4, 1e-5, 1e-6],
    'decay' : [1.0, 0.95, 0.9],
    'epochs' : [20, 50, 100]
}

gen_model = GenerativeModel()
searcher = RandomSearch(GenerativeModel, param_ranges, L_train, n=5)
%time
gen_model, run_stats = searcher.fit(L_dev, L_gold_dev)
run_stats
#gen_model.fit(L_train, epochs=100, decay=0.95, step_size=0.1 / L_train.shape[0], reg_param=1e-6, n_threads=parallelism)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 18.6 µs
[1] Testing step_size = 1.00e-04, decay = 9.00e-01, epochs = 50
Inferred cardinality: 2
[GenerativeModel] F-1 Score: 0.0
[GenerativeModel] Model saved as <GenerativeModel_0>.
[GenerativeModel] Model saved as <GenerativeModel_best>.
[2] Testing step_size = 1.00e-06, decay = 9.50e-01, epochs = 100
Inferred cardinality: 2
[GenerativeModel] F-1 Score: 0.0
[3] Testing step_size = 1.00e-04, decay = 9.00e-01, epochs = 50
Inferred cardinality: 2
[GenerativeModel] F-1 Score: 0.0
[4] Testing step_size = 1.00e-03, decay = 9.50e-01, epochs = 20
Inferred cardinality: 2
[GenerativeModel] F-1 Score: 0.0
[5] Testing step_size = 1.00e-05, decay = 1.00e+00, epochs = 100
Inferred cardinality: 2
[GenerativeModel] F-1 Score: 0.0
[GenerativeModel] Model <GenerativeModel_0> loaded.


Unnamed: 0,step_size,decay,epochs,Prec.,Rec.,F-1
0,0.0001,0.9,50,0.0,0.0,0.0
1,1e-06,0.95,100,0.0,0.0,0.0
2,0.0001,0.9,50,0.0,0.0,0.0
3,0.001,0.95,20,0.0,0.0,0.0
4,1e-05,1.0,100,0.0,0.0,0.0


In [45]:
gen_model.weights.lf_accuracy

array([0.19891975, 0.6095837 , 0.13340544, 0.23601688, 0.13190345,
       0.13227257])

In [46]:
train_marginals = gen_model.marginals(L_train)

In [None]:
cands_train = session.query(LocationExtraction).filter(LocationExtraction.split == 0).all()

In [59]:
from collections import defaultdict
doc_extractions = {}
num_train_cands = L_train.shape[0]
train_cand_preds = (gen_model.marginals(L_train)>0.5)*2-1
for ind in range(num_train_cands):
    cand = L_train.get_candidate(session,ind)
    parent = cand.get_parent()
    doc_name = parent.document.name
    # Initializing key if it doesn't exist
    doc_extractions[doc_name] = defaultdict(list)
    loc = cand.location.get_span().lower()
    if train_cand_preds[ind] == 1:
        doc_extractions[doc_name]['location'].append(loc)
    doc_extractions[doc_name]['dummy'].append('dummy_ext')

In [67]:
a = list(doc_extractions.keys())[0:5]
doc_extractions

#url: "http://url1.com/....", uid: "unique_identifier_from_the_doc_id_field_in_memex-data", extractions: {location: "sdfsdf", name: "bob"}}

{'http://www.eroticmugshots.com/saltlakecity-escorts/801-980-8053/?pid=8282828': defaultdict(list,
             {'dummy': ['dummy_ext']}),
 'http://myproviderguide.com/barcelona/escorts/6598920': defaultdict(list,
             {'dummy': ['dummy_ext']}),
 'http://www.eroticmugshots.com/eastoregon-escorts/310-561-7325/?pid=9662561': defaultdict(list,
             {'dummy': ['dummy_ext']}),
 'http://www.eroticmugshots.com/eastoregon-escorts/310-561-7325/?pid=56867111': defaultdict(list,
             {'dummy': ['dummy_ext']}),
 'http://www.eroticmugshots.com/eastkentucky-escorts/323-929-4361/?pid=18646197': defaultdict(list,
             {'location': ['hot east coast lady'], 'dummy': ['dummy_ext']}),
 'http://www.eroticmugshots.com/mobile-escorts/805-328-3461/?pid=13474949': defaultdict(list,
             {'dummy': ['dummy_ext']}),
 'http://www.eroticmugshots.com/sanfrancisco-escorts/571-839-1097/?pid=26085054': defaultdict(list,
             {'dummy': ['dummy_ext']}),
 'http://www.eroticm

In [None]:
import matplotlib.pyplot as plt
plt.hist(train_marginals, bins=20)
plt.show()

In [58]:
gen_model.learned_lf_stats()

AttributeError: 'GenerativeModel' object has no attribute 'fg'

In [None]:
L_dev = labeler.apply_existing(split=1)

In [None]:
tp, fp, tn, fn = gen_model.error_analysis(session, L_dev, L_gold_dev)

## Error Analysis

In [None]:
from snorkel.viewer import SentenceNgramViewer

# NOTE: This if-then statement is only to avoid opening the viewer during automated testing of this notebook
# You should ignore this!
import os
if 'CI' not in os.environ:
    sv = SentenceNgramViewer(fn, session)
else:
    sv = None

In [None]:
sv

In [None]:
c = sv.get_selected() if sv else list(fp.union(fn))[0]
c

In [None]:
c.labels

In [None]:
L_dev.lf_stats(session, L_gold_dev, gen_model.learned_lf_stats()['Accuracy'])

In [None]:
from snorkel.annotations import save_marginals
%time save_marginals(session, L_train, train_marginals)