In [1]:
%load_ext autoreload
%autoreload 2

## Setup

In [4]:
config = {
    'domain': 'spouse',
    'postgres': True,
    'parallelism': 1,
    'db_name': 'babble_spouse_demo',
    'debug': False,
    'babbler_candidate_split': 1,
    'babbler_label_split': 1,
}

In [5]:
# Get DB connection string and add to globals
# NOTE: $SNORKELDB must be set before any snorkel imports
import os

default_db_name = 'babble_' + config['domain'] + ('_debug' if config.get('debug', False) else '')
DB_NAME = config.get('db_name', default_db_name)
if 'postgres' in config and config['postgres']:
    DB_TYPE = 'postgres'
else:
    DB_TYPE = 'sqlite'
    DB_NAME += '.db'
DB_ADDR = "localhost:{0}".format(config['db_port']) if 'db_port' in config else ""
os.environ['SNORKELDB'] = '{0}://{1}/{2}'.format(DB_TYPE, DB_ADDR, DB_NAME)
print("$SNORKELDB = {0}".format(os.environ['SNORKELDB']))

$SNORKELDB = postgres:///babble_spouse_demo


In [6]:
from snorkel import SnorkelSession
session = SnorkelSession()

# Resolve config conflicts (nb_config > local_config > global_config)
from snorkel.contrib.babble.pipelines import merge_configs
config = merge_configs(config)

if config['debug']:
    print("NOTE: --debug=True: modifying parameters...")
    config['max_docs'] = 100
    config['gen_model_search_space'] = 2
    config['disc_model_search_space'] = 2
    config['gen_params_default']['epochs'] = 25
    config['disc_params_default']['n_epochs'] = 5

Overwriting domain=None to domain=spouse
Overwriting babbler_candidate_split=0 to babbler_candidate_split=1
Overwriting init_class_prior=0 to init_class_prior=-1.15
Overwriting reg_param=0.1 to reg_param=0.5
Overwriting decay=0.95 to decay=0.99
Overwriting postgres=False to postgres=True
Overwriting babbler_label_split=0 to babbler_label_split=1


In [7]:
from snorkel.models import candidate_subclass
from snorkel.contrib.babble import ExplanationIO
from tutorials.babble.spouse import SpousePipeline

Spouse = candidate_subclass('Spouse', ['person1', 'person2'])
candidate_class = Spouse
pipe = SpousePipeline(session, Spouse, config)

## Parse, Extract, Load

In [8]:
# %time pipe.parse()

In [9]:
# %time pipe.extract()

In [10]:
# %time pipe.load_gold()

In [9]:
from tutorials.babble.spouse.spouse_examples import get_explanations, get_user_lists

candidates = session.query(Spouse).filter(Spouse.split == 0).all()
spouse_explanations = get_explanations(candidates)
spouse_user_lists = get_user_lists()

Building list of target candidate ids...
Collected 11 unique target candidate ids from 11 explanations.
Gathering desired candidates...
Found 11/11 desired candidates
Linking explanations to candidates...
Linked 11/11 explanations


## Now the real work begins...

In [10]:
from snorkel.contrib.babble import BabbleStream
bs = BabbleStream(session, strategy='linear', candidate_class=Spouse)

In [11]:
bs.preload(explanations=spouse_explanations, user_lists=spouse_user_lists)

Created grammar with 494 rules
11 explanation(s) out of 11 were parseable.
24 parse(s) generated from 11 explanation(s).
20 parse(s) remain (4 parse(s) removed by DuplicateSemanticsFilter).
13 parse(s) remain (7 parse(s) removed by ConsistencyFilter).
Applying labeling functions to split 1

12 parse(s) remain (1 parse(s) removed by UniformSignatureFilter: (1 None, 0 All)).
9 parse(s) remain (3 parse(s) removed by DuplicateSignatureFilter).
Added 9 parse(s) to set. (Total # parses = 9)
Added 9 explanation(s) to set. (Total # explanations = 9)


In [12]:
bs.get_label_matrix()

<2456x9 sparse matrix of type '<type 'numpy.float64'>'
	with 7156 stored elements in Compressed Sparse Row format>

In [14]:
# from snorkel.contrib.babble import Babbler
# bb = Babbler('text', Spouse, spouse_explanations, user_lists=spouse_user_lists)
# bb.apply(split=1)

In [15]:
c = bs.next()

In [16]:
from snorkel.lf_helpers import *
from IPython.core.display import display, HTML

In [88]:
chunks = get_text_splits(c)
arg_tmpl = u'<b style="background-color:#ffd77c;padding:1pt 5pt 1pt 5pt;">{0}<small style="color:#4B86A8; font-size:9.5pt;">{1}</small></b>'
sent_tmpl = u'<p style="font-size:12pt;">{}</p>'
div_tmpl = u'''<div style="border: 1px dotted #858585; border-radius:8px;
    background-color:#FDFDFD; padding:5pt 10pt 5pt 10pt">{}</div>'''
text = u""
for s in chunks[0:]:
    if s in [u"{{A}}", u"{{B}}"]:
        span = (c[0].get_span(), " arg1 ") if s == u"{{A}}" else (c[1].get_span(), " arg2 ")
        text += arg_tmpl.format(span[0], span[1])
    else:
        text += s.replace(u"\n", u"<BR/>")
html = div_tmpl.format(sent_tmpl.format(text.strip()))
HTML(html)

In [16]:
print c[0].get_span()
print c[1].get_span()

Mr Simms
Penny Wright


In [48]:
from snorkel.viewer import SentenceNgramViewer
sv = SentenceNgramViewer([c], session, n_per_page=1, height=150)
# sv.get_selected()[0].sentence.text
sv.get_selected()[0]

<IPython.core.display.Javascript object>

Span("Mr Simms", sentence=65556, chars=[0,7], words=[0,1])

In [17]:
from snorkel.contrib.babble import Explanation
label = False
condition = "'the' is immediately to the left of arg 1"
explanation = Explanation(condition, label, candidate=c, name='')

In [18]:
explanation

Explanation("False, 'the' is immediately to the left of arg 1")

In [19]:
%time parse_list, conf_matrix_list, stats_list = bs.apply(explanation)

1 explanation(s) out of 1 were parseable.
1 parse(s) generated from 1 explanation(s).
1 parse(s) remain (0 parse(s) removed by DuplicateSemanticsFilter).
1 parse(s) remain (0 parse(s) removed by ConsistencyFilter).
Applying labeling functions to split 1

1 parse(s) remain (0 parse(s) removed by UniformSignatureFilter: (0 None, 0 All)).
1 parse(s) remain (0 parse(s) removed by DuplicateSignatureFilter).
CPU times: user 233 ms, sys: 20.7 ms, total: 254 ms
Wall time: 243 ms


In [20]:
print("Accuracy: {}".format(stats_list[0].accuracy))
print("Class Coverage: {}".format(stats_list[0].class_coverage))

Accuracy: 0.983870967742
Class Coverage: 0.0261603375527


In [21]:
list(conf_matrix_list[0].tn)[:5]

[Spouse(Span("Qataris", sentence=51908, chars=[68,74], words=[11,11]), Span("Mozah", sentence=51908, chars=[140,144], words=[23,23])),
 Spouse(Span("Rafael", sentence=63403, chars=[30,35], words=[7,7]), Span("Barack Obama", sentence=63403, chars=[155,166], words=[30,31])),
 Spouse(Span("Juju", sentence=60386, chars=[7,10], words=[2,2]), Span("  ", sentence=60386, chars=[92,93], words=[20,20])),
 Spouse(Span("Knot", sentence=31091, chars=[70,73], words=[13,13]), Span("Erika Christensen", sentence=31091, chars=[83,99], words=[17,18])),
 Spouse(Span("Kanye West", sentence=60996, chars=[59,68], words=[13,14]), Span("Bowl", sentence=60996, chars=[250,253], words=[49,49]))]

In [22]:
bs.commit() # Permanently adds the parses corresponding to these idxs

Added 1 parse(s) to set. (Total # parses = 10)
Added 1 explanation(s) to set. (Total # explanations = 10)


In [35]:
conf_matrix_list, stats_list = parse_results

In [51]:
conf_matrix_items  =[]
for matrix in conf_matrix_list:
    for item in matrix.tn:
        print item[0].sentence.text

Mr Bennett, who walked with two sticks due to a hip replacement, had his back to the van, which was about a metre behind him.     '
The alleged plan never came to fruition as Labour was removed from power and replaced by the Coalition soon after Mr Blumenthal's email.     
Mr Simms will replace Penny Wright in the Senate after she announced her decision to quit politics because of a family illness.   
Outgoing Greens senator Penny Wright is expected to be replaced by Adelaide councillor Robert Simms.   


In [30]:
L_train = bs.get_label_matrix()
L_train

<2456x11 sparse matrix of type '<type 'numpy.float64'>'
	with 7224 stored elements in COOrdinate format>

### Add another explanation

In [30]:
from snorkel.contrib.babble import Explanation
label = False
condition = "'where' is within two words to the right of arg 1"
explanation = Explanation(condition, label, candidate=c, name='')

In [31]:
%time parse_list, conf_matrix_list, stats_list = bs.apply(explanation)

1 explanation(s) out of 1 were parseable.
2 parse(s) generated from 1 explanation(s).
2 parse(s) remain (0 parse(s) removed by DuplicateSemanticsFilter).
2 parse(s) remain (0 parse(s) removed by ConsistencyFilter).
Applying labeling functions to split 1

2 parse(s) remain (0 parse(s) removed by UniformSignatureFilter: (0 None, 0 All)).
1 parse(s) remain (1 parse(s) removed by DuplicateSignatureFilter).
CPU times: user 351 ms, sys: 14.3 ms, total: 365 ms
Wall time: 360 ms


In [32]:
print("Accuracy: {}".format(stats_list[0].accuracy))
print("Class Coverage: {}".format(stats_list[0].class_coverage))

Accuracy: 1.0
Class Coverage: 0.00084388185654


In [33]:
bs.commit()

Added 1 parse(s) to set. (Total # parses = 11)
Added 1 explanation(s) to set. (Total # explanations = 11)


In [34]:
parse = parse_list[0]
parse.semantics

('.root',
 ('.label',
  ('.bool', False),
  ('.call',
   ('.in',
    ('.extract_text',
     ('.right',
      ('.arg', ('.int', 1)),
      ('.string', '.eq'),
      ('.int', 2),
      ('.string', 'words')))),
   ('.string', 'where'))))

In [36]:
bs.semparser.grammar.translate(parse.semantics)

"return -1 if call(in text(right(arg1,'.eq',2,'words')), 'where') else 0"

In [None]:
pipe.label()

In [None]:
pipe.supervise()

In [None]:
pipe.classify()

In [38]:
train = session.query(pipe.candidate_class).filter(pipe.candidate_class.split == 0).order_by(pipe.candidate_class.id).all()

In [39]:
print len(train)

23490
