# Environment Setup

In [4]:
%load_ext autoreload
%autoreload 2

In [5]:
config = {
    'domain': 'spouse',
    'postgres': False,
    'parallelism': 1,
    'db_name': 'babble_spouse_demo',
    'babbler_candidate_split': 1,
    'babbler_label_split': 1,
    'supervision': 'majority_vote',
    'gen_model_search_space': 1,
    'disc_model_search_space': 1,
}

In [6]:
# Get DB connection string and add to globals
# NOTE: $SNORKELDB must be set before any snorkel imports
import os

default_db_name = 'babble_' + config['domain'] + ('_debug' if config.get('debug', False) else '')
DB_NAME = config.get('db_name', default_db_name)
if 'postgres' in config and config['postgres']:
    DB_TYPE = 'postgres'
else:
    DB_TYPE = 'sqlite'
    DB_NAME += '.db'
DB_ADDR = "localhost:{0}".format(config['db_port']) if 'db_port' in config else ""
os.environ['SNORKELDB'] = '{0}://{1}/{2}'.format(DB_TYPE, DB_ADDR, DB_NAME)
print("$SNORKELDB = {0}".format(os.environ['SNORKELDB']))

$SNORKELDB = sqlite:///babble_spouse_demo.db


In [7]:
from snorkel import SnorkelSession
session = SnorkelSession()

# Resolve config conflicts (nb_config > local_config > global_config)
from snorkel.contrib.babble.pipelines import merge_configs
config = merge_configs(config)

Overwriting disc_model_search_space=10 to disc_model_search_space=1
Overwriting domain=None to domain=spouse
Overwriting print_freq=1 to print_freq=5
Overwriting disc_model_search_space=10 to disc_model_search_space=1
Overwriting babbler_label_split=0 to babbler_label_split=1
Overwriting init_class_prior=0 to init_class_prior=-1.15
Overwriting reg_param=0.1 to reg_param=0.5
Overwriting decay=0.95 to decay=0.99
Overwriting gen_model_search_space=10 to gen_model_search_space=1
Overwriting babbler_candidate_split=0 to babbler_candidate_split=1
Overwriting disc_model_class=lstm to disc_model_class=logreg
Overwriting supervision=generative to supervision=majority_vote


In [8]:
from snorkel.models import candidate_subclass
from tutorials.babble.spouse import SpousePipeline

Spouse = candidate_subclass('Spouse', ['person1', 'person2'])
candidate_class = Spouse
pipe = SpousePipeline(session, Spouse, config)

### Initialization

In [9]:
# %time pipe.parse()

In [10]:
# %time pipe.extract()

In [11]:
# %time pipe.load_gold()

### Pre-load Explanations (10) + User Lists (4)

In [12]:
from snorkel.contrib.babble import BabbleStream
bs = BabbleStream(session, candidate_class=Spouse, balanced=True, shuffled=True, seed=1234)

In [13]:
from tutorials.babble.spouse.spouse_examples import get_explanations, get_user_lists

candidates = session.query(Spouse).filter(Spouse.split == 0).all()
spouse_explanations = get_explanations(candidates)
# spouse_explanations = []
spouse_user_lists = get_user_lists()

Building list of target candidate ids...
Collected 10 unique target candidate ids from 10 explanations.
Gathering desired candidates...
Found 10/10 desired candidates
Linking explanations to candidates...
Linked 10/10 explanations


In [14]:
bs.preload(explanations=spouse_explanations, user_lists=spouse_user_lists)

Created grammar with 499 rules
Flushing all parses from previous explanation set.
All previously uncommitted parses have been flushed.
10 explanation(s) out of 10 were parseable.
29 parse(s) generated from 10 explanation(s).
19 parse(s) remain (10 parse(s) removed by DuplicateSemanticsFilter).
12 parse(s) remain (7 parse(s) removed by ConsistencyFilter).
Applying labeling functions to split 1

11 parse(s) remain (1 parse(s) removed by UniformSignatureFilter: (1 None, 0 All)).
9 parse(s) remain (2 parse(s) removed by DuplicateSignatureFilter).
Added 9 parse(s) to set. (Total # parses = 9)


# Start Demo:

### View user_lists

In [15]:
user_lists = bs.user_lists
for alias, values in user_lists.items():
    if len(values) <= 10:
        print("{}:\n {}\n".format(alias, values))
    else:
        print("{}:\n {}...\n".format(alias, list(values)[:10]))

known_spouses:
 [('Eleanor Powell', 'Glenn Ford'), ('Andronikos Doukas', 'Maria of Bulgaria'), ('Marjorie Rambeau', 'Willard Mack'), ('Margo St. James', 'Paul Avery'), ('Joan of England', 'William II the Good'), ('Maiko Jeong Shun Lee', 'The Viscount Rothermere'), ('Heinrich von Coudenhove-Kalergi', 'Mitsuko Aoyama'), ('Kiran Nadar', 'Shiv Nadar ( )'), ('Cecilia Mnsdotter Eka', 'Erik Johansson Vasa'), ('Bonne of Bohemia', 'John the Good')]...

spouse:
 ['spouse', 'wife', 'husband', 'ex-wife', 'ex-husband']

other:
 ['boyfriend', 'girlfriend', 'boss', 'employee', 'secretary', 'co-worker']

family:
 ['father', 'father', 'mother', 'sister', 'sisters', 'brother', 'brothers', 'son', 'sons', 'daughter']...

last_names:
 [('Merezhkovsky', 'Gippius'), ('Deakin', 'Simkins'), ('Mattson', 'Roth'), ('Kenyatta', 'Kenyatta'), ('Neville', 'Howard'), ('Nelson', 'Mason'), ('Troy', 'McNamara'), ('Vuuren', 'Vuuren'), ('Menthe', 'Brunswick-Lneburg'), ('Tyson', 'Davis')]...



Optionally add another user_list.

In [16]:
ALIAS = None    # e.g., 'marriage_words'
VALUES = []    # e.g., ['engaged', 'betrothed', 'proposed']

if ALIAS:
    bs.add_user_lists({ALIAS: VALUES})

### View a candidate

In [17]:
c = bs.next()
from snorkel.viewer import SentenceNgramViewer
sv = SentenceNgramViewer([c], session, n_per_page=1, height=max(len(c.get_parent().words)*2, 80))
sv

<IPython.core.display.Javascript object>

A Jupyter Widget

In [18]:
from snorkel.lf_helpers import *
from IPython.core.display import HTML

In [19]:
def candidate_html(c):
    chunks = get_text_splits(c)
    div_tmpl = u'''<div style="border: 1px #858585; box-shadow:0 4px 8px 0 rgba(0, 0, 0, 0.2), 0 6px 20px 0 rgba(0, 0, 0, 0.19);
    background-color:#FDFDFD; padding:5pt 10pt 5pt 10pt; width: 80%; margin: auto; margin-top: 2%">{}</div>'''
    arg_tmpl = u'<b style="background-color:#ffd77c;padding:1pt 5pt 1pt 5pt;">{}</b>'
    sent_tmpl = u'<p style="font-size:12pt;">{}</p>'
    text = u""
    for s in chunks[0:]:
        if s in [u"{{A}}", u"{{B}}"]:
            span = c[0].get_span() if s == u"{{A}}" else c[1].get_span()
            text += arg_tmpl.format(span)
        else:
            text += s.replace(u"\n", u"<BR/>")
    html = div_tmpl.format(sent_tmpl.format(text.strip()))
    return html

In [54]:
c

Spouse(Span("Faith Jenkinson", sentence=49632, chars=[72,86], words=[13,14]), Span("Andrew Asher", sentence=49632, chars=[125,136], words=[25,26]))

In [20]:
HTML(candidate_html(c))

### Give an explanation

(See MTurk instructions for examples)

In [21]:
LABEL = True
CONDITION = "'wife' is immediately to the left of arg 1"
# CONDITION = "there are no people between the arg 1 and arg 2 and 'husband' is immediately to the left of arg 2"
# CONDITION = "'announcing' is less than six words to the left of arg 2"
# CONDITION = "'syndrome' occurs to within three words to the right of arg 2"

With the user input and the current candidate, we make an Explanation object.

In [22]:
from snorkel.contrib.babble import Explanation
explanation = Explanation(CONDITION, LABEL, candidate=c)

In [23]:
explanation

Explanation("True, 'wife' is immediately to the left of arg 1")

### Parse and view labeling stats

In [24]:
%time parse_list, filtered_parses, conf_matrix_list, stats_list = bs.apply(explanation)

Flushing all parses from previous explanation set.
All previously uncommitted parses have been flushed.
1 explanation(s) out of 1 were parseable.
1 parse(s) generated from 1 explanation(s).
1 parse(s) remain (0 parse(s) removed by DuplicateSemanticsFilter).
1 parse(s) remain (0 parse(s) removed by ConsistencyFilter).
Applying labeling functions to split 1

1 parse(s) remain (0 parse(s) removed by UniformSignatureFilter: (0 None, 0 All)).
1 parse(s) remain (0 parse(s) removed by DuplicateSignatureFilter).
CPU times: user 537 ms, sys: 235 ms, total: 771 ms
Wall time: 685 ms


In [25]:
bs.filtered_analysis(filtered_parses)

No filtered parses to analyze.


In [31]:
tup_list = zip(parse_list, conf_matrix_list, stats_list)

In [53]:
for item in tup_list[0][1].correct:
    print item

Spouse(Span("Faith Jenkinson", sentence=49632, chars=[72,86], words=[13,14]), Span("Andrew Asher", sentence=49632, chars=[125,136], words=[25,26]))
Spouse(Span("Rachel Weisz", sentence=20804, chars=[44,55], words=[7,8]), Span("Mr Craig", sentence=20804, chars=[68,75], words=[12,13]))
Spouse(Span("Kim", sentence=23467, chars=[42,44], words=[8,8]), Span("Andy", sentence=23467, chars=[158,161], words=[31,31]))
Spouse(Span("Christi", sentence=67290, chars=[154,160], words=[29,29]), Span("  Gibson", sentence=67290, chars=[220,227], words=[41,42]))


In [26]:
PARSE_IDX = 0
parse = parse_list[PARSE_IDX]
conf_matrix = conf_matrix_list[PARSE_IDX]
stats = stats_list[PARSE_IDX]

print("Parse {}:\n{}\n".format(PARSE_IDX, bs.semparser.grammar.translate(parse.semantics)))
print(stats.accuracy)
print(stats.class_coverage)
print(stats.coverage)

Parse 0:
return 1 if call(in text(left(arg1,'.eq',1,'words')), 'wife') else 0

Accuracy: 21.05% (4/19)
ClassCoverage: 10.38% (19/183)
Coverage: 0.78% (19/2448)


### View labeled candidates

Select the subset of labeled candidates you would like to view.

In [37]:
SUBSET = 'correct' # Must be one of ['correct', 'incorrect', 'abstained']

subset = getattr(conf_matrix_list[0], SUBSET)
def candidate_generator(subset):
    for c in subset: 
        yield c
subset_generator = candidate_generator(subset)

In [38]:
from snorkel.viewer import SentenceNgramViewer
c = subset_generator.next()
sv = SentenceNgramViewer([c], session, n_per_page=3, height=max(len(c.get_parent().words)*2, 80))
sv

<IPython.core.display.Javascript object>

A Jupyter Widget

In [63]:
for idx in range(len(conf_matrix_list)):
    tf_sentence_dict = {}
    tf_sentence_dict["True"] = [candidate_html(sentence) for sentence in conf_matrix_list[idx].correct]

In [64]:
tf_sentence_dict

{'True': [u'<div style="border: 1px #858585; box-shadow:0 4px 8px 0 rgba(0, 0, 0, 0.2), 0 6px 20px 0 rgba(0, 0, 0, 0.19);\n    background-color:#FDFDFD; padding:5pt 10pt 5pt 10pt; width: 80%; margin: auto; margin-top: 2%"><p style="font-size:12pt;">Michael Ulatowski (pictured) did not finalise the divorce with his wife <b style="background-color:#ffd77c;padding:1pt 5pt 1pt 5pt;">Faith Jenkinson</b>, but she went to marry another man, \xa0<b style="background-color:#ffd77c;padding:1pt 5pt 1pt 5pt;">Andrew Asher</b>.</p></div>',
  u'<div style="border: 1px #858585; box-shadow:0 4px 8px 0 rgba(0, 0, 0, 0.2), 0 6px 20px 0 rgba(0, 0, 0, 0.19);\n    background-color:#FDFDFD; padding:5pt 10pt 5pt 10pt; width: 80%; margin: auto; margin-top: 2%"><p style="font-size:12pt;">Daniel Craig pictured with his actress wife <b style="background-color:#ffd77c;padding:1pt 5pt 1pt 5pt;">Rachel Weisz</b>   But when <b style="background-color:#ffd77c;padding:1pt 5pt 1pt 5pt;">Mr Craig</b> attacks James Bond 

### Commit parses

If you are satisfied with the given parses, commit them.

In [39]:
bs.commit()

Added 1 parse(s) to set. (Total # parses = 10)


### View global stats

In [40]:
bs.get_global_coverage()

GlobalCoverage: 97.63% (2390/2448)

In [41]:
bs.num_dev_total

2448

In [42]:
bs.get_lf_stats()

Unnamed: 0,j,Coverage,Overlaps,Conflicts,TP,FP,FN,TN,Empirical Acc.
LF_spouse_to_left_0,0,0.051471,0.051471,0.051471,80,46,0,0,0.634921
LF_no_spouse_in_sentence_0,1,0.822304,0.795343,0.005719,0,0,63,1950,0.968703
LF_married_after_3,2,0.009804,0.009804,0.009804,10,14,0,0,0.416667
LF_family_between_0,3,0.965278,0.941993,0.065359,0,0,179,2184,0.924249
LF_family_to_left_0,4,0.078023,0.077614,0.003268,0,0,7,184,0.963351
LF_other_between_0,5,0.026961,0.026961,0.000817,0,0,3,63,0.954545
LF_too_far_apart_0,6,0.555964,0.555964,0.035131,0,0,76,1285,0.944159
LF_third_wheel_0,7,0.365196,0.365196,0.020833,0,0,36,858,0.959732
LF_identical_args_0,8,0.04902,0.048611,0.000817,0,0,0,120,1.0
Explanation0_0,9,0.007761,0.006944,0.006944,4,15,0,0,0.210526


In [28]:
bs.get_parses()

["return 1 if any(map(in text(left(arg2,'.leq',2,'words')), $'spouse'$)) else 0",
 "return -1 if call((= 0), sum(map(in text(sentence()), $'spouse'$))) else 0",
 "return 1 if (call(in text(between([arg1,arg2])), 'and') and any(map(in text(right(arg2)), ['married','marriage']))) else 0",
 'return -1 if call((>= 1), count(filter(between([arg1,arg2]), words, \\w+\\S*))) else 0',
 "return -1 if any(map(in text(left(arg2,'.leq',3,'words')), $'family'$)) else 0",
 "return -1 if any(map(in text(between([arg1,arg2])), $'other'$)) else 0",
 'return -1 if call((> 10), count(between([arg1,arg2]))) else 0',
 'return -1 if call((>= 1), count(filter(between([arg1,arg2]), ner_tags, PERSON))) else 0',
 'return -1 if call((= text(arg2)), text(arg1)) else 0']

In [29]:
bs.get_lfs()

[<function snorkel.contrib.babble.grammar.grammar.LF_spouse_to_left_0>,
 <function snorkel.contrib.babble.grammar.grammar.LF_no_spouse_in_sentence_0>,
 <function snorkel.contrib.babble.grammar.grammar.LF_married_after_3>,
 <function snorkel.contrib.babble.grammar.grammar.LF_family_between_0>,
 <function snorkel.contrib.babble.grammar.grammar.LF_family_to_left_0>,
 <function snorkel.contrib.babble.grammar.grammar.LF_other_between_0>,
 <function snorkel.contrib.babble.grammar.grammar.LF_too_far_apart_0>,
 <function snorkel.contrib.babble.grammar.grammar.LF_third_wheel_0>,
 <function snorkel.contrib.babble.grammar.grammar.LF_identical_args_0>]

In [30]:
bs.get_explanations()

[Explanation("LF_spouse_to_left: True, there is a spouse word within two words to the left of arg 1 or arg 2"),
 Explanation("LF_no_spouse_in_sentence: False, there are no spouse words in the sentence"),
 Explanation("LF_married_after: True, the word 'and' is between arg 1 and arg 2 and 'married' or 'marriage' is after arg 2"),
 Explanation("LF_family_between: False, there is a family word between arg 1 and arg 2"),
 Explanation("LF_family_to_left: False, there is a family word within three words to the left of arg 1 or arg 2"),
 Explanation("LF_other_between: False, there is an other word between arg 1 and arg 2"),
 Explanation("LF_too_far_apart: False, the number of words between arg 1 and arg 2 is larger than 10"),
 Explanation("LF_third_wheel: False, there is a person between arg 1 and arg 2"),
 Explanation("LF_identical_args: False, arg 1 is identical to arg 2")]

## REPEAT (go back to "START DEMO")

When you have entered all of the explanations that you would like to, run these final cells.

In [43]:
%time pipe.supervise()

Using L_train: <23425x1 sparse matrix of type '<type 'numpy.int64'>'
	with 13 stored elements in Compressed Sparse Row format>
Using L_gold_train: <23425x1 sparse matrix of type '<type 'numpy.int64'>'
	with 1230 stored elements in Compressed Sparse Row format>
Positive Fraction: 0.3%



AssertionError: 

In [44]:
%time pipe.classify()

IndexError: list index out of range