# Environment Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
config = {
    'domain': 'spouse',
    'postgres': False,
    'parallelism': 1,
    'db_name': 'babble_spouse_demo',
    'babbler_candidate_split': 1,
    'babbler_label_split': 1,
    'supervision': 'majority',
    'disc_model_class': 'logreg',
    'gen_model_search_space': 1,
    'disc_model_search_space': 1,
}

In [3]:
# Get DB connection string and add to globals
# NOTE: $SNORKELDB must be set before any snorkel imports
import os

default_db_name = 'babble_' + config['domain'] + ('_debug' if config.get('debug', False) else '')
DB_NAME = config.get('db_name', default_db_name)
if 'postgres' in config and config['postgres']:
    DB_TYPE = 'postgres'
else:
    DB_TYPE = 'sqlite'
    DB_NAME += '.db'
DB_ADDR = "localhost:{0}".format(config['db_port']) if 'db_port' in config else ""
os.environ['SNORKELDB'] = '{0}://{1}/{2}'.format(DB_TYPE, DB_ADDR, DB_NAME)
print("$SNORKELDB = {0}".format(os.environ['SNORKELDB']))

$SNORKELDB = sqlite:///babble_spouse_demo.db


In [4]:
from snorkel import SnorkelSession
session = SnorkelSession()

# Resolve config conflicts (nb_config > local_config > global_config)
from snorkel.contrib.babble.pipelines import merge_configs
config = merge_configs(config)

Overwriting disc_model_search_space=10 to disc_model_search_space=1
Overwriting domain=None to domain=spouse
Overwriting print_freq=1 to print_freq=5
Overwriting disc_model_search_space=10 to disc_model_search_space=1
Overwriting LF_acc_prior_weight_default=1.0 to LF_acc_prior_weight_default=0.5
Overwriting decay=0.95 to decay=0.99
Overwriting init_class_prior=0 to init_class_prior=-1.15
Overwriting reg_param=0.1 to reg_param=0.5
Overwriting gen_model_search_space=10 to gen_model_search_space=1
Overwriting disc_model_class=lstm to disc_model_class=logreg
Overwriting supervision=generative to supervision=majority


In [5]:
from snorkel.models import candidate_subclass
from tutorials.babble.spouse import SpousePipeline

Spouse = candidate_subclass('Spouse', ['person1', 'person2'])
candidate_class = Spouse
pipe = SpousePipeline(session, Spouse, config)

### Initialization

In [6]:
# %time pipe.parse()

In [7]:
# %time pipe.extract()

In [8]:
# %time pipe.load_gold()

In [9]:
# %time pipe.featurize()

### Pre-load Explanations (10) + User Lists (4)

In [10]:
from snorkel.contrib.babble import BabbleStream
bs = BabbleStream(session, candidate_class=Spouse, balanced=True, shuffled=True, seed=1234)

Created grammar with 596 rules


In [11]:
# from tutorials.babble.spouse.spouse_examples import get_explanations, get_user_lists

# spouse_explanations = get_explanations()
# spouse_user_lists = get_user_lists()
# spouse_explanations = []
# spouse_user_lists = {}

In [12]:
# bs.preload(explanations=spouse_explanations, user_lists=spouse_user_lists)

# Start Demo:

### View user_lists

In [13]:
user_lists = bs.user_lists
for alias, values in user_lists.items():
    if len(values) <= 10:
        print("{}:\n {}\n".format(alias, values))
    else:
        print("{}:\n {}...\n".format(alias, list(values)[:10]))

Optionally add another user_list.

In [14]:
ALIAS = None    # e.g., 'marriage_words'
VALUES = []    # e.g., ['engaged', 'betrothed', 'proposed']

if ALIAS:
    bs.add_user_lists({ALIAS: VALUES})

### View a candidate

In [15]:
c = bs.next()
from snorkel.viewer import SentenceNgramViewer
sv = SentenceNgramViewer([c], session, n_per_page=1, height=max(len(c.get_parent().words)*2, 80))
sv

<IPython.core.display.Javascript object>

A Jupyter Widget

In [20]:
c.get_stable_id()

u'affed66e-3a08-40d0-b11f-51b5727dccc4::span:224:226~~affed66e-3a08-40d0-b11f-51b5727dccc4::span:340:343'

### Give an explanation

(See MTurk instructions for examples)

In [16]:
LABEL = True
CONDITION = ("there are no people between X and Y and 'husband' is immediately to the left of Y")

# CONDITION = ("'husband' is immediately to the left of Y and there are no people between X and Y")
# CONDITION = ("there are no people between X and Y and 'husband' is immediately to the left of Y")

# LABEL = True
# CONDITION = "X and Y are less than 10 words apart and 'wife' is between them"

# LABEL = False
# CONDITION = "'syndrome' occurs within three words to the right of arg 2"

# LABEL = False
# CONDITION = "')' is between X and Y"

# LABEL = True
# CONDITION = "'wife' is immediately before Y and X and Y are within 7 words of each other"


With the user input and the current candidate, we make an Explanation object.

In [17]:
from snorkel.contrib.babble import Explanation
explanation = Explanation(CONDITION, LABEL, candidate=c)

### Parse and view labeling stats

In [18]:
%time parse_list, filtered_parses, conf_matrix_list, stats_list = bs.apply(explanation)

Flushing all parses from previous explanation set.
Created grammar with 595 rules
1 explanation(s) out of 1 were parseable.
2 parse(s) generated from 1 explanation(s).
1 parse(s) remain (1 parse(s) removed by DuplicateSemanticsFilter).
1 parse(s) remain (0 parse(s) removed by ConsistencyFilter).
### Applying labeling functions to split 1

### Done in 6.4s.

1 parse(s) remain (0 parse(s) removed by UniformSignatureFilter: (0 None, 0 All)).
1 parse(s) remain (0 parse(s) removed by DuplicateSignatureFilter).
CPU times: user 9.9 s, sys: 615 ms, total: 10.5 s
Wall time: 10.7 s


In [19]:
PARSE_IDX = 0
if parse_list:
    parse = parse_list[PARSE_IDX]
    conf_matrix = conf_matrix_list[PARSE_IDX]
    stats = stats_list[PARSE_IDX]

    print("Parse {}:\n{}\n".format(PARSE_IDX, bs.semparser.grammar.translate(parse.semantics)))
    print(stats.accuracy)
    print(stats.class_coverage)
    print(stats.coverage)
else:
    print("No valid parses were found.")

Parse 0:
return 1 if (count([w for w in the word(s) between([X,Y]) if w.ner_tags == PERSON]).(= 0) and 'husband'.in(text(exactly 1 word(s) to the left of Y))) else 0

Accuracy: 80.77% (21/26)
ClassCoverage: 10.71% (21/196)
Coverage: 0.93% (26/2796)


In [20]:
bs.filtered_analysis(filtered_parses)

SUMMARY
1 TOTAL:
0 Unparseable Explanation
1 Duplicate Semantics
0 Inconsistency with Example
0 Uniform Signature
0 Duplicate Signature

[#1]: Duplicate Semantics

Parse: return 1 if (count([w for w in the word(s) between([X,Y]) if w.ner_tags == PERSON]).(= 0) and 'husband'.in(text(exactly 1 word(s) to the left of Y))) else 0

Reason: This parse is identical to one produced by the following explanation:
	"there are no people between X and Y and 'husband' is immediately to the left of Y"



In [21]:
# bs.semparser.grammar.print_chart()

### View labeled candidates

Select the subset of labeled candidates you would like to view.

In [22]:
SUBSET = 'correct' # Must be one of ['correct', 'incorrect', 'abstained']

subset = getattr(conf_matrix_list[0], SUBSET)
def candidate_generator(subset):
    for c in subset: 
        yield c
subset_generator = candidate_generator(subset)

In [23]:
from snorkel.viewer import SentenceNgramViewer
c = subset_generator.next()
sv = SentenceNgramViewer([c], session, n_per_page=3, height=max(len(c.get_parent().words)*2, 80))
sv

<IPython.core.display.Javascript object>

### Commit parses

If you are satisfied with the given parses, commit them.

In [24]:
bs.commit()

Added 1 parse(s) from 1 explanations to set. (Total # parses = 1)


In [25]:
(f1, pr, re) = bs.get_majority_quality(split=1)
print(f1, pr, re)

(0.1891891891891892, 0.80769230769230771, 0.10714285714285714)


In [26]:
num_labels_equiv = bs.get_labeled_equivalent(f1)
num_labels_equiv

'1594'

In [27]:
# %time bs.label_split(0)

In [28]:
# %time bs.label_split(2)

In [29]:
# This is here for illustration purposes.
# No need to call this every time; it will be called by set_babbler_matrices.
%time bs.get_label_matrix(1)

CPU times: user 51.5 ms, sys: 1.32 ms, total: 52.8 ms
Wall time: 51.7 ms


<2796x1 sparse matrix of type '<type 'numpy.float64'>'
	with 26 stored elements in Compressed Sparse Row format>

### View global stats

In [30]:
bs.get_global_coverage()

GlobalCoverage: 0.93% (26/2796)

In [31]:
bs.get_lf_stats()

Unnamed: 0,j,Coverage,Overlaps,Conflicts,TP,FP,FN,TN,Empirical Acc.
Explanation0_0,0,0.009299,0.0,0.0,21,5,0,0,0.807692


In [32]:
bs.get_parses()

["return 1 if (count([w for w in the word(s) between([X,Y]) if w.ner_tags == PERSON]).(= 0) and 'husband'.in(text(exactly 1 word(s) to the left of Y))) else 0"]

In [33]:
bs.get_lfs()

[<function snorkel.contrib.babble.grammar.grammar.Explanation0_0>]

In [34]:
bs.get_explanations()

[Explanation("Explanation0: True, there are no people between X and Y and 'husband' is immediately to the left of Y")]

## REPEAT (go back to "START DEMO")

When you have entered all of the explanations that you would like to, run these final cells.

In [35]:
# NOTE: Don't use this. Instead, incrementally label using bs.label_split() after each commit.
pipe.lfs = bs.get_lfs()
%time pipe.label(split=0)
%time pipe.label(split=2)

Clearing existing...
Running UDF...

Labeled split 0: (22195,1) sparse (nnz = 279)

CPU times: user 2min 15s, sys: 2.78 s, total: 2min 17s
Wall time: 2min 29s
Clearing existing...
Running UDF...

Labeled split 2: (2697,1) sparse (nnz = 49)

CPU times: user 15.3 s, sys: 218 ms, total: 15.5 s
Wall time: 15.7 s


In [36]:
%time pipe.set_babbler_matrices(bs, split=1) # Pulls out and saves label matrices from babbler.

CPU times: user 47.9 ms, sys: 1.02 ms, total: 48.9 ms
Wall time: 48.1 ms


In [37]:
%time pipe.supervise()

Using L_train: <22195x1 sparse matrix of type '<type 'numpy.int64'>'
	with 279 stored elements in Compressed Sparse Row format>
Using L_gold_train: <22195x1 sparse matrix of type '<type 'numpy.int64'>'
	with 22195 stored elements in Compressed Sparse Row format>
Positive Fraction: 7.0%

Using L_dev: <2796x1 sparse matrix of type '<type 'numpy.float64'>'
	with 26 stored elements in Compressed Sparse Row format>
Using L_gold_dev: <2796x1 sparse matrix of type '<type 'numpy.int64'>'
	with 2796 stored elements in Compressed Sparse Row format>
Positive Fraction: 7.0%

Using L_test: <2697x1 sparse matrix of type '<type 'numpy.int64'>'
	with 49 stored elements in Compressed Sparse Row format>
Using L_gold_test: <2697x1 sparse matrix of type '<type 'numpy.int64'>'
	with 2697 stored elements in Compressed Sparse Row format>
Positive Fraction: 8.3%

Saved 22195 marginals
CPU times: user 1.8 s, sys: 75.4 ms, total: 1.88 s
Wall time: 1.84 s


In [None]:
%time pipe.classify()

Note: in general, we expect Disc to do better than Gen. However, with small sample sizes, major class imbalance, or lack of grid search, those may flip.

## Scratch

This portion of the notebook can be used to find candidates that match a certain explanation.

In [None]:
candidates = session.query(bs.candidate_class).filter(bs.candidate_class.split == 1).all()

In [None]:
from snorkel.contrib.babble import Explanation
explanation = Explanation(
    label=False,
    condition="The last word of X is different than the last word of Y",
    candidate=None)

In [None]:
%time parse_list, filtered_parses, conf_matrix_list, stats_list = bs.apply(explanation)

In [None]:
# bs.filtered_analysis(filtered_parses)

In [None]:
# bs.semparser.grammar.print_chart()

In [None]:
lf = parse_list[0].function

In [None]:
matches = []
for c in candidates:
    if lf(c):
        matches.append(c)

In [None]:
from snorkel.viewer import SentenceNgramViewer
sv = SentenceNgramViewer(matches, session, n_per_page=3, height=300)
sv

In [None]:
c = sv.get_selected()
print(c.get_stable_id())