# Environment Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
config = {
    'domain': 'spouse',
    'postgres': False,
    'parallelism': 1,
    'db_name': 'babble_spouse_demo',
    'babbler_candidate_split': 1,
    'babbler_label_split': 1,
    'supervision': 'majority',
    'disc_model_class': 'logreg',
    'gen_model_search_space': 1,
    'disc_model_search_space': 1,
}

In [3]:
# Get DB connection string and add to globals
# NOTE: $SNORKELDB must be set before any snorkel imports
import os

default_db_name = 'babble_' + config['domain'] + ('_debug' if config.get('debug', False) else '')
DB_NAME = config.get('db_name', default_db_name)
if 'postgres' in config and config['postgres']:
    DB_TYPE = 'postgres'
else:
    DB_TYPE = 'sqlite'
    DB_NAME += '.db'
DB_ADDR = "localhost:{0}".format(config['db_port']) if 'db_port' in config else ""
os.environ['SNORKELDB'] = '{0}://{1}/{2}'.format(DB_TYPE, DB_ADDR, DB_NAME)
print("$SNORKELDB = {0}".format(os.environ['SNORKELDB']))

$SNORKELDB = sqlite:///babble_spouse_demo.db


In [4]:
from snorkel import SnorkelSession
session = SnorkelSession()

# Resolve config conflicts (nb_config > local_config > global_config)
from snorkel.contrib.babble.pipelines import merge_configs
config = merge_configs(config)

Overwriting disc_model_search_space=10 to disc_model_search_space=1
Overwriting domain=None to domain=spouse
Overwriting print_freq=1 to print_freq=5
Overwriting disc_model_search_space=10 to disc_model_search_space=1
Overwriting LF_acc_prior_weight_default=1.0 to LF_acc_prior_weight_default=0.5
Overwriting decay=0.95 to decay=0.99
Overwriting init_class_prior=0 to init_class_prior=-1.15
Overwriting reg_param=0.1 to reg_param=0.5
Overwriting gen_model_search_space=10 to gen_model_search_space=1
Overwriting disc_model_class=lstm to disc_model_class=logreg
Overwriting supervision=generative to supervision=majority


In [5]:
from snorkel.models import candidate_subclass
from experiments.babble.spouse import SpousePipeline

Spouse = candidate_subclass('Spouse', ['person1', 'person2'])
candidate_class = Spouse
pipe = SpousePipeline(session, Spouse, config)

### Initialization

In [6]:
# %time pipe.parse()

In [7]:
# %time pipe.extract()

In [8]:
# %time pipe.load_gold()

In [9]:
# %time pipe.featurize()

### Pre-load Explanations (10) + User Lists (4)

In [10]:
from snorkel.contrib.babble import BabbleStream
bs = BabbleStream(session, candidate_class=Spouse, balanced=True, shuffled=True, seed=1234)

Created grammar with 596 rules


In [11]:
# from experiments.babble.spouse.spouse_examples import get_explanations, get_user_lists

# spouse_explanations = get_explanations()
# spouse_user_lists = get_user_lists()
# spouse_explanations = []
# spouse_user_lists = {}

In [12]:
# bs.preload(explanations=spouse_explanations, user_lists=spouse_user_lists)

# Start Demo:

### View user_lists

In [13]:
user_lists = bs.user_lists
for alias, values in user_lists.items():
    if len(values) <= 10:
        print("{}:\n {}\n".format(alias, values))
    else:
        print("{}:\n {}...\n".format(alias, list(values)[:10]))

Optionally add another user_list.

In [14]:
ALIAS = None    # e.g., 'marriage_words'
VALUES = []    # e.g., ['engaged', 'betrothed', 'proposed']

if ALIAS:
    bs.add_user_lists({ALIAS: VALUES})

### View a candidate

In [15]:
c = bs.next()
from snorkel.viewer import SentenceNgramViewer
sv = SentenceNgramViewer([c], session, n_per_page=1, height=max(len(c.get_parent().words)*2, 80))
sv

<IPython.core.display.Javascript object>

Widget Javascript not detected.  It may not be installed properly. Did you enable the widgetsnbextension? If not, then run "jupyter nbextension enable --py --sys-prefix widgetsnbextension"


In [20]:
c.get_stable_id()

u'5f335353-bb93-493a-864a-c451cdcdab0a::span:409:422~~5f335353-bb93-493a-864a-c451cdcdab0a::span:512:521'

### Give an explanation

(See MTurk instructions for examples)

In [21]:
LABEL = True
CONDITION = ("'are' is within 5 characters of Y")

# CONDITION = ("'husband' is immediately to the left of Y and there are no people between X and Y")
# CONDITION = ("there are no people between X and Y and 'husband' is immediately to the left of Y")

# LABEL = True
# CONDITION = "X and Y are less than 10 words apart and 'wife' is between them"

# LABEL = False
# CONDITION = "'syndrome' occurs within three words to the right of arg 2"

# LABEL = False
# CONDITION = "')' is between X and Y"

# LABEL = True
# CONDITION = "'wife' is immediately before Y and X and Y are within 7 words of each other"


With the user input and the current candidate, we make an Explanation object.

In [22]:
from snorkel.contrib.babble import Explanation
explanation = Explanation(CONDITION, LABEL, candidate=c)

### Parse and view labeling stats

In [23]:
%time parse_list, filtered_parses, conf_matrix_list, stats_list = bs.apply(explanation)

Flushing all parses from previous explanation set.
1 explanation(s) out of 1 were parseable.
1 parse(s) generated from 1 explanation(s).
1 parse(s) remain (0 parse(s) removed by DuplicateSemanticsFilter).
> /Users/bradenjh/repos/snorkel/snorkel/contrib/babble/text/text_helpers.py(153)get_within_phrase()
-> return phrase[j:k]
(Pdb) c
0 parse(s) remain (1 parse(s) removed by ConsistencyFilter).
CPU times: user 21.2 ms, sys: 3.2 ms, total: 24.4 ms
Wall time: 1.57 s


In [None]:
PARSE_IDX = 0
if parse_list:
    parse = parse_list[PARSE_IDX]
    conf_matrix = conf_matrix_list[PARSE_IDX]
    stats = stats_list[PARSE_IDX]

    print("Parse {}:\n{}\n".format(PARSE_IDX, bs.semparser.grammar.translate(parse.semantics)))
    print(stats.accuracy)
    print(stats.class_coverage)
    print(stats.coverage)
else:
    print("No valid parses were found.")

In [None]:
bs.filtered_analysis(filtered_parses)

In [None]:
# bs.semparser.grammar.print_chart()

### View labeled candidates

Select the subset of labeled candidates you would like to view.

In [None]:
SUBSET = 'correct' # Must be one of ['correct', 'incorrect', 'abstained']

subset = getattr(conf_matrix_list[0], SUBSET)
def candidate_generator(subset):
    for c in subset: 
        yield c
subset_generator = candidate_generator(subset)

In [None]:
from snorkel.viewer import SentenceNgramViewer
c = subset_generator.next()
sv = SentenceNgramViewer([c], session, n_per_page=3, height=max(len(c.get_parent().words)*2, 80))
sv

### Commit parses

If you are satisfied with the given parses, commit them.

In [None]:
bs.commit()

In [None]:
(f1, pr, re) = bs.get_majority_quality(split=1)
print(f1, pr, re)

In [None]:
num_labels_equiv = bs.get_labeled_equivalent(f1)
num_labels_equiv

In [None]:
# %time bs.label_split(0)

In [None]:
# %time bs.label_split(2)

In [None]:
# This is here for illustration purposes.
# No need to call this every time; it will be called by set_babbler_matrices.
%time bs.get_label_matrix(1)

### View global stats

In [None]:
bs.get_global_coverage()

In [None]:
bs.get_lf_stats()

In [None]:
bs.get_parses()

In [None]:
bs.get_lfs()

In [None]:
bs.get_explanations()

## REPEAT (go back to "START DEMO")

When you have entered all of the explanations that you would like to, run these final cells.

In [None]:
# NOTE: Don't use this. Instead, incrementally label using bs.label_split() after each commit.
pipe.lfs = bs.get_lfs()
%time pipe.label(split=0)
%time pipe.label(split=2)

In [None]:
%time pipe.set_babbler_matrices(bs, split=1) # Pulls out and saves label matrices from babbler.

In [None]:
%time pipe.supervise()

In [None]:
%time pipe.classify()

Note: in general, we expect Disc to do better than Gen. However, with small sample sizes, major class imbalance, or lack of grid search, those may flip.

## Scratch

This portion of the notebook can be used to find candidates that match a certain explanation.

In [None]:
candidates = session.query(bs.candidate_class).filter(bs.candidate_class.split == 1).all()

In [None]:
from snorkel.contrib.babble import Explanation
explanation = Explanation(
    label=False,
    condition="The last word of X is different than the last word of Y",
    candidate=None)

In [None]:
%time parse_list, filtered_parses, conf_matrix_list, stats_list = bs.apply(explanation)

In [None]:
# bs.filtered_analysis(filtered_parses)

In [None]:
# bs.semparser.grammar.print_chart()

In [None]:
lf = parse_list[0].function

In [None]:
matches = []
for c in candidates:
    if lf(c):
        matches.append(c)

In [None]:
from snorkel.viewer import SentenceNgramViewer
sv = SentenceNgramViewer(matches, session, n_per_page=3, height=300)
sv

In [None]:
c = sv.get_selected()
print(c.get_stable_id())