In [1]:
%load_ext autoreload
%autoreload 2

## Setup

In [2]:
config = {
    'domain': 'cdr',
#     'db_name': 'babble_spouse_demo',
    'postgres': False,
    'parallelism': 1,
    'debug': False,
    'babbler_candidate_split': 1,
    'babbler_label_split': 1,
    'disc_model_search_space': 1,
    'gen_model_search_space': 1,
    'supervision': 'majority',
}

In [3]:
# Get DB connection string and add to globals
# NOTE: $SNORKELDB must be set before any snorkel imports
import os

default_db_name = 'babble_' + config['domain'] + ('_debug' if config.get('debug', False) else '')
DB_NAME = config.get('db_name', default_db_name)
if 'postgres' in config and config['postgres']:
    DB_TYPE = 'postgres'
else:
    DB_TYPE = 'sqlite'
    DB_NAME += '.db'
DB_ADDR = "localhost:{0}".format(config['db_port']) if 'db_port' in config else ""
os.environ['SNORKELDB'] = '{0}://{1}/{2}'.format(DB_TYPE, DB_ADDR, DB_NAME)
print("$SNORKELDB = {0}".format(os.environ['SNORKELDB']))

$SNORKELDB = sqlite:///babble_spouse_demo.db


In [4]:
from snorkel import SnorkelSession
session = SnorkelSession()

# Resolve config conflicts (nb_config > local_config > global_config)
from snorkel.contrib.babble.pipelines import merge_configs
config = merge_configs(config)

if config['debug']:
    print("NOTE: --debug=True: modifying parameters...")
    config['max_docs'] = 100
    config['gen_model_search_space'] = 2
    config['disc_model_search_space'] = 2
    config['gen_params_default']['epochs'] = 25
    config['disc_params_default']['n_epochs'] = 5

Overwriting disc_model_search_space=10 to disc_model_search_space=1
Overwriting domain=None to domain=spouse
Overwriting print_freq=1 to print_freq=5
Overwriting disc_model_search_space=10 to disc_model_search_space=1
Overwriting LF_acc_prior_weight_default=1.0 to LF_acc_prior_weight_default=0.5
Overwriting decay=0.95 to decay=0.99
Overwriting init_class_prior=0 to init_class_prior=-1.15
Overwriting reg_param=0.1 to reg_param=0.5
Overwriting gen_model_search_space=10 to gen_model_search_space=1
Overwriting disc_model_class=lstm to disc_model_class=logreg
Overwriting supervision=generative to supervision=majority


In [5]:
from snorkel.models import candidate_subclass
from experiments.babble.spouse import SpousePipeline

Spouse = candidate_subclass('Spouse', ['person1', 'person2'])
candidate_class = Spouse
pipe = SpousePipeline(session, Spouse, config)

## Parse, Extract, Load

In [6]:
# %time pipe.parse()

In [7]:
# %time pipe.extract()

In [8]:
# %time pipe.load_gold()

## Now the real work begins...

In [9]:
from snorkel.contrib.babble import BabbleStream
bs = BabbleStream(session, candidate_class=Spouse, balanced=True, seed=123)

Created grammar with 596 rules


In [10]:
from experiments.babble.spouse.spouse_examples import get_explanations, get_user_lists

candidates = session.query(Spouse).filter(Spouse.split == 0).all()
spouse_explanations = get_explanations()
spouse_user_lists = get_user_lists()

In [11]:
bs.preload(explanations=spouse_explanations, user_lists=spouse_user_lists)

Created grammar with 596 rules
Flushing all parses from previous explanation set.
11 explanation(s) out of 12 were parseable.
29 parse(s) generated from 12 explanation(s).
16 parse(s) remain (13 parse(s) removed by DuplicateSemanticsFilter).
Note: 16 LFs did not have candidates and therefore could not be filtered.
16 parse(s) remain (0 parse(s) removed by ConsistencyFilter).
### Applying labeling functions to split 1

### Done in 28.4s.

11 parse(s) remain (5 parse(s) removed by UniformSignatureFilter: (5 None, 0 All)).
9 parse(s) remain (2 parse(s) removed by DuplicateSignatureFilter).
Added 9 parse(s) from 9 explanations to set. (Total # parses = 9)


In [43]:
c = bs.next()

In [44]:
from snorkel.viewer import SentenceNgramViewer
sv = SentenceNgramViewer([c], session, n_per_page=1, height=200)
sv

<IPython.core.display.Javascript object>

In [46]:
sv.get_selected().get_stable_id()

u'7fc3e510-c4e6-44c2-a24b-f9a39bfcfb07::span:2486:2499~~7fc3e510-c4e6-44c2-a24b-f9a39bfcfb07::span:2537:2551'

In [None]:
from snorkel.contrib.babble import Explanation
label = True
condition = "married is within two words to the left of arg 2"
explanation = Explanation(condition, label, candidate=c)
explanation

In [None]:
%time parse_list, conf_matrix_list, stats_list = bs.apply(explanation)

In [None]:
print(stats_list[0].accuracy)
print(stats_list[0].class_coverage)

In [None]:
from snorkel.viewer import SentenceNgramViewer
error_set = conf_matrix_list[0].correct
sv = SentenceNgramViewer(list(error_set)[:10], session, n_per_page=3, height=300)
sv

In [None]:
global_coverage = bs.get_global_stats()
print(global_coverage)

In [None]:
bs.commit([]) # Permanently adds the parses corresponding to these idxs

Confirm that after committing, global coverage goes up.

In [None]:
global_coverage = bs.get_global_stats()
print(global_coverage)

In [None]:
L_train = bs.get_label_matrix()
L_train

### Add another explanation

In [None]:
from snorkel.contrib.babble import Explanation
label = False
condition = "'where' is within two words to the right of arg 1"
explanation = Explanation(condition, label, candidate=c, name='')

In [None]:
%time parse_list, conf_matrix_list, stats_list = bs.apply(explanation)

In [None]:
print(stats_list[0].accuracy)
print(stats_list[0].class_coverage)

In [None]:
bs.commit()

In [None]:
parse = parse_list[0]
parse.semantics

In [None]:
bs.semparser.grammar.translate(parse.semantics)

In [None]:
pipe.lfs = [parse.function for parse in bs.parses]
pipe.label()

In [None]:
# %time pipe.supervise()

In [None]:
# %time pipe.classify()