# Environment Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
config = {
    'domain': 'spouse',
    'postgres': False,
    'parallelism': 1,
#     'db_name': 'babble_spouse_demo',
    'babbler_candidate_split': 1,
    'babbler_label_split': 1,
    'supervision': 'generative',
    'disc_model_class': 'logreg',
    'gen_model_search_space': 1,
    'disc_model_search_space': 1,
}

In [3]:
# Get DB connection string and add to globals
# NOTE: $SNORKELDB must be set before any snorkel imports
import os

default_db_name = 'babble_' + config['domain'] + ('_debug' if config.get('debug', False) else '')
DB_NAME = config.get('db_name', default_db_name)
if 'postgres' in config and config['postgres']:
    DB_TYPE = 'postgres'
else:
    DB_TYPE = 'sqlite'
    DB_NAME += '.db'
DB_ADDR = "localhost:{0}".format(config['db_port']) if 'db_port' in config else ""
os.environ['SNORKELDB'] = '{0}://{1}/{2}'.format(DB_TYPE, DB_ADDR, DB_NAME)
print("$SNORKELDB = {0}".format(os.environ['SNORKELDB']))

$SNORKELDB = sqlite:///babble_spouse.db


In [4]:
from snorkel import SnorkelSession
session = SnorkelSession()

# Resolve config conflicts (nb_config > local_config > global_config)
from snorkel.contrib.babble.pipelines import merge_configs, get_local_pipeline
config = merge_configs(config)

if config['debug']:
    print("NOTE: --debug=True: modifying parameters...")
    config['max_docs'] = 100
    config['gen_model_search_space'] = 2
    config['disc_model_search_space'] = 2
    config['gen_params_default']['epochs'] = 25
    config['disc_params_default']['n_epochs'] = 5

from snorkel.models import candidate_subclass
candidate_class = candidate_subclass(config['candidate_name'], config['candidate_entities'])

pipeline = get_local_pipeline(config['domain'])
pipe = pipeline(session, candidate_class, config)

Overwriting disc_model_search_space=10 to disc_model_search_space=1
Overwriting babbler_candidate_split=[0, 1, 2] to babbler_candidate_split=1
Overwriting domain=None to domain=spouse
Overwriting print_freq=1 to print_freq=5
Overwriting disc_model_search_space=10 to disc_model_search_space=1
Overwriting LF_acc_prior_weight_default=1.0 to LF_acc_prior_weight_default=0.5
Overwriting decay=0.95 to decay=0.99
Overwriting init_class_prior=0 to init_class_prior=-1.15
Overwriting reg_param=0.1 to reg_param=0.5
Overwriting gen_model_search_space=10 to gen_model_search_space=1
Overwriting disc_model_class=lstm to disc_model_class=logreg


### Initialization

In [5]:
# %time pipe.parse()

In [6]:
# %time pipe.extract()

In [7]:
# %time pipe.load_gold()

In [8]:
# %time pipe.featurize()

### Pre-load Explanations (10) + User Lists (4)

In [9]:
# from snorkel.contrib.babble import BabbleStream
# bs = BabbleStream(session, candidate_class=Spouse, balanced=True, shuffled=True, seed=1234)

In [10]:
# from experiments.babble.spouse.spouse_examples import get_explanations, get_user_lists

# spouse_explanations = get_explanations()
# spouse_user_lists = get_user_lists()
# spouse_explanations = []
# spouse_user_lists = {}

In [11]:
# bs.preload(explanations=spouse_explanations, user_lists=spouse_user_lists)

# Start Demo:

### Check a certain condition

In [16]:
candidates = session.query(candidate_class).all()

In [17]:
print(len(candidates))

27688


In [20]:
for c in candidates:
    if 'says her formal arrest' in c.get_parent().text:
        print(c[0].get_span(), c[1].get_span())
        print(c.get_parent().text)
        print(c.get_stable_id())
        print('')
        break

(u'Gillis', u'  ')
Gillis says her formal arrest last weekend prompted him to speak out.   
850fbb5c-5b4e-4e47-8f2a-884a024e7968::span:3338:3343~~850fbb5c-5b4e-4e47-8f2a-884a024e7968::span:3408:3409



In [27]:
c.get_parent().ner_tags

[u'PERSON',
 'O',
 'O',
 'O',
 'O',
 u'DATE',
 u'DATE',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 u'PERSON']

###  Make an explanation

With the user input and the current candidate, we make an Explanation object.

In [None]:
LABEL = False
CONDITION = "The word 'son' is immediately to the left of Y"

In [None]:
from snorkel.contrib.babble import Explanation
explanation = Explanation(CONDITION, LABEL, candidate=None)

### Parse and view labeling stats

In [None]:
%time parse_list, filtered_parses, conf_matrix_list, stats_list = bs.apply(explanation)

In [None]:
PARSE_IDX = 0
if parse_list:
    parse = parse_list[PARSE_IDX]
    conf_matrix = conf_matrix_list[PARSE_IDX]
    stats = stats_list[PARSE_IDX]

    print("Parse {}:\n{}\n".format(PARSE_IDX, bs.semparser.grammar.translate(parse.semantics)))
    print(stats.accuracy)
    print(stats.class_coverage)
    print(stats.coverage)
else:
    print("No valid parses were found.")

In [None]:
bs.filtered_analysis(filtered_parses)

In [None]:
# bs.semparser.grammar.print_chart()

In [None]:
lf = parse_list[0].function

In [None]:
matches = []
for c in candidates:
    if lf(c):
        matches.append(c)

In [None]:
from snorkel.viewer import SentenceNgramViewer
sv = SentenceNgramViewer(matches[:100], session, n_per_page=3, height=300)
sv

In [None]:
c = sv.get_selected()
print(c.get_stable_id())

## SCRATCH

### View labeled candidates

Select the subset of labeled candidates you would like to view.

In [None]:
SUBSET = 'correct' # Must be one of ['correct', 'incorrect', 'abstained']

subset = getattr(conf_matrix_list[0], SUBSET)
def candidate_generator(subset):
    for c in subset: 
        yield c
subset_generator = candidate_generator(subset)

In [None]:
from snorkel.viewer import SentenceNgramViewer
c = subset_generator.next()
sv = SentenceNgramViewer([c], session, n_per_page=3, height=max(len(c.get_parent().words)*2, 80))
sv

### Commit parses

If you are satisfied with the given parses, commit them.

In [None]:
bs.commit()

In [None]:
bs.get_majority_quality(split=2)

In [None]:
# %time bs.label_split(0)

In [None]:
# %time bs.label_split(2)

In [None]:
# This is here for illustration purposes.
# No need to call this every time; it will be called by set_babbler_matrices.
%time bs.get_label_matrix(0)

### View global stats

In [None]:
bs.get_global_coverage()

In [None]:
bs.get_lf_stats()

In [None]:
bs.get_parses()

In [None]:
bs.get_lfs()

In [None]:
bs.get_explanations()

## REPEAT (go back to "START DEMO")

When you have entered all of the explanations that you would like to, run these final cells.

In [None]:
# NOTE: Don't use this. Instead, incrementally label using bs.label_split() after each commit.
# pipe.lfs = bs.get_lfs()
# %time pipe.label()

In [None]:
# %time pipe.set_babbler_matrices(bs) # Pulls out and saves label matrices from babbler.

In [None]:
# %time pipe.supervise()

In [None]:
# %time pipe.classify()

Note: in general, we expect Disc to do better than Gen. However, with small sample sizes, major class imbalance, or lack of grid search, those may flip.