In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
config = {
    'domain': 'spouse',
    'debug': False,
    'postgres': False,
    'parallelism': 1,
    'splits': [0,1,2],
    'disc_model_class': 'logreg',
    'supervision': 'traditional',
    # TEMP
#     'seed': 100,
#     'max_train': 100,
#     'disc_model_search_space': 1,
# #     'disc_params_range': {},
#     'disc_params_default': {
#         'batch_size': 64,
#         'n_epochs': 30,
#         'lr': 0.001,
#         'rebalance': 0.25,
#     }   
}

In [3]:
# Get DB connection string and add to globals
# NOTE: $SNORKELDB must be set before any snorkel imports
import os

default_db_name = 'babble_' + config['domain'] + ('_debug' if config.get('debug', False) else '')
DB_NAME = config.get('db_name', default_db_name)
if 'postgres' in config and config['postgres']:
    DB_TYPE = 'postgres'
else:
    DB_TYPE = 'sqlite'
    DB_NAME += '.db'
DB_ADDR = "localhost:{0}".format(config['db_port']) if 'db_port' in config else ""
os.environ['SNORKELDB'] = '{0}://{1}/{2}'.format(DB_TYPE, DB_ADDR, DB_NAME)
print("$SNORKELDB = {0}".format(os.environ['SNORKELDB']))

$SNORKELDB = sqlite:///babble_spouse.db


In [4]:
from snorkel import SnorkelSession
session = SnorkelSession()

# Resolve config conflicts (nb_config > local_config > global_config)
from snorkel.contrib.babble.pipelines import merge_configs, get_local_pipeline
config = merge_configs(config)

if config['debug']:
    print("NOTE: --debug=True: modifying parameters...")
    config['max_docs'] = 100
    config['gen_model_search_space'] = 2
    config['disc_model_search_space'] = 2
    config['gen_params_default']['epochs'] = 25
    config['disc_params_default']['n_epochs'] = 5

from snorkel.models import candidate_subclass
candidate_class = candidate_subclass(config['candidate_name'], config['candidate_entities'])

pipeline = get_local_pipeline(config['domain'])
pipe = pipeline(session, candidate_class, config)

Overwriting domain=None to domain=spouse
Overwriting print_freq=1 to print_freq=5
Overwriting init_class_prior=0 to init_class_prior=-1.15
Overwriting reg_param=0.1 to reg_param=0.5
Overwriting decay=0.95 to decay=0.99
Overwriting disc_model_class=lstm to disc_model_class=logreg
Overwriting supervision=generative to supervision=traditional


In [5]:
# %time pipe.parse()

In [6]:
# %time pipe.extract()

In [7]:
# %time pipe.load_gold()

In [8]:
# %time pipe.featurize()

In [9]:
# %time pipe.collect()

In [10]:
candidates = session.query(candidate_class).filter(candidate_class.split == 1).all()

In [11]:
from snorkel.viewer import SentenceNgramViewer
sv = SentenceNgramViewer(candidates[5:100], session, n_per_page=3, height=300)
sv

<IPython.core.display.Javascript object>

In [12]:
c = sv.get_selected()
print(c)
print(c.get_parent().text)
print(c.get_stable_id())

Spouse(Span("Cindy Crawford", sentence=3435, chars=[0,13], words=[0,1]), Span("Rande Gerber", sentence=3435, chars=[88,99], words=[18,19]))
Cindy Crawford was spotted wearing a Casamigos tank top while walking hand-in-hand with Rande Gerber in Malibu, California, on Sunday   Cindy displayed her long legs in short shorts as she pulled her brunette hair with a ponytail and snapback cap.    
1ca2f108-633f-440a-bd6e-d59f7e01316b::span:338:351~~1ca2f108-633f-440a-bd6e-d59f7e01316b::span:426:437


In [13]:
from snorkel.contrib.babble import Explanation
from snorkel.contrib.babble.utils import link_explanation_candidates

explanations = [
    Explanation(
        label=True,
        condition="""'spotted' is within 3 words to the right of X or Y""",
        candidate="1ca2f108-633f-440a-bd6e-d59f7e01316b::span:338:351~~1ca2f108-633f-440a-bd6e-d59f7e01316b::span:426:437"
    ),
    Explanation(
        label=True,
        condition="The moon is full",
        candidate=None
    ),
#     Explanation(
#         label=False,
#         condition="""At least one word right of X or Y is uppercase or lowercase""",
#         candidate="1ca2f108-633f-440a-bd6e-d59f7e01316b::span:338:351~~1ca2f108-633f-440a-bd6e-d59f7e01316b::span:426:437"
#     ),
#     Explanation(
#         label=False,
#         condition="""The word 'spotted' is two words right of X or Y""",
#         candidate="1ca2f108-633f-440a-bd6e-d59f7e01316b::span:338:351~~1ca2f108-633f-440a-bd6e-d59f7e01316b::span:474:478"
#     ),    
]
explanations = link_explanation_candidates(explanations, candidates)

Building list of target candidate ids...
Collected 1 unique target candidate ids from 2 explanations.
Gathering desired candidates...
Found 1/1 desired candidates
Linking explanations to candidates...
Linked 1/2 explanations


In [14]:
from collections import namedtuple
from snorkel.contrib.babble import Babbler
babbler = Babbler(session,
                  mode='text',
                  candidate_class=candidate_class,
                  user_lists={})
babbler.apply(explanations, split=1)
parses = babbler.get_parses(translate=False)

Flushing all parses from previous explanation set.
Created grammar with 590 rules
1 explanation(s) out of 2 were parseable.
4 parse(s) generated from 2 explanation(s).
2 parse(s) remain (2 parse(s) removed by DuplicateSemanticsFilter).
2 parse(s) remain (0 parse(s) removed by ConsistencyFilter).
### Applying labeling functions to split 1

### Done in 3.3s.

2 parse(s) remain (0 parse(s) removed by UniformSignatureFilter: (0 None, 0 All)).
2 parse(s) remain (0 parse(s) removed by DuplicateSignatureFilter).
Added 2 parse(s) from 1 explanations to set. (Total # parses = 2)


In [26]:
babbler.filtered_analysis()

SUMMARY
3 TOTAL:
1 Unparseable Explanation
2 Duplicate Semantics
0 Inconsistency with Example
0 Uniform Signature
0 Duplicate Signature

[#1]: Duplicate Semantics

Parse: return 1 if 'spotted'.(any([in(text(exactly 3 word(s) to the right of X)),in(text(exactly 3 word(s) to the right of Y))])) else 0

Reason: This parse is identical to one produced by the following explanation:
	'spotted' is within 3 words to the right of X or Y


[#2]: Duplicate Semantics

Parse: return 1 if 'spotted'.(any([in(text(no more than 3 word(s) to the right of X)),in(text(no more than 3 word(s) to the right of Y))])) else 0

Reason: This parse is identical to one produced by the following explanation:
	'spotted' is within 3 words to the right of X or Y


[#3]: Unparseable Explanation

Explanation: The moon is full

Reason: This explanation couldn't be parsed.



In [16]:
for parse in parses:
    print(parse.semantics)
    print("")
    print(babbler.semparser.grammar.translate(parse.semantics))
    print("")

('.root', ('.label', ('.bool', True), ('.call', ('.composite_or_func', ('.list', ('.in', ('.extract_text', ('.right', ('.arg', ('.int', 1)), ('.string', '.eq'), ('.int', 3), ('.string', 'words')))), ('.in', ('.extract_text', ('.right', ('.arg', ('.int', 2)), ('.string', '.eq'), ('.int', 3), ('.string', 'words')))))), ('.string', 'spotted'))))

return 1 if 'spotted'.(any([in(text(exactly 3 word(s) to the right of X)),in(text(exactly 3 word(s) to the right of Y))])) else 0

('.root', ('.label', ('.bool', True), ('.call', ('.composite_or_func', ('.list', ('.in', ('.extract_text', ('.right', ('.arg', ('.int', 1)), ('.string', '.leq'), ('.int', 3), ('.string', 'words')))), ('.in', ('.extract_text', ('.right', ('.arg', ('.int', 2)), ('.string', '.leq'), ('.int', 3), ('.string', 'words')))))), ('.string', 'spotted'))))

return 1 if 'spotted'.(any([in(text(no more than 3 word(s) to the right of X)),in(text(no more than 3 word(s) to the right of Y))])) else 0



In [17]:
parses[0].function(c)

1

In [18]:
# babbler.semparser.grammar.print_chart()

In [19]:
# parses = sorted(parses, key=lambda x: x.explanation.name)
# new_parses = []
# seen = set()
# for parse in parses:
#     if parse.explanation.name not in seen:
#         seen.add(parse.explanation.name)
#         new_parses.append(parse)
#         print(parse.explanation.name)
#         print(babbler.semparser.grammar.translate(parse.semantics))
#         print("")
# print(len(new_parses))
# new_parses

In [20]:
# from collections import defaultdict
# matches = defaultdict(list)
# for i, parse in enumerate(new_parses):
#     for c in candidates:
#         if parse.function(c) and c.get_stable_id() not in matches.values():
#             matches[parse.explanation.name] = c.get_stable_id()
#             break
#     if parse.explanation.name not in matches:
#         print(babbler.semparser.grammar.translate(parse.semantics))

In [21]:
# print(len(candidates))
# print(len(explanations))
# print(len(parses))
# print(len(new_parses))
# print(len(matches))

In [22]:
# for name, candidate_id in sorted(matches.items()):
#     print("{}\t\t: {}".format(name, candidate_id))

In [23]:
# %time pipe.label()

In [24]:
# %time pipe.supervise()

In [25]:
# %time pipe.classify()