In [1]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
config = {
    'domain': 'drink',
    'debug': True,
    'postgres': False,
    'parallelism': 1,
    'splits': [0,1,2],
    'disc_model_class': 'logreg',
    'supervision': 'generative',
}

In [3]:
# Get DB connection string and add to globals
# NOTE: $SNORKELDB must be set before any snorkel imports
import os

default_db_name = 'babble_' + config['domain'] + ('_debug' if config.get('debug', False) else '')
DB_NAME = config.get('db_name', default_db_name)
if 'postgres' in config and config['postgres']:
    DB_TYPE = 'postgres'
else:
    DB_TYPE = 'sqlite'
    DB_NAME += '.db'
DB_ADDR = "localhost:{0}".format(config['db_port']) if 'db_port' in config else ""
os.environ['SNORKELDB'] = '{0}://{1}/{2}'.format(DB_TYPE, DB_ADDR, DB_NAME)
print("$SNORKELDB = {0}".format(os.environ['SNORKELDB']))

$SNORKELDB = sqlite:///babble_drink_debug.db


In [4]:
from snorkel import SnorkelSession
session = SnorkelSession()

# Resolve config conflicts (nb_config > local_config > global_config)
from snorkel.contrib.babble.pipelines import merge_configs, get_local_pipeline
config = merge_configs(config)

if config['debug']:
    print("NOTE: --debug=True: modifying parameters...")
    config['max_docs'] = 100
    config['gen_model_search_space'] = 2
    config['disc_model_search_space'] = 2
    config['gen_params_default']['epochs'] = 25
    config['disc_params_default']['n_epochs'] = 5

from snorkel.models import candidate_subclass
candidate_class = candidate_subclass(config['candidate_name'], config['candidate_entities'])

pipeline = get_local_pipeline(config['domain'])
pipe = pipeline(session, candidate_class, config)

Overwriting splits=[0, 1] to splits=[0, 1, 2]
Overwriting disc_model_class=inception_v3 to disc_model_class=logreg
Overwriting epochs=[25, 50, 75] to epochs=[50]
Overwriting step_size=[0.01, 0.001, 0.0001, 1e-05] to step_size=[1e-05]
Overwriting reg_param=[0.0, 0.01, 0.1, 0.25, 0.5] to reg_param=[0.01]
Overwriting decay=[0.9, 0.95, 0.99] to decay=[0.9]
Overwriting lr=[0.01, 0.001, 0.0001] to lr=[50, 100]
Overwriting babbler_candidate_split=1 to babbler_candidate_split=0
Overwriting LF_acc_prior_weight_default=1.0 to LF_acc_prior_weight_default=0.5
Overwriting decay=0.95 to decay=0.9
Overwriting epochs=50 to epochs=100
Overwriting reg_param=0.1 to reg_param=0.01
Overwriting disc_model_class=lstm to disc_model_class=logreg
Overwriting domain=None to domain=drink
Overwriting tune_b=True to tune_b=False
Overwriting debug=False to debug=True
Overwriting gen_model_search_space=10 to gen_model_search_space=1
NOTE: --debug=True: modifying parameters...


In [5]:
%time pipe.parse()

Clearing existing...
Running UDF...
Running UDF...
CPU times: user 19.8 s, sys: 520 ms, total: 20.3 s
Wall time: 20.8 s


In [6]:
%time pipe.extract()

Extraction was performed during parse stage.
Candidates [Split 0]: 15711
Candidates [Split 1]: 3377
Candidates [Split 2]: 0
CPU times: user 16 ms, sys: 16 ms, total: 32 ms
Wall time: 18.7 ms


In [7]:
%time pipe.load_gold()

Reading train CSV!
Num HITs unique: 1525
Num HITs total: 3050
Unanimous: 4542
Majority: 2211
Bad: 450
Reading val CSV!
Num HITs unique: 184
Num HITs total: 368
Unanimous: 474
Majority: 318
Bad: 100
AnnotatorLabels created: 5648
AnnotatorLabels created: 634
CPU times: user 59.2 s, sys: 552 ms, total: 59.8 s
Wall time: 59.8 s


In [8]:
# %time pipe.featurize()

In [9]:
%time pipe.collect()

Reading train CSV!
Num HITs unique: 44
Num HITs total: 132
Unanimous: 112
Majority: 54
Bad: 37
Building list of target candidate ids...
Collected 139 unique target candidate ids from 370 explanations.
Gathering desired candidates...
Found 139/139 desired candidates
Linking explanations to candidates...
Linked 370/370 explanations
Calling babbler...
Flushing all parses from previous explanation set.
Created grammar with 453 rules
121 explanation(s) out of 370 were parseable.
171 parse(s) generated from 370 explanation(s).
49 parse(s) remain (122 parse(s) removed by DuplicateSemanticsFilter).
24 parse(s) remain (25 parse(s) removed by ConsistencyFilter).
### Applying labeling functions to split 1

### Done in 4.6s.

18 parse(s) remain (6 parse(s) removed by UniformSignatureFilter: (0 None, 6 All)).
14 parse(s) remain (4 parse(s) removed by DuplicateSignatureFilter).
Added 14 parse(s) from 13 explanations to set. (Total # parses = 14)
CPU times: user 16.5 s, sys: 1.42 s, total: 17.9 s
Wal

In [10]:
%time pipe.label()

Clearing existing...
Running UDF...

Labeled split 0: (15711,14) sparse (nnz = 87722)

Clearing existing...
Running UDF...

Labeled split 1: (3377,14) sparse (nnz = 18749)

                   j  Coverage  Overlaps  Conflicts   TP   FP   FN   TN  \
Explanation3_0     0  0.859639  0.859639   0.859639    0    0   87  332   
Explanation5_0     1  0.058632  0.058632   0.058632   54  103    0    0   
Explanation6_2     2  0.852236  0.852236   0.851051  158  433    0    0   
Explanation9_0     3  0.038496  0.038496   0.037903   31   47    0    0   
Explanation38_0    4  0.514658  0.514658   0.513473  129  325    0    0   
Explanation42_1    5  0.481492  0.481492   0.480308   73  246    0    0   
Explanation51_0    6  0.159313  0.159313   0.159017   70  139    0    0   
Explanation61_0    7  0.326917  0.326917   0.325733   89  149    0    0   
Explanation61_1    8  0.156648  0.156648   0.155760   58  162    0    0   
Explanation126_0   9  0.904649  0.904649   0.904649    0    0  165  428   
Ex

In [14]:
%time pipe.supervise()

Using L_train: <15711x14 sparse matrix of type '<type 'numpy.int64'>'
	with 87722 stored elements in Compressed Sparse Row format>
Using L_gold_train: <15711x1 sparse matrix of type '<type 'numpy.int64'>'
	with 5648 stored elements in Compressed Sparse Row format>
Positive Fraction: 7.5%

Using L_dev: <3377x14 sparse matrix of type '<type 'numpy.int64'>'
	with 18749 stored elements in Compressed Sparse Row format>
Using L_gold_dev: <3377x1 sparse matrix of type '<type 'numpy.int64'>'
	with 634 stored elements in Compressed Sparse Row format>
Positive Fraction: 5.0%

Using L_test: <0x14 sparse matrix of type '<type 'numpy.int64'>'
	with 0 stored elements in Compressed Sparse Row format>
Using L_gold_test: <0x1 sparse matrix of type '<type 'numpy.int64'>'
	with 0 stored elements in Compressed Sparse Row format>
[1] Testing epochs = 50, step_size = 1.00e-05, reg_param = 1.00e-02, decay = 9.00e-01
Inferred cardinality: 2


TypeError: too many arguments: expected 6, got 7

In [None]:
config['display_marginals'] = False
config['download_data'] = False
config['disc_model_class'] = 'inception_v3'

%time pipe.classify(slim_ws_path='/dfs/scratch0/paroma/slim_new/slim_ws/')