In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import os

# TO USE A DATABASE OTHER THAN SQLITE, USE THIS LINE
# Note that this is necessary for parallel execution amongst other things...
# os.environ['SNORKELDB'] = 'postgres://localhost:5432/babble_test_bike'

from snorkel import SnorkelSession
session = SnorkelSession()

In [2]:
import numpy as np

# anns_folder = '/dfs/scratch0/paroma/coco/annotations/'
anns_folder = os.environ['SNORKELHOME'] + '/experiments/babble/bike/data/'
train_path = anns_folder + 'train_anns.npy'
val_path = anns_folder + 'val_anns.npy'

train_anns = np.load(train_path).tolist()
val_anns = np.load(val_path).tolist()

In [3]:
from snorkel.models import candidate_subclass

Biker = candidate_subclass('Biker', ['person', 'bike'])

In [4]:
from snorkel.parser import ImageCorpusExtractor, CocoPreprocessor

corpus_extractor = ImageCorpusExtractor(candidate_class=Biker)

coco_preprocessor = CocoPreprocessor(train_path, source=0)
%time corpus_extractor.apply(coco_preprocessor)

coco_preprocessor = CocoPreprocessor(val_path, source=1)
%time corpus_extractor.apply(coco_preprocessor, clear=False)

for split in [0, 1]:
    num_candidates = session.query(Biker).filter(Biker.split == split).count()
    print("Split {} candidates: {}".format(split, num_candidates))

Clearing existing...
Running UDF...
CPU times: user 11.2 s, sys: 408 ms, total: 11.6 s
Wall time: 11.6 s
Running UDF...
CPU times: user 2.49 s, sys: 108 ms, total: 2.6 s
Wall time: 2.65 s
Split 0 candidates: 2406
Split 1 candidates: 1037


In [5]:
from snorkel.models import StableLabel
from snorkel.db_helpers import reload_annotator_labels

def assign_gold_labels(session, labels_by_candidate):
    candidate_class = Biker
    annotator_name = 'gold'
    
    for candidate_hash, label in labels_by_candidate.items():
        set_name, image_idx, bbox1_idx, bbox2_idx = candidate_hash.split(':')
        source = {'train': 0, 'val': 1}[set_name]
        stable_id_1 = "{}:{}::bbox:{}".format(source, image_idx, bbox1_idx)
        stable_id_2 = "{}:{}::bbox:{}".format(source, image_idx, bbox2_idx)
        context_stable_ids = "~~".join([stable_id_1, stable_id_2])
        query = session.query(StableLabel).filter(StableLabel.context_stable_ids == context_stable_ids)
        query = query.filter(StableLabel.annotator_name == annotator_name)
        label = 1 if label else -1
        if query.count() == 0:
            session.add(StableLabel(
                context_stable_ids=context_stable_ids,
                annotator_name=annotator_name,
                value=label))

    session.commit()
    reload_annotator_labels(session, candidate_class, annotator_name, split=source, filter_label_split=False)

In [6]:
validation_labels_by_candidate = np.load(anns_folder + 'labels_by_candidate.npy').tolist()
assign_gold_labels(session, validation_labels_by_candidate)

AnnotatorLabels created: 906


In [7]:
train_labels_by_candidate = np.load(anns_folder + 'train_labels_by_candidate.npy').tolist()
assign_gold_labels(session, train_labels_by_candidate)

AnnotatorLabels created: 2102


In [8]:
from snorkel.annotations import load_gold_labels

L_gold_dev = load_gold_labels(session, annotator_name='gold', split=1)
L_gold_dev

<1037x1 sparse matrix of type '<type 'numpy.int64'>'
	with 906 stored elements in Compressed Sparse Row format>

In [9]:
from snorkel.annotations import load_gold_labels

L_gold_train = load_gold_labels(session, annotator_name='gold', split=0)
L_gold_train

<2406x1 sparse matrix of type '<type 'numpy.int64'>'
	with 2102 stored elements in Compressed Sparse Row format>