In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

# import os
# os.remove('snorkel.db')

from snorkel import SnorkelSession
session = SnorkelSession()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
import csv
import os

mturk_out_path = (os.environ['SNORKELHOME'] + 
                  '/experiments/babble/spouse/data/mturk_spouse_all_out.csv')

with open(mturk_out_path, 'r') as csvfile:
    csvreader = csv.reader(csvfile)

    header = csvreader.next()
    turk = set()
    for row in csvreader:
        for i, field in enumerate(row):
            if header[i].startswith('Input.span1'):
                turk.add(field[:field.find('::')] + "::document:0:0")
print("Turk: {} documents".format(len(turk)))

Turk: 75 documents


In [4]:
gold_path = (os.environ['SNORKELHOME'] + 
                  '/tutorials/intro/data/gold_labels.tsv')

with open(gold_path, 'r') as tsvfile:
    tsvreader = csv.reader(tsvfile)
    gold = set()
    for row in tsvreader:
        span = row[0]
        doc_id = span[:span.find('::')] + "::document:0:0"
        gold.add(doc_id)
print("Gold: {} documents".format(len(gold)))

Gold: 444 documents


In [5]:
import os
import sys

from snorkel.parser import TSVDocPreprocessor
from snorkel.parser.spacy_parser import Spacy
from snorkel.parser import CorpusParser

MAX_DOCS = None

doc_preprocessor = TSVDocPreprocessor(
    os.environ['SNORKELHOME'] + '/tutorials/intro/data/articles.tsv',
    max_docs=MAX_DOCS)

# corpus_parser = CorpusParser(parser=Spacy())
# %time corpus_parser.apply(doc_preprocessor, count=MAX_DOCS)

# To use this shortcut, comment out corpus_parser lines and delete snorkel.db
for doc, _ in doc_preprocessor:
    session.add(doc)
session.commit()

In [6]:
from snorkel.models import Document, Sentence

print("Documents:", session.query(Document).count())

('Documents:', 2591)


In [7]:
import random
from snorkel.models import Document

from tutorials.intro import number_of_people

train_docs = set()
dev_docs = set()
test_docs = set()

docs = session.query(Document).order_by(Document.name).all()
# num_docs = len(docs)
# train_fraction = 0.2
# adj_train_fraction = (0.8 * num_docs - len(turk))/num_docs
# print(adj_train_fraction)

random.seed(2222)
for i, doc in enumerate(docs):
    rand = random.random()
    if doc.stable_id in turk:
        train_docs.add(doc)
    elif doc.stable_id in gold:
        if rand < 0.5:
            dev_docs.add(doc)
        else:
            test_docs.add(doc)
    else:
        train_docs.add(doc)
print("TRAIN docs: {}".format(len(train_docs)))
print("DEV docs: {}".format(len(dev_docs)))
print("TEST docs: {}".format(len(test_docs)))

TRAIN docs: 2223
DEV docs: 187
TEST docs: 181


In [8]:
assignment_path = mturk_out_path = (os.environ['SNORKELHOME'] + 
                  '/experiments/babble/spouse/data/split_assignments.tsv')
with open(assignment_path, 'w') as tsvfile:
    tsvwriter = csv.writer(tsvfile, delimiter='\t')
    for split, doc_set in enumerate([train_docs, dev_docs, test_docs]):
        for doc in doc_set:
            tsvwriter.writerow([doc.stable_id, split])

In [9]:
# TEST with this:
assignment_path = mturk_out_path = (os.environ['SNORKELHOME'] + 
          '/experiments/babble/spouse/data/split_assignments.tsv')
with open(assignment_path, 'r') as tsvfile:
    tsvreader = csv.reader(tsvfile, delimiter='\t')
    split_assignments = {k: v for k, v in tsvreader}

In [10]:
split_assignments.items()[:30]

[('d325336a-b3ef-4a6d-b7e1-2eca3a42cd2a::document:0:0', '0'),
 ('14112548-367a-4de1-85a7-11cd145b18e0::document:0:0', '1'),
 ('44c591cf-4fcb-42f3-86cb-ce469aa5bda7::document:0:0', '0'),
 ('240f6aee-21ca-44fa-8a11-f9f028e18b1f::document:0:0', '0'),
 ('e6a7cef9-ef82-43fe-9d92-39ef052bcad6::document:0:0', '0'),
 ('2667f876-2b6f-4516-b4b6-2024246d3d66::document:0:0', '0'),
 ('b19afa20-bfd9-497c-90be-c90d42df259e::document:0:0', '0'),
 ('a0ae0e02-2323-4fed-869f-a7a5a544278f::document:0:0', '0'),
 ('2a24e6e6-cb26-463b-8d71-977ca6e88bb6::document:0:0', '0'),
 ('e16a971f-23ce-42e4-81df-b2386126f8b3::document:0:0', '0'),
 ('35cabc94-63de-47ad-b28d-4b70861a2e3b::document:0:0', '0'),
 ('a8dfab81-2664-4586-92c7-d448550c88de::document:0:0', '0'),
 ('e37ad99f-cc66-4759-8f5f-746ca1ab7db0::document:0:0', '0'),
 ('ce4e1a2f-d81c-4cf4-93cb-cd30f4f4246f::document:0:0', '0'),
 ('29ff04ea-aefd-4f83-8778-3137862158e3::document:0:0', '0'),
 ('c314652b-f30d-428f-b720-abb85f0ddc6d::document:0:0', '0'),
 ('5b7ad

In [None]:
# import os 
# from tutorials.intro import load_external_labels
# from snorkel.annotations import load_gold_labels

# fpath = os.environ['SNORKELHOME'] + '/tutorials/intro/data/gold_labels.tsv'
# load_external_labels(session, Spouse, annotator_name='gold', path=fpath, 
#                      splits=[0,1,2])

# L_train = L_gold_train = load_gold_labels(session, annotator_name='gold', split=0)