# Building the Corpus

In [24]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [25]:
# Setting Snorkel DB location
import os
import sys

#For PostgreSQL
postgres_location = 'postgresql://jdunnmon:123@localhost:5432'
postgres_db_name = 'memex_db_snorkel'
os.environ['SNORKELDB'] = os.path.join(postgres_location,postgres_db_name)

# Adding path above for utils
sys.path.append('..')
# For SQLite
#db_location = '.'
#db_name = "snorkel_memex.db"
#os.environ['SNORKELDB'] = '{0}:///{1}/{2}'.format("sqlite", db_location, db_name)

# Start Snorkel session
from snorkel import SnorkelSession
session = SnorkelSession()

In [32]:
from snorkel_utils import MemexTSVDocPreprocessor, MEMEXJsonLGZIPPreprocessor, retrieve_all_files

# Setting path to MEMEX source data
data_loc = '../../../data/data_sample'

# Getting all file paths
path_list = retrieve_all_files(data_loc)

# Applying arbitrary conditions to file path list
path_list = [a for a in path_list if a.endswith('gz')]
path_list = path_list[5:6]

# Setting max number of docs to ingest
max_docs = 10000

# Setting parallelism
parallelism = 8

# Preprocessing documents from path_list
doc_preprocessor = MEMEXJsonLGZIPPreprocessor(data_loc,\
                                file_list=path_list,encoding='utf-8', max_docs=max_docs, verbose=False)

In [36]:
from snorkel.parser import CorpusParser
from snorkel.parser.spacy_parser import Spacy

# Applying corpus parser
corpus_parser = CorpusParser(parser=Spacy())
%time corpus_parser.apply(list(doc_preprocessor), parallelism=parallelism)

Clearing existing...
Running UDF...
CPU times: user 2min 15s, sys: 17 s, total: 2min 32s
Wall time: 15min 23s


In [37]:
from snorkel.models import Document, Sentence

print("Documents:", session.query(Document).count())
print("Sentences:", session.query(Sentence).count())

Documents: 10000
Sentences: 166822


In [38]:
import pickle
with open('../../../data/data_sample/gold_loc.pickle', 'rb') as handle:
    gold_dict = pickle.load(handle)

In [39]:
from snorkel_utils import create_test_train_splits

# Getting all documents parsed by Snorkel
docs = session.query(Document).order_by(Document.name).all()

# Creating train, test, dev splits
%time train_docs, dev_docs, test_docs, train_sents, dev_sents, test_sents = create_test_train_splits(docs, 'location', gold_dict=gold_dict)

Train: 8000 Docs, 134982 Sentences
Dev: 91 Docs, 1257 Sentences
Test: 92 Docs, 1203 Sentences
CPU times: user 27 s, sys: 7.99 s, total: 35 s
Wall time: 1min 45s


In [40]:
from snorkel.models import Candidate, candidate_subclass

# Designing candidate subclasses
LocationExtraction = candidate_subclass('Location', ['location'])

In [41]:
from snorkel.candidates import Ngrams, CandidateExtractor
from snorkel.matchers import LocationMatcher

# Defining ngrams and matcher for candidate extractor
location_ngrams   = Ngrams(n_max=7)
location_matcher  = LocationMatcher(longest_match_only=True)
cand_extractor    = CandidateExtractor(LocationExtraction, [location_ngrams], [location_matcher])

In [None]:
for k, sents in enumerate([train_sents, dev_sents, test_sents]):
    %time cand_extractor.apply(sents, split=k, parallelism=parallelism)
    print("Number of candidates:", session.query(LocationExtraction).filter(LocationExtraction.split == k).count())

Clearing existing...
Running UDF...


In [7]:
cands_dev = session.query(LocationExtraction).filter(LocationExtraction.split == 1).all()

In [8]:
from snorkel_utils import get_gold_labels_from_meta

# Adding dev gold labels using dictionary
%time missed_dev = get_gold_labels_from_meta(session, LocationExtraction, 'location', 1, annotator='gold')

# Adding test gold labels using dictionary
%time missed_test = get_gold_labels_from_meta(session, LocationExtraction, 'location', 2, annotator='gold')

Loading 3981 candidate labels

AnnotatorLabels created: 3981
CPU times: user 35.9 s, sys: 1.27 s, total: 37.1 s
Wall time: 45.9 s
Loading 4172 candidate labels

AnnotatorLabels created: 4172
CPU times: user 37.2 s, sys: 1.12 s, total: 38.3 s
Wall time: 47.4 s


In [22]:
from snorkel.annotations import load_gold_labels

L_gold_dev = load_gold_labels(session, annotator_name='gold', split=1)
L_gold_test = load_gold_labels(session, annotator_name='gold', split=2)