# Step 1: Build the Dataset

The first thing to do is ensure that modules are auto-reloaded at runtime to allow for development in other files.

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

We then set the Snorkel database location and start and connect to it.  By default, we use a PosgreSQL database backend, which can be created using `createdb DB_NAME` once psql is installed.  Note that Snorkel does *not* currently support parallel database processing with a SQLite backend.

In [2]:
# Setting Snorkel DB location
import os
import sys

import random
import numpy as np

#For networked PostgreSQL
postgres_location = 'postgresql://jdufault:123@localhost:5432'
postgres_db_name = 'es_locs_1M'
os.environ['SNORKELDB'] = os.path.join(postgres_location,postgres_db_name)

#For local PostgreSQL
#os.environ['SNORKELDB'] = 'postgres:///es_locs_small'

# Adding path above for utils
sys.path.append('../utils')

# For SQLite
#db_location = '.'
#db_name = "es_locs_small.db"
#os.environ['SNORKELDB'] = '{0}:///{1}/{2}'.format("sqlite", db_location, db_name)

# Start Snorkel session
from snorkel import SnorkelSession
session = SnorkelSession()

# Setting parallelism
parallelism = 72

# Setting random seed
seed = 1701
random.seed(seed)
np.random.seed(seed)


We now set the document preprocessor to read raw data into the Snorkel database.  There exist three possible data source options: JSONL files from the MEMEX project (option: `memex_jsons`), a raw tsv file of extractions from the memex project `content.tsv` (option: `content.tsv`), and tsvs with a similar format to `content.tsv` drawn from an Elasticsearch index of the data (option: `es`).  `max_docs` controls the number of documents read by the preprocessor, and `data_source` sets the location of the data.  For MEMEX json source, this should be a directory, while in all other cases it should be a tsv file.

In [3]:
from dataset_utils import set_preprocessor, combine_dedupe

# Set data source: options are 'content.tsv', 'memex_jsons', 'es'
data_source = 'es'

# Setting max number of docs to ingest
max_docs = 1000

# Setting location of data source

# For ES:
data_loc = '/lfs/raiders5/0/jdunnmon/data/chtap/output_location'

# Optional: add tsv with additional documents to create combined tsv without duplicates
#data_loc = combine_dedupe(data_loc, 'output_location.tsv', 'combined.tsv')

# If memex_raw_content is a content_field, uses term as a regex in raw data in addition to getting title and body
term = r'\b[Ll]ocation:|\b[cC]ity:'

# Doc length in characters, remove to have no max
max_doc_length=1500

# Setting preprocessor
doc_preprocessor = set_preprocessor(data_source, data_loc, max_docs=max_docs, verbose=False, clean_docs=True,
                                    content_fields=['raw_content', 'url'], term=term, max_doc_length=max_doc_length)

Using parallelized loader


Now, we execute the preprocessor.  Parallelism can be changed using the `parallelism` flag.  Note that we use the Spacy parser rather than CoreNLP, as this tends to give superior results.

In [4]:
from snorkel.parser import CorpusParser
from snorkel.parser.spacy_parser import Spacy

# Applying corpus parser
corpus_parser = CorpusParser(parser=Spacy())
%time corpus_parser.apply(list(doc_preprocessor), parallelism=parallelism, verbose=False)

Malformatted Line!
Malformatted Line!
Malformatted Line!
Malformatted Line!
Malformatted Line!
Malformatted Line!
Malformatted Line!
Malformatted Line!
Malformatted Line!
Clearing existing...
Running UDF...
CPU times: user 2min 21s, sys: 2.1 s, total: 2min 24s
Wall time: 2min 37s


Checking the number of parsed documents and sentences in the database.

In [5]:
from snorkel.models import Document, Sentence

# Printing number of docs/sentences
print("Documents:", session.query(Document).count())
print("Sentences:", session.query(Sentence).count())

Documents: 1000
Sentences: 7450


Separating into train, dev, and test sets

In [6]:
from dataset_utils import create_test_train_splits

# Getting all documents parsed by Snorkel
docs = session.query(Document).order_by(Document.name).all()

# Creating train, test, dev splits
%time train_docs, dev_docs, test_docs, train_sents, dev_sents, test_sents = create_test_train_splits(docs, 'location', gold_dict=None, dev_frac=0.1, test_frac=0.1)

Train: 800 Docs, 5951 Sentences
Dev: 100 Docs, 754 Sentences
Test: 100 Docs, 745 Sentences
CPU times: user 1.78 s, sys: 140 ms, total: 1.92 s
Wall time: 2.61 s


Create candidate extractor.

In [7]:
from snorkel.candidates import Ngrams
from snorkel.candidates import CandidateExtractor
from dataset_utils import create_candidate_class, LocationMatcher, city_index
from snorkel.matchers import Union, LambdaFunctionMatcher

# Setting extraction type -- should be a subfield in your data source extractions field!
extraction_type = 'location'

# Creating candidate class
candidate_class, candidate_class_name = create_candidate_class(extraction_type)

# Defining ngrams for candidates
location_ngrams = Ngrams(n_max=3)

# Define matchers
cities = city_index('../utils/data/cities15000.txt')
geo_location_matcher = LambdaFunctionMatcher(func=cities.fast_loc)
# spacy_location_matcher = LocationMatcher(longest_match_only=True)

# Union matchers and create candidate extractor
location_matcher = Union(geo_location_matcher)
cand_extractor   = CandidateExtractor(candidate_class, [location_ngrams], [location_matcher])

Applying candidate extractor to each split (train, dev, test)

In [8]:
# Applying candidate extractor to each split
for k, sents in enumerate([train_sents, dev_sents, test_sents]):
    %time cand_extractor.apply(sents, split=k, parallelism=parallelism)
    print("Number of candidates:", session.query(candidate_class).filter(candidate_class.split == k).count())

Clearing existing...
Running UDF...
CPU times: user 1.4 s, sys: 1.76 s, total: 3.17 s
Wall time: 12.8 s
Number of candidates: 6751
Clearing existing...
Running UDF...
CPU times: user 368 ms, sys: 1.4 s, total: 1.76 s
Wall time: 5.69 s
Number of candidates: 907
Clearing existing...
Running UDF...
CPU times: user 348 ms, sys: 1.51 s, total: 1.86 s
Wall time: 5.74 s
Number of candidates: 818


In [9]:
from snorkel.viewer import SentenceNgramViewer
cands_dev = session.query(candidate_class).filter(candidate_class.split == 0).order_by(candidate_class.id).all()
sv = SentenceNgramViewer(cands_dev[:100], session)
sv

<IPython.core.display.Javascript object>

SentenceNgramViewer(cids=[[[5], [4, 75], [95]], [[63], [33], [26]], [[38, 46], [2], [32, 58]], [[22], [78], [2…

Add gold labels.

In [10]:
from dataset_utils import get_gold_labels_from_meta

# Adding dev gold labels using dictionary
%time missed_dev = get_gold_labels_from_meta(session, candidate_class, extraction_type, 1, annotator='gold', gold_dict=None)

# Adding test gold labels using dictionary
%time missed_test = get_gold_labels_from_meta(session, candidate_class, extraction_type, 2, annotator='gold', gold_dict=None)

Loading 907 candidate labels

AnnotatorLabels created: 907
CPU times: user 5.36 s, sys: 220 ms, total: 5.58 s
Wall time: 8.12 s
Loading 818 candidate labels

AnnotatorLabels created: 818
CPU times: user 5.66 s, sys: 172 ms, total: 5.84 s
Wall time: 7.64 s


In [11]:
# Checking percent of gold labels that are positive
from dataset_utils import check_gold_perc
perc_pos = check_gold_perc(session)

Percent Positive: 0.34


In [12]:
from dataset_utils import remove_gold_labels
# Remove gold labels if you want -- uncomment!
#remove_gold_labels(session)