# Step 1: Build the Dataset

The first thing to do is ensure that modules are auto-reloaded at runtime to allow for development in other files.

In [5]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


We then set the Snorkel database location and start and connect to it.  By default, we use a PosgreSQL database backend, which can be created using `createdb DB_NAME` once psql is installed.  Note that Snorkel does *not* currently support parallel database processing with a SQLite backend.

In [6]:
# Setting Snorkel DB location
import os
import sys

import random
import numpy as np

#For networked PostgreSQL
postgres_location = 'postgresql://jdunnmon:123@localhost:5432'
postgres_db_name = 'phone_jd_30K'
os.environ['SNORKELDB'] = os.path.join(postgres_location,postgres_db_name)

#For local PostgreSQL
#os.environ['SNORKELDB'] = 'postgres:///es_locs_small'

# Adding path above for utils
sys.path.append('../utils')

# For SQLite
#db_location = '.'
#db_name = "es_locs_small.db"
#os.environ['SNORKELDB'] = '{0}:///{1}/{2}'.format("sqlite", db_location, db_name)

# Start Snorkel session
from snorkel import SnorkelSession
session = SnorkelSession()

# Setting random seed
seed = 1701
random.seed(seed)
np.random.seed(seed)

We now set the document preprocessor to read raw data into the Snorkel database.  There exist three possible data source options: JSONL files from the MEMEX project (option: `memex_jsons`), a raw tsv file of extractions from the memex project `content.tsv` (option: `content.tsv`), and tsvs with a similar format to `content.tsv` drawn from an Elasticsearch index of the data (option: `es`).  `max_docs` controls the number of documents read by the preprocessor, and `data_source` sets the location of the data.  For MEMEX json source, this should be a directory, while in all other cases it should be a tsv file.

In [12]:
from dataset_utils import set_preprocessor, combine_dedupe

# Set data source: options are 'content.tsv', 'memex_jsons', 'es'
data_source = 'es'

# Setting max number of docs to ingest
max_docs = 36000

# Setting location of data source

# For ES:
data_loc = '/lfs/local/0/jdunnmon/data/chtap/output_phone.tsv'

# Optional: add tsv with additional documents to create combined tsv without duplicates
#data_all_loc = '/dfs/scratch1/jdunnmon/data/memex-data/es/output_all.tsv'
#data_loc = combine_dedupe(data_loc, data_all_loc, '/dfs/scratch1/jdunnmon/data/memex-data/es/combined_phone_1M.tsv')

# Setting preprocessor
doc_preprocessor = set_preprocessor(data_source, data_loc, max_docs=max_docs, verbose=True,
                                    clean_docs=True, content_field=['memex_raw_content'])

Now, we execute the preprocessor.  Parallelism can be changed using the `parallelism` flag.  Note that we use the Spacy parser rather than CoreNLP, as this tends to give superior results.

In [None]:
from snorkel.parser import CorpusParser
from snorkel.parser.spacy_parser import Spacy

# Applying corpus parser
corpus_parser = CorpusParser(parser=Spacy())
%time corpus_parser.apply(list(doc_preprocessor), parallelism=1, verbose=True)

Clearing existing...
Running UDF...

Checking the number of parsed documents and sentences in the database.

In [9]:
from snorkel.models import Document, Sentence

# Printing number of docs/sentences
print("Documents:", session.query(Document).count())
print("Sentences:", session.query(Sentence).count())

Documents: 100
Sentences: 1707


Separating into train, dev, and test sets

In [None]:
from dataset_utils import create_test_train_splits

# Getting all documents parsed by Snorkel
docs = session.query(Document).order_by(Document.name).all()

# Creating train, test, dev splits
%time train_docs, dev_docs, test_docs, train_sents, dev_sents, test_sents = create_test_train_splits(docs, 'phone', gold_dict=None, dev_frac=0.1, test_frac=0.1)

Create candidate extractor.

In [None]:
import random
import io
import codecs
import json
import phonenumbers
        
def find_phone_number(span_input):   
    span_input=span_input.get_span()
    
    lst =[]
    for match in phonenumbers.PhoneNumberMatcher(span_input, "US"):
        num = phonenumbers.format_number(match.number, phonenumbers.PhoneNumberFormat.NATIONAL)
        lst.append(num.encode('utf-8'))
    
    if len(lst)!=0:
        
        return True
        print(lst)
    else:
        return False

def find_phone_number_reg(span_input):
    span_input = span_input.get_span()
    reg1= re.findall("\d{10}",span_input )
    reg2 = re.findall("(\d{3}\D{0,3}\d{3}\D{0,3}\d{4})", span_input)
    reg3 = re.findall("^(\+\d{1,2}\s)?\(?\d{3}\)?[\s*.-~]?\d{3}[\s*.-~]?\d{4}$",span_input )
    reg4 = re.findall("^(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4}$)",span_input)
    if len(reg1)!=0  or len(reg3)!=0 or len(reg4)!=0 or len(reg2)!=0:

        return True
    else:
        return False
    
def count_(span_input, pattern):
    count = 0
    while len(span_input)>0:
        idx = span_input.find(pattern) # returns first position of character matching pattern
        span_input = span_input[idx+len(pattern):]
        if idx<0:
            break
        else:
            count+=1
    return count
    
def phone_matcher (span_input):
    span_input = span_input.get_span()
   
    l1 = len([char for char in span_input if char.isdigit()])
    for nb in ['one', 'two', 'three','four','five','six','seven','eight','nine','ten']:
        l1+=count_(span_input,nb)
    result =  (l1>=10 and l1<11)
    return result

In [None]:
from snorkel.candidates import Ngrams, CandidateExtractor
from dataset_utils import CandidateExtractorFilter
from snorkel.matchers import *

phone_lambda_matcher_1 =LambdaFunctionMatcher(func=find_phone_number)
phone_lambda_matcher =LambdaFunctionMatcher(func=find_phone_number_reg)
phone_lambda_matcher_2 = LambdaFunctionMatcher(func=phone_matcher)
phone_matcher = Union(phone_lambda_matcher_1,phone_lambda_matcher,phone_lambda_matcher_2)

In [None]:
from snorkel.candidates import Ngrams
from snorkel.candidates import CandidateExtractor
from dataset_utils import create_candidate_class, LocationMatcher

# Setting extraction type -- should be a subfield in your data source extractions field!
extraction_type = 'phone'

# Creating candidate class
candidate_class, candidate_class_name = create_candidate_class(extraction_type)

# Defining ngrams for candidates
ngrams = Ngrams(n_max=5)

# Uand matcher for candidate extractor
matcher = phone_matcher
cand_extractor = CandidateExtractorFilter(candidate_class ,[ngrams],[matcher],candidate_filter=None)

Applying candidate extractor to each split (train, dev, test)

In [None]:
# Applying candidate extractor to each split
for k, sents in enumerate([train_sents, dev_sents, test_sents]):
    %time cand_extractor.apply(sents, split=k, parallelism=1)
    print("Number of candidates:", session.query(candidate_class).filter(candidate_class.split == k).count())

Add gold labels.

In [None]:
from dataset_utils import get_gold_labels_from_meta

# Adding dev gold labels using dictionary
%time missed_dev = get_gold_labels_from_meta(session, candidate_class, extraction_type, 1, annotator='gold', gold_dict=None)

# Adding test gold labels using dictionary
%time missed_test = get_gold_labels_from_meta(session, candidate_class, extraction_type, 2, annotator='gold', gold_dict=None)

In [None]:
# Checking percent of gold labels that are positive
from dataset_utils import check_gold_perc
perc_pos = check_gold_perc(session)

In [None]:
from dataset_utils import remove_gold_labels
# Remove gold labels if you want -- uncomment!
#remove_gold_labels(session)