### Building the Corpus

In [5]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
# Setting Snorkel DB location
import os
import sys

import random
import numpy as np

#For PostgreSQL
#postgres_location = 'postgresql://jdunnmon:123@localhost:5432'
postgres_location = 'postgresql://saeideh:123@localhost:5432'
#postgres_db_name = 'memex_db_snorkel_large'
#postgres_db_name = 'memex_snorkel_db_extracted_text_10K'
#postgres_db_name = 'memex_snorkel_db_extracted_text_150K'
#postgres_db_name = 'memex_db_snorkel_tsv_1M'
postgres_db_name = 'phone_sse_ver1'
#postgres_db_name ='memex_1M_sse_rest'
os.environ['SNORKELDB'] = os.path.join(postgres_location,postgres_db_name)

# Adding path above for utils
sys.path.append('..')

# For SQLite
#db_location = '.'
#db_name = "snorkel_memex.db"
#os.environ['SNORKELDB'] = '{0}:///{1}/{2}'.format("sqlite", db_location, db_name)

# Start Snorkel session
from snorkel import SnorkelSession
session = SnorkelSession()
# Setting random seed
seed = 1701
random.seed(seed)
# np.random.seed(seed)

We now set the document preprocessor to read raw data into the Snorkel database. There exist three possible data source options: JSONL files from the MEMEX project (option: memex_jsons), a raw tsv file of extractions from the memex project content.tsv (option: content.tsv), and tsvs with a similar format to content.tsv drawn from an Elasticsearch index of the data (option: es).  max_docs controls the number of documents read by the preprocessor, and data_source sets the location of the data. For MEMEX json source, this should be a directory, while in all other cases it should be a tsv file.

`Note:` If you want to use es, you have to create a new file coming from es (like, es_locations.tsv)

In [7]:
from dataset_utils import set_preprocessor

# Set data source: options are 'content.tsv', 'memex_jsons', 'es'
data_source = 'es'

# Setting max number of docs to ingest
max_docs = 100000

# Setting location of data source

# For ES:
data_loc = '/lfs/local/0/saeideh/extractors/src/elasticsearch_preprocessing/output_phone.tsv'

# Setting preprocessor
doc_preprocessor = set_preprocessor(data_source,data_loc,
                                    max_docs=max_docs,verbose=False,clean_docs=True,content_field='extracted_text')
# doc_preprocessor = set_preprocessor(data_source,data_loc,
#                                     max_docs=max_docs,verbose=False,clean_docs=True,content_field='extracted_text')

Now, we execute the preprocessor. Parallelism can be changed using the parallelism flag. Note that we use the Spacy parser rather than CoreNLP, as this tends to give superior results.

## creating a preprocessor based on the files

In [8]:
from snorkel.parser import CorpusParser
from snorkel.parser.spacy_parser import Spacy

# Applying corpus parser
corpus_parser = CorpusParser(parser=Spacy())
%time corpus_parser.apply(list(doc_preprocessor), parallelism=8, verbose=False)


Clearing existing...
Running UDF...
CPU times: user 22.1 s, sys: 1.39 s, total: 23.5 s
Wall time: 1min 20s


Checking the number of parsed documents and sentences in the database.

In [9]:
from snorkel.models import Document, Sentence

# Printing number of docs/sentences
print("Documents:", session.query(Document).count())
print("Sentences:", session.query(Sentence).count())

Documents: 25560
Sentences: 108051


### Deviding test, train, dev

In [15]:
from dataset_utils import create_test_train_splits

# Getting all documents parsed by Snorkel
docs = session.query(Document).order_by(Document.name).all()

# Creating train, test, dev splits
%time train_docs, dev_docs, test_docs, train_sents, dev_sents, test_sents = create_test_train_splits(docs, 'phone', gold_dict=None, dev_frac=0.01, test_frac=0.01,)

Malformatted JSON Entry!
Malformatted JSON Entry!
Malformatted JSON Entry!
Malformatted JSON Entry!
Malformatted JSON Entry!
Malformatted JSON Entry!
Malformatted JSON Entry!
Malformatted JSON Entry!
Malformatted JSON Entry!
Malformatted JSON Entry!
Malformatted JSON Entry!
Malformatted JSON Entry!
Malformatted JSON Entry!
Malformatted JSON Entry!
Malformatted JSON Entry!
Malformatted JSON Entry!
Malformatted JSON Entry!
Malformatted JSON Entry!
Malformatted JSON Entry!
Malformatted JSON Entry!
Malformatted JSON Entry!
Malformatted JSON Entry!
Train: 25048 Docs, 105883 Sentences
Dev: 256 Docs, 1109 Sentences
Test: 256 Docs, 1059 Sentences
CPU times: user 32.3 s, sys: 2.47 s, total: 34.7 s
Wall time: 54.6 s


### Set up Matchers

In [10]:
from snorkel.models import Candidate, candidate_subclass

# Designing candidate subclasses
PhoneExtraction = candidate_subclass('Phone', ['phone'])

In [11]:
import random
import io
import codecs
import json
#from snorkel.matchers import *
from phonenumbers.python import phonenumbers
# #from models import TemporarySpan
# def find_phone_number_1(span_input):
#     patern = re.compile("(?:\+?(\d{1})?-?\(?(\d{3})\)?[\s\-.\/]?)?(\d{3})[\s\-.\/]?(\d{4})[\s\-.\/]?")
# #     result = prog.match(string)
#     span = span_input.get_span()
#     for w in span:   
#         result = patern.match(w)
#         if result:
#             return True
        
    

def find_phone_number(span_input):
    
    span_input=span_input.get_span()
    
    lst =[]
    for match in phonenumbers.PhoneNumberMatcher(span_input, "US"):
        num = phonenumbers.format_number(match.number, phonenumbers.PhoneNumberFormat.NATIONAL)
        lst.append(num.encode('utf-8'))
    
    if len(lst)!=0:
        
        return True
        print(lst)
    else:
        return False

def find_phone_number_reg(span_input):
    span_input = span_input.get_span()
    reg1= re.findall("\d{10}",span_input )
#     print(reg1)
    reg2 = re.findall("(\d{3}\D{0,3}\d{3}\D{0,3}\d{4})", span_input)#("(^?:\+?(\d{1})?-?\(?(\d{3})\)?[\s\-.\/]?)?(\d{3})[\s\-.\/]?(\d{4})[\s\-.\/]?$",span_input )
#     print(reg2)

    reg3 = re.findall("^(\+\d{1,2}\s)?\(?\d{3}\)?[\s*.-~]?\d{3}[\s*.-~]?\d{4}$",span_input )#("^(\+\d{1,2}\s)?\(?\d{3}\)?[\s.-~]?\d{3}[\s.-~]?\d{4}$",span_input)
#     print(reg3)
    reg4 = re.findall("^(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4}$)",span_input)
#     print(reg4)
    if len(reg1)!=0  or len(reg3)!=0 or len(reg4)!=0 or len(reg2)!=0:

        return True
    else:
        return False

    
def count_(span_input, pattern):
    count = 0
    while len(span_input)>0:
        idx = span_input.find(pattern) # returns first position of character matching pattern
        span_input = span_input[idx+len(pattern):]
        if idx<0:
            break
        else:
            count+=1
    return count
       
        
    
    
def phone_matcher (span_input):
    span_input = span_input.get_span()
   
    l1 = len([char for char in span_input if char.isdigit()])
    for nb in ['one', 'two', 'three','four','five','six','seven','eight','nine','ten']:
        l1+=count_(span_input,nb)
    result =  (l1>=10 and l1<11)
    return result

# print(l1)
# print(l1>=10 and l1<12)


In [12]:
from snorkel.candidates import Ngrams
from snorkel.candidates import CandidateExtractor
from dataset_utils import create_candidate_class
from snorkel.matchers import *
from snorkel_utils_phone import CandidateExtractorFilter
from snorkel.candidates import Ngrams

from snorkel.candidates import CandidateExtractor
from dataset_utils import create_candidate_class
from snorkel_utils import CandidateExtractorFilter

from dataset_utils import create_candidate_class
#from snorkel_utils import get_location_matcher, get_candidate_filter, CandidateExtractorFilter, LocationMatcher

# # Setting extraction type -- should be a subfield in your data source extractions field!
# extraction_type = 'phone'

# # Creating candidate class
# candidate_class, candidate_class_name  = create_candidate_class(extraction_type)

# # Defining ngrams for candidates

phone_ngrams = Ngrams(n_max=5)


phone_lambda_matcher_1 =LambdaFunctionMatcher(func=find_phone_number)
phone_lambda_matcher =LambdaFunctionMatcher(func=find_phone_number_reg)
phone_lambda_matcher_2 = LambdaFunctionMatcher(func=phone_matcher)
phone_matcher_ = Union(phone_lambda_matcher_1,phone_lambda_matcher,phone_lambda_matcher_2)


In [13]:
cand_extractor = CandidateExtractorFilter(PhoneExtraction ,[phone_ngrams],[phone_matcher_],candidate_filter=None)

In [16]:
for k, sents in enumerate([train_sents, dev_sents, test_sents]):
    %time cand_extractor.apply(sents, split=k, parallelism=8)
    print("Number of candidates:", session.query(PhoneExtraction).filter(PhoneExtraction.split == k).count())

Clearing existing...
Running UDF...
CPU times: user 18.1 s, sys: 2.96 s, total: 21.1 s
Wall time: 1min 3s
Number of candidates: 18383
Clearing existing...
Running UDF...
CPU times: user 236 ms, sys: 532 ms, total: 768 ms
Wall time: 4.29 s
Number of candidates: 204
Clearing existing...
Running UDF...
CPU times: user 212 ms, sys: 552 ms, total: 764 ms
Wall time: 4.18 s
Number of candidates: 175


# Adding Gold labels

print (label,gold_value, extracted_cand)

In [18]:
from snorkel_utils_phone import  get_gold_phone_label_from_meta

# Adding dev gold labels using dictionary
%time missed_dev = get_gold_phone_label_from_meta(session, PhoneExtraction, 'phone', 1, annotator='gold', gold_dict = None)

# Adding test gold labels using dictionary
%time missed_test = get_gold_phone_label_from_meta(session, PhoneExtraction, 'phone', 2, annotator='gold', gold_dict= None)

Loading 204 candidate labels
[=                                       ] 0%index 0
label: -1 gold_value:  (319) 410-0196 extracted_cand:  319410019624
index 1
label: -1 gold_value:  (319) 410-0196 extracted_cand:  31941001962452
[=                                       ] 1%index 2
label: -1 gold_value:  (319) 410-0196 extracted_cand:  31941001962452
index 3
label: 1 gold_value:  (319) 410-0196 extracted_cand:  (319) 410-0196
[=                                       ] 2%index 4
label: 1 gold_value:  (319) 410-0196 extracted_cand:  (319) 410-0196
index 5
label: 1 gold_value:  (714) 727-7023 extracted_cand:  (714) 727-7023
[==                                      ] 3%index 6
label: 1 gold_value:  (916) 213-6108 extracted_cand:  (916) 213-6108
index 7
label: 1 gold_value:  (916) 213-6108 extracted_cand:  (916) 213-6108
[==                                      ] 4%index 8
label: 1 gold_value:  (916) 213-6108 extracted_cand:  (916) 213-6108
index 9
label: 1 gold_value:  (916) 213-6108 extract

label: -1 gold_value:  (323) 836-8645(415) 480-9134(818) 584-5060(818) 932-5308(858) 248-9108(930) 658-0557 extracted_cand:  202226052022202220222022260520222022260520222022260520224154809134
index 76
label: -1 gold_value:  (323) 836-8645(415) 480-9134(818) 584-5060(818) 932-5308(858) 248-9108(930) 658-0557 extracted_cand:  202226052022202220222022260520222022260520222022260520224154809134
index 77
label: 1 gold_value:  (626) 600-2789 extracted_cand:  (626) 600-2789
label: 1 gold_value:  (610) 316-2940 extracted_cand:  (610) 316-2940
index 79
label: 1 gold_value:  (209) 351-3240(408) 223-5335(530) 204-0468(650) 360-1284(925) 268-8061(973) 447-4539 extracted_cand:  (973) 447-4539
label: 1 gold_value:  (614) 607-1689 extracted_cand:  (614) 607-1689
index 81
label: 1 gold_value:  (616) 710-8360 extracted_cand:  (616) 710-8360
label: 1 gold_value:  (206) 922-9303(305) 849-8140(312) 600-8628(347) 940-1982(401) 324-9388(414) 914-3777(416) 554-3337(442) 222-0227(469) 510-5849(608) 609-5899(62

label: -1 gold_value:  (224) 433-7485 extracted_cand:  (290) 000-1214
label: -1 gold_value:  (224) 433-7485 extracted_cand:  (201) 317-0000

AnnotatorLabels created: 204
CPU times: user 3.19 s, sys: 820 ms, total: 4.01 s
Wall time: 3.91 s
Loading 175 candidate labels
[=                                       ] 0%index 0
label: 1 gold_value:  (507) 779-9395 extracted_cand:  (507) 779-9395
index 1
label: 1 gold_value:  (507) 779-9395 extracted_cand:  (507) 779-9395
[=                                       ] 1%index 2
label: 1 gold_value:  (507) 779-9395 extracted_cand:  (507) 779-9395
index 3
label: 1 gold_value:  (510) 387-5762 extracted_cand:  (510) 387-5762
[==                                      ] 2%index 4
label: 1 gold_value:  (587) 216-9917 extracted_cand:  (587) 216-9917
[==                                      ] 3%index 5
label: 1 gold_value:  (587) 216-9917 extracted_cand:  (587) 216-9917
index 6
label: 1 gold_value:  (587) 216-9917 extracted_cand:  (587) 216-9917
[==          

label: 1 gold_value:  (415) 626-1857 extracted_cand:  (415) 626-1857
index 59
label: 1 gold_value:  (415) 626-1857 extracted_cand:  (415) 626-1857
label: 1 gold_value:  (415) 626-1857 extracted_cand:  (415) 626-1857
label: 1 gold_value:  (323) 391-4945(323) 905-4021(702) 723-6202(818) 313-8688(818) 421-0922(951) 335-1559(951) 367-5673 extracted_cand:  (818) 421-0922
index 62
label: 1 gold_value:  (910) 229-7979 extracted_cand:  (910) 229-7979
label: 1 gold_value:  (910) 229-7979 extracted_cand:  (910) 229-7979
index 64
label: 1 gold_value:  (910) 229-7979 extracted_cand:  (910) 229-7979
label: 1 gold_value:  (910) 229-7979 extracted_cand:  (910) 229-7979
label: 1 gold_value:  (910) 229-7979 extracted_cand:  (910) 229-7979
index 67
label: 1 gold_value:  (910) 336-7852 extracted_cand:  (910) 336-7852
label: 1 gold_value:  (910) 336-7852 extracted_cand:  (910) 336-7852
index 69
label: 1 gold_value:  (910) 336-7852 extracted_cand:  (910) 336-7852
label: 1 gold_value:  (910) 336-7852 extrac

label: 1 gold_value:  (413) 776-0328(415) 424-9409(415) 735-6125(702) 712-4325(916) 274-1559(916) 807-3408 extracted_cand:  (415) 424-9409
index 118
label: 1 gold_value:  (413) 776-0328(415) 424-9409(415) 735-6125(702) 712-4325(916) 274-1559(916) 807-3408 extracted_cand:  (415) 424-9409
label: 1 gold_value:  (413) 776-0328(415) 424-9409(415) 735-6125(702) 712-4325(916) 274-1559(916) 807-3408 extracted_cand:  (415) 424-9409
index 120
label: 1 gold_value:  (224) 280-7780 extracted_cand:  (224) 280-7780
label: 1 gold_value:  (224) 280-7780 extracted_cand:  (224) 280-7780
label: 1 gold_value:  (224) 280-7780 extracted_cand:  (224) 280-7780
index 123
label: 1 gold_value:  (912) 656-7277 extracted_cand:  (912) 656-7277
label: 1 gold_value:  (912) 656-7277 extracted_cand:  (912) 656-7277
index 125
label: 1 gold_value:  (912) 656-7277 extracted_cand:  (912) 656-7277
label: 1 gold_value:  (912) 656-7277 extracted_cand:  (912) 656-7277
index 127
label: 1 gold_value:  (912) 656-7277 extracted_can

In [None]:
from snorkel.annotations import load_gold_labels

L_gold_dev = load_gold_labels(session, annotator_name='gold', split=1)

from snorkel.viewer import SentenceNgramViewer
labeled = []
dev_sents = dev_sents
#L_gold_dev_red = L_gold_dev[400:500]
for ii, c in enumerate(dev_sents):
    if L_gold_dev[ii] == -1:
        print(c)
        labeled.append(c)
print("Number labeled:", len(labeled))

SentenceNgramViewer(labeled, session)

In [17]:
from snorkel_utils_phone import remove_gold_labels
#Remove gold labels if you want -- uncomment!
#remove_gold_labels(session)

In [19]:
from dataset_utils import check_gold_perc
perc_pos = check_gold_perc(session)

Percent Positive: 0.84


In [None]:
#train_cand = session.query(PhoneExtraction).filter(PhoneExtraction.split == 0).all()

In [20]:
from snorkel.annotations import load_gold_labels

L_gold_dev = load_gold_labels(session, annotator_name='gold', split=1)


In [21]:
from snorkel.models import Candidate, candidate_subclass

# Designing candidate subclasses
#PhoneExtraction = candidate_subclass('Phone', ['phone'])

cands_dev = session.query(PhoneExtraction).filter(PhoneExtraction.split == 1).all()

In [56]:
from snorkel.viewer import SentenceNgramViewer
n_labeled = []
my_n_labeled ={}
cands_dev_red = cands_dev
L_gold_dev_red = L_gold_dev
for ii, c in enumerate(cands_dev_red):
#     if L_gold_dev_red[ii] == -1:
    #print(c.get_parent().document.name)
#     if L_gold_dev_red[ii]:
    #print(L_gold_dev_red[ii])
    if L_gold_dev_red[ii]==-1:
        #print(L_gold_dev_red[ii])
        #print(c.get_parent().document.name)
        my_n_labeled[c.get_parent().document.name]=c
        n_labeled.append(c.get_parent().document.name)
#print("Number labeled:", len(labeled))

#SentenceNgramViewer(labeled, session)
print(my_n_labeled)

{"b'94B73F18E0E06332542E91FBF22CDC51C621DCAA30283A401DB29305CD35A53C'": Phone(Span("b'DE DOMINGO A DOMINGO 3194100196'", sentence=283064, chars=[16,46], words=[2,6])), "b'ADBE924F8352DE81222ED118430FA1F2C2262E50EDCCE99197F23D3A59E36C85'": Phone(Span("b'\\\\\\\\n \\\\\\\\n 916 2136108 carmen\\\\\\\\n'", sentence=252756, chars=[237,271], words=[46,50])), "b'054D68E1DDACA2580E5AF580FF22CBBAC484ADA9A7E266E4B9F15592159EAECE'": Phone(Span("b'787........428........6067'", sentence=290051, chars=[247,272], words=[49,53])), "b'04329E208EE08FAEF44CFBB396974EE063B3D19E433242DD9079CACBE5B43189'": Phone(Span("b'Shunna 3346550314 \\\\\\\\n No Police'", sentence=193301, chars=[214,246], words=[45,49])), "b'8702795832E86C62D4358118436BC83552FA77FC00635657CC22F129B6BC7C93'": Phone(Span("b'b\'"**(347)805-6875**\\\\\\\\n'", sentence=283939, chars=[0,24], words=[0,2])), "b'B4F81DFE35E5E2FF84CA3D60E05E3E3B3435D5025D741CCCBA56049D1119D9C1'": Phone(Span("b'317-560-9310'", sentence=201883, chars=[12,23], word

### make a jsonl file




In [None]:
from snorkel_utils_phone import phone_eval
final_dict_tr = {}
final_dict_dev ={}
final_dict_test ={}
dev_cands = session.query(PhoneExtraction).filter(PhoneExtraction.split == 1).all()
train_cand = session.query(PhoneExtraction).filter(PhoneExtraction.split == 0).all()
test_cands = session.query(PhoneExtraction).filter(PhoneExtraction.split == 2).all()

for i, c in enumerate(train_cand):
    name =  c.get_parent().document.name.replace('b','')
    if name in list(final_dict_tr.keys()):
        aa = final_dict_tr[name]
        if len(phone_eval(c[0].get_span()))!=0:
            #print(aa)
            aa.append(phone_eval(c[0].get_span()))
            aa = list(set(aa))
            final_dict_tr[name] = aa
        
    else:
        bb = phone_eval(c[0].get_span())
        #print ("im bb", bb)
#         if type(bb)!=list:
#             bb = list(bb)
        if bb!='':
            final_dict_tr[name]=[bb]
        
######################################dev dict##################################    
for i, c in enumerate(dev_cands):
    name =  c.get_parent().document.name.replace('b','').replace('"', '')
    if name in list(final_dict_dev.keys()):
        aa =  final_dict_dev[name]
        if len(phone_eval(c[0].get_span()))!=0:
#            print(aa)
            aa.append(phone_eval(c[0].get_span()))
            aa = list(set(aa))
            final_dict_dev[name] = aa
        
    else:
        bb = phone_eval(c[0].get_span())
#        print ("im bb", bb)
#         if type(bb)!=list:
#             bb = list(bb)
        if bb!='':
            final_dict_dev[name]=[bb]
            
################################### test dict################################

for i, c in enumerate(test_cands):
    name =  c.get_parent().document.name.replace('b','').replace('"', '')
    if name in list(final_dict_test.keys()):
        aa = final_dict_test[name]
        if len(phone_eval(c[0].get_span()))!=0:
            
            aa.append(phone_eval(c[0].get_span()))
            aa = list(set(aa))
            final_dict_test[name] = aa
        
    else:
        bb = phone_eval(c[0].get_span())
#        print ("im bb", bb)
#         if type(bb)!=list:
#             bb = list(bb)
        if len(bb)!=0:
            final_dict_test[name]=[bb]
        
    
        
final_dict_tr.update(final_dict_dev)
final_dict_tr.update(final_dict_test)    

In [None]:
import json

with open('phone_ver1.jsonl', 'w') as fp:
    json.dump(final_dict_tr, fp)
