In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [20]:
# Setting Snorkel DB location
import os
import sys

import random
import numpy as np

#For networked PostgreSQL
postgres_location = 'postgresql://jdunnmon:123@localhost:5432'
#postgres_db_name = 'phone_30K'
postgres_db_name = 'all_jd_1M'
os.environ['SNORKELDB'] = os.path.join(postgres_location,postgres_db_name)

#For local PostgreSQL
#os.environ['SNORKELDB'] = 'postgres:///es_locs_small'

# Adding path above for utils
sys.path.append('../utils')

# For SQLite
#db_location = '.'
#db_name = "es_locs_small.db"
#os.environ['SNORKELDB'] = '{0}:///{1}/{2}'.format("sqlite", db_location, db_name)

# Start Snorkel session
from snorkel import SnorkelSession
session = SnorkelSession()

# Setting random seed
seed = 1701
random.seed(seed)
np.random.seed(seed)

In [3]:
from dataset_utils import set_preprocessor, combine_dedupe

# Set data source: options are 'content.tsv', 'memex_jsons', 'es'
data_source = 'es'

# Setting max number of docs to ingest
max_docs = 30000

# Setting location of data source

# For ES:
data_loc = '/lfs/local/0/jdunnmon/data/chtap/output_phone'

# Optional: add tsv with additional documents to create combined tsv without duplicates
#data_all_loc = '/dfs/scratch1/jdunnmon/data/memex-data/es/output_all.tsv'
#data_loc = combine_dedupe(data_loc, data_all_loc, '/dfs/scratch1/jdunnmon/data/memex-data/es/combined_phone_1M.tsv')

# Setting preprocessor
doc_preprocessor = set_preprocessor(data_source, data_loc, max_docs=max_docs, verbose=True,
                                    clean_docs=True, content_field=['memex_raw_content'])

Using parallelized loader


In [None]:
from snorkel.parser import CorpusParser
from snorkel.parser.spacy_parser import Spacy

# Applying corpus parser
corpus_parser = CorpusParser(parser=Spacy())
%time corpus_parser.apply(list(doc_preprocessor), parallelism=72, verbose=True)

In [21]:
from snorkel.models import Document, Sentence

# Printing number of docs/sentences
print("Documents:", session.query(Document).count())
print("Sentences:", session.query(Sentence).count())

Documents: 999601
Sentences: 7034221


In [26]:
import json
import re
from dataset_utils import clean_extracted_text
import phonenumbers

def get_extraction_from_doc(doc,quantity,extractions_field='extractions', mode=None):
    """
    Getting extraction from MEMEX tsv 
    
    Candidate can: candidate to get extraction from
    string quantity: extraction quantity to retrieve
    string extractions_field: field where extractions dictionary is stored
    """
    # Getting cleaned string describing extractions fileld
    dict_string = clean_extracted_text(doc.meta[extractions_field]) 
    
    # String-to-dict, extract quantity of interest
    extraction = json.loads(dict_string)[quantity]
    
    if mode == 'phonenumbers':
        ext_list = []
        txt = extraction
        for match in phonenumbers.PhoneNumberMatcher(txt,"US"):
            format_match = phonenumbers.format_number(match.number, phonenumbers.PhoneNumberFormat.E164)
            ext_list.append(format_match)
        extraction = ext_list
    
    return extraction

def regex_matcher(doc, mode=phonenumbers):
    phone_list = []
    results_list = []
    r = re.compile(r'\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4}')
    if mode == 'regex':
        for s in doc.sentences:
            txt = s.text
            results = r.findall(txt)
            for x in results:
                phone_list.append(str(x))
    elif mode == 'phonenumbers':
         for s in doc.sentences:
            txt = s.text
            for match in phonenumbers.PhoneNumberMatcher(txt,"US"):
                format_match = phonenumbers.format_number(match.number, phonenumbers.PhoneNumberFormat.E164)
                phone_list.append(str(format_match))
                
    return list(set(phone_list))

def get_text_from_doc(doc):
    string = ''
    for s in doc.sentences:
        string = string + s.text
    return string

def chunks(l, n):
    n = max(1, n)
    return [l[i:i+n] for i in range(0, len(l), n)]

def process_doc_chunk(chunk, d, pool):
    for ii, doc in enumerate(chunk):
        name, text = doc
        d[name] = {}
        if ii % 100 == 0:
            print(f'Extracting regexes from doc {ii} out of {len(docs)}')
        d[name]['phone'] = regex_matcher_raw(text, mode='phonenumbers')

In [24]:
# Getting documents
print('Getting documents...')
docs = session.query(Document).all()

Getting documents...


In [22]:
# Creating extraction name-text tuples for parallel processing
print('Extracting text...')
doc_tuples = []
for ii, doc in enumerate(docs):
    doc_tuples.append((doc.name, get_text_from_doc(doc)))
    if ii % 1000 == 0:
        print(f'Getting text for doc {ii} of {len(docs)}')

Extracting text...


NameError: name 'docs' is not defined

In [12]:
from multiprocessing import Process, Manager

# Setting parallelism
parallelism = 4

# Chunking documents list
print('Creating chunks')
doc_chunks = chunks(doc_tuples, round(len(docs)/parallelism))

# Setting up MP manager and shared memory dict
manager = Manager()
doc_extractions = manager.dict()

# Defining processes
print('Defining processes')
job = [Process(target=process_doc_chunk, args=(doc_chunks[i], doc_extractions)) for i in range(parallelism)]

# Starting jobs, joining results
print('Starting jobs')
_ = [p.start() for p in job]
_ = [p.join() for p in job]

Creating chunks
Defining processes
Starting jobs
0
Extracting regexes from doc 0 out of 999601
0


Process Process-2:


Extracting regexes from doc 0 out of 999601
Extracting regexes from doc 0 out of 999601
0
0


Traceback (most recent call last):


Extracting regexes from doc 0 out of 999601


  File "/lfs/local/0/jdunnmon/repos/anaconda3/envs/snorkel/lib/python3.6/site-packages/sqlalchemy/engine/base.py", line 1193, in _execute_context
    context)
Process Process-3:
  File "/lfs/local/0/jdunnmon/repos/anaconda3/envs/snorkel/lib/python3.6/site-packages/sqlalchemy/engine/default.py", line 508, in do_execute
    cursor.execute(statement, parameters)
Process Process-4:
psycopg2.OperationalError: SSL SYSCALL error: EOF detected


The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/lfs/local/0/jdunnmon/repos/anaconda3/envs/snorkel/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/lfs/local/0/jdunnmon/repos/anaconda3/envs/snorkel/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-10-373f6205176e>", line 67, in process_doc_chunk
    d[doc.name]['phone'] = regex_matcher(doc, mode='phonenumbers')
  File "<

  File "/lfs/local/0/jdunnmon/repos/anaconda3/envs/snorkel/lib/python3.6/site-packages/sqlalchemy/engine/base.py", line 1193, in _execute_context
    context)
  File "/lfs/local/0/jdunnmon/repos/anaconda3/envs/snorkel/lib/python3.6/site-packages/sqlalchemy/ext/baked.py", line 439, in all
    return list(self)
  File "/lfs/local/0/jdunnmon/repos/anaconda3/envs/snorkel/lib/python3.6/site-packages/sqlalchemy/ext/baked.py", line 439, in all
    return list(self)
  File "/lfs/local/0/jdunnmon/repos/anaconda3/envs/snorkel/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/lfs/local/0/jdunnmon/repos/anaconda3/envs/snorkel/lib/python3.6/site-packages/sqlalchemy/engine/default.py", line 508, in do_execute
    cursor.execute(statement, parameters)
  File "/lfs/local/0/jdunnmon/repos/anaconda3/envs/snorkel/lib/python3.6/site-packages/sqlalchemy/ext/baked.py", line 346, in __iter__
    return q._execute_and_instances(context)
  File "/

  File "/lfs/local/0/jdunnmon/repos/anaconda3/envs/snorkel/lib/python3.6/site-packages/sqlalchemy/orm/query.py", line 2958, in _execute_and_instances
    result = conn.execute(querycontext.statement, self._params)
sqlalchemy.exc.OperationalError: (psycopg2.OperationalError) SSL SYSCALL error: EOF detected
 [SQL: 'SELECT sentence.id AS sentence_id, context.id AS context_id, context.type AS context_type, context.stable_id AS context_stable_id, sentence.document_id AS sentence_document_id, sentence.position AS sentence_position, sentence.text AS sentence_text, sentence.words AS sentence_words, sentence.char_offsets AS sentence_char_offsets, sentence.abs_char_offsets AS sentence_abs_char_offsets, sentence.lemmas AS sentence_lemmas, sentence.pos_tags AS sentence_pos_tags, sentence.ner_tags AS sentence_ner_tags, sentence.dep_parents AS sentence_dep_parents, sentence.dep_labels AS sentence_dep_labels, sentence.entity_cids AS sentence_entity_cids, sentence.entity_types AS sentence_entity_types

In [27]:
# Getting gold label for each doc
doc_extractions = {}

for ii, doc in enumerate(docs):
    doc_extractions[doc.name] = {}
    #results_dict[doc.name]['doc'] = doc
    if ii % 1000 == 0:
        print(f'Extracting regexes from doc {ii} out of {len(docs)}')
    #results_dict[doc.name]['gold'] = get_extraction_from_doc(doc, 'phone',mode='phonenumbers')
    doc_extractions[doc.name]['phone'] = regex_matcher(doc, mode='phonenumbers')

Extracting regexes from doc 0 out of 999601
Extracting regexes from doc 1000 out of 999601
Extracting regexes from doc 2000 out of 999601
Extracting regexes from doc 3000 out of 999601
Extracting regexes from doc 4000 out of 999601
Extracting regexes from doc 5000 out of 999601
Extracting regexes from doc 6000 out of 999601
Extracting regexes from doc 7000 out of 999601
Extracting regexes from doc 8000 out of 999601
Extracting regexes from doc 9000 out of 999601
Extracting regexes from doc 10000 out of 999601
Extracting regexes from doc 11000 out of 999601
Extracting regexes from doc 12000 out of 999601
Extracting regexes from doc 13000 out of 999601
Extracting regexes from doc 14000 out of 999601
Extracting regexes from doc 15000 out of 999601
Extracting regexes from doc 16000 out of 999601
Extracting regexes from doc 17000 out of 999601
Extracting regexes from doc 18000 out of 999601
Extracting regexes from doc 19000 out of 999601
Extracting regexes from doc 20000 out of 999601
Extra

Extracting regexes from doc 170000 out of 999601
Extracting regexes from doc 171000 out of 999601
Extracting regexes from doc 172000 out of 999601
Extracting regexes from doc 173000 out of 999601
Extracting regexes from doc 174000 out of 999601
Extracting regexes from doc 175000 out of 999601
Extracting regexes from doc 176000 out of 999601
Extracting regexes from doc 177000 out of 999601
Extracting regexes from doc 178000 out of 999601
Extracting regexes from doc 179000 out of 999601
Extracting regexes from doc 180000 out of 999601
Extracting regexes from doc 181000 out of 999601
Extracting regexes from doc 182000 out of 999601
Extracting regexes from doc 183000 out of 999601
Extracting regexes from doc 184000 out of 999601
Extracting regexes from doc 185000 out of 999601
Extracting regexes from doc 186000 out of 999601
Extracting regexes from doc 187000 out of 999601
Extracting regexes from doc 188000 out of 999601
Extracting regexes from doc 189000 out of 999601
Extracting regexes f

Extracting regexes from doc 338000 out of 999601
Extracting regexes from doc 339000 out of 999601
Extracting regexes from doc 340000 out of 999601
Extracting regexes from doc 341000 out of 999601
Extracting regexes from doc 342000 out of 999601
Extracting regexes from doc 343000 out of 999601
Extracting regexes from doc 344000 out of 999601
Extracting regexes from doc 345000 out of 999601
Extracting regexes from doc 346000 out of 999601
Extracting regexes from doc 347000 out of 999601
Extracting regexes from doc 348000 out of 999601
Extracting regexes from doc 349000 out of 999601
Extracting regexes from doc 350000 out of 999601
Extracting regexes from doc 351000 out of 999601
Extracting regexes from doc 352000 out of 999601
Extracting regexes from doc 353000 out of 999601
Extracting regexes from doc 354000 out of 999601
Extracting regexes from doc 355000 out of 999601
Extracting regexes from doc 356000 out of 999601
Extracting regexes from doc 357000 out of 999601
Extracting regexes f

Extracting regexes from doc 506000 out of 999601
Extracting regexes from doc 507000 out of 999601
Extracting regexes from doc 508000 out of 999601
Extracting regexes from doc 509000 out of 999601
Extracting regexes from doc 510000 out of 999601
Extracting regexes from doc 511000 out of 999601
Extracting regexes from doc 512000 out of 999601
Extracting regexes from doc 513000 out of 999601
Extracting regexes from doc 514000 out of 999601
Extracting regexes from doc 515000 out of 999601
Extracting regexes from doc 516000 out of 999601
Extracting regexes from doc 517000 out of 999601
Extracting regexes from doc 518000 out of 999601
Extracting regexes from doc 519000 out of 999601
Extracting regexes from doc 520000 out of 999601
Extracting regexes from doc 521000 out of 999601
Extracting regexes from doc 522000 out of 999601
Extracting regexes from doc 523000 out of 999601
Extracting regexes from doc 524000 out of 999601
Extracting regexes from doc 525000 out of 999601
Extracting regexes f

Extracting regexes from doc 674000 out of 999601
Extracting regexes from doc 675000 out of 999601
Extracting regexes from doc 676000 out of 999601
Extracting regexes from doc 677000 out of 999601
Extracting regexes from doc 678000 out of 999601
Extracting regexes from doc 679000 out of 999601
Extracting regexes from doc 680000 out of 999601
Extracting regexes from doc 681000 out of 999601
Extracting regexes from doc 682000 out of 999601
Extracting regexes from doc 683000 out of 999601
Extracting regexes from doc 684000 out of 999601
Extracting regexes from doc 685000 out of 999601
Extracting regexes from doc 686000 out of 999601
Extracting regexes from doc 687000 out of 999601
Extracting regexes from doc 688000 out of 999601
Extracting regexes from doc 689000 out of 999601
Extracting regexes from doc 690000 out of 999601
Extracting regexes from doc 691000 out of 999601
Extracting regexes from doc 692000 out of 999601
Extracting regexes from doc 693000 out of 999601
Extracting regexes f

Extracting regexes from doc 842000 out of 999601
Extracting regexes from doc 843000 out of 999601
Extracting regexes from doc 844000 out of 999601
Extracting regexes from doc 845000 out of 999601
Extracting regexes from doc 846000 out of 999601
Extracting regexes from doc 847000 out of 999601
Extracting regexes from doc 848000 out of 999601
Extracting regexes from doc 849000 out of 999601
Extracting regexes from doc 850000 out of 999601
Extracting regexes from doc 851000 out of 999601
Extracting regexes from doc 852000 out of 999601
Extracting regexes from doc 853000 out of 999601
Extracting regexes from doc 854000 out of 999601
Extracting regexes from doc 855000 out of 999601
Extracting regexes from doc 856000 out of 999601
Extracting regexes from doc 857000 out of 999601
Extracting regexes from doc 858000 out of 999601
Extracting regexes from doc 859000 out of 999601
Extracting regexes from doc 860000 out of 999601
Extracting regexes from doc 861000 out of 999601
Extracting regexes f

In [14]:
doc_extractions

{"b'DAF47667C8A9908562037268CE1B7F698DEBE1E6CE44224F323100FC3C95A8FC'": {'phone': []},
 "b'0884FB309AFBE66ADF5407BD0DDD63D41B7A259F50730831EDDBB3C1AF8922E0'": {'phone': []},
 "b'815B55EBD00239EA6D991B0FA33F25167D7BB92B7EAC7CEDCACAADAC6E25A186'": {'phone': []},
 "b'6E5BA30406BAE8F77E2136CAB1B7F8ADA07B8477832E15A743875C9571742F0E'": {'phone': []},
 "b'A37F98230B2C2B7C704F18D4BB760D06446999FFAC78B9B4D1C46F5C61CDA957'": {'phone': []},
 "b'B1B00C8ADF6D1BCFEE7CC2DDF0BDA806A91AB3957EF7D4D19DB547D421B885EC'": {'phone': []},
 "b'33AF920EA684B97C64A4C00E1C2A440FBDCB27852F47BA50DC7B6D575A9FA2CE'": {'phone': []},
 "b'C353301BF0772ACF6A1FAF1AF626B945D869214ABCE6377CCE9AA696DAF59162'": {'phone': []},
 "b'07360EA0567C9DB46A34522910FFA4C1EC1C0C1E19FE7AF283E0BB379E73C259'": {'phone': []},
 "b'D075C899B5D492D676494C62D6931E36FDD9AD9E921C95D8298D29D84B8CDFC3'": {'phone': []},
 "b'8044515CEB58E2D70CFD86CD9D24E24758ACDA0122283B70D3C0E6A04CB36A81'": {'phone': []},
 "b'AB4633C0F21FC4A0E407F244C0598B27AA663C0

In [None]:
# from collections import defaultdict
# tp = 0
# fp = 0
# tn = 0
# fn = 0

# total = len(results_dict.keys())

# ext_ret_dict= defaultdict(list)
# analysis_dict = defaultdict(list)

# for ind,ky in enumerate(results_dict.keys()):
#     nm = docs[ind].name
#     val =  results_dict[nm]['ext']
#     targets = results_dict[nm]['gold']
#     if val == []:
#         if target == []:
#             tn += 1
#             analysis_dict['tn'].append((nm,val))
#         else:
#             fn += 1
#             analysis_dict['fn'].append((nm,val))
#         continue
        
#     for v in list(set(val)):
#         if nm == b'6A9632D62B602B0415ACD11887AFF9B5E873922260F6C2DA5B527313179D4B22':
#             import pdb; pdb.set_trace()
#         if v in targets:
#             ext_ret_dict[nm].append(v)
#             tp +=1
#             analysis_dict['tp'].append((nm,v))
#         else:
#             ext_ret_dict[nm].append(v)
#             fp += 1
#             analysis_dict['fp'].append((nm,v))
            

In [None]:
# rec = tp/(tp+fn)
# prec = tp/(tp+fp)
# print(f'Recall: {rec}')
# print(f'Precision: {prec}')

In [None]:
# len(results_dict.keys())

In [None]:
#  analysis_dict['fp']

In [None]:
# i = 4
# print('Document:')
# print(get_text_from_doc(results_dict[analysis_dict['tp'][i][0]]['doc']))
# print('Extraction:')
# print(analysis_dict['tp'][i][1])
# print('Gold Label:')
# print(results_dict[analysis_dict['tp'][i][0]]['gold'])

In [29]:
import json

# Setting filename
out_filename = "phone_ext_test_gpn.jsonl"

# Saving file to jsonl in extractions format
with open(out_filename, 'w') as outfile:
    for k,v in doc_extractions.items():
        v['id'] = k
        v['phone'] = list(v['phone'])
        print(json.dumps(v), file=outfile)

In [26]:
z = doc_extractions.keys()

In [1]:
out_filename = "phone_ext_test_gpn_1M.jsonl"

In [30]:
import json

yld = 0
data = []
with open(out_filename) as f:
    for line in f:
        data.append(json.loads(line))
        try:
            if data[-1]['phone'] != []:
                yld += 1
        except:
            print('No phone...')   

In [31]:
yld

277265

In [None]:
yld = 0
for k,v in doc_extractions.items():
    try:
        if v['phone'] != []:
            yld += 1
    except:
        import pdb; pdb.set_trace()

In [None]:
import pandas as pd
a = pd.read_csv('/lfs/local/0/jdunnmon/data/chtap/output_phone/output_phone_shard_00.tsv',sep='\t')