### Building the Corpus

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
# Setting Snorkel DB location
import os
import sys

#For PostgreSQL
#postgres_location = 'postgresql://jdunnmon:123@localhost:5432'
postgres_location = 'postgresql://saeideh:123@localhost:5432'
#postgres_db_name = 'memex_db_snorkel_large'
#postgres_db_name = 'memex_snorkel_db_extracted_text_10K'
#postgres_db_name = 'memex_snorkel_db_extracted_text_150K'
#postgres_db_name = 'memex_db_snorkel_tsv_1M'
postgres_db_name = 'phone_db_saeideh_100k'
#postgres_db_name ='memex_1M_sse_rest'
os.environ['SNORKELDB'] = os.path.join(postgres_location,postgres_db_name)

# Adding path above for utils
sys.path.append('..')

# For SQLite
#db_location = '.'
#db_name = "snorkel_memex.db"
#os.environ['SNORKELDB'] = '{0}:///{1}/{2}'.format("sqlite", db_location, db_name)

# Start Snorkel session
from snorkel import SnorkelSession
session = SnorkelSession()

In [3]:
# Set data source: options are content.tsv, memex_jsons
data_source = 'content.tsv'

# Setting max number of docs to ingest
max_docs = 100000

## creating a preprocessor based on the files

In [4]:
from snorkel_utils_phone import MemexTSVDocPreprocessor, MEMEXJsonLGZIPPreprocessor, ESTSVDocPreprocessor, retrieve_all_files

if data_source == 'content.tsv':
    data_loc = '/lfs/local/0/jdunnmon/data/memex-data/gold_labels/data_sample'
    
    # Setting path to MEMEX source data
    file_path = '/lfs/local/0/jdunnmon/data/memex-data/gold_labels/content.tsv'

    # Setting path to unique URL MEMEX source data
    file_path_unique = '/dfs/scratch1/jdunnmon/data/memex-data/gold_labels/content_unique.tsv'
 

    # Initializing document preprocessor
    doc_preprocessor = MemexTSVDocPreprocessor(
        path=file_path_unique,
        max_docs=max_docs,
        verbose=False,
        clean_docs=True
    )
    
elif data_source == 'es':
    # Setting path to MEMEX source data
    file_path_unique = '/dfs/scratch1/jdunnmon/data/memex-data/es/es_locations.tsv'
    
        # Initializing document preprocessor
    doc_preprocessor = ESTSVDocPreprocessor(
        path=file_path_unique,
        max_docs=max_docs,
        verbose=False,
        clean_docs=True
    )

elif data_source == 'memex_jsons':
    # Location on raiders
    data_loc = '/lfs/local/0/jdunnmon/data/memex-data/gold_labels/data_sample'

    # Getting all file paths
    path_list = retrieve_all_files(data_loc)

    # Applying arbitrary conditions to file path list
    path_list = [a for a in path_list if a.endswith('gz')]

    # Preprocessing documents from path_list
    # Set "content field" to "extracted_text" to use extracted text as raw content
    doc_preprocessor = MEMEXJsonLGZIPPreprocessor(data_loc,\
                                    file_list=path_list,encoding='utf-8', max_docs=max_docs, verbose=False, content_field='extracted_text')
else:
    raise ValueError('Invalid data source!')


In [5]:
from snorkel.parser import CorpusParser
from snorkel.parser.spacy_parser import Spacy

# Applying corpus parser
corpus_parser = CorpusParser(parser=Spacy())
%time corpus_parser.apply(list(doc_preprocessor), parallelism=8, verbose=False)

Malformatted Line!
Malformatted Line!
Malformatted Line!
Malformatted Line!
Malformatted Line!
Clearing existing...
Running UDF...
CPU times: user 22.9 s, sys: 2.1 s, total: 25 s
Wall time: 4min 57s


In [6]:
from snorkel.models import Document, Sentence

# Printing number of docs/sentences
print("Documents:", session.query(Document).count())
print("Sentences:", session.query(Sentence).count())

Documents: 100000
Sentences: 467271


### Deviding test, train, dev

In [9]:
from snorkel_utils_phone import create_test_train_splits

# Getting all documents parsed by Snorkel
docs = session.query(Document).order_by(Document.name).all()

# Creating train, test, dev splits
%time train_docs, dev_docs, test_docs, train_sents, dev_sents, test_sents = create_test_train_splits(docs, 'location', gold_dict=gold_dict, dev_frac=0.01, test_frac=0.01,)

Train: 98000 Docs, 457891 Sentences
Dev: 796 Docs, 3595 Sentences
Test: 797 Docs, 3981 Sentences
CPU times: user 2min 52s, sys: 18.2 s, total: 3min 10s
Wall time: 4min 42s


In [8]:
import pickle

# Importing gold label dict
with open('phone_gold_dict.pickle', 'rb') as handle:
    gold_dict = pickle.load(handle)

In [None]:
lst_keys = list(gold_dict.keys())
len(lst_keys)
gold_dict["http://perth.backpage.com/FemaleEscorts/sexy-french-ebony-angelic-flavier-ready-for-kinky-strap-on-gfe-in-perth-cbd-1-day-only/6645008"]

In [10]:
from snorkel.models import Candidate, candidate_subclass

# Designing candidate subclasses
PhoneExtraction = candidate_subclass('Phone', ['phone'])

### Set up Matchers

In [11]:
import random
import io
import codecs
import json
#from snorkel.matchers import *
from phonenumbers.python import phonenumbers
# #from models import TemporarySpan
# def find_phone_number_1(span_input):
#     patern = re.compile("(?:\+?(\d{1})?-?\(?(\d{3})\)?[\s\-.\/]?)?(\d{3})[\s\-.\/]?(\d{4})[\s\-.\/]?")
# #     result = prog.match(string)
#     span = span_input.get_span()
#     for w in span:   
#         result = patern.match(w)
#         if result:
#             return True
        
    

def find_phone_number(span_input):
    
    span_input=span_input.get_span()
    lst =[]
    for match in phonenumbers.PhoneNumberMatcher(span_input, "US"):
        num = phonenumbers.format_number(match.number, phonenumbers.PhoneNumberFormat.NATIONAL)
        lst.append(num.encode('utf-8'))
    
    if len(lst)!=0:
        
        return True
        print(lst)
    else:
        return False

def find_phone_number_reg(span_input):
    span_input = span_input.get_span()
    patern1 = re.compile("(?:\+?(\d{1})?-?\(?(\d{3})\)?[\s\-.\/]?)?(\d{3})[\s\-.\/]?(\d{4})[\s\-.\/]?")
    patern2 = re.compile("^(\+\d{1,2}\s)?\(?\d{3}\)?[\s*.-~]?\d{3}[\s*.-~]?\d{4}$")
    result1 = patern1.match(span_input)
    result2 = patern2.match(span_input)
    if result1 or result2:
        return True
    else:
        return False



In [12]:
from snorkel.candidates import Ngrams
from snorkel.candidates import CandidateExtractor
from snorkel.matchers import *
from snorkel_utils_phone import  get_candidate_filter, CandidateExtractorFilter
phone_ngrams = Ngrams(n_max=1)


phone_lambda_matcher_1 =LambdaFunctionMatcher(func=find_phone_number)
phone_lambda_matcher =LambdaFunctionMatcher(func=find_phone_number_reg)
phone_matcher_ = Union(phone_lambda_matcher_1,phone_lambda_matcher)


In [13]:
cand_extractor = CandidateExtractorFilter(PhoneExtraction ,[phone_ngrams],[phone_matcher_],candidate_filter=None)

In [14]:
for k, sents in enumerate([train_sents, dev_sents, test_sents]):
    %time cand_extractor.apply(sents, split=k, parallelism=8)
    print("Number of candidates:", session.query(PhoneExtraction).filter(PhoneExtraction.split == k).count())

Clearing existing...
Running UDF...
CPU times: user 1min, sys: 20.3 s, total: 1min 21s
Wall time: 1min 39s
Number of candidates: 27932
Clearing existing...
Running UDF...
CPU times: user 444 ms, sys: 1.6 s, total: 2.04 s
Wall time: 5.25 s
Number of candidates: 193
Clearing existing...
Running UDF...
CPU times: user 424 ms, sys: 1.61 s, total: 2.04 s
Wall time: 5.22 s
Number of candidates: 215


## testing cleaning Function

In [15]:
train_cand = session.query(PhoneExtraction).filter(PhoneExtraction.split == 0).all()

In [None]:
len(train_cand)

In [16]:
train_cand

[Phone(Span("b'6122034720'", sentence=465653, chars=[167,176], words=[33,33])),
 Phone(Span("b'18816971617?Jenny?\\\\n'", sentence=525666, chars=[25,45], words=[7,7])),
 Phone(Span("b'6197943678.&nbsp;Call'", sentence=288350, chars=[21,41], words=[3,3])),
 Phone(Span("b'2064769399'", sentence=546375, chars=[230,239], words=[46,46])),
 Phone(Span("b'5859395336'", sentence=134003, chars=[0,9], words=[0,0])),
 Phone(Span("b'8315855928Dont'", sentence=476681, chars=[108,121], words=[24,24])),
 Phone(Span("b'2262248153'", sentence=534140, chars=[64,73], words=[17,17])),
 Phone(Span("b'1551226'", sentence=379542, chars=[69,75], words=[18,18])),
 Phone(Span("b'502~472~8300'", sentence=550625, chars=[0,11], words=[0,0])),
 Phone(Span("b'6474064838'", sentence=511350, chars=[359,368], words=[71,71])),
 Phone(Span("b'16462873237'", sentence=271906, chars=[29,39], words=[5,5])),
 Phone(Span("b'6149994387\\\\n'", sentence=472085, chars=[118,130], words=[26,26])),
 Phone(Span("b'56998972389'", sent

In [None]:
ss = '8.3.2.8.9.7.8.2.1.0.&nbsp;Call'
p= "09874812283"
l= 'lt;~~#813.244.3521'
def phone_cleaning (c):
    phone = re.sub("[^0-9]","", c)
    return phone
phone_cleaning(ss)

In [None]:
def PhoneNumber( number ):
    areaCode = number[0:3 ]
    exchange = number[3:6 ]
    line = number[6:] 
    return "(%s) %s-%s" % ( areaCode, exchange, line )
s = "001(832)8978210"
PhoneNumber(phone_cleaning(p))
PhoneNumber(s)
phone_cleaning(s)

In [None]:
k= "68722665"
l = s.replace(")","").replace("(","")
l.isdigit()


In [None]:
PhoneNumber(k)

In [None]:
r = phone_cleaning(s)
r[3:]

In [None]:
if len(r)==11:
    rr = PhoneNumber(r[1:])
    print(rr)

In [None]:
def arrange_phone(p):
    if len(p)==10:
        return PhoneNumber(p)
    if len(p)==11:
        return PhoneNumber(p[1:])
    if len(p) == 13:
        return PhoneNumber(p[3:])
    else:
        return []
def phone_eval(phone):
    if phone.isdigit():
        result = arrange_phone(phone)
        return result
    else:
        phone = phone_cleaning(phone)
        if phone.isdigit():
            result = arrange_phone(phone)
            return result
        else:
            phone =[]
            return []
            

In [None]:
phone_eval(ss)

In [None]:
n = '8048883503/Bianca'
nn= "0809459945@4"
nnn = "215917004Elk"
phone_eval(nn)

In [None]:
train_cands = session.query(PhoneExtraction).filter(PhoneExtraction.split == 0).all()
print(len(train_cands))
#train_dict = {}
for i, c in enumerate(train_cands):
    doc = c[0].sentence.document.name
    print (doc)
    extracted_cand = phone_eval(c[0].get_span())
    print(i)
    print (extracted_cand) 
#     train_dict[doc]=[extracted_cand]
# print (len(train_dict.values())) 

In [None]:
c_1 = train_cands[1]
c_1
c_1.get_parent().document.name

In [None]:
ext = getattr(c_1,"phone")

In [None]:
ext

# Adding Gold labels

print (label,gold_value, extracted_cand)

In [17]:
from snorkel_utils_phone import  get_gold_phone_label_from_meta

# Adding dev gold labels using dictionary
%time missed_dev = get_gold_phone_label_from_meta(session, PhoneExtraction, 'phone', 1, annotator='gold', gold_dict = gold_dict)

# Adding test gold labels using dictionary
%time missed_test = get_gold_phone_label_from_meta(session, PhoneExtraction, 'phone', 2, annotator='gold', gold_dict= gold_dict)

Loading 193 candidate labels
[=                                       ] 0%1 ['(702) 666-8028'] (702) 666-8028
-1 ['(206) 922-9303', '(215) 528-8446', '(305) 849-8140', '(312) 600-8628', '(347) 940-1982', '(401) 324-9388', '(414) 914-3777', '(416) 554-3337', '(442) 222-0227', '(469) 510-5849', '(608) 609-5899', '(623) 500-7076', '(678) 328-9455', '(732) 621-4443', '(773) 412-2044', '(786) 504-1860', '(832) 914-9667', '(917) 676-1333'] []
[=                                       ] 1%-1 ['(626) 278-8213', '(626) 510-0929'] []
1 ['(414) 380-6004'] (414) 380-6004
[==                                      ] 2%-1 ['(206) 922-9303', '(305) 849-8140', '(312) 600-8628', '(347) 940-1982', '(401) 324-9388', '(414) 914-3777', '(416) 554-3337', '(442) 222-0227', '(469) 510-5849', '(608) 609-5899', '(623) 500-7076', '(705) 875-6845', '(732) 621-4443', '(773) 412-2044', '(786) 504-1860', '(832) 914-9667', '(917) 676-1333'] (770) 806-2034
1 ['(206) 922-9303', '(305) 849-8140', '(312) 600-8628', '(316) 3

1 ['(832) 899-1610'] (832) 899-1610
1 ['(304) 266-0221', '(907) 290-7253'] (304) 266-0221
1 ['(218) 780-9372', '(954) 526-3741'] (954) 526-3741
1 ['(206) 922-9303', '(210) 465-6677', '(210) 466-7701', '(305) 849-8140', '(312) 600-8628', '(347) 940-1982', '(401) 324-9388', '(414) 914-3777', '(416) 554-3337', '(442) 222-0227', '(469) 510-5849', '(608) 609-5899', '(623) 500-7076', '(646) 902-9223', '(732) 621-4443', '(773) 412-2044', '(786) 504-1860', '(832) 914-9667', '(917) 676-1333'] (210) 465-6677
1 ['(479) 544-5124'] (479) 544-5124
1 ['(773) 440-0288'] (773) 440-0288
1 ['(907) 440-1230'] (907) 440-1230
1 ['(530) 315-5402'] (530) 315-5402
1 ['(727) 867-6202'] (727) 867-6202
1 ['(662) 368-8412'] (662) 368-8412
1 ['(210) 912-6691', '(304) 203-4764', '(323) 392-9251', '(402) 805-2508', '(509) 389-4767', '(586) 209-9093', '(760) 885-3727', '(770) 462-4450', '(951) 334-2517'] (210) 912-6691
1 ['(781) 913-3938', '(781) 913-3989'] (781) 913-3989
1 ['(786) 763-1407'] (786) 763-1407
1 ['(901) 

1 ['(503) 719-5046'] (503) 719-5046
1 ['(219) 293-6319'] (219) 293-6319
1 ['(647) 521-2365'] (647) 521-2365
1 ['(857) 417-8843'] (857) 417-8843
1 ['(217) 775-9602'] (217) 775-9602
1 ['(306) 359-0504'] (306) 359-0504
1 ['(202) 607-1069', '(305) 979-5627', '(575) 814-5150', '(954) 607-2279'] (202) 607-1069
1 ['(254) 768-3338'] (254) 768-3338
1 ['(952) 463-5870'] (952) 463-5870
1 ['(973) 814-4020'] (973) 814-4020
1 ['(415) 968-7562'] (415) 968-7562

AnnotatorLabels created: 193
CPU times: user 32.5 s, sys: 1.88 s, total: 34.4 s
Wall time: 34.6 s
Loading 215 candidate labels
[=                                       ] 0%1 ['(801) 920-0469'] (801) 920-0469
1 ['(405) 628-2012'] (405) 628-2012
[=                                       ] 1%1 ['(207) 259-1558'] (207) 259-1558
1 ['(787) 200-8347'] (787) 200-8347
[=                                       ] 2%-1 ['(347) 581-9547'] []
1 ['(361) 980-6165'] (361) 980-6165
[==                                      ] 3%1 ['(787) 200-8347'] (787) 200-8347
1

1 ['(787) 200-8347'] (787) 200-8347
1 ['(832) 997-6621'] (832) 997-6621
1 ['(509) 399-7250', '(907) 290-7253'] (509) 399-7250
-1 ['(440) 742-3220'] (742) 322-0540
1 ['(323) 743-7002'] (323) 743-7002
1 ['(225) 916-5073'] (225) 916-5073
1 ['(613) 321-0571'] (613) 321-0571
1 ['(214) 250-2024'] (214) 250-2024
1 ['(206) 922-9303', '(305) 849-8140', '(312) 600-8628', '(347) 940-1982', '(401) 324-9388', '(414) 914-3777', '(416) 554-3337', '(442) 222-0227', '(469) 510-5849', '(480) 343-4945', '(510) 332-2186', '(608) 609-5899', '(623) 500-7076', '(732) 621-4443', '(732) 779-7920', '(773) 412-2044', '(786) 504-1860', '(832) 914-9667', '(917) 676-1333'] (480) 343-4945
1 ['(510) 307-6486'] (510) 307-6486
1 ['(787) 200-8347'] (787) 200-8347
1 ['(206) 922-9303', '(305) 849-8140', '(312) 600-8628', '(325) 262-9348', '(347) 940-1982', '(401) 324-9388', '(414) 914-3777', '(416) 554-3337', '(442) 222-0227', '(469) 510-5849', '(608) 609-5899', '(623) 500-7076', '(646) 342-4926', '(732) 621-4443', '(773)

-1 ['(418) 262-1736', '(514) 295-0213'] []
1 ['(623) 556-7514'] (623) 556-7514
1 ['(662) 368-8412'] (662) 368-8412
1 ['(787) 200-8347'] (787) 200-8347
1 ['(787) 200-8347'] (787) 200-8347
1 ['(305) 600-7671', '(305) 790-7406'] (305) 600-7671
1 ['(305) 600-7671', '(305) 790-7406'] (305) 600-7671
1 ['(206) 922-9303', '(305) 849-8140', '(312) 600-8628', '(347) 940-1982', '(401) 324-9388', '(414) 914-3777', '(416) 554-3337', '(419) 612-7900', '(442) 222-0227', '(469) 510-5849', '(608) 609-5899', '(623) 500-7076', '(732) 621-4443', '(773) 412-2044', '(786) 504-1860', '(832) 914-9667', '(917) 676-1333', '(978) 712-8055'] (419) 612-7900
1 ['(717) 542-4481'] (717) 542-4481
1 ['(973) 457-8644'] (973) 457-8644
1 ['(253) 341-7049'] (253) 341-7049
1 ['(319) 491-3900'] (319) 491-3900
1 ['(787) 200-8347'] (787) 200-8347
1 ['(469) 684-7610'] (469) 684-7610
1 ['(404) 387-0045'] (404) 387-0045
1 ['(206) 922-9303', '(267) 994-5099', '(305) 849-8140', '(312) 600-8628', '(347) 940-1982', '(401) 324-9388', 

In [20]:
from snorkel_utils_phone import remove_gold_labels
#Remove gold labels if you want -- uncomment!
#remove_gold_labels(session)

In [19]:
from snorkel_utils_phone import check_gold_perc
perc_pos = check_gold_perc(session)

Percent Positive: 0.90
