### Step 1: Parsing Files, Adding Candidates and Labels to Database

In [1]:
from __future__ import print_function
from __future__ import division
import json
# Loading config
with open("run_config.json") as fl:
    cfg = json.load(fl)
cfg_params = cfg['parameters']

# Setting snorkel path and output root
import os
from os.path import join
output_root = join(cfg_params['output_path'],cfg_params['experiment_name'])
os.environ['FONDUERDBNAME'] = cfg['postgres_db_name']
os.environ['SNORKELDB'] = join(cfg['postgres_phone'],os.environ['FONDUERDBNAME'])

# For loading input files
import pandas as pd

# For running Snorkel
from snorkel.contrib.fonduer import SnorkelSession
from snorkel.contrib.fonduer.models import candidate_subclass
from snorkel.contrib.fonduer import HTMLPreprocessor, OmniParser
from utils import HTMLListPreprocessor

from sqlalchemy import create_engine
snorkeldb = create_engine(os.environ['SNORKELDB'], isolation_level="AUTOCOMMIT")

In [2]:
# Load labeled data from tsv
# Creating path to labeled data
pth_labeled = join(cfg['data_path'],'labels_and_splits')
# Getting labele data file name
fl_labeled = cfg['labeled_data_file']
# Loading labeled data into dataframe
df_labeled = pd.read_csv(join(pth_labeled,fl_labeled),sep='\t')
# Adding .html to filenames
# NOTE: Need to add .html to all actual filenames before running
#df_labeled.dtypes
df_labeled['file name'] =df_labeled['file name'].astype('string') 
path_list_labeled = [_+'.html' for _ in df_labeled['file name'].tolist()]
# print("hi")
# #Load unlabeled data from tsv
fl_unlabeled = cfg['unlabeled_data_file']
df_unlabeled = pd.read_csv(join(pth_labeled,fl_unlabeled),sep='\t')
path_list_unlabeled = [_+'.html' for _ in df_unlabeled['file name'].tolist()]

# Start snorkel session and creating phone subclass
session = SnorkelSession()
Phone_Extraction = candidate_subclass('phone_extraction',["phone"])
# print("hi")

### Parsing Documents

In [4]:
max_docs = cfg['max_docs']

# Setting phone for raw data
data_loc = join(cfg['data_path'],'raw_data')

# Creating a list of paths for documents from both labeled and unlabeled data
path_list = path_list_labeled[:max_docs]+path_list_unlabeled[:max_docs]

# Preprocessing documents from path_list
doc_preprocessor = HTMLListPreprocessor(data_loc,\
                                file_list=path_list)

# Ingest data into Fonduer via parser
corpus_parser = OmniParser(structural=True, lingual=True, visual=False)
%time corpus_parser.apply(doc_preprocessor, parallelism=cfg['parallel'])

Clearing existing...
Running UDF...




CPU times: user 687 ms, sys: 55.8 ms, total: 743 ms
Wall time: 21.7 s


In [3]:
from snorkel.contrib.fonduer.models import Document, Phrase

# Checking database contents
print("Documents:", session.query(Document).count())
print("Phrases:", session.query(Phrase).count())

Documents: 20
Phrases: 4890


## Step 2: Dividing into Test/Train, Extracting Features, Throttling


In [4]:
# Getting all documents parsed by Fonduer
docs = session.query(Document).order_by(Document.name).all()
ld   = len(docs)

# Setting up train, dev, and test sets
train_docs = set()
dev_docs   = set()
test_docs  = set()

# Creating list of (document name, Fonduer document object) tuples
data = [(doc.name+'.html', doc) for doc in docs]
data.sort(key=lambda x: x[0])

# Adding unlabeled data to train set, 
# labaled data to dev/test sets in alternating fashion
for i, (doc_name, doc) in enumerate(data):
    if doc_name in path_list_unlabeled:
        train_docs.add(doc)
    else:
        if len(dev_docs)<=len(test_docs):
            dev_docs.add(doc)
        else:
            test_docs.add(doc)

#Printing length of train/test/dev sets
print("train:",len(train_docs))
print("dev:" ,len(dev_docs))
print("test:",len(test_docs))

#Printing some filenames 
from pprint import pprint
pprint([x.name for x in train_docs])
pprint([x.name for x in dev_docs])
pprint([x.name for x in test_docs])

train: 10
dev: 5
test: 5
[u'02663026-4377-4c61-a19e-907e81e74ce0',
 u'005dd27d-91c5-4569-b285-489391dcff4f',
 u'0189ca4e-f259-4bf3-8144-0e4fa64620e0',
 u'0069a7dd-9a03-4240-9073-77744c10b467',
 u'06370eb0-2ad4-4f4d-b176-18d1fd07ac0a',
 u'0582d3b3-90d9-4ecb-8603-c389d952cc63',
 u'0166a90b-5733-4336-88a0-b5a48fcb14fd',
 u'03be9e25-a022-4269-9164-7033e2564304',
 u'0034ff21-5d7a-4edf-9150-e22c5188dde1',
 u'001a5f8b-82c5-4428-b539-0c8a0f2f87c4']
[u'0620fe76-98ce-4add-9371-e4a752446e12',
 u'be1e1310-3225-4a1c-ab2f-f62e8edaccc0',
 u'e9f9f542-64ed-433b-90ec-c38d884bbed0',
 u'2de9987a-d7d0-430c-be38-d2e67c91a4ed',
 u'8de9bb32-a6ca-4728-8bf0-1c29d2356485']
[u'd7a3157d-eef3-4481-9eb4-bfe918a1112f',
 u'7a2e6be0-1f31-4fae-8a35-999168201995',
 u'ea011c10-0808-4ac5-adbf-0a3651bf40c1',
 u'1e2d23df-121c-4439-a18a-b833a0f96ba9',
 u'af95cbf5-c629-48a7-9f4c-eaa7a23ace3f']


In [5]:

    
from snorkel.contrib.fonduer.fonduer.candidates import OmniNgrams
Phone_ngrams = OmniNgrams(n_max=1, split_tokens=[])


In [6]:
#Importing matchers module and defining google phone 
from snorkel.matchers import *
'''This function uses Google phone library to extract phone number.'''
from bs4 import BeautifulSoup
import os

import random
import io
import codecs
import json
from snorkel.matchers import *
from phonenumbers.python import phonenumbers
from snorkel.models import TemporarySpan


def find_phone_number(span_input):
    
    span_input=span_input.get_span()
    lst =[]
    for match in phonenumbers.PhoneNumberMatcher(span_input, "US"):
        num = phonenumbers.format_number(match.number, phonenumbers.PhoneNumberFormat.NATIONAL)
        lst.append(num.encode('utf-8'))
    
    if len(lst)!=0:
        
        return True
    
    else:
        return False
       

##### Link for Google Phone library
https://github.com/daviddrysdale/python-phonenumbers

In [7]:
from snorkel.contrib.fonduer.candidates import CandidateExtractor
from snorkel.matchers import *

phone_lambda_matcher =LambdaFunctionMatcher(func=find_phone_number)

candidate_extractor = CandidateExtractor(Phone_Extraction,
                                         [Phone_ngrams], [phone_lambda_matcher],candidate_filter=None)
                                         
# Extracting candidates from each split
%time candidate_extractor.apply(train_docs, split=0)
print("Number of candidates:", session.query(Phone_Extraction).filter(Phone_Extraction.split == 0).count())
%time
for i, docs in enumerate([dev_docs, test_docs]):
   candidate_extractor.apply(docs, split=i+1, parallelism=cfg['parallel'])
   print("Number of candidates:", session.query(Phone_Extraction).filter(Phone_Extraction.split == i+1).count())

Clearing existing...
Running UDF...
CPU times: user 831 ms, sys: 47.1 ms, total: 878 ms
Wall time: 1.07 s
Number of candidates: 39
CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 7.87 µs
Clearing existing...
Running UDF...
Number of candidates: 25
Clearing existing...
Running UDF...
Number of candidates: 21


In [8]:
cands = session.query(Phone_Extraction).all()
cands


[phone_extraction(Span("512-650-7347", sentence=62976, chars=[0,11], words=[0,0])),
 phone_extraction(Span("512-650-7347", sentence=62973, chars=[0,11], words=[0,0])),
 phone_extraction(Span("512-650-7347", sentence=62990, chars=[0,11], words=[0,0])),
 phone_extraction(Span("512-650-7347", sentence=62915, chars=[0,11], words=[0,0])),
 phone_extraction(Span("806-344-8868", sentence=62674, chars=[0,11], words=[0,0])),
 phone_extraction(Span("806-344-8868", sentence=62735, chars=[0,11], words=[0,0])),
 phone_extraction(Span("806-344-8868", sentence=62755, chars=[0,11], words=[0,0])),
 phone_extraction(Span("806-344-8868", sentence=62738, chars=[0,11], words=[0,0])),
 phone_extraction(Span("806-443-5607", sentence=66331, chars=[0,11], words=[0,0])),
 phone_extraction(Span("806-443-5607", sentence=66297, chars=[0,11], words=[0,0])),
 phone_extraction(Span("806-443-5607", sentence=66178, chars=[0,11], words=[0,0])),
 phone_extraction(Span("806-443-5607", sentence=66303, chars=[0,11], words=[