# Adding Visual Features

In [1]:
%load_ext autoreload
%autoreload 2

import os
os.remove('snorkel.db')

import sys
sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')

from snorkel import SnorkelSession
session = SnorkelSession()

from pdb import set_trace as t

### Convert PDF to HTML

Use Adobe Acrobat (or other program of your choice) to convert PDF -> HTML with structure.

### Parse HTML and PDF

In [2]:
import os
from snorkel.parser import CorpusParser
from snorkel.parser import HTMLParser
from snorkel.parser import OmniParser

pdf_path = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware100_pdf/'
html_path = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware100_html/'

filename = 'bc546-d'
html_file = html_path + filename + '.html'

doc_parser = HTMLParser(path=html_file)
context_parser = OmniParser(pdf_path=pdf_path, session=session)
cp = CorpusParser(doc_parser, context_parser, max_docs=1) 

%time corpus = cp.parse_corpus(name='Hardware', session=session)

# Save results
# os.system('cp snorkel.db snorkel.db\ corpus');

Extracted 2690 pdf words
Elapsed: 0.263 s
Extracted 480 html words
Elapsed: 0.009 s
Linked 480 words to 2690 bounding boxes
Elapsed: 0.026 s
Updated coordinates in snorkel.db
Elapsed: 0.153 s
CPU times: user 843 ms, sys: 68.1 ms, total: 911 ms
Wall time: 6.84 s


### Load results

In [3]:
# import os
# from snorkel.models import Corpus
# from snorkel.utils import get_ORM_instance

# os.system('cp snorkel.db\ corpus snorkel.db');
# corpus = get_ORM_instance(Corpus, session, 'Hardware')

### Extract Candidates

In [4]:
import os
from collections import defaultdict
from snorkel.models import Corpus, candidate_subclass
from snorkel.matchers import RegexMatchSpan, Union
from snorkel.candidates import CandidateExtractor
from snorkel.utils import get_ORM_instance
from hardware_utils import OmniNgramsPart, OmniNgramsTemp, get_gold_dict

# Candidate Type
Part_Temp = candidate_subclass('Part_Temp', ['part','temp'])
    
# CandidateSpaces
part_ngrams = OmniNgramsPart(parts_by_doc=None, n_max=3) # NOTE: no part linking right now
temp_ngrams = OmniNgramsTemp(n_max=2)

# Matchers
eeca_matcher = RegexMatchSpan(rgx='([b]{1}[abcdefklnpqruyz]{1}[\swxyz]?[0-9]{3,5}[\s]?[A-Z\/]{0,5}[0-9]?[A-Z]?([-][A-Z0-9]{1,7})?([-][A-Z0-9]{1,2})?)')
jedec_matcher = RegexMatchSpan(rgx='([123]N\d{3,4}[A-Z]{0,5}[0-9]?[A-Z]?)')
jis_matcher = RegexMatchSpan(rgx='(2S[abcdefghjkmqrstvz]{1}[\d]{2,4})')
others_matcher = RegexMatchSpan(rgx='((NSVBC|SMBT|MJ|MJE|MPS|MRF|RCA|TIP|ZTX|ZT|TIS|TIPL|DTC|MMBT|PZT){1}[\d]{2,4}[A-Z]{0,3}([-][A-Z0-9]{0,6})?([-][A-Z0-9]{0,1})?)')
parts_matcher = Union(eeca_matcher, jedec_matcher, jis_matcher, others_matcher)

temp_matcher = RegexMatchSpan(rgx=r'[0-9]+[05]', longest_match_only=False)

# Throttler
part_throttler = lambda x: x[0].parent.page == x[1].parent.page

# Extractor
ce = CandidateExtractor(Part_Temp, 
                        [part_ngrams, temp_ngrams], 
                        [parts_matcher, temp_matcher], 
                        throttler=part_throttler)

# Extract
for corpus_name in ['Hardware']:
    corpus = get_ORM_instance(Corpus, session, corpus_name)
    print "Extracting Candidates from %s" % corpus
    %time candidates = ce.extract(\
        corpus.documents, corpus_name + ' Candidates', session)
    session.add(candidates)
    print "%s contains %d Candidates" % (candidates, len(candidates))
session.commit()

Extracting Candidates from Corpus (Hardware)
CPU times: user 367 ms, sys: 0 ns, total: 367 ms
Wall time: 384 ms
Candidate Set (Hardware Candidates) contains 417 Candidates


### View Results

In [14]:
c = candidates[15]
print c.part
print c.part.char_start, c.part.char_end
print c.part.parent.page
print c.part.get_attrib_tokens('top')
print c.part.get_attrib_tokens('bottom')
print c.part.get_attrib_tokens('left')
print c.part.get_attrib_tokens('right')


from snorkel.entity_features import visual_binary_features
from snorkel.lf_helpers import get_aligned_lemmas, _bbox_from_span

print c.part.parent.document
for c in candidates:
    
    span1, span2 = c.get_arguments()
    
    feats = set()
    for f in visual_binary_features(span1, span2):
        feats.add(f)
    text1 =   span1.get_span()
    text2 = span2.get_span()
    if not  'Y_ALIGNED' in feats or span1.parent.page!=2: continue
    print '='*20
    print 'For cacndidate pair:'
    print span1.get_span()
    print span2.get_span()
    print 'Visual features are:'
    for f in feats: print f
    print 'LF is_aligned_with_lemmas:', 'min' in get_aligned_lemmas(span2)
    print 'Phrase1', span1.parent,  span1.parent.page#.text, span1.parent.bbox
    print 'Phrase2', span2.parent,  span2.parent.page#.text, span2.parent.bbox, span2.parent.page
    print _bbox_from_span(span1), _bbox_from_span(span2)
    
print len(candidates)

 Span("BC546", parent=220, chars=[0,4], words=[0,0])
0 4
1
[319]
[329]
[189]
[214]
Document bc546-d
For cacndidate pair:
BC548
30
Visual features are:
e2_ALIGNED_CENTER_min
Y_ALIGNED
HAS_COORDS
e2_ALIGNED_min
LF is_aligned_with_lemmas: True
Phrase1 Phrase(Doc: bc546-d, Table: 3, Row: 3, Col: 0, Position: 0, Text: BC548) 2
Phrase2 Phrase(Doc: bc546-d, Table: 3, Row: 3, Col: 1, Position: 0, Text: 30) 2
bbox(top=122, bottom=131, left=314, right=339) bbox(top=122, bottom=131, left=411, right=420)
For cacndidate pair:
BC547
50
Visual features are:
e1_ALIGNED_Y_emitter
HAS_COORDS
e2_ALIGNED_CENTER_min
Y_ALIGNED
e1_ALIGNED_base
e1_ALIGNED_Y_base
e1_ALIGNED_Y_breakdown
e1_ALIGNED_breakdown
e1_ALIGNED_emitter
e1_ALIGNED_Y_voltage
e1_ALIGNED_voltage
e2_ALIGNED_min
LF is_aligned_with_lemmas: True
Phrase1 Phrase(Doc: bc546-d, Table: 3, Row: 7, Col: 1, Position: 0, Text: BC546 BC547) 2
Phrase2 Phrase(Doc: bc546-d, Table: 3, Row: 4, Col: 3, Position: 0, Text: 50) 2
bbox(top=154, bottom=164, left=314

In [6]:
context_parser.vizlink.display_links(20)

Unnamed: 0,i,html,pdf,j,offset
0,0,BC546,BC546,42,-5
1,1,-,-,36,-5
2,2,NPN,NPN,13,14
3,3,Amplifier,Amplifier,11,14
4,4,Transistors,Transistors,12,14
5,5,BC546B,BC546B,0,13
6,6,",",",",9,12
7,7,BC547A,BC547A,2,10
8,8,",",",",9,10
9,9,B,B,4,5


In [7]:
# Note: Select image and press any key to close image.
context_parser.vizlink.display_candidates(candidates, page_num=2, display=False)

Boxes per page: total (unique)
Page 1: 690 (25)
Page 2: 144 (16)


In [8]:
# pdf_file = context_parser.vizlink.pdf_file

# context_parser.vizlink.display_word('BC546', page_num=2)
# context_parser.vizlink.display_candidates(candidates, page_num=2)

In [9]:
# from visual_linking import display_boxes, get_box

# boxes = []
# for c in candidates:
#     boxes.append(get_box(c.part))
# boxes = list(set(boxes))
# display_boxes(pdf_file, boxes, page_num=2)

### Display Ordering of PDF Word List 

Display Ordering on a black page - Not very easy to display because of superposition

In [10]:
# import numpy as np
# import cv2
# import math

# page_num = 2
# page_height = 792
# page_width = 612
# img = np.ones((page_height,page_width,3))*255
# font = cv2.FONT_HERSHEY_SIMPLEX
# letter_width = 3
# i = 0
# for word_id, _ in pdf_word_list:
#     if word_id[0] == page_num:
#         i += 1
#         _, top, left, bottom, right = coordinate_map[word_id]
#         cv2.rectangle(img, (left, top), (right, bottom), (0,255,0), 1)
#         cv2.putText(img, 
#                     str(i), 
#                     ((left + right)/2 - letter_width*int(math.ceil(math.log10(i))), 
#                     bottom + (top - bottom)/4), 
#                     font, 
#                     0.3, 
#                     (255,0,0), 
#                     1)
# cv2.imshow('PDF Word List Order',img)
# cv2.waitKey() # press any key to exit the opencv output 
# cv2.destroyAllWindows() 

The end.