# Adding Visual Features

In [1]:
%load_ext autoreload
%autoreload 2

import os
# os.remove('snorkel.db')

import sys
sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')

from snorkel import SnorkelSession
session = SnorkelSession()

from pdb import set_trace as t

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Step 1: PDF to HTML Conversion

Use Adobe Acrobat (or other program of your choice) to convert PDF -> HTML with structure.

In [2]:
from visual_linking import VisualLinker
vizlink = VisualLinker(session)

### Step 2: Parse PDF

In [3]:
pdf_path = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware100_pdf/'
filename = 'bc546-d'
pdf_file = pdf_path + filename + '.pdf' # Path to PDF file 

In [4]:
%time vizlink.extract_pdf_words(pdf_file)

Extracted 2690 pdf words
CPU times: user 308 ms, sys: 29.8 ms, total: 338 ms
Wall time: 595 ms


In [5]:
from pprint import pprint

pprint(vizlink.pdf_word_list[:5])
pprint(vizlink.coordinate_map.items()[:5])

[((1, 0), u'BC546B'),
 ((1, 0.5), u','),
 ((1, 1), u'BC547A'),
 ((1, 1.5), u','),
 ((1, 2), u'B')]
[((3, 271), (3, 645, 325, 653, 332)),
 ((4, 135), (4, 638, 314, 645, 322)),
 ((5, 31), (5, 138, 434, 148, 451)),
 ((6, 167), (6, 349, 367, 355, 385)),
 ((1, 215), (1, 575, 453, 584, 460))]


### Step 3: Parse HTML

In [6]:
# from snorkel.parser import CorpusParser
# from snorkel.parser import HTMLParser
# from snorkel.parser import OmniParser

# html_path = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware100_html/'
# filename = 'bc546-d'
# html_file = html_path + filename + '.html'
# doc_parser = HTMLParser(path=html_file)
# context_parser = OmniParser()
# cp = CorpusParser(doc_parser, context_parser, max_docs=100) 

# %time corpus = cp.parse_corpus(name='Hardware', session=session)

import os
from snorkel.models import Corpus
from snorkel.utils import get_ORM_instance

os.system('cp snorkel.db\ corpus snorkel.db');
corpus = get_ORM_instance(Corpus, session, 'Hardware')

In [7]:
%time vizlink.extract_html_words(corpus.documents[0])

Extracted 2662 html words
CPU times: user 61.1 ms, sys: 6.62 ms, total: 67.7 ms
Wall time: 66.5 ms


In [8]:
from pprint import pprint
pprint(vizlink.html_word_list[:5])

[((2, 0), u'BC546'),
 ((2, 1), u'-'),
 ((2, 2), u'NPN'),
 ((2, 3), u'Amplifier'),
 ((2, 4), u'Transistors')]


### Step 4: Visual Linking

In [9]:
%time vizlink.link_lists(searchMax=200)

Linked 2662 words to 2690 bounding boxes
Updated coordinates in snorkel.db
CPU times: user 1.54 s, sys: 30.5 ms, total: 1.57 s
Wall time: 1.59 s


In [10]:
pprint(vizlink.links.items()[:5])

[((2, 0), (1, 31)),
 ((2, 1), (1, 23.5)),
 ((2, 2), (1, 8)),
 ((2, 3), (1, 6)),
 ((2, 4), (1, 7))]


In [11]:
phrase = corpus.documents[0].phrases[15]
print phrase
print phrase.page
print phrase.top
print phrase.bottom
print phrase.left
print phrase.right

Phrase(Doc: bc546-d, Table: 0, Row: 1, Col: 0, Position: 0, Text: Collector - Emitter Voltage)
1
(268, 268, 268, 268)
(277, 277, 277, 277)
(64, 99, 105, 133)
(96, 102, 130, 160)


In [12]:
matches = vizlink.display_links()
print matches[:100]

     i               html                pdf    j  offset
0    0              BC546              BC546   42      -5
1    1                  -                  -   36      -5
2    2                NPN                NPN   13      14
3    3          Amplifier          Amplifier   11      14
4    4        Transistors        Transistors   12      14
5    5             BC546B             BC546B    0      13
6    6                  ,                  ,    9      12
7    7             BC547A             BC547A    2      10
8    8                  ,                  ,    9      10
9    9                  B                  B    4       5
10  10                  ,                  ,    5       6
11  11                  C                  C    6       6
12  12                  ,                  ,    7       7
13  13             BC548B             BC548B    8       7
14  14                  ,                  ,    7       8
15  15                  C                  C   10       8
16  16        

### Step 5: Viewing Results

In [13]:
import os
from collections import defaultdict
from snorkel.models import Corpus, candidate_subclass
from snorkel.matchers import RegexMatchSpan, Union
from snorkel.candidates import CandidateExtractor
from snorkel.utils import get_ORM_instance
from hardware_utils import OmniNgramsPart, OmniNgramsTemp, get_gold_dict

# Candidate Type
Part_Temp = candidate_subclass('Part_Temp', ['part','temp'])
    
# CandidateSpaces
part_ngrams = OmniNgramsPart(parts_by_doc=None, n_max=3) # NOTE: no part linking right now
temp_ngrams = OmniNgramsTemp(n_max=2)

# Matchers
eeca_matcher = RegexMatchSpan(rgx='([b]{1}[abcdefklnpqruyz]{1}[\swxyz]?[0-9]{3,5}[\s]?[A-Z\/]{0,5}[0-9]?[A-Z]?([-][A-Z0-9]{1,7})?([-][A-Z0-9]{1,2})?)')
jedec_matcher = RegexMatchSpan(rgx='([123]N\d{3,4}[A-Z]{0,5}[0-9]?[A-Z]?)')
jis_matcher = RegexMatchSpan(rgx='(2S[abcdefghjkmqrstvz]{1}[\d]{2,4})')
others_matcher = RegexMatchSpan(rgx='((NSVBC|SMBT|MJ|MJE|MPS|MRF|RCA|TIP|ZTX|ZT|TIS|TIPL|DTC|MMBT|PZT){1}[\d]{2,4}[A-Z]{0,3}([-][A-Z0-9]{0,6})?([-][A-Z0-9]{0,1})?)')
parts_matcher = Union(eeca_matcher, jedec_matcher, jis_matcher, others_matcher)

temp_matcher = RegexMatchSpan(rgx=r'1[4-6]0', longest_match_only=False)

# Throttler
part_throttler = lambda x: x[0].parent.page == x[1].parent.page

# Extractor
ce = CandidateExtractor(Part_Temp, 
                        [part_ngrams, temp_ngrams], 
                        [parts_matcher, temp_matcher], 
                        throttler=part_throttler)

# Extract
for corpus_name in ['Hardware']:
    corpus = get_ORM_instance(Corpus, session, corpus_name)
    print "Extracting Candidates from %s" % corpus
    %time candidates = ce.extract(\
        corpus.documents, corpus_name + ' Candidates', session)
    session.add(candidates)
    print "%s contains %d Candidates" % (candidates, len(candidates))
session.commit()

Extracting Candidates from Corpus (Hardware)
CPU times: user 978 ms, sys: 11.5 ms, total: 989 ms
Wall time: 1 s
Candidate Set (Hardware Candidates) contains 275 Candidates


In [None]:
c = candidates[5]
print c.part
print c.part.char_start, c.part.char_end
print c.part.get_word_start()
print c.part.get_word_end()
print c.part.parent.page
print c.part.get_attrib_tokens('top')
print c.part.get_attrib_tokens('bottom')
print c.part.get_attrib_tokens('left')
print c.part.get_attrib_tokens('right')

ImplicitSpan("BC547A", parent=1050, words=[11,13], position=[0])
30 91
11
13
2
[]
[]
[]
[]


In [None]:
vizlink.display_candidates(candidates, page_num=1, display=True)

In [None]:
# from visual_linking import display_boxes, get_box

# boxes = []
# for c in candidates:
#     boxes.append(get_box(c.part))
# boxes = list(set(boxes))
# display_boxes(pdf_file, boxes, page_num=2)

### Display Ordering of PDF Word List 

Display Ordering on a black page - Not very easy to display because of superposition

In [None]:
# import numpy as np
# import cv2
# import math

# page_num = 2
# page_height = 792
# page_width = 612
# img = np.ones((page_height,page_width,3))*255
# font = cv2.FONT_HERSHEY_SIMPLEX
# letter_width = 3
# i = 0
# for word_id, _ in pdf_word_list:
#     if word_id[0] == page_num:
#         i += 1
#         _, top, left, bottom, right = coordinate_map[word_id]
#         cv2.rectangle(img, (left, top), (right, bottom), (0,255,0), 1)
#         cv2.putText(img, 
#                     str(i), 
#                     ((left + right)/2 - letter_width*int(math.ceil(math.log10(i))), 
#                     bottom + (top - bottom)/4), 
#                     font, 
#                     0.3, 
#                     (255,0,0), 
#                     1)
# cv2.imshow('PDF Word List Order',img)
# cv2.waitKey() # press any key to exit the opencv output 
# cv2.destroyAllWindows() 

The end.