# Adding Visual Features

In [1]:
%load_ext autoreload
%autoreload 2
from pdb import set_trace as t

### Step 1: Coordinate Extraction

In [114]:
# Ines's code takes PDF and yields pdf_word_list, coordinate_map
import os 
import subprocess

pdf_path = os.environ['SNORKELHOME'] + 'tutorials/tables/data/hardware/hardware100_pdf/'
filename = 'bc546-d'
pdf_file = pdf_path + filename + '.pdf' # Path to PDF file 

# create coordinates and id files
os.system('./getCoordinates.sh {}'.format(pdf_file))

# extract coordinates in map and words in list
basename = subprocess.check_output('basename {} .pdf'.format(pdf_file), shell=True)
dirname = subprocess.check_output('dirname {}'.format(pdf_file), shell=True)
coordinates = open(dirname.rstrip() + '/' + basename.rstrip() + '.ids_coordinates.txt','r').readlines()
words = open(dirname.rstrip() + '/' + basename.rstrip() + '.ids_words.txt','r').readlines()
pdf_word_list = []
coordinate_map= {}
for i in range(len(words)):
    try:
        # get words and ids
        word_id, word = words[i].rstrip().split('\t')
        pdf_word_list.append((word_id, word))
        # get ids and coordinates 
        word_id, page_nb, top, left, bottom, right = coordinates[i].rstrip().split('\t')
        coordinate_map[word_id] = (page_nb, top, left, bottom, right)
    except ValueError as e: 
        if str(e) == 'need more than 1 value to unpack': # word is a white space, ignore it
            pass
        else: 
            raise

### Step 2: PDF to HTML Conversion

Use Adobe Acrobat (or other program of your choice) to convert PDF -> HTML with structure.

### Step 3: HTML Parsing

In [2]:
# Payal's parser takes HTML and yields corpus object, html_word_list
import os
os.remove('snorkel.db')

from snorkel import SnorkelSession
session = SnorkelSession()

from snorkel.parser import CorpusParser
from snorkel.parser import HTMLParser
from snorkel.parser import OmniParser

html_path = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware100_html/'
filename = 'bc546-d'
html_file = html_path + filename + '.html'
doc_parser = HTMLParser(path=html_file)
context_parser = OmniParser()
cp = CorpusParser(doc_parser, context_parser, max_docs=100) 

%time corpus = cp.parse_corpus(name='Hardware', session=session)

CPU times: user 2.98 s, sys: 219 ms, total: 3.2 s
Wall time: 7.71 s


In [8]:
from pprint import pprint

html_word_list = []
for phrase in corpus.documents[0].phrases:
    for i, word in enumerate(phrase.words):
        html_word_list.append(((phrase.id, i), word))

print len(html_word_list)
pprint(html_word_list[:10])

2662
[((2, 0), u'BC546'),
 ((2, 1), u'-'),
 ((2, 2), u'NPN'),
 ((2, 3), u'Amplifier'),
 ((2, 4), u'Transistors'),
 ((3, 0), u'BC546B'),
 ((3, 1), u','),
 ((3, 2), u'BC547A'),
 ((3, 3), u','),
 ((3, 4), u'B')]


### Step 4: Visual Linking

In [None]:
from visual_linking import link_lists

%time links = link_lists(pdf_word_list, html_word_list)

### Step 5: Updating with coordinates

In [None]:
# Payal's code walks through phrases, updating each one's five visual attributes
# (page, top, left, bottom, right)
for phrase in corpus.documents[0].phrases:
    for i range(len(phrase.words)):
        (page_nb, top, left, bottom, right) = links[((phrase.id), i)]
        phrase.page_nb[i] = page_nb
        phrase.top[i]     = top
        phrase.left[i]    = left
        phrase.bottom[i]  = bottom
        phrase.right[i]   = right

### Step 6: Assess Results

In [115]:
# Use Ines's code to plot locations of words on the original PDF
import numpy as np
import cv2

In [116]:
page_to_visualize = '4' # page number to visualize
page_width = 612 # size of A4 pdf 
page_height = 792

In [117]:
#Generate jpeg image from pdf
os.system('pdftoppm -f {} -l {} -jpeg {} {}'.format(
    page_to_visualize, page_to_visualize, pdf_file, pdf_path + filename))

0

BOUNDING BOXES on PDF image

In [118]:
# Read pdf as an image 
img = cv2.resize(cv2.imread(pdf_path + filename + '-' + page_to_visualize + '.jpg'),(page_width,page_height))
# Plot bounding boxes
for i, (page_nb, top, left, bottom, right) in enumerate(coordinate_map.values()):
    if page_nb == page_to_visualize:
        cv2.rectangle(img,(int(float(left)),int(float(top))),(int(float(right)),int(float(bottom))),(255,0,0),1)
cv2.imshow('Bounding boxes',img)
cv2.waitKey() # press any key to exit the opencv output 
cv2.destroyAllWindows() 

Display Ordering on a black page - Not very easy to display because of supe

In [124]:
img = np.zeros((page_height,page_width,3))
font = cv2.FONT_HERSHEY_SIMPLEX
i = 0
for word_id, _ in pdf_word_list:
    if word_id[0] == page_to_visualize:
        i += 1
        __, top, left, bottom, right = coordinate_map[word_id]
        cv2.rectangle(img,(int(float(left)),int(float(top))),(int(float(right)),int(float(bottom))),(0,255,0),1)
        cv2.putText(img, str(i), (int((float(left)+float(right))/2), int(float(bottom))), font, 0.3, (0,0,255), 1)
cv2.imshow('Ordering',img)
cv2.waitKey() # press any key to exit the opencv output 
cv2.destroyAllWindows() 

In [112]:
# delete image, coordinates file and ids file created
os.system('rm {}'.format(pdf_path + filename + '-' + page_to_visualize + '.jpg'))
os.system('rm {}'.format(dirname.rstrip() + '/' + basename.rstrip() + '.ids_coordinates.txt'))
os.system('rm {}'.format(dirname.rstrip() + '/' + basename.rstrip() + '.ids_words.txt'))

0