In [1]:
import os
import subprocess
import glob
import re
import collections

from bs4 import BeautifulSoup as bs

In [2]:
PDF_PATH = 'pdf'
XML_PATH = 'xml'
OUTPUT_PATH = 'output'

In [3]:
input_files = glob.glob(os.path.join(PDF_PATH, '*.pdf'))

# Convert to XML
for input_file in input_files:
    filename = input_file.rsplit('/')[1][:-4]
    cmd = ['pdftohtml', '-s', '-i', '-xml', input_file, os.path.join(XML_PATH, filename + '.xml')]
    state = subprocess.check_call(cmd)
    if state != 0:
        print 'Failure:', ' '.join(cmd)

In [22]:
xml_files = glob.glob(os.path.join(XML_PATH, '*.xml'))

for xml_file in xml_files:
    filename = xml_file.rsplit('/')[1][:-4]
    
    soup = bs(open(xml_file), 'html.parser')
    
    # Guess code properties: seek for line starting with R code
    font, height = [], []
    for item in soup.find_all('text', text=re.compile(r'^(> )?(#|(library)|(plot)|(summary))')):
        font.append(item['font'])
        height.append(item['height'])
    try:
        font = collections.Counter(font).most_common(1)[0][0]
        height = collections.Counter(height).most_common(1)[0][0]
    except IndexError as e:
        font, height = 4, 18
    
    # Multicolumn?
    page_width = int(soup('page')[0]['width'])
    
    left, right = [], []
    for item in soup('text'):
        x = int(item['left'])
        if x >= page_width / 3:
            right.append(x)
        else:
            left.append(x)
    # If left and right are "comparable", we have a multicolumn page
    if len(right) * 0.5 < len(left) < len(right) * 1.5:
        multicolumn = True
        # Compute the "middle point" using most frequent right value
        multicolumn_sep = collections.Counter(right).most_common(1)[0][0]
    else:
        multicolumn = False
        multicolumn_sep = 0
    
    # Get the boxes
    pages = []
    for page in soup('page'):
        current_page = []
        for box in page('text'):
            current_page.append(box)
        pages.append(current_page)
        
    # Find interesting boxes, sorted by page/top
    done = set()
    code = []
    for i, page in enumerate(pages):
        for box in page:
            if box['font'] == font and box['height'] == height and box not in done:
                # Check colliding boxes
                concatenated_box = []
                keep_it = True
                
                for other_box in page:
                    # Filter boxes on the same arbitrary level
                    
                    if multicolumn and int(box['width']) < multicolumn_sep:
                        if int(box['left']) < multicolumn_sep:
                            multicol_bool = int(other_box['left']) < multicolumn_sep
                        else:
                            multicol_bool = int(other_box['left']) >= multicolumn_sep
                    else:
                        multicol_bool = True
                    
                    if multicol_bool and int(box['top']) - 10 <= int(other_box['top']) <= int(box['top']) + 10:
                        if other_box['font'] == font and other_box['height'] == height and box['top'] == other_box['top']:
                            if other_box not in done:
                                concatenated_box.append(other_box)
                        else:
                            keep_it = False
                            break
                
                if keep_it:
                    for other_box in concatenated_box:
                        done.add(other_box)
                    sorted_boxes = sorted(concatenated_box, key=lambda x: int(x['left']))
                    code_item = (i+1, box['left'], box['top'], ' '.join([x.text for x in sorted_boxes]))
                    code.append(code_item)
    if len(code) > 0:
        f = open(os.path.join(OUTPUT_PATH, filename+'.txt'), 'w')
        s = '\n'.join([u'{0[0]}\t{0[1]}\t{0[2]}\t{0[3]}'.format(c) for c in code])
        f.write('page\tleft\ttop\tcode\n')
        f.write(s.encode('utf-8'))
        f.close()
    
print 'finished ^^'    

finished ^^
