In [9]:
import os
import subprocess
import glob
import re
import collections
import math

from bs4 import BeautifulSoup as bs

In [10]:
PDF_PATH = 'pdf'
XML_PATH = 'xml'
OUTPUT_PATH = 'output'

SKIP_CONVERT = True

In [11]:
if not SKIP_CONVERT:
    # All the available PDF files
    input_files = glob.glob(os.path.join(PDF_PATH, '*.pdf'))
    
    # Convert to XML
    for input_file in input_files:
        filename = input_file.rsplit('/')[1][:-4]
        cmd = ['pdftohtml', '-s', '-i', '-xml', input_file, os.path.join(XML_PATH, filename + '.xml')]
        state = subprocess.check_call(cmd)
        if state != 0:
            print 'Failure:', ' '.join(cmd)

In [13]:
# All the available XML files
xml_files = glob.glob(os.path.join(XML_PATH, '*.xml'))

for xml_file in xml_files:
    filename = xml_file.rsplit('/')[1][:-4]
    
    # Parser initialization
    soup = bs(open(xml_file), 'html.parser')
    
    # Guess code properties: seek for line starting with R code to deduce
    # what is the font number and box height that we need to consider
    font, height = [], []
    for item in soup.find_all('text', text=re.compile(r'^(> )?(#|(library)|(plot)|(summary))')):
        font.append(item['font'])
        height.append(item['height'])
    try:
        font = collections.Counter(font).most_common(1)[0][0]
        height = collections.Counter(height).most_common(1)[0][0]
    except IndexError as e:
        # Default if nothing appropriate is found
        font, height = 4, 18
    
    # Multi-column detection: count the number of boxes that are on the 
    # "right" part of the page (starting from 2/3 * width).
    page_width = int(soup('page')[0]['width'])
    
    left, right = [], []
    for item in soup('text'):
        x = int(item['left'])
        if x >= page_width / 3:
            right.append(x)
        else:
            left.append(x)
            
    # If left and right are "quite comparable", assume we have a multicolumns layout
    if len(right) * 0.5 < len(left) < len(right) * 1.5:
        multicolumn = True
        # Compute the "middle point" using most frequent right value
        # This position will be used to split the page into two columns
        left = collections.Counter(left).most_common(1)[0][0]
        right = collections.Counter(right).most_common(1)[0][0]
    else:
        multicolumn = False
        left = right = collections.Counter(left).most_common(1)[0][0]
    
    # Get the all the boxes for all the pages
    pages = []
    for page in soup('page'):
        current_page = []
        for box in page('text'):
            current_page.append(box)
        pages.append(current_page)
        
    
    # We go on with every box. If a box has the same properties than font/height, then it is
    # probably a code box. To check that, we verify if, on the same line (+/- margin) in the
    # same column, there is no other box that contains simple text. 
    done = set()  # Set of already considered boxes
    code = []  # Set of tuples (page, left, top, line_of_code)
    for i, page in enumerate(pages):
        for box in page:
            # Is this box an interesting one?
            if box['font'] == font and box['height'] == height and box not in done:
                concatenated_box = []  # List of box on the same line +/- margin
                keep_it = True  # Do we keep the current box?
                
                for other_box in page:
                    # If multicolumn, check in which side of the page we are
                    if multicolumn and int(box['width']) < right:
                        if int(box['left']) < multicolumn_sep:
                            multicol_bool = int(other_box['left']) < right
                        else:
                            multicol_bool = int(other_box['left']) >= right
                    else:
                        multicol_bool = True
                    
                    # Is other_box on the same line (+/- margin) in the same column?
                    if multicol_bool and int(box['top']) - 10 <= int(other_box['top']) <= int(box['top']) + 10:
                        if other_box['font'] == font and other_box['height'] == height and box['top'] == other_box['top']:
                            if other_box not in done:
                                # else assert False ?
                                concatenated_box.append(other_box)
                        else:
                            keep_it = False
                            break
                
                # The current box was not rejected, so we concatenate the texts
                if keep_it:
                    for other_box in concatenated_box:
                        done.add(other_box)
                    sorted_boxes = sorted(concatenated_box, key=lambda x: int(x['left']))
                    
                    # Spacing solution
                    last_box = None
                    line_of_code = []
                    try:
                        char_width = float(box['width']) / float(len(box.text))
                    except ZeroDivisionError:
                        char_width = 0
                    
                    for current_box in sorted_boxes:
                        if last_box:
                            margin = int(last_box['left']) + int(last_box['width'])
                        else:
                            # First box on the line, do we need initial spacing?
                            if multicolumn and int(current_box['left']) >= right:
                                margin = right
                            else:
                                margin = left
                        # Distance between current box and the margin
                        spacing = int(current_box['left']) - margin
                        # Number of spaces
                        try:
                            spaces = int(math.ceil(spacing / char_width))
                        except ZeroDivisionError:
                            spaces = 0
                        # Fill
                        line_of_code.append(' ' * spaces)    
                        line_of_code.append(current_box.text)
                        last_box = current_box
                    code_item = (i+1, box['left'], box['top'], ''.join(line_of_code))
                    code.append(code_item)
                    
    # If we have some code, output it!
    if len(code) > 0:
        f = open(os.path.join(OUTPUT_PATH, filename+'.txt'), 'w')
        s = '\n'.join([u'{0[0]}\t{0[1]}\t{0[2]}\t{0[3]}'.format(c) for c in code])
        f.write('page\tleft\ttop\tcode\n')
        f.write(s.encode('utf-8'))
        f.close()
    
print 'finished ^^'    

finished ^^
