In [501]:
#from pdfminer.pdfpage import PDFPage - Python2

from pdfminer.pdfparser import PDFParser, PDFDocument, PDFNoOutlines
from pdfminer.converter import PDFPageAggregator, TextConverter#, XMLConverter, HTMLConverter

from pdfminer.layout import LAParams, LTTextBox, LTTextLine #, LTFigure, LTImage


from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice


we only want to keep English. The functions below identify the probabilities of a text being written in a given language (using stop words). The language is takes as the max in languages_ratios

In [1]:
def get_languages(text):
    '''
    nltk.wordpunct_tokenize() splits all punctuations into separate tokens
    
    >>> wordpunct_tokenize("My name's Anna.End.")
    ['My', name', 's', 'Anna', '.', 'End', '.']
    '''
    languages_ratios = {}

    tokens = wordpunct_tokenize(text)
    words = [word.lower() for word in tokens]

    # number of unique stopwords appearing in analyzed text as included in nltk(Africaans classified as Dutch)
    for language in stopwords.fileids():
        stopwords_set = set(stopwords.words(language))
        words_set = set(words)
        common_elements = words_set.intersection(stopwords_set)

        languages_ratios[language] = len(common_elements) # language "score"

    return languages_ratios


#----------------------------------------------------------------------
def detect_language(text):
    """
    Calculate probability of given text to be written in a given language,
    returning the highest score and ratios
    """

    ratios = get_languages(text)
    
    most_rated_language = max(ratios, key=ratios.get)

    return most_rated_language, ratios


In [None]:
text_parsed=[]

fp = open('3489_31-8_ECape.pdf', 'rb') # from a local file
parser = PDFParser(fp)
        
doc = PDFDocument()
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize('')
rsrcmgr = PDFResourceManager()
laparams = LAParams()

device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
        # Process each page contained in the document.
        
for page in doc.get_pages():
            interpreter.process_page(page)
            
            # layout analyzer: an LTPage object for each page in the PDF document.
            #This object contains child objects within the page, forming a tree structure
            layout = device.get_result()
            
            
            for lt_obj in layout:
            #Represents a group of text chunks that can be contained 
            #in a rectangular area. Created by geometric analysis, not necessarily 
            #represents a logical boundary of the text. It contains a list of
            #LTTextLine objects: Contains a list of LTChar objects that represent 
            #a single text line. The characters are aligned either horizontaly
            #or vertically, 
            #get_text() method returns the text content. 
                if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                        text_obj = lt_obj.get_text().replace('\n','')
                        language, ratios = detect_language(text_obj)
                        if (language=="english" and not(text_obj in text_parsed)):
                            #(\d{1,2} )?(Nov|Dec)?( ?- )?(\d{1,2}) (Nov|Dec) (\d{4}) 
                                text_parsed.append(text_obj)
         

## Initial pipeline

### 1. information about a given gazette is in the first page (vol, no, type etc)

### 2. we look for places in the doc (page no), where of the most relevant content, so (for now) parsing the 'outline' 

After reading the 1st page, we look for a page containing an 'outline' of sorts.

This is a major hack, as accessing the table of contents ("Outlines") does not work for the gazettes (they are boxes)
http://www.unixuser.org/~euske/python/pdfminer/programming.html#layout

We identify the page by containing a variation of the following:

GENERAL NOTICE / ALGEMENE KENNISGEWINGS
CONTENT/Table of contents (actually is Table of ConTenTs, so need to switch to small fonts)

difficulties:
- page number in outline might not be a separate box (while in the same doc others are)
- page no. might appear at the end: 'table of contents' + entries + page no header

not sure if it's the pdf or pdfminer's fault

Idea in the long run:
extend the PDFPageInterpreter and PDFDevice class in order to process them differently / obtain other information. 

### 3. going to the indentified page and if it's in English, extract the text

In [327]:
def get_first_page(objstack_t):
    first_text = []
    while objstack_t:
            lt_obj=objstack_t.pop()
            
            if isinstance(lt_obj, LTTextBox): #or isinstance(lt_obj, LTTextLine):
                    text_obj = lt_obj.get_text().replace('\n','')
                        
                        #language, ratios = detect_language(text_obj)
                    if ('Registered' not in text_obj):
                        
                        first_text.append(text_obj)
                        
                        if ('No.' in text_obj):
                          #if not(text_obj in text_parsed) and ('GENERAL NOTICE' in text_obj):
                        
                          break
                       
    return first_text 

In [401]:
def get_outline(objstack):
        text_obj=[]
        start = 0 # beg of the outline
        outline_obj = []
        is_outline_page = 0
        page_box = []
        gazette_box = []
        
        while objstack:
            lt_obj=objstack.pop()
            
            if isinstance(lt_obj, LTTextBox): #or isinstance(lt_obj, LTTextLine):
                        text_obj = lt_obj.get_text().replace('\n',' ').lower()
                        
                        #parse all below 'content':
                        if ('contents' in text_obj) or ('provincial notices' in text_obj)\
                           or ('page no' in text_obj) or ('gazette no' in text_obj):
                                #print("contents", lt_obj)
                                start = 1
                                is_outline_page = 1
                        #print(start, text_obj)        
                        if (start==1):
                           
                            if ('page' in text_obj):
                                page_box = lt_obj
                                #print(text_obj)
                                #print(page_box)
                            if ('gazette no' in text_obj):
                                gazette_box = lt_obj
                                
                            outline_obj.append(lt_obj) 
                            
                
        return is_outline_page, outline_obj, page_box, gazette_box

In [477]:
# open the pdf file
fp = open('39782_4-3_NationalLiquor.pdf', 'rb')
parser = PDFParser(fp)
doc = PDFDocument()
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize() # optional if passwo is there: pdf_pwd

rsrcmgr = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)

# to use in-built pdftotxt.py:
#device = TextConverter(rsrcmgr, outfp, laparams=LAParams()) 
#process_pdf(rsrcmgr, device, fp) 
#device.close() 

text_parsed = [] # a list of strings, each representing text collected from each page of the doc

# find the number of pages- starts from 0
num_pages = max([i for i,page in enumerate(doc.get_pages())])


# find page with content (not defined as outline in the doc, unfortnately)
for i, page in enumerate(doc.get_pages()):
    try:
           interpreter.process_page(page)
    except Exception as e:
            print("parsing of the first page is not possible")
            continue
    # receive the LTPage object for this page
    #The layout received from get_result() parses the strings into separate objects. 
    #These objects have several key components:type, the coordinates 
    #(startingx, startingy, endingx, endingy), and the content, e.g.
    #<LTRect 258.000,39.720,297.000,51.000>
    #Accessing the type sone by type(object)(e.g. type(object)==LTRect). 
    layout = device.get_result()
            
    objstack=list(reversed(layout._objs))
        
    # get the first page info: type of gazette, vol and data
    if i==0:
        
        first_text = get_first_page(objstack)
        # extract specific info: vol, date, place, type of gazzette
        #.....
        
    else:
        # find page with the outline and extract outline text and pages it points to:
        is_outline_page, outline_obj, page_box, gazette_box = get_outline(objstack)
        if (is_outline_page):
            #print(is_outline_page, i)
            num_outline_page=i
            break # we found the outline so move to extracting pages from there

#now we have to fetch the columns corresponding to the 'page no' or 'page' 
#and extract the page numbers the main info is stored at

           
fp.close()
device.close()

In [499]:
x0 = page_box.x0
x1 = page_box.x1

# page numbers can be misaligned-> 'Gazette no.' column overlaps 
# in the heading with the column of Page numbers. What that means is that
# we cannot use coordinates of boxes to fetch 'Page no'. 
# In practial terms: the 'Page no' x1 coordinate needs to be extended to 'Gazette no' x0

if gazette_box != []:
    g_x0 = gazette_box.x0 
else:
    g_x0 = x1
    
page_contents = []
for box in outline_obj:
    text_obj = box.get_text().replace('\n',' ').lower()
    if (box.x0 >= x0 and box.x1 <= x1) or (box.x0 >= x0 and box.x1 < g_x0):

        if not ('page' in text_obj):
            if (len(text_obj)>1): # sometimes pages come as a vector, so we split the elements
                temp = text_obj.split(' ')
                for l in temp:
                    if len(l)>0:
                        page_contents.append(int(float(l)))     

            else:     
                page_contents.append(int(text_obj))
            
    if ('..' in text_obj):
        t = text_obj.split('.')
        last_el = t[-1].strip()
        print(last_el)
        if last_el != '':
            x = int(float(last_el))
            
            if (x<=(num_pages+1)):  # sometimes gazette number parses in
               
                page_contents.append(x)
#unique pages
pages_to_extract = list(set(page_contents))





In [500]:
pages_to_extract

[9, 42, 43, 44, 21]

with 'pages to extract' we can



totally unstructured (to say the least):
 Extraordinary Provincial Gazette of KwaZulu-Natal
 1156_9-6_KznG1.pdf

KznElec: ELECTION TIMETABLE, outline: ok
KznSep: MUNICIPAL/Provincial NOTICE, outline: ok
KznDemarc: outline:ok, BUT it refers only to general changes, followed by unreferenced pages with data on ward changes. This data, however, can be accessed from other source (registered 
voters in each wards etc).

NCape:
Liquor: ok

Gauteng:
outline: single vector pages split by municipalities
