In [1]:
#from pdfminer.pdfpage import PDFPage - Python2

from pdfminer.pdfparser import PDFParser, PDFDocument, PDFNoOutlines
from pdfminer.converter import PDFPageAggregator, TextConverter#, XMLConverter, HTMLConverter

from pdfminer.layout import LAParams, LTTextBox, LTTextLine #, LTFigure, LTImage


from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice


In [487]:
import pdfminer
pdfminer.__version__ 

'1.3.0'

The functions below identify the probabilities of a text being written in a given language (using stop words). The language is taken as the max in languages_ratios- we want
to keep English only (Africaans is assigned to Dutch). 

In [2]:
def get_languages(text):
    '''
    nltk.wordpunct_tokenize() splits all punctuations into separate tokens
    
    >>> wordpunct_tokenize("My name's Anna.End.")
    ['My', name', 's', 'Anna', '.', 'End', '.']
    '''
    languages_ratios = {}

    tokens = wordpunct_tokenize(text)
    words = [word.lower() for word in tokens]

    # number of unique stopwords appearing in analyzed text as included in nltk(Africaans classified as Dutch)
    for language in stopwords.fileids():
        stopwords_set = set(stopwords.words(language))
        words_set = set(words)
        common_elements = words_set.intersection(stopwords_set)

        languages_ratios[language] = len(common_elements) # language "score"

    return languages_ratios


#----------------------------------------------------------------------
def detect_language(text):
    """
    Calculate probability of given text to be written in a given language,
    returning the highest score and ratios
    """

    ratios = get_languages(text)
    
    most_rated_language = max(ratios, key=ratios.get)

    return most_rated_language, ratios


## Initial pipeline

### 1. information about a gazette can be found on the 1st page (vol, no, type etc) and header of the 2nd 

### 2. in order to pinpoint things to tag in a doc, we look up page numbers in what  could be called an 'outline'  (it is not as far as a doc structure is concerned, e.g. we cannot use .get_outlines())

After reading the 1st/2nd page, we keep skimming through pages until we hit the 'outline'.
This is a major hack, as accessing the table of contents ("Outlines") does not work for the gazettes (they are simply text boxes)
http://www.unixuser.org/~euske/python/pdfminer/programming.html#layout

We identify the page with an outline by looking up a variation of the following:

GENERAL NOTICE / ALGEMENE KENNISGEWINGS
CONTENT/Table of contents (note: undercover can be Table of ConTenTs, so to avoid confusion, we transform capital letters)

tricky inconsistencies:
- page number in the outline might not all be a separate box (can be a vector or extension of text)
- page numbers are not aligned with the header (overlapp column labelled as 'gazette no')
- page no. might appear at the end: 'table of contents' + entries + page no header, despite visually being ok. Not sure if it's the pdf or pdfminer's fault


Idea for later:
extend the PDFPageInterpreter and PDFDevice class in order to process them differently / obtain other information. 

### 3. going to the pages scraped from the outline and extracting English text/tagging

In [513]:
def get_classification_data(objstack_1, objstack_2):
     
        class_info = []
        # go over the front page
        while objstack_1:
            lt_obj=objstack_1.pop()
            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                    text_obj = lt_obj.get_text().replace('\n','').lower()
                        #language, ratios = detect_language(text_obj)
                    if ('gazette' in text_obj):

                        class_info.append(text_obj)
                        
                    if ('vol' in text_obj): 

                        class_info.append(text_obj)
                    
                    if ('no.' in text_obj):  # might be repeated ot the only source of info

                        class_info.append(text_obj)
                    if ('province' in text_obj): 

                        class_info.append(text_obj)
                    
                    if ('issn' in text_obj):
                        
                        class_info.append(text_obj)
                        
        # header from the 2nd page          
        for i in range(2):
            lt_obj=objstack_2.pop()
            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                    text_obj = lt_obj.get_text().replace('\n','').lower()
                    
                    if ('no' in text_obj) and not ('no' in class_info):
                        class_info.append("no. " + text_obj.split('no.')[1].strip())
                        continue
                        
                    for txt in text_obj.split(','):
                        class_info.append(txt.strip())
        
        return class_info

In [383]:
def get_outline(objstack):
        text_obj=[]
        start = 0 # when changed to 1, indicates the beg of the outline
        
        outline_obj = []
        is_outline_page = 0
        
        # objects corresponding to the 'Page No' and 'Gazette No' 
        page_box = []
        gazette_box = []
        
        while objstack:
            lt_obj=objstack.pop()
            
            if isinstance(lt_obj, LTTextBox): #or isinstance(lt_obj, LTTextLine):
                        text_obj = lt_obj.get_text().replace('\n',' ').lower()
                        
                        #parse all below 'content':
                        if ('contents' in text_obj) or ('provincial notices' in text_obj)\
                           or ('page no' in text_obj) or ('gazette no' in text_obj):
                                start = 1
                                is_outline_page = 1 # ind that this is the outline page
                                
                        if (start==1): # we are below 'contents' header now
                           
                            if ('page' in text_obj):
                                page_box = lt_obj
                                
                            if ('gazette no' in text_obj):
                                gazette_box = lt_obj
                                
                            outline_obj.append(lt_obj)               
                
        return is_outline_page, outline_obj, page_box, gazette_box

In [531]:
def get_notice_pages(page_box, gazette_box, num_pages):
        
        notice_info = []
        
        # page numbers can be misaligned-> 'Gazette no.' and 'Page no' columns 
        # can appear in any order and be misaligned in headings (text shifted wrt to the heading)

        page_contents = []
       
        for box in outline_obj:
            text_obj = box.get_text().replace('\n',' ').lower()

            if (box.is_hoverlap(page_box)): 
                #https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py

                if not ('page' in text_obj) and not ('no' in text_obj)\
                    and not ('..' in text_obj): # check which ones overlap horizontally
                                                # with the page box,
                                                # discard the ones merged with text (dealt with below)
                                                # and check whether we're not parsing the issue no
                                    
                    if (len(text_obj)>1): # sometimes page no's come as a vector, 
                                          # so we need to split the elements
                        temp = text_obj.split(' ')
                        for el in temp:
                            if len(el)>0 and (int(float(el)) < num_pages):
                                page_contents.append(int(float(el)))     

                    else:   
                        if int(text_obj) < num_pages:
                            page_contents.append(int(text_obj))
                 
            if ('..' in text_obj):
                
                t = text_obj.split('.')
                
                last_el = t[-1].strip()
                if last_el != '':
                    x = int(float(last_el))

                    if (x <= num_pages):  # sometimes gazette number parses in,need to see 
                                            # if it is a valid page no

                        page_contents.append(x)
                        
                # save info about what can be found there:
            if ('section' in text_obj) or ('act' in text_obj) or ('municipality' in text_obj)\
                or ('correction' in text_obj):
                    notice_info.append(text_obj.replace("..","").strip())
           
        #unique pages
        return sorted(list(set(page_contents))), notice_info


In [385]:
def pdf_to_gazette_classification(fp):
    
        parser = PDFParser(fp)
        doc = PDFDocument()
        parser.set_document(doc)
        doc.set_parser(parser)
        doc.initialize() # optional if passwo is there: pdf_pwd

        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        # to use in-built pdftotxt.py:
        #device = TextConverter(rsrcmgr, outfp, laparams=LAParams()) 
        #process_pdf(rsrcmgr, device, fp) 
        #device.close() 

        text_parsed = [] # a list of strings, each representing text collected from each page of the doc

        # find the number of pages- starts from 0
        num_pages = max([i for i,page in enumerate(doc.get_pages())]) + 1

        look_for_outline = 1

        # find the page with content outline (not defined as outline in the doc, unfortnately)
        for i, page in enumerate(doc.get_pages()):
            try:
                   interpreter.process_page(page)
            except Exception as e:
                    print("parsing of the first page is not possible")
                    continue
            # receive the LTPage object for this page
            #The layout from get_result() parses the strings into separate objects. 
            #These objects have several key components:type, the coordinates 
            #(startingx, startingy, endingx, endingy), and the content, e.g.
            #<LTRect 258.000,39.720,297.000,51.000>
            #Accessing the type: e.g. type(object)==LTRect)
            
            layout = device.get_result()

            objstack=list(reversed(layout._objs))

            # get the first page info: type of gazette, vol 
            if i==0:
                objstack_1 = objstack
                continue
            
            # header from the 2nd page (any 2+ page would do)
            if i==1:
                objstack_2 = objstack
                classification_data = get_classification_data(objstack_1, objstack_2)
                continue

            if(look_for_outline):
                # find page with the outline and extract outline text and pages it points to:
                is_outline_page, outline_obj, page_box, gazette_box = get_outline(objstack)
                if (is_outline_page):
                    #print(is_outline_page, i)
                    num_outline_page=i
                    # we found the outline so move to extracting pages from there
                    look_for_outline = 0
                    pages_to_extract, notice_info = get_notice_pages(page_box, gazette_box, num_pages)
                    break
             #else:
              #  print(look_for_outline)
             #now we have to fetch the columns corresponding to the 'page no' or 'page' 
             #and extract the page numbers the main info is stored at
        device.close()
        
        return classification_data, pages_to_extract, notice_info

In [387]:
set(classification_data)

{'18 march 2016', 'government gazette', 'no. 39837', 'vol. 609'}

In [534]:
# refers to the document itself:
class Issue:
  def __init__(self, publication = '', issn = 0, num_pages = 0, volume=0, gazette_title=''):
    self.publication = publication # type of gazette
    self.identifier = issn # ISSN code
    self.page_range = num_pages # number of pages
    self.edition_id = volume # volume
    self.title = gazette_title # if extraordinary

# refers to notices where the relevant data is stored: info stored in pages 
#referred to in the outline + extra info on the type of data and its place on the web 

class Document:
  def __init__(self, page_range = [] , uri = 'http://www.gpwonline.co.za', media_type = 'text' ): 
    self.page_range = page_range
    self.url = uri
    self.media_type =  media_type # {text, jpg, ...} -> some notices are Figures   


In [432]:
classification_data

['vol. 609', 'no. 39837', 'no. 39837', 'government gazette', '18 march 2016']

Next, we put the information into structured json template containing
document identifiers of all sorts (classification of a gazette).

Field ['subjects']: the entities that the notice is primarily about.

Field ['about'] is for the content of the notice (parsed info).
Depending on the type of document, it will either be 

- empty (if notices come as jpgs and cannot be processed), 
- csv/table for Liquor 
- tagged text for ... plain text


Below we use json.dump to create a human readbale json format.
However, for large no of files with large fileds and efficient way of saving data is:

import jsonpickle
json_obj = jsonpickle.encode(classification)

In [529]:
def save_to_json(classification_data, pages_to_extract, notice_info, classification):
        """
        classification_data: data about the document (date, vol, number, title)
        pages_to_extract: where the info is
        notice_info: what the notice is about, fetched from the 'outline'
        """
        from collections import defaultdict
        classification = defaultdict(list) # dict to store data and dump into json

        classification['issue'] = Issue() # instance of Issue
        # publication = '', issn = 0, num_pages = 0, volume=0, notice_title='')

        classification['document'] = Document() # instance of Document- mainly to gather
           # pages where notices are published and their types

        for x in set(classification_data):
            if ('vol' in x):
                uid_vol = x.split('.')[1].strip()

                classification['issue'].edition_id = uid_vol

            if ('no' in x):
                uid_no = x.split('.')[1].strip()
                classification['other_attributes'].append(uid_no) 

            if ('gazette' in x):
                classification['issue'].publication = x
                uid_type = x

            if ('province' in x):
                classification['other_attributes'].append(x)    

            if ('extraordinary' in x):
                classification['issue'].title = 'extraordinary'

            if ('issn' in x):
                classification['issue'].identifier = x
            
            #date
            mult_dots = re.compile(r'(.){2,}') # 2 or more
            #no = mult_dots.search(text_obj)
            
       
            if ('december' in x):
                print('left', x)
                classification['date_published'] = x

        # add info from the outline (keywords are important)
        # shoulf be parsed: save keywords?
        classification['summary'] = notice_info

        # page_range, string /^[0-9]*(-[0-9]*)?$/
        # The pages the document within the issue where to look for info

        classification['document'].page_range =  pages_to_extract 
        classification['subjects'] = [] # entities
        classification['about'] = [] # parsed info

        classification['issue'].page_range = num_pages
        #classification['source_url'] = 

        uid = uid_no + '_' + uid_vol
        # modify id's if necessary
        classification['uid'] = uid # must be unique
        classification['identifier'] = uid  # + uid_type ? can be more descriptive possibly
        
        # see comment above this function
        return json.dumps(classification, default=lambda o: o.__dict__, sort_keys=True)#, indent=4)

In [522]:
# open the pdf file
fp = open('39569_31-12_NationalRegulation.pdf', 'rb')
classification_data, pages_to_extract, notice_info = pdf_to_gazette_classification(fp)
fp.close()

classification_json = save_to_json(classification_data, pages_to_extract, notice_info)


left 31 december 2015


In [532]:
classification_data

['regulation gazette',
 'no. 10544',
 'vol. 606',
 'no. 39569',
 'issn 1682-5843',
 'no. 39569',
 'government gazette',
 '31 december 2015']

In [533]:
classification_json

'{"date_published": "31 december 2015", "document": {"media_type": "text", "page_range": [4], "url": "http://www.gpwonline.co.za"}, "identifier": "39569_606", "issue": {"edition_id": "606", "identifier": "issn 1682-5843", "page_range": 8, "publication": "regulation gazette", "title": ""}, "other_attributes": ["10544", "39569"], "subjects": [], "summary": ["higher education and training, department of/ ho\\u00ebr onderwys en opleiding, departement van 336  skills development act (97/1998): correction notice: establish, alternatively re-establishment of sector education  and training authorities (setas)"], "uid": "39569_606"}'

In [None]:


from collections import defaultdict
classification = defaultdict(list)

classification['issue'] = Issue() # instance of Issue
# publication = '', issn = 0, num_pages = 0, volume=0, notice_title='')

classification['document'] = Document() # instance of Document- mainly to gather
   # pages where notices are published and their types

for x in set(classification_data):
    if ('vol' in x):
        uid_vol = x.split('.')[1].strip()
        
        classification['issue'].edition_id = x[1]
    
    if ('no' in x):
        uid_no = x.split('.')[1].strip()
        classification['other_attributes'].append(uid_no) 
    
    if ('gazette' in x):
        classification['issue'].publication = x
        uid_type = x
    
    if ('province' in x):
        print(x)
        classification['other_attributes'].append(x)    
    
    if ('extraordinary' in x):
        classification['issue'].title = 'extraordinary'
            
    if ('issue' in x):
        classification['issue'].identifier = x
        
    #date
    mult_dots = re.compile(r'(.){2,}') # 2 or more
    #no = mult_dots.search(text_obj)
    
        classification['date_published'] = x
                
# add info from the outline (keywords are important)
classification['summary'] = notice_info

# page_range, string /^[0-9]*(-[0-9]*)?$/
# The pages the document within the issue where to look for info

classification['document'].pages_range =  pages_to_extract 
classification['subjects'] = [] # parsed notice info


classification['issue'].num_pages = num_pages
#classification['source_url'] = 

uid = '.'
# modify id's if necessary
classification['uid'] = uid # must be unique
classification['identifier'] = uid  # + uid_type ? can be more descriptive possibly

In [535]:
x

'18 march 2016'

In [536]:
import re
match=re.search(r'(\d+/*/\d+)','The  11/12/98')

- detect whether it's a single or double column text
- Extract just left column

### processing images:

LTImage type: contains bits, colorspace, height,imagemask,name,srcsize,stream, and width.
Note: PDFMiner seems work well only with jpegs ( apparently xpdf works with all images)

In [None]:
if type(b) == LTImage:
     imgbits=b.bits



In [None]:
from elasticsearch import Elasticsearch
es = Elasticsearch()

#result = es.suggest(index="gazettes", body={"my_suggestion": {"text": notice, "term": {"field":"content" }}})