In [3]:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
import pdfquery
from spacy.pipeline import EntityRuler
import spacy

from io import StringIO
import os
import sys
import re

In [4]:
def pdf2text(path, number=None):
    """
    Given a PDF and a possible page number, extract the text.
    """
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    device = TextConverter(rsrcmgr, retstr, codec='utf-8', laparams=LAParams())
    with open(path, 'rb') as fp:
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        pagenos=set()
        allpages = PDFPage.get_pages(fp, pagenos, maxpages=0, password='', caching=True, check_extractable=False)
        if not number:
            for page in allpages:
                interpreter.process_page(page)
        else:
            for index, page in enumerate(allpages):
                if index == number:
                    interpreter.process_page(page)
        text = retstr.getvalue()
    device.close()
    retstr.close()
    return text

In [5]:
def pdf2textCached(filename):
    """
    Cache PDF text if it exists: this speeds up program execution.
    """
    textfilename = filename + '.txt'
    if not os.path.isfile(textfilename):
        text = pdf2text(filename)
        with open(textfilename, 'w+') as fd:
            fd.write(text)
    return '\n'.join(open(textfilename, 'r+').readlines())

In [6]:
def sentences(doc, text, what, startwith=False):
    """
    Given a document with named entities, extract the sentence
    belonging to the named entity.
    """
    result = []
    for index, ent in enumerate(doc.ents):
        if ent.label_ == what:
            sentence = ent.sent.string
            # make sure we start with the given text
            if startwith and not sentence.startswith(ent.text):
                index = sentence.index(ent.text)
                sentence = sentence[index:]
            # replace whitespace and newlines
            sentence = sentence.strip().replace('\n', '')
            result.append(sentence)
    return result

In [7]:
def expand_audit_numbers(doc):
    """
    We cannot use a conventional pipe here, because spacy sometimes
    parses 2xxx-yyy as a single entity (DATE or CARDINAL). Here, we
    explicitly match against a regexp and create custom spans; as
    such, this function _must_ be the first pipe executed, otherwise
    we will get overlapping entities for the same token.
    """
    new_ents = []
    for match in re.finditer('2\d{3}-\d{3}', doc.text):
        start, end = match.span()
        span = doc.char_span(start, end, label='AUDIT_NUMBER')
        if span is not None:
            new_ents.append(span)
    doc.ents = new_ents
    return doc

In [8]:
def nlp_results(doc):
    return [(ent.text.strip(), ent.label_) for ent in doc.ents]

In [9]:
def audit_numbers(doc):
    return {ent.text for ent in doc.ents if ent.label_ == 'AUDIT_NUMBER'}

In [10]:
def split_pattern(string):
    return [{'LOWER': s} for s in string.split(' ')]

In [11]:
patterns = [
    # primary criteria
    {'label': 'CORRECTIVE_ACTION', 'pattern': split_pattern('corrective action plan')},
    {'label': 'CORRECTIVE_ACTION', 'pattern': split_pattern('corrective action')},
    # secondary criteria: used to identify where the audit is
    {'label': 'CONDITION', 'pattern': [{'LOWER': 'observation'}]},
    {'label': 'CONDITION', 'pattern': [{'LOWER': 'condition'}]},
    {'label': 'CRITERIA', 'pattern': [{'LOWER': 'criteria'}]},
    {'label': 'CRITERIA', 'pattern': split_pattern('criteria or specific requirement')},
    {'label': 'CONTEXT', 'pattern': [{'LOWER': 'context'}]},
    {'label': 'CAUSE', 'pattern': [{'LOWER': 'cause'}]},
    {'label': 'CAUSE', 'pattern': split_pattern('cause of the condition')},
    {'label': 'EFFECT', 'pattern': [{'LOWER': 'effect'}]},
    {'label': 'EFFECT', 'pattern': split_pattern('effect or possible effect')},
    {'label': 'RECOMMENDATION', 'pattern': [{'LOWER': {'REGEX': 'recommendations?'}}]},
    {'label': 'RESPONSE', 'pattern': [{'LOWER': 'response'}]},
]

In [19]:
# a sample of different headers that start audit findings
headers = [
    'federal award findings and questioned costs',
    'financial statement findings',
    'findings and questioned costs – major federal award programs audit',
    'findings – financial statement audit',
    'findings related to the financial statements',
    'major federal award findings and questioned costs',
    'schedule of findings and questioned costs',
    'summary schedule of prior audit findings',
]

for header in headers:
    pattern = {'label': 'HEADER', 'pattern': split_pattern(header)}
    patterns.append(pattern)

In [20]:
nlp = spacy.load('en_core_web_sm') # or 'en'
ruler = EntityRuler(nlp, overwrite_ents=True)
sentencizer = nlp.create_pipe('sentencizer')
ruler.add_patterns(patterns)
nlp.add_pipe(sentencizer, first=True)
nlp.add_pipe(expand_audit_numbers, first=True)
nlp.add_pipe(ruler)

In [21]:
pagenumber = None
filename = "20/13002220181.pdf"
sample = pdf2textCached(filename) if not pagenumber else pdf2text(filename, number=pagenumber)
doc = nlp(sample)

In [22]:
audits = audit_numbers(doc)
print('found the following audit numbers:', audits)

found the following audit numbers: {'2018-001', '2017-002', '2017-003', '2017-001', '2018-002', '2018-003'}


In [27]:
print(sentences(doc, sample, 'AUDIT_NUMBER'))

['We  consider  the  following  deficiencies  2018-001, described  in  the  accompanying  Schedule  of  Findings  and  Questioned  Costs  to  be  a  material weaknesses.', 'The results of our tests disclosed instances of noncompliance or other matters  that  are  required  to  be  reported  under  Government  Auditing  Standards  and  which  are described in the accompanying schedule of findings and questioned costs as items 2018-002 and 2018-003.', 'The results of our tests disclosed instances of noncompliance or other matters  that  are  required  to  be  reported  under  Government  Auditing  Standards  and  which  are described in the accompanying schedule of findings and questioned costs as items 2018-002 and 2018-003.', 'However  our  audit  does  not  provide  a  legal  determination  of  the  Tribe’s compliance  34 \x0cCouncil Members Native Village of Kwinhagak Basis for Qualified Opinion As described in items 2018-003 in the accompanying schedule of findings and questioned co

In [25]:
from spacy import displacy
displacy.render(doc, style="ent", options={'fine_grained': True})

In [18]:
[(x, y) for (x, y) in nlp.tokenizer.explain(sample) if y.startswith('Finding')]
#[sent.string.strip() for sent in doc.sents]

[('TOKEN', 'Findings'),
 ('TOKEN', 'Findings'),
 ('TOKEN', 'Findings'),
 ('TOKEN', 'Findings'),
 ('TOKEN', 'Findings'),
 ('TOKEN', 'Findings'),
 ('TOKEN', 'Findings'),
 ('TOKEN', 'Finding'),
 ('TOKEN', 'Finding'),
 ('TOKEN', 'Findings'),
 ('TOKEN', 'Finding'),
 ('TOKEN', 'Finding'),
 ('TOKEN', 'Finding'),
 ('TOKEN', 'Finding'),
 ('TOKEN', 'Finding')]