In [1]:
import os
import re
import io
import string

## Remove warnings
import warnings
warnings.filterwarnings('ignore')

from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage

import en_core_web_sm

In [2]:
def extract_pdf(file):
    '''
    Pdf extraction function to extract pdf page-by-page

    Parameters
    -----------
    file: str
        The file path to which where the pdf file is located.

    Returns
    -------
    text: str
        A condensed string of text containing all the words extracted from the pdf.
    '''
    ## Load functions for pdf processing.
    resource_manager = PDFResourceManager()
    retstr = io.StringIO()
    codec = 'utf-8'
    laparams = LAParams()

    text_converter = TextConverter(resource_manager, retstr, codec=codec, laparams=laparams)
    page_interpreter = PDFPageInterpreter(resource_manager, text_converter)

    data = []

    with open(file, 'rb') as f:
        for page in PDFPage.get_pages(f, caching=True, check_extractable=False):

            page_interpreter.process_page(page)
            data.append(retstr.getvalue())

            retstr.truncate(0)
            retstr.seek(0)

    text = '##END_OF_PAGE##'.join(data)

    return text

In [77]:
text = extract_pdf('test_dataset/documents/ubm_esg_report_2021.pdf')

The PDF <_io.BufferedReader name='test_dataset/documents/ubm_esg_report_2021.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case


In [78]:
print(text)

Cover

ESG Report 2021##END_OF_PAGE##Key performance indicators

Key earnings ﬁgures (in €m)

Total Output 1

Revenue 

Earnings before taxes 

Net proﬁt

Key asset and ﬁnancial ﬁgures (in €m)

Total assets

Equity 

Equity ratio

Net debt 2

Cash and cash equivalents

Key share data and staff

2021

471.0

278.3

60.1

43.7

Change

-1.6%

51.8%

-3.4%

7.3%

2020

478.6

183.3

62.3

40.8

2019

678.0

242.0

70.5

50.1

31.12.2021

Change

31.12.2020

31.12.2019

1,494.5

550.6

36.8%

381.0

423.3

8.9%

14.0%

1.65 PP

-20.5%

71.2%

1,372.0

1,316.4

482.9

35.2%

479.1

247.2

462.5

35.1%

442.4

212.4

Earnings per share (in €) 3

Earnings per share incl. hybrid capital interest 4

Share price (in €) 

Market capitalisation (in €m) 

Dividend per share (in €) 5

Payout ratio % 6

Staff 

31.12.2021

Change

31.12.2020

31.12.2019

4.50

5.65

43.30

323.5

2.25

50.0%

355

2.6%

6.0%

20.9%

20.9%

2.3%

-0.2 PP

4.7%

4.39

5.33

35.80

267.5

2.20

50.2%

339

6.16

7.10



In [79]:
text



In [80]:
page = text.split('##END_OF_PAGE##')

In [81]:
res = []
for p in page:
    p = re.sub(r'[^\x00-\x7F]+','', p)
    prev_line = ""
    for line in p.split('\n\n'):
        if (line.startswith(' ') or not prev_line.endswith('.')):
                prev_line = prev_line + ' ' + line
        else: ## If condition is not met, we start a new index.
                res.append(prev_line)
                prev_line = line

        ## Ensures that the last line is stored into the array.
        res.append(prev_line)
        res.append('##END_OF_SENTENCE##')

final_sentences = ' '.join(res).split('##END_OF_SENTENCE##')

In [82]:
final_sentences

[' Cover ',
 '  Cover ESG Report 2021\x0c ',
 '  Key performance indicators ',
 '  Key performance indicators Key earnings gures (in m) ',
 '  Key performance indicators Key earnings gures (in m) Total Output 1 ',
 '  Key performance indicators Key earnings gures (in m) Total Output 1 Revenue  ',
 '  Key performance indicators Key earnings gures (in m) Total Output 1 Revenue  Earnings before taxes  ',
 '  Key performance indicators Key earnings gures (in m) Total Output 1 Revenue  Earnings before taxes  Net prot ',
 '  Key performance indicators Key earnings gures (in m) Total Output 1 Revenue  Earnings before taxes  Net prot Key asset and nancial gures (in m) ',
 '  Key performance indicators Key earnings gures (in m) Total Output 1 Revenue  Earnings before taxes  Net prot Key asset and nancial gures (in m) Total assets ',
 '  Key performance indicators Key earnings gures (in m) Total Output 1 Revenue  Earnings before taxes  Net prot Key asset and nancial gures (in m) Total assets Equ