In [None]:
######################################### IMPORTING PACAKGES #############################
from scipy import spatial
import pandas as pd
import os
import json
import numpy as np
import string

import warnings
warnings.filterwarnings("ignore")


import sys  
import os
from dateutil.parser import parse


# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel


# PDF text extraction
from pdfminer3.layout import LAParams, LTTextBox
from pdfminer3.pdfpage import PDFPage
from pdfminer3.pdfinterp import PDFResourceManager
from pdfminer3.pdfinterp import PDFPageInterpreter
from pdfminer3.converter import PDFPageAggregator
from pdfminer3.converter import TextConverter

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

# Others
import requests
import string
import re
from pprint import pprint
from tqdm.notebook import tqdm
import io

import nltk
from nltk.stem.snowball import SnowballStemmer
nltk.download('punkt')
nltk.download('stopwords')

import spacy
spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm", disable=['ner'])


# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

from sklearn.feature_extraction import text
stop_words = text.ENGLISH_STOP_WORDS.union(stop_words)

DATA_FOLDER = "dataset/"

In [None]:
def extract_pdf(file_path):
    """
    Process raw PDF text to structured and processed PDF text to be worked on in Python.
    Parameters
    ----------
    file_path : Relative Location of File
    Return
    ------
    text : str
        processed PDF text if no error is throw
    """   

    try:
        resource_manager = PDFResourceManager()
        fake_file_handle = io.StringIO()
        codec = 'utf-8'
        laparams = LAParams()

        converter = TextConverter(resource_manager, fake_file_handle, codec=codec, laparams=laparams)
        page_interpreter = PDFPageInterpreter(resource_manager, converter)
        
        password = ""
        maxpages = 0
        caching = True
        pagenos = set()

        content = []

        with open(file_path, 'rb') as file:
            for page in PDFPage.get_pages(file,
                                        pagenos, 
                                        maxpages=maxpages,
                                        password=password,
                                        caching=True,
                                        check_extractable=False):

                page_interpreter.process_page(page)

                content.append(fake_file_handle.getvalue())

                fake_file_handle.truncate(0)
                fake_file_handle.seek(0)        

        text = '##PAGE_BREAK##'.join(content)

        # close open handles
        converter.close()
        fake_file_handle.close()
        
        return text

    except Exception as e:
        print(e)

        # close open handles
        converter.close()
        fake_file_handle.close()

        return ""

In [None]:
extract_pdf('dataset/BMW Sustainability Report 2019.pdf')

In [None]:
extract_pdf('Test.pdf')

In [None]:
# nlp preprocessing
def preprocess_lines(line_input):
    """
    Helper Function to preprocess and clean sentences from raw PDF text 
    Parameters
    ----------
    line_input : str
        String that contains a sentence to be cleaned
    Return
    ------
    line : str
        Cleaned sentence
    ----------
    Sub: Substitute regular expression
    Split: Remove blank space from front and rear 
    """  
    # removing header number
    line = re.sub(r'^\s?\d+(.*)$', r'\1', line_input)
    # removing trailing spaces
    line = line.strip()
    # words may be split between lines, ensure we link them back together
    line = re.sub(r'\s?-\s?', '-', line)
    # remove space prior to punctuation
    line = re.sub(r'\s?([,:;\.])', r'\1', line)
    # ESG contains a lot of figures that are not relevant to grammatical structure
    line = re.sub(r'\d{5,}', r' ', line)
    # remove emails
    line = re.sub(r'\S*@\S*\s?', '', line)
    # remove mentions of URLs
    line = re.sub(r'((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*', r' ', line)
    # remove multiple spaces
    line = re.sub(r'\s+', ' ', line)
    # join next line with space
    line = re.sub(r' \n', ' ', line)
    line = re.sub(r'.\n', '. ', line)
    line = re.sub(r'\x0c', ' ', line)
    
    return line

In [None]:
# Get list of url from text
# To be write later

In [None]:
preprocess_lines("To \nachieve this, we set ourselves ten ambitious goals along the \nentire value chain.\nThe BMW Group Sustainable Value Report (SVR) has been \npublished to provide stakeholders with comprehensive \ninformation about the company’s sustainability strategy \nand the progress made in integrating sustainability into \nits corporate processes.")

In [None]:
preprocess_lines("\n\n\x0c##PAGE_BREAK##14\n\nIntroduction\n\n1\n\nFundamentals\n\n·\n\n1.1  Strategy\n1.2  Sustainability management\n1.3  Stakeholder dialogue\n1.4  Compliance and human rights\n1.5  Product safety\n\n2\nProducts and services\n\n3\nProduction and \nvalue creation\n\n4\nEmployees and society\n\nAppendix\n\nBMW Group’s position on the recommendations of the \n Task Force on Climate-related Financial Disclosures\nClimate change is one of the greatest social challenges of \nour time")

In [None]:
def remove_non_ascii(text):
    """
    Helper Function to remove non ascii characters from text
    Printable will 
    """
    printable = set(string.printable) #Convert iterable to set
    return ''.join(filter(lambda x: x in printable, text))

def not_header(line):
    """
    Helper Function to remove headers
    Check if all the characters are in upper case
    """
    return not line.isupper()

In [None]:
remove_non_ascii("#@#$Jjsjnfsjnf#@$")

In [None]:
not_header('HELLO')

In [None]:
def extract_pages_sentences(nlp, text):    
    """
    Extracting text from raw PDF text and store them by pages and senteces. Raw text is also cleand by removing junk, URLs, etc.
    Consecutive lines are also grouped into paragraphs and spacy is used to parse sentences.
    Parameters
    ----------
    nlp: spacy nlp model
        NLP model to parse sentences
    text : str
        Raw PDF text
    Return
    ------
    pages_content : list of str
        A list containing text from each page of the PDF report. Page number is the index of list + 1
    
    pages_sentences : list of list
        A list containing lists. Page number is the index of outer list + 1. Inner list contains sentences from each page
 
    """  
    MIN_WORDS_PER_PAGE = 500
    
    pages = text.split('##PAGE_BREAK##')
    #print('Number of Pages: {}'.format(len(pages)))

    lines = []
    for i in range(len(pages)):
        page_number = i + 1
        page = pages[i]
        
        # remove non ASCII characters
        text = remove_non_ascii(page)
        
        # if len(text.split(' ')) < MIN_WORDS_PER_PAGE:
        #     print(f'Skipped Page: {page_number}')
        #     continue
        
        prev = ""
        for line in text.split('\n\n'):
            # aggregate consecutive lines where text may be broken down
            # only if next line starts with a space or previous does not end with dot.
            if(line.startswith(' ') or not prev.endswith('.')):
                prev = prev + ' ' + line
            else:
                # new paragraph
                lines.append(prev)
                prev = line

        # don't forget left-over paragraph
        lines.append(prev)
        lines.append('##SAME_PAGE##')
        
    lines = '  '.join(lines).split('##SAME_PAGE##')
    
    # clean paragraphs from extra space, unwanted characters, urls, etc.
    # best effort clean up, consider a more versatile cleaner
    
    pages_content = []
    pages_sentences = []

    for line in lines[:-1]: # looping through each page
        
        line = preprocess_lines(line)       
        pages_content.append(str(line).strip())

        sentences = []
        # split paragraphs into well defined sentences using spacy
        for part in list(nlp(line).sents):
            sentences.append(str(part).strip())

        #sentences += nltk.sent_tokenize(line)
            
        # Only interested in full sentences and sentences with 10 to 100 words. --> filter out first page/content page
        sentences = [s for s in sentences if re.match('^[A-Z][^?!.]*[?.!]$', s) is not None]
        sentences = [s.replace('\n', ' ') for s in sentences]
        
        pages_sentences.append(sentences)
        
    return pages_content, pages_sentences #list, list of list where page is index of outer list

In [1]:
pages_content, pages_sentences = extract_pages_sentences(spacy.load("en_core_web_sm"),extract_pdf("Test.pdf"))
pages_content

NameError: name 'extract_pages_sentences' is not defined

In [None]:
def preprocessing(report):
    """
    Lemmatize,lowercase and remove stopwords for pages of a report
    
    Parameters
    ----------
    report: list of str
        A list containing text from each page of the PDF report. Page number is the index of list + 1
    Return
    ------
    report_pages : list of str
        A list containing processed text from each page of the PDF report. Page number is the index of list + 1
    
    """  
    
    report_pages = []

    def para_to_sent(para):
        """
        Helper function to split paragraphs into well defined sentences using spacy
        """
        sentences = []
        for part in list(nlp(para).sents):
            sentences.append(str(part).strip())
        return sentences

    def remove_stopwords(texts):
        """
        Helper function to remove stopwords from sentence
        """
        return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]


    def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
        """https://spacy.io/api/annotation"""
        """
        Helper function to lemmatize text in sentence
        """
        texts_out = []
        doc = nlp(texts) 
        texts_out.append(" ".join([token.lemma_ for token in doc]))
        return texts_out

    def stemming(texts):
        stemmer = SnowballStemmer(language='english')
        revisions = [stemmer.stem(text) for text in texts]
        return revisions
    
    for page in report:

        sentences = para_to_sent(page.lower())

        # Do lemmatization keeping only noun, adj, vb, adv
        page_data = []
        for sentence in sentences : 
            data_lemmatized = lemmatization(sentence, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
            data_stemmed_lemmatized = stemming(sentences)
            page_data.extend(data_stemmed_lemmatized)
        page_para_stem_lemma = "".join(page_data)
        
        report_pages.append(page_para_stem_lemma)
    
    return report_pages

In [None]:
preprocessing(pages_content)

In [None]:
def lemmatization(text_list, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    # lemmatize text in sentence
    """https://spacy.io/api/annotation"""
    texts_out = []
    for texts in text_list:
        texts = texts.lower()
        texts_out.append(" ".join([token.lemma_ for token in nlp(texts)]))
    return texts_out

In [None]:
def filter_report_highLevel(report):
    """
    Page filter to filter report for only relevant pages with decarbonisation related words.
    Two types of word filters: direct and indirect. Direct contains words that are directly related to decarbonisation while indirect contains other relevant decarbonisation information.
    
    Parameters
    ----------
    report: list of str
        A list containing text from each page of the PDF report. Page number is the index of list + 1
    Return
    ------
    filtered_report_direct : dict of {int : str}
        A dictionary that contains relevant pages obtained using direct filter. The key is the page number and value is the text on the page. 
    
    filtered_report_indirect : dict of {int : str}
        A dictionary that contains relevant pages obtained using indirect filter. The key is the page number and value is the text on the page.     
    """  
    
    # list of words used to filter
    relevant_terms_directFilter = set(["carbon","co2","environment","GHG emissions","Greenhouse Gas","carbon footprint","carbon emissions","Scope 1","Scope 2",
                               "Scope 3", "WACI","Carbon Intensity","carbon pricing","net-zero","metrics and targets","TCFD",
                                "sustainability goals","decarbonisation","climate",'energy', 'emission', 'emissions', 'renewable', 'carbon', 'fuel', 'power', 
                               'green', 'gas', 'green energy', 'sustainable', 'climate', 'sustainability', 'environmental', 'environment', 'GHG', 
                               'decarbon', 'energy consumption', 'paper consumption','water consumption', 'carbon intensity', 'waste management', 'electricity consumption', 
                                'cdp', 'global warming', 'business travel','climate solutions', 'decarbonization', 'cvar', 'climate value-at-risk','waste output'])
    relevant_terms_combinationA = ["emissions","exposure","carbon related","esg","sustainable","green","climate sensitive","impact investing", "investment framework", 'msci', 'ftse', 'responsible investing', 'responsible investment','transition']
    relevant_terms_combinationB = ["portfolio","assets","AUM","investment","financing","ratings","revenue","bond","goal","insurance", "equity", "swap", "option", "portfolio holdings", "risk management",'financial products']
    relevant_terms_combinationC = ["net zero","carbon footprint","CO2","carbon","oil","coal", "gas", "fossil fuel","green"]
    relevant_terms_combination_directFilter_lem = lemmatization(relevant_terms_directFilter)
    relevant_terms_combinationA_lem = lemmatization(relevant_terms_combinationA)
    relevant_terms_combinationB_lem = lemmatization(relevant_terms_combinationB)
    relevant_terms_combinationC_lem = lemmatization(relevant_terms_combinationC)
    
    
    filtered_report_direct = {}
    filtered_report_indirect = {}
    for i in range(len(report)):
        page = report[i]
        page_number = i + 1
        no_words = len(page.split(" "))
        
        # filter for pages that contain at least 3 words from the relevant_terms_combination_directFilter_lem list
        if sum(map(page.__contains__, relevant_terms_combination_directFilter_lem)) > 2:
            filtered_report_direct[page_number] = page
        
        # filter for pages that contain at least 1 word (relevant_terms_combinationC_lem AND relevant_terms_combinationA_lem) OR (relevant_terms_combinationC_lem AND  relevant_terms_combinationB_lem)
        elif (any(map(page.__contains__, relevant_terms_combinationA_lem)) and any(map(page.__contains__, relevant_terms_combinationC_lem))) or (any(map(page.__contains__, relevant_terms_combinationB_lem)) and any(map(page.__contains__, relevant_terms_combinationC_lem))):
            filtered_report_indirect[page_number] = page
    
    return filtered_report_direct,filtered_report_indirect

In [59]:
filtered_report_direct, filtered_report_indirect = filter_report_highLevel(pages_content)
filtered_report_direct


{3: 'Appendix Our reporting concept TCFD Index Fuel consumption and CO2 emissions ratings Additional information on delivery figures Independent Practitioners Limited Assurance Report Imprint 133 136 138 139 140 142 3 see page 37 see page 64 see page 121 see page 46 see page 87 CONTENTS Introduction Preface An overview of the BMW Group Key sustainability indicators Transformation of the BMW Group 1 Fundamentals 1.1 Strategy 1.2 Sustainability management 1.3 Stakeholder dialogue 1.4 Compliance and human rights 1.5 Product safety 2 Products and services 2.1 Emissions of CO2 and pollutants 2.2 Electromobility 2.3 Mobility patterns 3 Production and value creation 3.1 Consumption of resources 3.2 Renewable energy 3.3 Supplier network 4 Employees and society 4.1 Health and performance 4.2 Long-term employee development 4.3 Diversity 4.4 Corporate citizenship 4 6 7 8 10 18 20 26 32 38 47 55 65 84 88 101 110 122 129',
 4: '4 Introduction Preface An overview of the BMW Group Key sustainability 