In [4]:
######################################### IMPORTING PACAKGES #############################
# Basic ML Packages
from scipy import spatial
import pandas as pd
import math
import os
import json
import numpy as np
import string

import warnings
warnings.filterwarnings("ignore")

# PDF text extraction
from pdfminer3.layout import LAParams, LTTextBox
from pdfminer3.pdfpage import PDFPage
from pdfminer3.pdfinterp import PDFResourceManager
from pdfminer3.pdfinterp import PDFPageInterpreter
from pdfminer3.converter import PDFPageAggregator
from pdfminer3.converter import TextConverter

# Others
import string
import re
from pprint import pprint
from tqdm.notebook import tqdm
import io

# Text pre-processing (Tokenization, Stemming, Lemmatization)
import nltk
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import RegexpTokenizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('omw-1.4')

# Pdf Extraction Model
import spacy
spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm", disable=['ner'])

#Gensim stopwords
import gensim
from gensim.parsing.preprocessing import remove_stopwords
stopwords = gensim.parsing.preprocessing.STOPWORDS

# Train Test Split
from sklearn.model_selection import train_test_split

# Tf-Idf Vectorization
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

DATA_FOLDER = "dataset/"

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\65869\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\65869\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\65869\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


✔ Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


In [5]:
def extract_pdf(file_path):
    """
    Process raw PDF text to structured and processed PDF text to be worked on in Python.
    Parameters
    ----------
    file_path : Relative Location of File
    Return
    ------
    text : str
        processed PDF text if no error is throw
    """   

    try:
        resource_manager = PDFResourceManager()
        fake_file_handle = io.StringIO()
        codec = 'utf-8'
        laparams = LAParams()

        converter = TextConverter(resource_manager, fake_file_handle, codec=codec, laparams=laparams)
        page_interpreter = PDFPageInterpreter(resource_manager, converter)
        
        password = ""
        maxpages = 0
        caching = True
        pagenos = set()

        content = []

        with open(file_path, 'rb') as file:
            for page in PDFPage.get_pages(file,
                                        pagenos, 
                                        maxpages=maxpages,
                                        password=password,
                                        caching=True,
                                        check_extractable=False):

                page_interpreter.process_page(page)

                content.append(fake_file_handle.getvalue())

                fake_file_handle.truncate(0)
                fake_file_handle.seek(0)        

        text = '##PAGE_BREAK##'.join(content)

        # close open handles
        converter.close()
        fake_file_handle.close()
        
        return text

    except Exception as e:
        print(e)

        # close open handles
        converter.close()
        fake_file_handle.close()

        return ""

In [6]:
# nlp preprocessing
def preprocess_lines(line_input):
    """
    Helper Function to preprocess and clean sentences from raw PDF text 
    Parameters
    ----------
    line_input : str
        String that contains a sentence to be cleaned
    Return
    ------
    line : str
        Cleaned sentence
    ----------
    Sub: Substitute regular expression
    Split: Remove blank space from front and rear 
    """  
    # removing header number
    line = re.sub(r'^\s?\d+(.*)$', r'\1', line_input)
    # removing trailing spaces
    line = line.strip()
    # words may be split between lines, ensure we link them back together
    line = re.sub(r'\s?-\s?', '-', line)
    # remove space prior to punctuation
    line = re.sub(r'\s?([,:;\.])', r'\1', line)
    # ESG contains a lot of figures that are not relevant to grammatical structure
    line = re.sub(r'\d{5,}', r' ', line)
    # remove emails
    line = re.sub(r'\S*@\S*\s?', '', line)
    # remove mentions of URLs
    line = re.sub(r'((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*', r' ', line)
    # remove multiple spaces
    line = re.sub(r'\s+', ' ', line)
    # join next line with space
    line = re.sub(r' \n', ' ', line)
    line = re.sub(r'.\n', '. ', line)
    line = re.sub(r'\x0c', ' ', line)
    
    return line

In [7]:
def remove_non_ascii(text):
    """
    Helper Function to remove non ascii characters from text
    Printable will 
    """
    printable = set(string.printable) #Convert iterable to set
    return ''.join(filter(lambda x: x in printable, text))

def not_header(line):
    """
    Helper Function to remove headers
    Check if all the characters are in upper case
    """
    return not line.isupper()

In [8]:
def extract_pages_sentences(nlp, text):    
    """
    Extracting text from raw PDF text and store them by pages and senteces. Raw text is also cleand by removing junk, URLs, etc.
    Consecutive lines are also grouped into paragraphs and spacy is used to parse sentences.
    Parameters
    ----------
    nlp: spacy nlp model
        NLP model to parse sentences
    text : str
        Raw PDF text
    Return
    ------
    pages_content : list of str
        A list containing text from each page of the PDF report. Page number is the index of list + 1
    
    pages_sentences : list of list
        A list containing lists. Page number is the index of outer list + 1. Inner list contains sentences from each page
 
    """  
    
    pages = text.split('##PAGE_BREAK##')
    #print('Number of Pages: {}'.format(len(pages)))

    lines = []
    for i in range(len(pages)):
        page_number = i + 1
        page = pages[i]
        
        # remove non ASCII characters
        text = remove_non_ascii(page)
        
        # if len(text.split(' ')) < MIN_WORDS_PER_PAGE:
        #     print(f'Skipped Page: {page_number}')
        #     continue
        
        prev = ""
        for line in text.split('\n\n'):
            # aggregate consecutive lines where text may be broken down
            # only if next line starts with a space or previous does not end with dot.
            if(line.startswith(' ') or not prev.endswith('.')):
                prev = prev + ' ' + line
            else:
                # new paragraph
                lines.append(prev)
                prev = line

        # don't forget left-over paragraph
        lines.append(prev)
        lines.append('##SAME_PAGE##')
        
    lines = '  '.join(lines).split('##SAME_PAGE##')
    
    # clean paragraphs from extra space, unwanted characters, urls, etc.
    # best effort clean up, consider a more versatile cleaner
    
    pages_content = []
    pages_sentences = []
    all_sentences = []

    for line in lines[:-1]: # looping through each page
        
        line = preprocess_lines(line)       
        pages_content.append(str(line).strip())

        sentences = []
        # split paragraphs into well defined sentences using spacy
        for part in list(nlp(line).sents):
            sentences.append(str(part).strip())

        #sentences += nltk.sent_tokenize(line)
            
        # Only interested in full sentences and sentences with 10 to 100 words. --> filter out first page/content page
        sentences = [s for s in sentences if re.match('^[A-Z][^?!.]*[?.!]$', s) is not None]
        sentences = [s.replace('\n', ' ') for s in sentences]
        
        pages_sentences.append(sentences)
        all_sentences.extend(sentences)
    return pages_content, pages_sentences, all_sentences #list, list of list where page is index of outer list, list of sentences

In [9]:
# Corpus has already been extract to corpus.txt, take around 15 mins to extract 24 reports
# spacy_model = spacy.load("en_core_web_sm")
# list_dataset = os.listdir(DATA_FOLDER)
# corpus = []
# for file in list_dataset:
#     pages_content, pages_sentences, all_sentences = extract_pages_sentences(spacy_model,extract_pdf("Test.pdf"))
#     corpus.extend(all_sentences)
# np.shape(corpus)

In [10]:
# Store corpus in text file
# with open("corpus.txt", "w") as fp:
#     json.dump(corpus, fp)

In [11]:
corpus_data = open('corpus.txt')
corpus = json.load(corpus_data)
corpus

['To achieve this, we set ourselves ten ambitious goals along the entire value chain.',
 'The BMW Group Sustainable Value Report (SVR) has been published to provide stakeholders with comprehensive information about the companys sustainability strategy and the progress made in integrating sustainability into its corporate processes.',
 'The Sustainable Value Report is published at the same time as the Annual Report on the date of the Annual Accounts Press Conference.',
 'The requirements of the German CSR Directive Implemen-tation Act (CSR-RUG) obligate Bayerische Motoren Werke Aktiengesellschaft (BMWAG)topublishanon-financial report at company and Group level.',
 'This will be published jointly as an integrated, separate non-financial report (hereinafterreferredtoasseparatenon-financialreport) within this Sustainable Value Report for BMW AG and BMW Group.',
 'In the SVR 2019 we focused on providing information that is required in order to comply with the German CSR Direc-tive Implement

In [12]:
from tabulate import tabulate

def stemming(corpus):
    stemmer = SnowballStemmer(language='english')
    revisions = [stemmer.stem(line) for line in corpus]
    return revisions

In [13]:
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer 

def lemmatization(corpus):
    lemmatizer = WordNetLemmatizer()
    revisions = [lemmatizer.lemmatize(line) for line in corpus]
    return revisions
    

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\65869\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [14]:
tokenizer = TreebankWordTokenizer()
tokenize_output = tokenizer.tokenize(corpus[15])
stemming(tokenize_output)

['sustain', 'is', 'not', 'just', 'a', 'trend', 'for', 'us', '.']

In [15]:
lemmatization(stemming(tokenize_output))

['sustain', 'is', 'not', 'just', 'a', 'trend', 'for', 'u', '.']

In [16]:
def remove_stop_words(corpus):
    revisions = [remove_stopwords(line) for line in corpus]
    return revisions 

In [17]:
stopwords

frozenset({'a',
           'about',
           'above',
           'across',
           'after',
           'afterwards',
           'again',
           'against',
           'all',
           'almost',
           'alone',
           'along',
           'already',
           'also',
           'although',
           'always',
           'am',
           'among',
           'amongst',
           'amoungst',
           'amount',
           'an',
           'and',
           'another',
           'any',
           'anyhow',
           'anyone',
           'anything',
           'anyway',
           'anywhere',
           'are',
           'around',
           'as',
           'at',
           'back',
           'be',
           'became',
           'because',
           'become',
           'becomes',
           'becoming',
           'been',
           'before',
           'beforehand',
           'behind',
           'being',
           'below',
           'beside',
           'besides'

In [18]:
remove_stop_words(corpus)

['To achieve this, set ambitious goals entire value chain.',
 'The BMW Group Sustainable Value Report (SVR) published provide stakeholders comprehensive information companys sustainability strategy progress integrating sustainability corporate processes.',
 'The Sustainable Value Report published time Annual Report date Annual Accounts Press Conference.',
 'The requirements German CSR Directive Implemen-tation Act (CSR-RUG) obligate Bayerische Motoren Werke Aktiengesellschaft (BMWAG)topublishanon-financial report company Group level.',
 'This published jointly integrated, separate non-financial report (hereinafterreferredtoasseparatenon-financialreport) Sustainable Value Report BMW AG BMW Group.',
 'In SVR 2019 focused providing information required order comply German CSR Direc-tive Implementation Act (CSR RUG) Global Report-ing Initiative (GRI).',
 'We added detailed informa-tion topics strategic relevance BMW Group.',
 'Current examples measures support implementing sustainability t

In [19]:
def pre_processing(corpus):
    return lemmatization(stemming(remove_stop_words(corpus)))

In [47]:
def label_relevancy(corpus, related_words):
    related_words = pre_processing(related_words)
    corpus = pre_processing(corpus)
    related_sentences = [line for line in corpus if any(word in line for word in related_words)]
    unrelated_sentences = [line for line in corpus if (line not in related_sentences)]
    all_sentences = related_sentences + unrelated_sentences
    all_data = pd.DataFrame(all_sentences, columns=['corpus'])
    all_data['have_transition_plan'] = all_data['corpus'].isin(related_sentences)
    return related_sentences, unrelated_sentences, all_data

In [102]:
related_words = ['transition']
related_sentences, unrelated_sentences, all_data = label_relevancy(corpus, related_words)
all_data.head

<bound method NDFrame.head of                                                   corpus  have_transition_plan
0      we expand deepen collaboration cities national...                  True
1      we expand deepen collaboration cities national...                  True
2      we expand deepen collaboration cities national...                  True
3      we expand deepen collaboration cities national...                  True
4      we expand deepen collaboration cities national...                  True
...                                                  ...                   ...
33451  the report intended parties base (financial) d...                 False
33452                  our responsi-bility lies company.                 False
33453                  we assume responsibility parties.                 False
33454  we happy answer questions forward relevant dep...                 False
33455  if want stay up-to-date sustainability bmw gro...                 False

[33456 rows x 2 colum

In [104]:
all_data.to_csv('transition_data.csv')

In [114]:
from sklearn.model_selection import train_test_split

X_train, X_test= train_test_split(related_sentences, test_size=0.5)
X_train[:5]

['we expand deepen collaboration cities national international level, example urbane mobilitt (urban mobility) platform (de), agora verkehrswende (agora traffic transition) (de), theeit urban mobility (european institute innovation tech-nology) world economic forum (us, eu, cn).',
 'we expand deepen collaboration cities national international level, example urbane mobilitt (urban mobility) platform (de), agora verkehrswende (agora traffic transition) (de), theeit urban mobility (european institute innovation tech-nology) world economic forum (us, eu, cn).',
 'we expand deepen collaboration cities national international level, example urbane mobilitt (urban mobility) platform (de), agora verkehrswende (agora traffic transition) (de), theeit urban mobility (european institute innovation tech-nology) world economic forum (us, eu, cn).',
 'we expand deepen collaboration cities national international level, example urbane mobilitt (urban mobility) platform (de), agora verkehrswende (agora t

In [115]:
vectorizer = TfidfVectorizer()
tf_idf_matrix = vectorizer.fit_transform(X_train)

In [116]:
tf_idf_matrix.shape

(12, 31)

In [117]:
feature_names = vectorizer.get_feature_names()
corpus_index = [n for n in X_train]
df = pd.DataFrame(tf_idf_matrix.T.todense(), index=feature_names, columns=corpus_index)
df

Unnamed: 0,"we expand deepen collaboration cities national international level, example urbane mobilitt (urban mobility) platform (de), agora verkehrswende (agora traffic transition) (de), theeit urban mobility (european institute innovation tech-nology) world economic forum (us, eu, cn).","we expand deepen collaboration cities national international level, example urbane mobilitt (urban mobility) platform (de), agora verkehrswende (agora traffic transition) (de), theeit urban mobility (european institute innovation tech-nology) world economic forum (us, eu, cn)..1","we expand deepen collaboration cities national international level, example urbane mobilitt (urban mobility) platform (de), agora verkehrswende (agora traffic transition) (de), theeit urban mobility (european institute innovation tech-nology) world economic forum (us, eu, cn)..2","we expand deepen collaboration cities national international level, example urbane mobilitt (urban mobility) platform (de), agora verkehrswende (agora traffic transition) (de), theeit urban mobility (european institute innovation tech-nology) world economic forum (us, eu, cn)..3","we expand deepen collaboration cities national international level, example urbane mobilitt (urban mobility) platform (de), agora verkehrswende (agora traffic transition) (de), theeit urban mobility (european institute innovation tech-nology) world economic forum (us, eu, cn)..4","we expand deepen collaboration cities national international level, example urbane mobilitt (urban mobility) platform (de), agora verkehrswende (agora traffic transition) (de), theeit urban mobility (european institute innovation tech-nology) world economic forum (us, eu, cn)..5","we expand deepen collaboration cities national international level, example urbane mobilitt (urban mobility) platform (de), agora verkehrswende (agora traffic transition) (de), theeit urban mobility (european institute innovation tech-nology) world economic forum (us, eu, cn)..6","we expand deepen collaboration cities national international level, example urbane mobilitt (urban mobility) platform (de), agora verkehrswende (agora traffic transition) (de), theeit urban mobility (european institute innovation tech-nology) world economic forum (us, eu, cn)..7","we expand deepen collaboration cities national international level, example urbane mobilitt (urban mobility) platform (de), agora verkehrswende (agora traffic transition) (de), theeit urban mobility (european institute innovation tech-nology) world economic forum (us, eu, cn)..8","we expand deepen collaboration cities national international level, example urbane mobilitt (urban mobility) platform (de), agora verkehrswende (agora traffic transition) (de), theeit urban mobility (european institute innovation tech-nology) world economic forum (us, eu, cn)..9","we expand deepen collaboration cities national international level, example urbane mobilitt (urban mobility) platform (de), agora verkehrswende (agora traffic transition) (de), theeit urban mobility (european institute innovation tech-nology) world economic forum (us, eu, cn)..10","we expand deepen collaboration cities national international level, example urbane mobilitt (urban mobility) platform (de), agora verkehrswende (agora traffic transition) (de), theeit urban mobility (european institute innovation tech-nology) world economic forum (us, eu, cn)..11"
agora,0.304997,0.304997,0.304997,0.304997,0.304997,0.304997,0.304997,0.304997,0.304997,0.304997,0.304997,0.304997
cities,0.152499,0.152499,0.152499,0.152499,0.152499,0.152499,0.152499,0.152499,0.152499,0.152499,0.152499,0.152499
cn,0.152499,0.152499,0.152499,0.152499,0.152499,0.152499,0.152499,0.152499,0.152499,0.152499,0.152499,0.152499
collaboration,0.152499,0.152499,0.152499,0.152499,0.152499,0.152499,0.152499,0.152499,0.152499,0.152499,0.152499,0.152499
de,0.304997,0.304997,0.304997,0.304997,0.304997,0.304997,0.304997,0.304997,0.304997,0.304997,0.304997,0.304997
deepen,0.152499,0.152499,0.152499,0.152499,0.152499,0.152499,0.152499,0.152499,0.152499,0.152499,0.152499,0.152499
economic,0.152499,0.152499,0.152499,0.152499,0.152499,0.152499,0.152499,0.152499,0.152499,0.152499,0.152499,0.152499
eu,0.152499,0.152499,0.152499,0.152499,0.152499,0.152499,0.152499,0.152499,0.152499,0.152499,0.152499,0.152499
european,0.152499,0.152499,0.152499,0.152499,0.152499,0.152499,0.152499,0.152499,0.152499,0.152499,0.152499,0.152499
example,0.152499,0.152499,0.152499,0.152499,0.152499,0.152499,0.152499,0.152499,0.152499,0.152499,0.152499,0.152499


In [118]:
import math

def find_related_sentences(im):
    input_len = len(im)
    input_vec = vectorizer.transform(im)
    cosine_similarities = []
    angles = []
    for i in range(tf_idf_matrix.shape[0]):
        cosine = cosine_similarity(tf_idf_matrix[i], input_vec)[0]
        angle_list = np.rad2deg(np.arccos(cosine))
        cosine_similarities.append(cosine)
        angles.extend(angle_list)
    min_angle = min(angles)
    threshold_value = 20
    if min_angle <= threshold_value:
        index1 = math.ceil(np.argmin(angles) / input_len)
        index2 = np.argmin(angles) % input_len
        return f"Found matching sentence: {im[index2]} - {corpus[index1]}"
    else:
        return f"Can't find matching sentences"

In [138]:
def check_relevancy(sentence):
    input_vec = vectorizer.transform([sentence]) 
    cosine_similarities = []
    angles = []
    for i in range(tf_idf_matrix.shape[0]):
        cosine = cosine_similarity(tf_idf_matrix[i], input_vec)[0]
        angle_list = np.rad2deg(np.arccos(cosine))
        cosine_similarities.append(cosine)
        angles.extend(angle_list)
    min_angle = min(angles)
    threshold_value = 60
    return min_angle


In [120]:
find_related_sentences(['We have a low transition plan','Yes Im Jeff'])

"Can't find matching sentences"

In [136]:
# Test on 20% of the valid result
test_valid = [check_relevancy(line) for line in X_test]
test_valid

  (0, 30)	0.15249857033260467
  (0, 29)	0.15249857033260467
  (0, 28)	0.15249857033260467
  (0, 27)	0.15249857033260467
  (0, 26)	0.15249857033260467
  (0, 25)	0.30499714066520933
  (0, 24)	0.15249857033260467
  (0, 23)	0.15249857033260467
  (0, 22)	0.15249857033260467
  (0, 21)	0.15249857033260467
  (0, 20)	0.15249857033260467
  (0, 19)	0.15249857033260467
  (0, 18)	0.15249857033260467
  (0, 17)	0.30499714066520933
  (0, 16)	0.15249857033260467
  (0, 15)	0.15249857033260467
  (0, 14)	0.15249857033260467
  (0, 13)	0.15249857033260467
  (0, 12)	0.15249857033260467
  (0, 11)	0.15249857033260467
  (0, 10)	0.15249857033260467
  (0, 9)	0.15249857033260467
  (0, 8)	0.15249857033260467
  (0, 7)	0.15249857033260467
  (0, 6)	0.15249857033260467
  (0, 5)	0.15249857033260467
  (0, 4)	0.30499714066520933
  (0, 3)	0.15249857033260467
  (0, 2)	0.15249857033260467
  (0, 1)	0.15249857033260467
  (0, 0)	0.30499714066520933
[1.]
[nan]
[1.]
[nan]
[1.]
[nan]
[1.]
[nan]
[1.]
[nan]
[1.]
[nan]
[1.]
[nan]
[1.

[nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]

In [139]:
test_invalid = [check_relevancy(line) for line in unrelated_sentences]

In [122]:
X_test

['we expand deepen collaboration cities national international level, example urbane mobilitt (urban mobility) platform (de), agora verkehrswende (agora traffic transition) (de), theeit urban mobility (european institute innovation tech-nology) world economic forum (us, eu, cn).',
 'we expand deepen collaboration cities national international level, example urbane mobilitt (urban mobility) platform (de), agora verkehrswende (agora traffic transition) (de), theeit urban mobility (european institute innovation tech-nology) world economic forum (us, eu, cn).',
 'we expand deepen collaboration cities national international level, example urbane mobilitt (urban mobility) platform (de), agora verkehrswende (agora traffic transition) (de), theeit urban mobility (european institute innovation tech-nology) world economic forum (us, eu, cn).',
 'we expand deepen collaboration cities national international level, example urbane mobilitt (urban mobility) platform (de), agora verkehrswende (agora t

In [43]:
input_vec = vectorizer.transform(['We have a low transition plan','Yes Im Jeff'])
input_vec

<2x4564 sparse matrix of type '<class 'numpy.float64'>'
	with 5 stored elements in Compressed Sparse Row format>

In [44]:
cosine = cosine_similarity(tf_idf_matrix[100], input_vec)[0]
cosine

array([0., 0.])

In [45]:
angle_list = np.rad2deg(np.arccos(cosine))
angle_list

array([90., 90.])