In [69]:
import os
import re
import io
import json
import numpy as np
import pandas as pd

## Remove warnings
import warnings
warnings.filterwarnings('ignore')

from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage

import nltk
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem.snowball import SnowballStemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import spacy
import en_core_web_sm

import gensim
from gensim.parsing.preprocessing import remove_stopwords
stopwords = gensim.parsing.preprocessing.STOPWORDS

In [2]:
def extract_pdf(file):
    '''
    Pdf extraction function to extract pdf page-by-page

    Parameters
    -----------
    file: str
        The file path to which where the pdf file is located.

    Returns
    -------
    text: str
        A condensed string of text containing all the words extracted from the pdf.
    '''
    ## Load functions for pdf processing.
    resource_manager = PDFResourceManager()
    retstr = io.StringIO()
    codec = 'utf-8'
    laparams = LAParams()

    text_converter = TextConverter(resource_manager, retstr, codec=codec, laparams=laparams)
    page_interpreter = PDFPageInterpreter(resource_manager, text_converter)

    data = []

    with open(file, 'rb') as f:
        for page in PDFPage.get_pages(f, caching=True, check_extractable=False):

            page_interpreter.process_page(page)
            data.append(retstr.getvalue())

            retstr.truncate(0)
            retstr.seek(0)

    text = '##END_OF_PAGE##'.join(data)

    return text

In [4]:
def preprocess(sentence):
    '''
    Preprocessing function to clean up the extracted pdf data.

    Parameters
    ----------
    sentence: str
        The sentence to be preprocessed.

    Returns
    -------
    sentence: str
        The sentence after preprocessing.
    
    '''
    ## Removes headers and content pages.
    sentence = re.sub(r'^\s?\d+(.*)$', r'\1', sentence)
    ## Strip sentence of trailing whitespace
    sentence = sentence.strip()
    ## Link back words that have been split in-between lines.
    sentence = re.sub(r'\s?-\s?', '-', sentence)
    ## Remove space before punctuation
    sentence = re.sub(r'\s([?.!"](?:\s|$))', r'\1', sentence)
    ## Remove emails from text
    sentence = re.sub(r'\S*@\S*\s?', '', sentence)
    ## Remove URLs from text
    sentence = re.sub(r'(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w\.-]*)*', ' ', sentence)
    ## Consolidate multiple spacing into one
    sentence = re.sub(r'\s+', ' ', sentence)
    ## join next line with space
    sentence = re.sub(r' \n', ' ', sentence)
    sentence = re.sub(r'.\n', '. ', sentence)
    sentence = re.sub(r'\x0c', ' ', sentence)
    
    return sentence

In [5]:
def extract_sentences(text, nlp_model=en_core_web_sm.load()):
    '''
    Function to extract sentences from consolidated text extracted from pdf.
    It cleans the consolidated text by removing URLs, emails and irrelevant sentences,
    groups sentences into paragraphs.

    Parameters
    ----------
    text: str
        The consolidated text to be cleaned and parsed.
    nlp_model:
        The spacy model that is used to parse sentences

    Returns
    -------
    page_sentences: list of list of str
        A list containing lists of sentences. The outer list indicates the page where the sentences come from,
        where page = index of list + 1 and the inner list contains the sentence in each page.

    all_sentences: list of str
        A list containing the full text of the cleaned consolidated text, without any page number segregation.
    '''

    pages = text.split('##END_OF_PAGE##')

    sentences = []

    for i in range(len(pages)):
        page_number = i + 1
        page = pages[i]
        print(f"Extracting page number: {i}")

        ## Removes non-ASCII words from paragraph.
        text = re.sub(r'[^\x00-\x7F]+','', page)

        prev_line = ""
        for line in text.split('\n\n'):
            ## Combines consecutive lines where text may have been broken up,
            ## provided that the next line has a space at the start, and the previous line
            ## does not end with a full stop.
            if (line.startswith(' ') or not prev_line.endswith('.')):
                prev_line = prev_line + ' ' + line
            else: ## If condition is not met, we start a new index.
                sentences.append(prev_line)
                prev_line = line

        ## Ensures that the last line is stored into the array.
        sentences.append(prev_line)
        sentences.append('##END_OF_SENTENCE##')

    final_sentences = ' '.join(sentences).split('##END_OF_SENTENCE##')

    page_sentences = {}
    all_sentences = []
    page = 1
    
    for line in final_sentences:
        page_sentences[page] = []
        line = preprocess(line)

        words = []
        ## parses paragraph into sentences using spacy.
        for partial in list(nlp_model(line).sents):
            p = str(partial).strip()
            words.append(p)

        w_res = []
        for w in words:
            if re.match('^[A-Z][^?!.]*[?.!]$', w) is not None:
                ## Filters for non-sentence sentences, which are
                ## sentences that do not end with ?, ., !
                w_res.append(w)

        w_res = [x.replace('\n', ' ') for x in w_res] ## Replace new line tag with blank.

        page_sentences[page].append(w_res)
        all_sentences.extend(w_res)
        page += 1

    return page_sentences, all_sentences

In [6]:
os.listdir("sample_datasets/automobile/")

['BMW Sustainability Report 2019.pdf',
 'BMW Sustainability Report 2020.pdf',
 'BMW Sustainability Report 2021.pdf',
 'Ford ESG Review 2019.pdf',
 'Ford Sustainability Report 2020.pdf',
 'Ford Sustainability Report 2021.pdf',
 'GMC Sustainability Report 2019.pdf',
 'GMC Sustainability Report 2020.pdf',
 'GMC Sustainability Report 2021.pdf',
 'Honda Sustainability Report 2019.pdf',
 'Honda Sustainability Report 2020.pdf',
 'Honda Sustainability Report 2021.pdf',
 'Hyundai Sustainability Report 2019.pdf',
 'Hyundai Sustainability Report 2020.pdf',
 'Hyundai Sustainability Report 2021.pdf',
 'Nissan Sustainability Report 2019.pdf',
 'Nissan Sustainability Report 2020.pdf',
 'Nissan Sustainability Report 2021.pdf',
 'Stellantis Sustainability Report 2019.pdf',
 'Stellantis Sustainability Report 2020.pdf',
 'Stellantis Sustainability Report 2021.pdf',
 'Subaru CSR Report 2019.pdf',
 'Subaru CSR Report 2020.pdf',
 'Subaru Sustainability Report 2021.pdf']

In [23]:
DATA_FOLDER = 'sample_datasets/automobile/'

In [34]:
# Corpus has already been extract to corpus.txt, take around 15 mins to extract 24 reports
spacy_model = spacy.load("en_core_web_sm")
list_dataset = os.listdir(DATA_FOLDER)
corpus = []
for file in list_dataset:
    pages_sentences, all_sentences = extract_sentences(extract_pdf(DATA_FOLDER + file), spacy_model)
    corpus.extend(all_sentences)
np.shape(corpus)

Extracting page number: 0
Extracting page number: 1
Extracting page number: 2
Extracting page number: 3
Extracting page number: 4
Extracting page number: 5
Extracting page number: 6
Extracting page number: 7
Extracting page number: 8
Extracting page number: 9
Extracting page number: 10
Extracting page number: 11
Extracting page number: 12
Extracting page number: 13
Extracting page number: 14
Extracting page number: 15
Extracting page number: 16
Extracting page number: 17
Extracting page number: 18
Extracting page number: 19
Extracting page number: 20
Extracting page number: 21
Extracting page number: 22
Extracting page number: 23
Extracting page number: 24
Extracting page number: 25
Extracting page number: 26
Extracting page number: 27
Extracting page number: 28
Extracting page number: 29
Extracting page number: 30
Extracting page number: 31
Extracting page number: 32
Extracting page number: 33
Extracting page number: 34
Extracting page number: 35
Extracting page number: 36
Extracting 

Extracting page number: 0
Extracting page number: 1
Extracting page number: 2
Extracting page number: 3
Extracting page number: 4
Extracting page number: 5
Extracting page number: 6
Extracting page number: 7
Extracting page number: 8
Extracting page number: 9
Extracting page number: 10
Extracting page number: 11
Extracting page number: 12
Extracting page number: 13
Extracting page number: 14
Extracting page number: 15
Extracting page number: 16
Extracting page number: 17
Extracting page number: 18
Extracting page number: 19
Extracting page number: 20
Extracting page number: 21
Extracting page number: 22
Extracting page number: 23
Extracting page number: 24
Extracting page number: 25
Extracting page number: 26
Extracting page number: 27
Extracting page number: 28
Extracting page number: 29
Extracting page number: 30
Extracting page number: 31
Extracting page number: 32
Extracting page number: 0
Extracting page number: 1
Extracting page number: 2
Extracting page number: 3
Extracting page

Extracting page number: 0
Extracting page number: 1
Extracting page number: 2
Extracting page number: 3
Extracting page number: 4
Extracting page number: 5
Extracting page number: 6
Extracting page number: 7
Extracting page number: 8
Extracting page number: 9
Extracting page number: 10
Extracting page number: 11
Extracting page number: 12
Extracting page number: 13
Extracting page number: 14
Extracting page number: 15
Extracting page number: 16
Extracting page number: 17
Extracting page number: 18
Extracting page number: 19
Extracting page number: 20
Extracting page number: 21
Extracting page number: 22
Extracting page number: 23
Extracting page number: 24
Extracting page number: 25
Extracting page number: 26
Extracting page number: 27
Extracting page number: 28
Extracting page number: 29
Extracting page number: 30
Extracting page number: 31
Extracting page number: 32
Extracting page number: 33
Extracting page number: 34
Extracting page number: 35
Extracting page number: 36
Extracting 

Extracting page number: 0
Extracting page number: 1
Extracting page number: 2
Extracting page number: 3
Extracting page number: 4
Extracting page number: 5
Extracting page number: 6
Extracting page number: 7
Extracting page number: 8
Extracting page number: 9
Extracting page number: 10
Extracting page number: 11
Extracting page number: 12
Extracting page number: 13
Extracting page number: 14
Extracting page number: 15
Extracting page number: 16
Extracting page number: 17
Extracting page number: 18
Extracting page number: 19
Extracting page number: 20
Extracting page number: 21
Extracting page number: 22
Extracting page number: 23
Extracting page number: 24
Extracting page number: 25
Extracting page number: 26
Extracting page number: 27
Extracting page number: 28
Extracting page number: 29
Extracting page number: 30
Extracting page number: 31
Extracting page number: 32
Extracting page number: 33
Extracting page number: 34
Extracting page number: 35
Extracting page number: 36
Extracting 

Extracting page number: 0
Extracting page number: 1
Extracting page number: 2
Extracting page number: 3
Extracting page number: 4
Extracting page number: 5
Extracting page number: 6
Extracting page number: 7
Extracting page number: 8
Extracting page number: 9
Extracting page number: 10
Extracting page number: 11
Extracting page number: 12
Extracting page number: 13
Extracting page number: 14
Extracting page number: 15
Extracting page number: 16
Extracting page number: 17
Extracting page number: 18
Extracting page number: 19
Extracting page number: 20
Extracting page number: 21
Extracting page number: 22
Extracting page number: 23
Extracting page number: 24
Extracting page number: 25
Extracting page number: 26
Extracting page number: 27
Extracting page number: 28
Extracting page number: 29
Extracting page number: 30
Extracting page number: 31
Extracting page number: 32
Extracting page number: 33
Extracting page number: 34
Extracting page number: 35
Extracting page number: 36
Extracting 

Extracting page number: 0
Extracting page number: 1
Extracting page number: 2
Extracting page number: 3
Extracting page number: 4
Extracting page number: 5
Extracting page number: 6
Extracting page number: 7
Extracting page number: 8
Extracting page number: 9
Extracting page number: 10
Extracting page number: 11
Extracting page number: 12
Extracting page number: 13
Extracting page number: 14
Extracting page number: 15
Extracting page number: 16
Extracting page number: 17
Extracting page number: 18
Extracting page number: 19
Extracting page number: 20
Extracting page number: 21
Extracting page number: 22
Extracting page number: 23
Extracting page number: 24
Extracting page number: 25
Extracting page number: 26
Extracting page number: 27
Extracting page number: 28
Extracting page number: 29
Extracting page number: 30
Extracting page number: 31
Extracting page number: 32
Extracting page number: 33
Extracting page number: 34
Extracting page number: 35
Extracting page number: 36
Extracting 

Extracting page number: 0
Extracting page number: 1
Extracting page number: 2
Extracting page number: 3
Extracting page number: 4
Extracting page number: 5
Extracting page number: 6
Extracting page number: 7
Extracting page number: 8
Extracting page number: 9
Extracting page number: 10
Extracting page number: 11
Extracting page number: 12
Extracting page number: 13
Extracting page number: 14
Extracting page number: 15
Extracting page number: 16
Extracting page number: 17
Extracting page number: 18
Extracting page number: 19
Extracting page number: 20
Extracting page number: 21
Extracting page number: 22
Extracting page number: 23
Extracting page number: 24
Extracting page number: 25
Extracting page number: 26
Extracting page number: 27
Extracting page number: 28
Extracting page number: 29
Extracting page number: 30
Extracting page number: 31
Extracting page number: 32
Extracting page number: 33
Extracting page number: 34
Extracting page number: 35
Extracting page number: 36
Extracting 

Extracting page number: 0
Extracting page number: 1
Extracting page number: 2
Extracting page number: 3
Extracting page number: 4
Extracting page number: 5
Extracting page number: 6
Extracting page number: 7
Extracting page number: 8
Extracting page number: 9
Extracting page number: 10
Extracting page number: 11
Extracting page number: 12
Extracting page number: 13
Extracting page number: 14
Extracting page number: 15
Extracting page number: 16
Extracting page number: 17
Extracting page number: 18
Extracting page number: 19
Extracting page number: 20
Extracting page number: 21
Extracting page number: 22
Extracting page number: 23
Extracting page number: 24
Extracting page number: 25
Extracting page number: 26
Extracting page number: 27
Extracting page number: 28
Extracting page number: 29
Extracting page number: 30
Extracting page number: 31
Extracting page number: 32
Extracting page number: 33
Extracting page number: 34
Extracting page number: 35
Extracting page number: 36
Extracting 

Extracting page number: 0
Extracting page number: 1
Extracting page number: 2
Extracting page number: 3
Extracting page number: 4
Extracting page number: 5
Extracting page number: 6
Extracting page number: 7
Extracting page number: 8
Extracting page number: 9
Extracting page number: 10
Extracting page number: 11
Extracting page number: 12
Extracting page number: 13
Extracting page number: 14
Extracting page number: 15
Extracting page number: 16
Extracting page number: 17
Extracting page number: 18
Extracting page number: 19
Extracting page number: 20
Extracting page number: 21
Extracting page number: 22
Extracting page number: 23
Extracting page number: 24
Extracting page number: 25
Extracting page number: 26
Extracting page number: 27
Extracting page number: 28
Extracting page number: 29
Extracting page number: 30
Extracting page number: 31
Extracting page number: 32
Extracting page number: 33
Extracting page number: 34
Extracting page number: 35
Extracting page number: 36
Extracting 

Extracting page number: 0
Extracting page number: 1
Extracting page number: 2
Extracting page number: 3
Extracting page number: 4
Extracting page number: 5
Extracting page number: 6
Extracting page number: 7
Extracting page number: 8
Extracting page number: 9
Extracting page number: 10
Extracting page number: 11
Extracting page number: 12
Extracting page number: 13
Extracting page number: 14
Extracting page number: 15
Extracting page number: 16
Extracting page number: 17
Extracting page number: 18
Extracting page number: 19
Extracting page number: 20
Extracting page number: 21
Extracting page number: 22
Extracting page number: 23
Extracting page number: 24
Extracting page number: 25
Extracting page number: 26
Extracting page number: 27
Extracting page number: 28
Extracting page number: 29
Extracting page number: 30
Extracting page number: 31
Extracting page number: 32
Extracting page number: 33
Extracting page number: 34
Extracting page number: 35
Extracting page number: 36
Extracting 

Extracting page number: 0
Extracting page number: 1
Extracting page number: 2
Extracting page number: 3
Extracting page number: 4
Extracting page number: 5
Extracting page number: 6
Extracting page number: 7
Extracting page number: 8
Extracting page number: 9
Extracting page number: 10
Extracting page number: 11
Extracting page number: 12
Extracting page number: 13
Extracting page number: 14
Extracting page number: 15
Extracting page number: 16
Extracting page number: 17
Extracting page number: 18
Extracting page number: 19
Extracting page number: 20
Extracting page number: 21
Extracting page number: 22
Extracting page number: 23
Extracting page number: 24
Extracting page number: 25
Extracting page number: 26
Extracting page number: 27
Extracting page number: 28
Extracting page number: 29
Extracting page number: 30
Extracting page number: 31
Extracting page number: 32
Extracting page number: 33
Extracting page number: 34
Extracting page number: 35
Extracting page number: 36
Extracting 

(41364,)

In [35]:
#Store corpus in text file
with open("corpus.txt", "w") as fp:
    json.dump(corpus, fp)

In [36]:
corpus_data = open('corpus.txt')
corpus = json.load(corpus_data)
corpus

['To achieve this, we set ourselves ten ambitious goals along the entire value chain.',
 'The BMW Group Sustainable Value Report (SVR) has been published to provide stakeholders with comprehensive information about the companys sustainability strategy and the progress made in integrating sustainability into its corporate processes.',
 'The Sustainable Value Report is published at the same time as the Annual Report on the date of the Annual Accounts Press Conference.',
 'The requirements of the German CSR Directive Implemen-tation Act (CSR-RUG) obligate Bayerische Motoren Werke Aktiengesellschaft (BMWAG)topublishanon-financial report at company and Group level.',
 'This will be published jointly as an integrated, separate non-financial report (hereinafterreferredtoasseparatenon-financialreport) within this Sustainable Value Report for BMW AG and BMW Group.',
 'In the SVR 2019 we focused on providing information that is required in order to comply with the German CSR Direc-tive Implement

In [37]:
from tabulate import tabulate

def stemming(corpus):
    stemmer = SnowballStemmer(language='english')
    revisions = [stemmer.stem(line) for line in corpus]
    return revisions

In [38]:
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer 

def lemmatization(corpus):
    lemmatizer = WordNetLemmatizer()
    revisions = [lemmatizer.lemmatize(line) for line in corpus]
    return revisions
    

[nltk_data] Downloading package wordnet to C:\Users\Chen
[nltk_data]     Wei\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [41]:
tokenizer = TreebankWordTokenizer()
tokenize_output = tokenizer.tokenize(corpus[15])
stemming(tokenize_output)

['howev',
 ',',
 'whatreallymattersisthatwemakeaneffectivecontribu-t',
 'to',
 'environment',
 'and',
 'climat',
 'protect',
 'here',
 'and',
 'now',
 '.']

In [42]:
lemmatization(stemming(tokenize_output))

['howev',
 ',',
 'whatreallymattersisthatwemakeaneffectivecontribu-t',
 'to',
 'environment',
 'and',
 'climat',
 'protect',
 'here',
 'and',
 'now',
 '.']

In [43]:
def remove_stop_words(corpus):
    revisions = [remove_stopwords(line) for line in corpus]
    return revisions 

In [46]:
remove_stop_words(lemmatization(stemming(tokenize_output)))

['howev',
 ',',
 'whatreallymattersisthatwemakeaneffectivecontribu-t',
 '',
 'environment',
 '',
 'climat',
 'protect',
 '',
 '',
 '',
 '.']

In [49]:
stopwords

frozenset({'a',
           'about',
           'above',
           'across',
           'after',
           'afterwards',
           'again',
           'against',
           'all',
           'almost',
           'alone',
           'along',
           'already',
           'also',
           'although',
           'always',
           'am',
           'among',
           'amongst',
           'amoungst',
           'amount',
           'an',
           'and',
           'another',
           'any',
           'anyhow',
           'anyone',
           'anything',
           'anyway',
           'anywhere',
           'are',
           'around',
           'as',
           'at',
           'back',
           'be',
           'became',
           'because',
           'become',
           'becomes',
           'becoming',
           'been',
           'before',
           'beforehand',
           'behind',
           'being',
           'below',
           'beside',
           'besides'

In [50]:
remove_stop_words(corpus)

['To achieve this, set ambitious goals entire value chain.',
 'The BMW Group Sustainable Value Report (SVR) published provide stakeholders comprehensive information companys sustainability strategy progress integrating sustainability corporate processes.',
 'The Sustainable Value Report published time Annual Report date Annual Accounts Press Conference.',
 'The requirements German CSR Directive Implemen-tation Act (CSR-RUG) obligate Bayerische Motoren Werke Aktiengesellschaft (BMWAG)topublishanon-financial report company Group level.',
 'This published jointly integrated, separate non-financial report (hereinafterreferredtoasseparatenon-financialreport) Sustainable Value Report BMW AG BMW Group.',
 'In SVR 2019 focused providing information required order comply German CSR Direc-tive Implementation Act (CSR RUG) Global Report-ing Initiative (GRI).',
 'We added detailed informa-tion topics strategic relevance BMW Group.',
 'Current examples measures support implementing sustainability t

In [51]:
def pre_processing(corpus):
    return lemmatization(stemming(remove_stop_words(corpus)))

In [52]:
def label_relevancy(corpus, related_words):
    related_words = pre_processing(related_words)
    corpus = pre_processing(corpus)
    related_sentences = [line for line in corpus if any(word in line for word in related_words)]
    unrelated_sentences = [line for line in corpus if (line not in related_sentences)]
    all_sentences = related_sentences + unrelated_sentences
    all_data = pd.DataFrame(all_sentences, columns=['corpus'])
    all_data['have_transition_plan'] = all_data['corpus'].isin(related_sentences)
    return related_sentences, unrelated_sentences, all_data

In [53]:
related_words = ['transition', 'progress']
related_sentences, unrelated_sentences, all_data = label_relevancy(corpus, related_words)
all_data

Unnamed: 0,corpus,have_transition_plan
0,the bmw group sustainable value report (svr) p...,True
1,"the sdgs core 2030 agenda, global action plan ...",True
2,additional criteria are: enhancing companys at...,True
3,we manage imple-mentation targets evaluation p...,True
4,we manage implementation targets evaluation pr...,True
...,...,...
41359,the boards oversight climate-related risks opp...,False
41360,describe organizations processes identifying a...,False
41361,climate change > management system risk manage...,False
41362,initiatives climate change > risks opportuniti...,False


In [None]:
all_data.to_csv('transition_data.csv')

In [54]:
from sklearn.model_selection import train_test_split

X_train, X_test= train_test_split(related_sentences, test_size=0.5)
X_train[:5]

['the pulse survey enables better understand progress known drivers engagement, help refine action plans needed.',
 'we manage imple-mentation targets evaluation progress devel-opment process applying life cycle assessment accordance iso standard 14040/44.',
 'in recent study erb institute university michigan, asked internal external stakeholders what human progress mean?',
 'while strive integrity do, understand special role integrity compliance regulations standards promote sustainable development, distribution value, progressive taxation, cash transfers investment human capital, regulation strategies development inclusive growth.',
 'this communication progress implementing principles united nations global compact supporting broader un goals.']

In [57]:
vectorizer = TfidfVectorizer()
tf_idf_matrix = vectorizer.fit_transform(X_train)

In [58]:
tf_idf_matrix.shape

(342, 2161)

In [59]:
feature_names = vectorizer.get_feature_names()
corpus_index = [n for n in X_train]
df = pd.DataFrame(tf_idf_matrix.T.todense(), index=feature_names, columns=corpus_index)
df

Unnamed: 0,"the pulse survey enables better understand progress known drivers engagement, help refine action plans needed.",we manage imple-mentation targets evaluation progress devel-opment process applying life cycle assessment accordance iso standard 14040/44.,"in recent study erb institute university michigan, asked internal external stakeholders what human progress mean?","while strive integrity do, understand special role integrity compliance regulations standards promote sustainable development, distribution value, progressive taxation, cash transfers investment human capital, regulation strategies development inclusive growth.",this communication progress implementing principles united nations global compact supporting broader un goals.,"in transition period, important remain financially viable step consumer demand finding ways encourage market growth expanding zero-emission vehicle (zev) portfolio.","we ensure subaru group aligned focus continue striving map subarus distinctive approach csr view future, progress step step goal transitioning company making things, company making people smile.","we significant progress strengthening supplier relationships shown plante morans 2021 annual working relations index, achieved score 289, 20-point improvement compared previous year.","we measure promotions, performance metrics, interview slates attrition track progress.","while differing country next, new expectations follow key trends, shift ownership experience, calls progress everyone, highest security safety standards.",...,we pioneer innovations connect people matters general motors midst transformation includes accelerated transition evs avs.,the all-electric ford transit intended help lead transition carbon neutrality help businesses achieve sustainability goals helping cities improve air quality reduce noise levels.,infor m at ion ing logistics operations transitioning alternative packaging materials products.,"for economic, environmental social reasons, automotive world transitioning era vehicle mobility.",our efforts achieve progress objectives refects commitment create long-term value responsibly.,"so measure track progress inclusion, 2019 conducted baseline inclusion survey employees follow-up sessions better understand starting point.","in terms progress hondas environmental initiatives themes applicable worldwide, corporate planning supervisory unit collects information regional operations reports meeting world environment safety strategy committee.","in 2020, completed year progress goal reduce waste intensity 40% achieve 150 landfill-free sites 2010 baseline.","and jimhackett we believe freedom movement drives human progress committed helping safely, confidently freely.","fcas investment occupational health safety, combined measures adopted, resulted progressive reduction level risk attributed group plants italy inail, italian accident disability insurance agency."
000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
013,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
033,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
061,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
you,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ys,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ything,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zero,0.0,0.0,0.0,0.0,0.0,0.193066,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [60]:
df.T

Unnamed: 0,000,013,033,061,10,100,103,11,112,12,...,worth,wrong,year,yearly,years,you,ys,ything,zero,zev
"the pulse survey enables better understand progress known drivers engagement, help refine action plans needed.",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
we manage imple-mentation targets evaluation progress devel-opment process applying life cycle assessment accordance iso standard 14040/44.,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"in recent study erb institute university michigan, asked internal external stakeholders what human progress mean?",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"while strive integrity do, understand special role integrity compliance regulations standards promote sustainable development, distribution value, progressive taxation, cash transfers investment human capital, regulation strategies development inclusive growth.",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
this communication progress implementing principles united nations global compact supporting broader un goals.,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"so measure track progress inclusion, 2019 conducted baseline inclusion survey employees follow-up sessions better understand starting point.",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"in terms progress hondas environmental initiatives themes applicable worldwide, corporate planning supervisory unit collects information regional operations reports meeting world environment safety strategy committee.",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"in 2020, completed year progress goal reduce waste intensity 40% achieve 150 landfill-free sites 2010 baseline.",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.201606,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"and jimhackett we believe freedom movement drives human progress committed helping safely, confidently freely.",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [67]:
import math

def find_related_sentences(file_path):
    spacy_model = spacy.load("en_core_web_sm")
    a, im = extract_sentences(extract_pdf(file_path), spacy_model) #Get list of corpus
    im = pre_processing(im)
    input_len = len(im)
    input_vec = vectorizer.transform(im)
    cosine_similarities = []
    angles = []
    for i in range(tf_idf_matrix.shape[0]):
        cosine = cosine_similarity(tf_idf_matrix[i], input_vec)[0]
        angle_list = np.rad2deg(np.arccos(cosine))
        cosine_similarities.append(cosine)
        angles.extend(angle_list)
    min_angle = min(angles)
    threshold_value = 30
    index1 = math.ceil(np.argmin(angles) / input_len)
    index2 = np.argmin(angles) % input_len
    new_line = '\n'
    if min_angle <= threshold_value:
        print(f"Low carbon transition plan found: {new_line} Sentence in input report: {im[index2]} {new_line} Sentence in training dataset: {corpus[index1]}")
        return True
    else:
        print(f"No low carbon transition plan found {new_line} Closest sentences: {im[index2]}")
        return False

In [70]:
find_related_sentences("sample_datasets/automobile/BMW Sustainability Report 2019.pdf")

Extracting page number: 0
Extracting page number: 1
Extracting page number: 2
Extracting page number: 3
Extracting page number: 4
Extracting page number: 5
Extracting page number: 6
Extracting page number: 7
Extracting page number: 8
Extracting page number: 9
Extracting page number: 10
Extracting page number: 11
Extracting page number: 12
Extracting page number: 13
Extracting page number: 14
Extracting page number: 15
Extracting page number: 16
Extracting page number: 17
Extracting page number: 18
Extracting page number: 19
Extracting page number: 20
Extracting page number: 21
Extracting page number: 22
Extracting page number: 23
Extracting page number: 24
Extracting page number: 25
Extracting page number: 26
Extracting page number: 27
Extracting page number: 28
Extracting page number: 29
Extracting page number: 30
Extracting page number: 31
Extracting page number: 32
Extracting page number: 33
Extracting page number: 34
Extracting page number: 35
Extracting page number: 36
Extracting 

True

In [81]:
def check_relevancy(sentence):
    sentence = pre_processing(sentence)
    input_vec = vectorizer.transform(sentence) 
    cosine_similarities = []
    angles = []
    for i in range(tf_idf_matrix.shape[0]):
        cosine = cosine_similarity(tf_idf_matrix[i], input_vec)[0]
        angle_list = np.rad2deg(np.arccos(cosine))
        cosine_similarities.append(cosine)
        angles.extend(angle_list)
        
    min_angle = min(angles)
    threshold_value = 60
    return min_angle <= threshold_value


In [82]:
# Test on 20% of the valid result
test_valid = [check_relevancy(line) for line in related_sentences]

In [83]:
print(len(test_valid))
print(sum(test_valid))

685
0


In [30]:
test_invalid = [check_relevancy(line) for line in unrelated_sentences]

In [31]:
X_test

['these areas best bring competencies bear achieve measurable progress un sustain-able development goals (sdgs).',
 'un global compact report progress the bmw group committed implement principles united nations global compact 2001 report provides information progress achieved complying principles.',
 'these areas best bring competencies bear achieve measurable progress un sustain-able development goals (sdgs).',
 'we manage implementation targets evaluation progress development process applying life cycle assessment accordance iso standard /44.',
 'these areas best bring competencies bear achieve measurable progress un sustain-able development goals (sdgs).',
 'the sdgs core 2030 agenda, global action plan aiming ensure economic progress environ-mentally friendly socially equitable.',
 'additional criteria are: enhancing companys attractiveness employer, progress implementation diversity concept, presented supervisory board report, activities advance corporate citizen-ship bmw group.',

In [32]:
input_vec = vectorizer.transform(['We have a low transition plan','Yes Im Jeff'])
input_vec

<2x144 sparse matrix of type '<class 'numpy.float64'>'
	with 3 stored elements in Compressed Sparse Row format>

In [33]:
cosine = cosine_similarity(tf_idf_matrix[100], input_vec)[0]
cosine

array([0.13661966, 0.        ])

In [34]:
angle_list = np.rad2deg(np.arccos(cosine))
angle_list

array([82.1477126, 90.       ])