In [3]:
######################################### IMPORTING PACAKGES #############################
# Basic ML Packages
from scipy import spatial
import pandas as pd
import math
import os
import json
import numpy as np
import string

import warnings
warnings.filterwarnings("ignore")

# PDF text extraction
from pdfminer3.layout import LAParams, LTTextBox
from pdfminer3.pdfpage import PDFPage
from pdfminer3.pdfinterp import PDFResourceManager
from pdfminer3.pdfinterp import PDFPageInterpreter
from pdfminer3.converter import PDFPageAggregator
from pdfminer3.converter import TextConverter

# Others
import string
import re
from pprint import pprint
from tqdm.notebook import tqdm
import io

# Text pre-processing (Tokenization, Stemming, Lemmatization)
import nltk
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import RegexpTokenizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('omw-1.4')

# Pdf Extraction Model
import spacy
spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm", disable=['ner'])

#Gensim stopwords
import gensim
from gensim.parsing.preprocessing import remove_stopwords
stopwords = gensim.parsing.preprocessing.STOPWORDS

# Train Test Split
from sklearn.model_selection import train_test_split

# Tf-Idf Vectorization
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

DATA_FOLDER = "dataset/"

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\65869\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\65869\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\65869\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


✔ Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


In [49]:
def extract_pdf(file_path):
    """
    Process raw PDF text to structured and processed PDF text to be worked on in Python.
    Parameters
    ----------
    file_path : Relative Location of File
    Return
    ------
    text : str
        processed PDF text if no error is throw
    """   

    try:
        resource_manager = PDFResourceManager()
        fake_file_handle = io.StringIO()
        codec = 'utf-8'
        laparams = LAParams()

        converter = TextConverter(resource_manager, fake_file_handle, codec=codec, laparams=laparams)
        page_interpreter = PDFPageInterpreter(resource_manager, converter)
        
        password = ""
        maxpages = 0
        caching = True
        pagenos = set()

        content = []

        with open(file_path, 'rb') as file:
            for page in PDFPage.get_pages(file,
                                        pagenos, 
                                        maxpages=maxpages,
                                        password=password,
                                        caching=True,
                                        check_extractable=False):

                page_interpreter.process_page(page)

                content.append(fake_file_handle.getvalue())

                fake_file_handle.truncate(0)
                fake_file_handle.seek(0)        

        text = '##PAGE_BREAK##'.join(content)

        # close open handles
        converter.close()
        fake_file_handle.close()
        
        return text

    except Exception as e:
        print(e)

        # close open handles
        converter.close()
        fake_file_handle.close()

        return ""

In [3]:
extract_pdf('Test.pdf')

'SUSTAINABLE \nVALUE\nREPORT 2019\n\n\x0c##PAGE_BREAK##2\n\n·\n\nIntroduction\n\nPreface\nAn overview of the BMW Group\nKey sustainability indicators\nTransformation of the BMW Group\n\n1\nFundamentals\n\n2\nProducts and services\n\n3\nProduction and \nvalue creation\n\n4\nEmployees and society\n\nAppendix\n\nABOUT THIS REPORT\n\nThe BMW Group aims to be the most successful and sus-\ntainable  premium  provider  of  individual  mobility.  To \nachieve this, we set ourselves ten ambitious goals along the \nentire value chain.\nThe BMW Group Sustainable Value Report (SVR) has been \npublished to provide stakeholders with comprehensive \ninformation about the company’s sustainability strategy \nand the progress made in integrating sustainability into \nits corporate processes. The Sustainable Value Report is \npublished at the same time as the Annual Report on the \ndate of the Annual Accounts Press Conference.\nThe requirements of the German CSR Directive Implemen-\ntation Act (CSR-RUG) 

In [50]:
# nlp preprocessing
def preprocess_lines(line_input):
    """
    Helper Function to preprocess and clean sentences from raw PDF text 
    Parameters
    ----------
    line_input : str
        String that contains a sentence to be cleaned
    Return
    ------
    line : str
        Cleaned sentence
    ----------
    Sub: Substitute regular expression
    Split: Remove blank space from front and rear 
    """  
    # removing header number
    line = re.sub(r'^\s?\d+(.*)$', r'\1', line_input)
    # removing trailing spaces
    line = line.strip()
    # words may be split between lines, ensure we link them back together
    line = re.sub(r'\s?-\s?', '-', line)
    # remove space prior to punctuation
    line = re.sub(r'\s?([,:;\.])', r'\1', line)
    # ESG contains a lot of figures that are not relevant to grammatical structure
    line = re.sub(r'\d{5,}', r' ', line)
    # remove emails
    line = re.sub(r'\S*@\S*\s?', '', line)
    # remove mentions of URLs
    line = re.sub(r'((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*', r' ', line)
    # remove multiple spaces
    line = re.sub(r'\s+', ' ', line)
    # join next line with space
    line = re.sub(r' \n', ' ', line)
    line = re.sub(r'.\n', '. ', line)
    line = re.sub(r'\x0c', ' ', line)
    
    return line

In [9]:
def remove_non_ascii(text):
    """
    Helper Function to remove non ascii characters from text
    Printable will 
    """
    printable = set(string.printable) #Convert iterable to set
    return ''.join(filter(lambda x: x in printable, text))

def not_header(line):
    """
    Helper Function to remove headers
    Check if all the characters are in upper case
    """
    return not line.isupper()

In [51]:
def extract_sentences(nlp, text):
    pages = text.split('##PAGE_BREAK##')
    #print('Number of Pages: {}'.format(len(pages)))

    lines = []
    for i in range(len(pages)):
        page_number = i + 1
        page = pages[i]
        
        # remove non ASCII characters
        text = remove_non_ascii(page)
 
        prev = ""
        for line in text.split('\n\n'):
            # aggregate consecutive lines where text may be broken down
            # only if next line starts with a space or previous does not end with dot.
            if(line.startswith(' ') or not prev.endswith('.')):
                prev = prev + ' ' + line
            else:
                # new paragraph
                lines.append(prev)
                prev = line

        # don't forget left-over paragraph
        lines.append(prev)
        lines.append('##SAME_PAGE##')
        
    lines = '  '.join(lines).split('##SAME_PAGE##')
    
    # clean paragraphs from extra space, unwanted characters, urls, etc.
    # best effort clean up, consider a more versatile cleaner
    
    pages_content = []
    pages_sentences = []
    all_sentences = []

    for line in lines[:-1]: # looping through each page
        
        line = preprocess_lines(line)       
        pages_content.append(str(line).strip())

        sentences = []
        # split paragraphs into well defined sentences using spacy
        for part in list(nlp(line).sents):
            sentences.append(str(part).strip())

        #sentences += nltk.sent_tokenize(line)
            
        # Only interested in full sentences and sentences with 10 to 100 words. --> filter out first page/content page
        sentences = [s for s in sentences if re.match('^[A-Z][^?!.]*[?.!]$', s) is not None]
        sentences = [s.replace('\n', ' ') for s in sentences]
        
        pages_sentences.append(sentences)
        all_sentences.extend(sentences)
    return all_sentences #list of sentences

In [2]:
os.listdir("dataset/")

['BMW Sustainability Report 2019.pdf',
 'BMW Sustainability Report 2020.pdf',
 'BMW Sustainability Report 2021.pdf',
 'Ford ESG Review 2019.pdf',
 'Ford Sustainability Report 2020.pdf',
 'Ford Sustainability Report 2021.pdf',
 'GMC Sustainability Report 2019.pdf',
 'GMC Sustainability Report 2020.pdf',
 'GMC Sustainability Report 2021.pdf',
 'Honda Sustainability Report 2019.pdf',
 'Honda Sustainability Report 2020.pdf',
 'Honda Sustainability Report 2021.pdf',
 'Hyundai Sustainability Report 2019.pdf',
 'Hyundai Sustainability Report 2020.pdf',
 'Hyundai Sustainability Report 2021.pdf',
 'Nissan Sustainability Report 2019.pdf',
 'Nissan Sustainability Report 2020.pdf',
 'Nissan Sustainability Report 2021.pdf',
 'Stellantis Sustainability Report 2019.pdf',
 'Stellantis Sustainability Report 2020.pdf',
 'Stellantis Sustainability Report 2021.pdf',
 'Subaru CSR Report 2019.pdf',
 'Subaru CSR Report 2020.pdf',
 'Subaru Sustainability Report 2021.pdf']

In [10]:
# Corpus has already been extract to corpus.txt, take around 15 mins to extract 24 reports
spacy_model = spacy.load("en_core_web_sm")
list_dataset = os.listdir(DATA_FOLDER)
corpus = []
for file in list_dataset:
    all_sentences = extract_sentences(spacy_model,extract_pdf(DATA_FOLDER + file))
    corpus.extend(all_sentences)
np.shape(corpus)

(42121,)

In [11]:
#Store corpus in text file
with open("corpus.txt", "w") as fp:
    json.dump(corpus, fp)

In [12]:
corpus_data = open('corpus.txt')
corpus = json.load(corpus_data)
corpus

['To achieve this, we set ourselves ten ambitious goals along the entire value chain.',
 'The BMW Group Sustainable Value Report (SVR) has been published to provide stakeholders with comprehensive information about the companys sustainability strategy and the progress made in integrating sustainability into its corporate processes.',
 'The Sustainable Value Report is published at the same time as the Annual Report on the date of the Annual Accounts Press Conference.',
 'The requirements of the German CSR Directive Implemen-tation Act (CSR-RUG) obligate Bayerische Motoren Werke Aktiengesellschaft (BMWAG)topublishanon-financial report at company and Group level.',
 'This will be published jointly as an integrated, separate non-financial report (hereinafterreferredtoasseparatenon-financialreport) within this Sustainable Value Report for BMW AG and BMW Group.',
 'In the SVR 2019 we focused on providing information that is required in order to comply with the German CSR Direc-tive Implement

In [13]:
from tabulate import tabulate

def stemming(corpus):
    stemmer = SnowballStemmer(language='english')
    revisions = [stemmer.stem(line) for line in corpus]
    return revisions

In [14]:
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer 

def lemmatization(corpus):
    lemmatizer = WordNetLemmatizer()
    revisions = [lemmatizer.lemmatize(line) for line in corpus]
    return revisions
    

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\65869\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [15]:
tokenizer = TreebankWordTokenizer()
tokenize_output = tokenizer.tokenize(corpus[15])
stemming(tokenize_output)

['sustain', 'is', 'not', 'just', 'a', 'trend', 'for', 'us', '.']

In [16]:
lemmatization(stemming(tokenize_output))

['sustain', 'is', 'not', 'just', 'a', 'trend', 'for', 'u', '.']

In [17]:
def remove_stop_words(corpus):
    revisions = [remove_stopwords(line) for line in corpus]
    return revisions 

In [18]:
remove_stop_words(lemmatization(stemming(tokenize_output)))

['sustain', '', '', '', '', 'trend', '', 'u', '.']

In [19]:
stopwords

frozenset({'a',
           'about',
           'above',
           'across',
           'after',
           'afterwards',
           'again',
           'against',
           'all',
           'almost',
           'alone',
           'along',
           'already',
           'also',
           'although',
           'always',
           'am',
           'among',
           'amongst',
           'amoungst',
           'amount',
           'an',
           'and',
           'another',
           'any',
           'anyhow',
           'anyone',
           'anything',
           'anyway',
           'anywhere',
           'are',
           'around',
           'as',
           'at',
           'back',
           'be',
           'became',
           'because',
           'become',
           'becomes',
           'becoming',
           'been',
           'before',
           'beforehand',
           'behind',
           'being',
           'below',
           'beside',
           'besides'

In [20]:
remove_stop_words(corpus)

['To achieve this, set ambitious goals entire value chain.',
 'The BMW Group Sustainable Value Report (SVR) published provide stakeholders comprehensive information companys sustainability strategy progress integrating sustainability corporate processes.',
 'The Sustainable Value Report published time Annual Report date Annual Accounts Press Conference.',
 'The requirements German CSR Directive Implemen-tation Act (CSR-RUG) obligate Bayerische Motoren Werke Aktiengesellschaft (BMWAG)topublishanon-financial report company Group level.',
 'This published jointly integrated, separate non-financial report (hereinafterreferredtoasseparatenon-financialreport) Sustainable Value Report BMW AG BMW Group.',
 'In SVR 2019 focused providing information required order comply German CSR Direc-tive Implementation Act (CSR RUG) Global Report-ing Initiative (GRI).',
 'We added detailed informa-tion topics strategic relevance BMW Group.',
 'Current examples measures support implementing sustainability t

In [21]:
def pre_processing(corpus):
    return lemmatization(stemming(remove_stop_words(corpus)))

In [52]:
def label_relevancy(corpus, related_words):
    related_words = pre_processing(related_words)
    corpus = pre_processing(corpus)
    related_sentences = [line for line in corpus if any(word in line for word in related_words)]
    unrelated_sentences = [line for line in corpus if (line not in related_sentences)]
    all_sentences = related_sentences + unrelated_sentences
    all_data = pd.DataFrame(all_sentences, columns=['corpus'])
    all_data['have_transition_plan'] = all_data['corpus'].isin(related_sentences)
    return related_sentences, unrelated_sentences, all_data

In [23]:
related_words = ['transition', 'progress']
related_sentences, unrelated_sentences, all_data = label_relevancy(corpus, related_words)
all_data

Unnamed: 0,corpus,have_transition_plan
0,the bmw group sustainable value report (svr) p...,True
1,"the sdgs core 2030 agenda, global action plan ...",True
2,additional criteria are: enhancing companys at...,True
3,we manage imple-mentation targets evaluation p...,True
4,we manage implementation targets evaluation pr...,True
...,...,...
42116,if organization identified non-compliance laws...,False
42117,the context significant fines non-monetary san...,False
42118,governance tcfd disclosure recommendations rel...,False
42119,risks opportunities identified environmental m...,False


In [24]:
all_data.to_csv('transition_data.csv')

In [25]:
from sklearn.model_selection import train_test_split

X_train, X_test= train_test_split(related_sentences, test_size=0.5)
X_train[:5]

['specialized tools track evaluation records newly instated supervisor ascertain employee progress glance, maintaining consistency human resource development.',
 'i feel down-to-earth efforts slogan change mindset, change actions, change company led real, palpable progress.',
 'these examples testimony free2move leasys sense responsibility determination contribute accelerate on-going transition electric, sustainable, forms mobility.',
 'investing all-electric future (projects completed, progress announced quarter 2022 united states examples transformation) by end 2025, gm plans 1 million units ev capacity north america.',
 'ngp2022 discloses indicators progress initiatives related identified material issues year.']

In [26]:
vectorizer = TfidfVectorizer()
tf_idf_matrix = vectorizer.fit_transform(X_train)

In [27]:
tf_idf_matrix.shape

(357, 2250)

In [28]:
feature_names = vectorizer.get_feature_names()
corpus_index = [n for n in X_train]
df = pd.DataFrame(tf_idf_matrix.T.todense(), index=feature_names, columns=corpus_index)
df

Unnamed: 0,"specialized tools track evaluation records newly instated supervisor ascertain employee progress glance, maintaining consistency human resource development.","i feel down-to-earth efforts slogan change mindset, change actions, change company led real, palpable progress.","these examples testimony free2move leasys sense responsibility determination contribute accelerate on-going transition electric, sustainable, forms mobility.","investing all-electric future (projects completed, progress announced quarter 2022 united states examples transformation) by end 2025, gm plans 1 million units ev capacity north america.",ngp2022 discloses indicators progress initiatives related identified material issues year.,"we soon offer e-transit, vehicle tailored commercial customers help lower operating costs driving electricity, new bev europe produced cologne.",transition low co2 transport mobility increased pace requires policy makers develop implement carefully coordinated plan takes account countervailing economic social challenges associated rapid transition.,"by conducting megatrend analysis, literature review peer benchmarking, identified 3 tiers key transition physical climate change risks, including increased severity extreme weather events, shifts consumer preferences increased costs ghg emissions.","during transition carbon neutrality, continue researching developing alternative powertrains fuel options vehicles, providing customers efficient, low-carbon alternatives.","from perspec tive ensuring joys nex t generation, company progress clean safe/ secure society.",...,"the committee discusses targets measures broad medium-to long-term perspectives accommodate environmental standards required future societies, evaluates progress related implementations achievements.","that gm advocated regulatory frameworks support carsharing, e-bike new urban delivery ventures, value investments broader transit transportation infrastructure.","in ecuador, gms progress food waste informing food waste reduction project universidad san francisco quito.","main opportunities identified (1) if subaru advances efforts products environmentally friendly planned global climate change mitigation/adaptation efforts progress adequately, company able maintain key markets, this management system subaru established environment committee purpose promoting sustainable growth society company, contributing global environment conservation.",it marks companys long-term commitment ongoing progress gender equality.,"we taking action zero fatal accidents*3 2030, steady progress goal.","in order prepare domestic dealerships undergo inspection transition fye2018 version ea21 guidelines, dealership visited consultant provide relevant information, exchange opinions offer forms support.","a support social solidarity economy communities: philanthropy: merger groupe psa fiat chrysler automobiles (fca), stellantis managed transition new philanthropic focus continued support multi-year projects started merger.","csr issue #1: vehicle co2 emissions measures process, main actions results 1: risk mapping designed identify, analyse classify risks as risk management system, stellantis uses company-wide risk analysis framework assess, manage report climate-related physical transition risks.","we build our vehicles and develop our services to drive human progress and protect our customers, while delivering the quality, safety, reliability and driving experience they expect."
000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
100,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
103,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
109,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
years,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
you,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
zero,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.302937,0.0,0.0,0.0,0.0
zev,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0


In [29]:
df.T

Unnamed: 0,000,10,100,103,109,11,110,112,114,115,...,wrong,x50,xv,year,yearly,years,you,zero,zev,zevs
"specialized tools track evaluation records newly instated supervisor ascertain employee progress glance, maintaining consistency human resource development.",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0
"i feel down-to-earth efforts slogan change mindset, change actions, change company led real, palpable progress.",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0
"these examples testimony free2move leasys sense responsibility determination contribute accelerate on-going transition electric, sustainable, forms mobility.",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0
"investing all-electric future (projects completed, progress announced quarter 2022 united states examples transformation) by end 2025, gm plans 1 million units ev capacity north america.",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0
ngp2022 discloses indicators progress initiatives related identified material issues year.,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.261565,0.0,0.0,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"we taking action zero fatal accidents*3 2030, steady progress goal.",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.302937,0.0,0.0
"in order prepare domestic dealerships undergo inspection transition fye2018 version ea21 guidelines, dealership visited consultant provide relevant information, exchange opinions offer forms support.",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0
"a support social solidarity economy communities: philanthropy: merger groupe psa fiat chrysler automobiles (fca), stellantis managed transition new philanthropic focus continued support multi-year projects started merger.",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.131131,0.0,0.0,0.0,0.000000,0.0,0.0
"csr issue #1: vehicle co2 emissions measures process, main actions results 1: risk mapping designed identify, analyse classify risks as risk management system, stellantis uses company-wide risk analysis framework assess, manage report climate-related physical transition risks.",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0


In [61]:
import math

def find_related_sentences(file_path):
    spacy_model = spacy.load("en_core_web_sm")
    im = extract_sentences(spacy_model, extract_pdf(file_path)) #Get list of corpus
    im = pre_processing(im)
    input_len = len(im)
    input_vec = vectorizer.transform(im)
    cosine_similarities = []
    angles = []
    for i in range(tf_idf_matrix.shape[0]):
        cosine = cosine_similarity(tf_idf_matrix[i], input_vec)[0]
        angle_list = np.rad2deg(np.arccos(cosine))
        cosine_similarities.append(cosine)
        angles.extend(angle_list)
    min_angle = min(angles)
    threshold_value = 30
    index1 = math.ceil(np.argmin(angles) / input_len)
    index2 = np.argmin(angles) % input_len
    new_line = '\n'
    if min_angle <= threshold_value:
        print(f"Low carbon transition plan found: {new_line} Sentence in input report: {im[index2]} {new_line} Sentence in training dataset: {corpus[index1]}")
        return True
    else:
        print(f"No low carbon transition plan found {new_line} Closest sentences: {im[index2]}")
        return False

In [32]:
BMW_corpus = extract_sentences(spacy_model,extract_pdf("Test.pdf"))
BMW_corpus

In [35]:
BMW_corpus

['To achieve this, we set ourselves ten ambitious goals along the entire value chain.',
 'The BMW Group Sustainable Value Report (SVR) has been published to provide stakeholders with comprehensive information about the companys sustainability strategy and the progress made in integrating sustainability into its corporate processes.',
 'The Sustainable Value Report is published at the same time as the Annual Report on the date of the Annual Accounts Press Conference.',
 'The requirements of the German CSR Directive Implemen-tation Act (CSR-RUG) obligate Bayerische Motoren Werke Aktiengesellschaft (BMWAG)topublishanon-financial report at company and Group level.',
 'This will be published jointly as an integrated, separate non-financial report (hereinafterreferredtoasseparatenon-financialreport) within this Sustainable Value Report for BMW AG and BMW Group.',
 'In the SVR 2019 we focused on providing information that is required in order to comply with the German CSR Direc-tive Implement

In [59]:
find_related_sentences("dataset/BMW Sustainability Report 2019.pdf")

Low carbon transition plan found: 
 Sentence in input report: further progress environmental optimisation electromobility in year review, pursued expanded approaches holistic environmental optimisation bmws electromobility. 
 Sentence in training dataset: Metrics and targets Disclosure of the metrics and targets used to assess and manage relevant climate-related risks and opportunities, provided that this information is material.


True

In [62]:
find_related_sentences("School Lecture Note.pdf")

No low carbon transition plan found 
 Closest sentences: gradient descent algorithm step 1.


False

In [47]:
def check_relevancy(sentence):
    sentence = pre_processing(sentence)
    input_vec = vectorizer.transform([sentence]) 
    cosine_similarities = []
    angles = []
    for i in range(tf_idf_matrix.shape[0]):
        cosine = cosine_similarity(tf_idf_matrix[i], input_vec)[0]
        angle_list = np.rad2deg(np.arccos(cosine))
        cosine_similarities.append(cosine)
        angles.extend(angle_list)
        
    min_angle = min(angles)
    threshold_value = 60
    return min_angle <= threshold_value


In [45]:
find_related_sentences(['We have a low transition plan','Yes Im Jeff'])

Can't find matching sentences 
 Closest sentences: we low transition plan


False

In [48]:
# Test on 20% of the valid result
test_valid = [check_relevancy(line) for line in unrelated_sentences]

KeyboardInterrupt: 

In [43]:
print(len(test_valid))
print(sum(test_valid))

33216
312


In [30]:
test_invalid = [check_relevancy(line) for line in unrelated_sentences]

In [31]:
X_test

['these areas best bring competencies bear achieve measurable progress un sustain-able development goals (sdgs).',
 'un global compact report progress the bmw group committed implement principles united nations global compact 2001 report provides information progress achieved complying principles.',
 'these areas best bring competencies bear achieve measurable progress un sustain-able development goals (sdgs).',
 'we manage implementation targets evaluation progress development process applying life cycle assessment accordance iso standard /44.',
 'these areas best bring competencies bear achieve measurable progress un sustain-able development goals (sdgs).',
 'the sdgs core 2030 agenda, global action plan aiming ensure economic progress environ-mentally friendly socially equitable.',
 'additional criteria are: enhancing companys attractiveness employer, progress implementation diversity concept, presented supervisory board report, activities advance corporate citizen-ship bmw group.',

In [32]:
input_vec = vectorizer.transform(['We have a low transition plan','Yes Im Jeff'])
input_vec

<2x144 sparse matrix of type '<class 'numpy.float64'>'
	with 3 stored elements in Compressed Sparse Row format>

In [33]:
cosine = cosine_similarity(tf_idf_matrix[100], input_vec)[0]
cosine

array([0.13661966, 0.        ])

In [34]:
angle_list = np.rad2deg(np.arccos(cosine))
angle_list

array([82.1477126, 90.       ])