In [1]:
######################################### IMPORTING PACAKGES #############################
# Basic ML Packages
from scipy import spatial
import pandas as pd
import math
import os
import json
import numpy as np
import string

import warnings
warnings.filterwarnings("ignore")

# PDF text extraction
from pdfminer3.layout import LAParams, LTTextBox
from pdfminer3.pdfpage import PDFPage
from pdfminer3.pdfinterp import PDFResourceManager
from pdfminer3.pdfinterp import PDFPageInterpreter
from pdfminer3.converter import PDFPageAggregator
from pdfminer3.converter import TextConverter

# Others
import string
import re
from pprint import pprint
from tqdm.notebook import tqdm
import io

# Text pre-processing (Tokenization, Stemming, Lemmatization)
import nltk
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import RegexpTokenizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('omw-1.4')

# Pdf Extraction Model
import spacy
spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm", disable=['ner'])

#Gensim stopwords
import gensim
from gensim.parsing.preprocessing import remove_stopwords
stopwords = gensim.parsing.preprocessing.STOPWORDS

# Train Test Split
from sklearn.model_selection import train_test_split

# Tf-Idf Vectorization
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

DATA_FOLDER = "dataset/"

[nltk_data] Downloading package punkt to C:\Users\Chen
[nltk_data]     Wei\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Chen
[nltk_data]     Wei\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\Chen
[nltk_data]     Wei\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


âœ” Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


In [2]:
def extract_pdf(file_path):
    """
    Process raw PDF text to structured and processed PDF text to be worked on in Python.
    Parameters
    ----------
    file_path : Relative Location of File
    Return
    ------
    text : str
        processed PDF text if no error is throw
    """   

    try:
        resource_manager = PDFResourceManager()
        fake_file_handle = io.StringIO()
        codec = 'utf-8'
        laparams = LAParams()

        converter = TextConverter(resource_manager, fake_file_handle, codec=codec, laparams=laparams)
        page_interpreter = PDFPageInterpreter(resource_manager, converter)
        
        password = ""
        maxpages = 0
        caching = True
        pagenos = set()

        content = []

        with open(file_path, 'rb') as file:
            for page in PDFPage.get_pages(file,
                                        pagenos, 
                                        maxpages=maxpages,
                                        password=password,
                                        caching=True,
                                        check_extractable=False):

                page_interpreter.process_page(page)

                content.append(fake_file_handle.getvalue())

                fake_file_handle.truncate(0)
                fake_file_handle.seek(0)        

        text = '##PAGE_BREAK##'.join(content)

        # close open handles
        converter.close()
        fake_file_handle.close()
        
        return text

    except Exception as e:
        print(e)

        # close open handles
        converter.close()
        fake_file_handle.close()

        return ""

In [3]:
# nlp preprocessing
def preprocess_lines(line_input):
    """
    Helper Function to preprocess and clean sentences from raw PDF text 
    Parameters
    ----------
    line_input : str
        String that contains a sentence to be cleaned
    Return
    ------
    line : str
        Cleaned sentence
    ----------
    Sub: Substitute regular expression
    Split: Remove blank space from front and rear 
    """  
    # removing header number
    line = re.sub(r'^\s?\d+(.*)$', r'\1', line_input)
    # removing trailing spaces
    line = line.strip()
    # words may be split between lines, ensure we link them back together
    line = re.sub(r'\s?-\s?', '-', line)
    # remove space prior to punctuation
    line = re.sub(r'\s?([,:;\.])', r'\1', line)
    # ESG contains a lot of figures that are not relevant to grammatical structure
    line = re.sub(r'\d{5,}', r' ', line)
    # remove emails
    line = re.sub(r'\S*@\S*\s?', '', line)
    # remove mentions of URLs
    line = re.sub(r'((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*', r' ', line)
    # remove multiple spaces
    line = re.sub(r'\s+', ' ', line)
    # join next line with space
    line = re.sub(r' \n', ' ', line)
    line = re.sub(r'.\n', '. ', line)
    line = re.sub(r'\x0c', ' ', line)
    
    return line

In [4]:
def remove_non_ascii(text):
    """
    Helper Function to remove non ascii characters from text
    Printable will 
    """
    printable = set(string.printable) #Convert iterable to set
    return ''.join(filter(lambda x: x in printable, text))

def not_header(line):
    """
    Helper Function to remove headers
    Check if all the characters are in upper case
    """
    return not line.isupper()

In [5]:
def extract_pages_sentences(nlp, text):    
    """
    Extracting text from raw PDF text and store them by pages and senteces. Raw text is also cleand by removing junk, URLs, etc.
    Consecutive lines are also grouped into paragraphs and spacy is used to parse sentences.
    Parameters
    ----------
    nlp: spacy nlp model
        NLP model to parse sentences
    text : str
        Raw PDF text
    Return
    ------
    pages_content : list of str
        A list containing text from each page of the PDF report. Page number is the index of list + 1
    
    pages_sentences : list of list
        A list containing lists. Page number is the index of outer list + 1. Inner list contains sentences from each page
 
    """  
    
    pages = text.split('##PAGE_BREAK##')
    #print('Number of Pages: {}'.format(len(pages)))

    lines = []
    for i in range(len(pages)):
        page_number = i + 1
        page = pages[i]
        
        # remove non ASCII characters
        text = remove_non_ascii(page)
        
        # if len(text.split(' ')) < MIN_WORDS_PER_PAGE:
        #     print(f'Skipped Page: {page_number}')
        #     continue
        
        prev = ""
        for line in text.split('\n\n'):
            # aggregate consecutive lines where text may be broken down
            # only if next line starts with a space or previous does not end with dot.
            if(line.startswith(' ') or not prev.endswith('.')):
                prev = prev + ' ' + line
            else:
                # new paragraph
                lines.append(prev)
                prev = line

        # don't forget left-over paragraph
        lines.append(prev)
        lines.append('##SAME_PAGE##')
        
    lines = '  '.join(lines).split('##SAME_PAGE##')
    
    # clean paragraphs from extra space, unwanted characters, urls, etc.
    # best effort clean up, consider a more versatile cleaner
    
    pages_content = []
    pages_sentences = []
    all_sentences = []

    for line in lines[:-1]: # looping through each page
        
        line = preprocess_lines(line)       
        pages_content.append(str(line).strip())

        sentences = []
        # split paragraphs into well defined sentences using spacy
        for part in list(nlp(line).sents):
            sentences.append(str(part).strip())

        #sentences += nltk.sent_tokenize(line)
            
        # Only interested in full sentences and sentences with 10 to 100 words. --> filter out first page/content page
        sentences = [s for s in sentences if re.match('^[A-Z][^?!.]*[?.!]$', s) is not None]
        sentences = [s.replace('\n', ' ') for s in sentences]
        
        pages_sentences.append(sentences)
        all_sentences.extend(sentences)
    return pages_content, pages_sentences, all_sentences #list, list of list where page is index of outer list, list of sentences

In [6]:
def extract_sentences(nlp, text):
    pages = text.split('##PAGE_BREAK##')
    #print('Number of Pages: {}'.format(len(pages)))

    lines = []
    for i in range(len(pages)):
        page_number = i + 1
        page = pages[i]
        
        # remove non ASCII characters
        text = remove_non_ascii(page)
 
        prev = ""
        for line in text.split('\n\n'):
            # aggregate consecutive lines where text may be broken down
            # only if next line starts with a space or previous does not end with dot.
            if(line.startswith(' ') or not prev.endswith('.')):
                prev = prev + ' ' + line
            else:
                # new paragraph
                lines.append(prev)
                prev = line

        # don't forget left-over paragraph
        lines.append(prev)
        lines.append('##SAME_PAGE##')
        
    lines = '  '.join(lines).split('##SAME_PAGE##')
    
    # clean paragraphs from extra space, unwanted characters, urls, etc.
    # best effort clean up, consider a more versatile cleaner
    
    pages_content = []
    pages_sentences = []
    all_sentences = []

    for line in lines[:-1]: # looping through each page
        
        line = preprocess_lines(line)       
        pages_content.append(str(line).strip())

        sentences = []
        # split paragraphs into well defined sentences using spacy
        for part in list(nlp(line).sents):
            sentences.append(str(part).strip())

        #sentences += nltk.sent_tokenize(line)
            
        # Only interested in full sentences and sentences with 10 to 100 words. --> filter out first page/content page
        sentences = [s for s in sentences if re.match('^[A-Z][^?!.]*[?.!]$', s) is not None]
        sentences = [s.replace('\n', ' ') for s in sentences]
        
        pages_sentences.append(sentences)
        all_sentences.extend(sentences)
    return all_sentences #list of sentences

In [7]:
# Corpus has already been extract to corpus.txt, take around 15 mins to extract 24 reports
spacy_model = spacy.load("en_core_web_sm")
list_dataset = os.listdir(DATA_FOLDER)
corpus = []
#for file in list_dataset: ##clean loop
pages_content, pages_sentences, all_sentences = extract_pages_sentences(spacy_model,extract_pdf("british-land-sustainability-accounts-2022.pdf"))
corpus.extend(all_sentences)
np.shape(corpus)

(988,)

In [517]:
#Store corpus in text file
with open("bl_corpus.txt", "w") as fp:
    json.dump(corpus, fp)

In [518]:
corpus_data = open('bl_corpus.txt')
corpus = json.load(corpus_data)
corpus

['Our ESG journey Our 2030 commitments Performance overview (KPIs) Net Zero carbon Place based approach Responsible business Performance data 2022 EPRA index SASB index Reporting criteria Assurance Reports 2 3 4 5 9 17 24 27 61 64 66 85 Read more about our TCFD disclosures in our 2022 Annual Report Against a challenging backdrop, our teams have done a fantastic job this year, working with our customers and partners to support local communities.',
 'As we go into a new year, with new uncertainties on the horizon, their dedication will be more vital than ever.',
 'Introduction This year we celebrate 20 years of sustainability reporting, and never has our environmental and social focus been more integral to the way we do business.',
 'The climate emergency is high on everyones agenda; we face a cost of living crisis and the impact of Covid 19 continues to reverberate throughout our communities.',
 'This means progressing our pathway to net zero and making a positive impact at our places i

In [519]:
from tabulate import tabulate

def stemming(corpus):
    stemmer = SnowballStemmer(language='english')
    revisions = [stemmer.stem(line) for line in corpus]
    return revisions

In [520]:
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer 

def lemmatization(corpus):
    lemmatizer = WordNetLemmatizer()
    revisions = [lemmatizer.lemmatize(line) for line in corpus]
    return revisions
    

[nltk_data] Downloading package wordnet to C:\Users\Chen
[nltk_data]     Wei\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [521]:
tokenizer = TreebankWordTokenizer()
tokenize_output = tokenizer.tokenize(corpus[15])
stemming(tokenize_output)

['this',
 'year',
 'mark',
 'ten',
 'year',
 'of',
 'our',
 'partnership',
 'with',
 'the',
 'nation',
 'literaci',
 'trust',
 '.']

In [522]:
lemmatization(stemming(tokenize_output))

['this',
 'year',
 'mark',
 'ten',
 'year',
 'of',
 'our',
 'partnership',
 'with',
 'the',
 'nation',
 'literaci',
 'trust',
 '.']

In [523]:
def remove_stop_words(corpus):
    revisions = [remove_stopwords(line) for line in corpus]
    return revisions 

In [524]:
remove_stop_words(corpus)

['Our ESG journey Our 2030 commitments Performance overview (KPIs) Net Zero carbon Place based approach Responsible business Performance data 2022 EPRA index SASB index Reporting criteria Assurance Reports 2 3 4 5 9 17 24 27 61 64 66 85 Read TCFD disclosures 2022 Annual Report Against challenging backdrop, teams fantastic job year, working customers partners support local communities.',
 'As new year, new uncertainties horizon, dedication vital ever.',
 'Introduction This year celebrate 20 years sustainability reporting, environmental social focus integral way business.',
 'The climate emergency high everyones agenda; face cost living crisis impact Covid 19 continues reverberate communities.',
 'This means progressing pathway net zero making positive impact places important ever.',
 'We continued deliver strong progress year.',
 'At 1 Triton Square, completed second net zero carbon development site Canada Water, piloting innovative building materials technologies deliver market leading

In [525]:
def pre_processing(corpus):
    return lemmatization(stemming(remove_stop_words(corpus)))

In [526]:
def label_relevancy(corpus, related_words):
    related_words = pre_processing(related_words)
    corpus = pre_processing(corpus)
    related_sentences = [line for line in corpus if any(word in line for word in related_words)]
    unrelated_sentences = [line for line in corpus if (line not in related_sentences)]
    all_sentences = related_sentences + unrelated_sentences
    all_data = pd.DataFrame(all_sentences, columns=['corpus'])
    all_data['have_transition_plan'] = all_data['corpus'].isin(related_sentences)
    return related_sentences, unrelated_sentences, all_data

In [527]:
related_words = ['ghg', 'sbti', 'tcfd', 'sasb']
related_sentences, unrelated_sentences, all_data = label_relevancy(corpus, related_words)
all_data

Unnamed: 0,corpus,have_transition_plan
0,our esg journey our 2030 commitments performan...,True
1,"sbti, net zero targets greenhouse gas intensit...",True
2,location based: our use 2021 uk greenhouse gas...,True
3,overview sasb table this british lands sustain...,True
4,area energy management units kwhe sqm sasb cod...,True
...,...,...
983,dnv expressly disclaims liability co-responsib...,False
984,responsibilities board directors british land ...,False
985,our responsibility plan perform work obtain li...,False
986,we responsible preparation report.,False


In [528]:
all_data.to_csv('bl_ratios.csv')

In [20]:
from sklearn.model_selection import train_test_split

X_train, X_test= train_test_split(related_sentences, test_size=0.5)
X_train[:5]

['this includes seven main ghg emissions covered kyoto greenhouse gas protocols, line common practice: carbon dioxide (co2), methane (ch4), hydrofluorocarbons (hfcs), nitrous oxide (n2o), perfluorocarbons (pfcs), sulphur hexafluoride (sf6) nitrogen trifluoride (nf3).',
 'total direct indirect (scopes 1 2 & 3) ghg emissions location market based 5.',
 'scope 1 2 ghg emissions intensities epra reporting reported fig.',
 'our esg journey our 2030 commitments performance overview (kpis) net zero carbon place based approach responsible business performance data 2022 epra index sasb index reporting criteria assurance reports 2 3 4 5 9 17 24 27 61 64 66 85 read tcfd disclosures 2022 annual report against challenging backdrop, teams fantastic job year, working customers partners support local communities.',
 'table 4 shows combined carbon-equivalent emission factors different ghgs considered.']

In [21]:
vectorizer = TfidfVectorizer()
tf_idf_matrix = vectorizer.fit_transform(X_train)

In [22]:
tf_idf_matrix.shape

(13, 209)

In [23]:
feature_names = vectorizer.get_feature_names()
corpus_index = [n for n in X_train]
df = pd.DataFrame(tf_idf_matrix.T.todense(), index=feature_names, columns=corpus_index)
df

Unnamed: 0,"this includes seven main ghg emissions covered kyoto greenhouse gas protocols, line common practice: carbon dioxide (co2), methane (ch4), hydrofluorocarbons (hfcs), nitrous oxide (n2o), perfluorocarbons (pfcs), sulphur hexafluoride (sf6) nitrogen trifluoride (nf3).",total direct indirect (scopes 1 2 & 3) ghg emissions location market based 5.,scope 1 2 ghg emissions intensities epra reporting reported fig.,"our esg journey our 2030 commitments performance overview (kpis) net zero carbon place based approach responsible business performance data 2022 epra index sasb index reporting criteria assurance reports 2 3 4 5 9 17 24 27 61 64 66 85 read tcfd disclosures 2022 annual report against challenging backdrop, teams fantastic job year, working customers partners support local communities.",table 4 shows combined carbon-equivalent emission factors different ghgs considered.,financial intensity ratio expresses absolute scope 1 2 ghg emissions relation gross rental income properties managed portfolio.,"absolute scope 1 2 ghg emissions relate managed portfolio electricity, gas use refrigerant loss air conditioning, fuel use british land owned vehicles.","a1-a5 emissions (embodied carbon construction stages new developments refurbishments carbon levy applies): includes ghg emissions new developments refurbishments, carbon levy applies, completed 2020 construction (stage 5).",location based: our use 2021 uk greenhouse gas conversion factors reflect +3% increase ghg intensity gas (vs 2020) +1% increase ghg intensity electricity (vs 2020).,the reporting years total scope 1 2 ghg emissions prepared sourced figure 4.,"includes 100% ghg emissions developments, joint venture developments developments undertaken funding.","area energy management units kwhe sqm sasb code if-re-130a 1 location figure 8, p36 gri code 302-1 if-re-130a 2 302-1 if-re-130a 3 302-1 if-re-130a 4 if-re-130a 5 103-2 activity metric energy consumption data coverage percentage floor area, property subsector total energy consumed portfolio area data coverage, percentage grid electricity, percentage renewable, property subsector lfl change energy consumption portfolio area data coverage, property subsector percentage eligible portfolio (1) obtained energy rating (2) certified energy star, property subsector description building energy management consideration integrated property investment analysis operational strategy mwh figure 13, p40 mwh figure 12, p39 % floor area (sqm) figure 14, p41 discussion analysis energy carbon management integrated policies procedures.","carbon intensity includes scope 1, 2 3 ghg emissions related energy sources plus emissions building sources waste disposal management water consumption."
100,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.282267,0.000000,0.000000
103,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.056371,0.000000
12,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.056371,0.000000
13,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.056371,0.000000
130a,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.281857,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
water,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.260303
working,0.0,0.0,0.0,0.130515,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000
year,0.0,0.0,0.0,0.130515,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000
years,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.386629,0.000000,0.000000,0.000000


In [24]:
df.T

Unnamed: 0,100,103,12,13,130a,14,17,2020,2021,2022,...,use,vehicles,venture,vs,waste,water,working,year,years,zero
"this includes seven main ghg emissions covered kyoto greenhouse gas protocols, line common practice: carbon dioxide (co2), methane (ch4), hydrofluorocarbons (hfcs), nitrous oxide (n2o), perfluorocarbons (pfcs), sulphur hexafluoride (sf6) nitrogen trifluoride (nf3).",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
total direct indirect (scopes 1 2 & 3) ghg emissions location market based 5.,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
scope 1 2 ghg emissions intensities epra reporting reported fig.,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"our esg journey our 2030 commitments performance overview (kpis) net zero carbon place based approach responsible business performance data 2022 epra index sasb index reporting criteria assurance reports 2 3 4 5 9 17 24 27 61 64 66 85 read tcfd disclosures 2022 annual report against challenging backdrop, teams fantastic job year, working customers partners support local communities.",0.0,0.0,0.0,0.0,0.0,0.0,0.130515,0.0,0.0,0.26103,...,0.0,0.0,0.0,0.0,0.0,0.0,0.130515,0.130515,0.0,0.130515
table 4 shows combined carbon-equivalent emission factors different ghgs considered.,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
financial intensity ratio expresses absolute scope 1 2 ghg emissions relation gross rental income properties managed portfolio.,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"absolute scope 1 2 ghg emissions relate managed portfolio electricity, gas use refrigerant loss air conditioning, fuel use british land owned vehicles.",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.418395,0.242586,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"a1-a5 emissions (embodied carbon construction stages new developments refurbishments carbon levy applies): includes ghg emissions new developments refurbishments, carbon levy applies, completed 2020 construction (stage 5).",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.147533,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
location based: our use 2021 uk greenhouse gas conversion factors reflect +3% increase ghg intensity gas (vs 2020) +1% increase ghg intensity electricity (vs 2020).,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.344313,0.199633,0.0,...,0.172156,0.0,0.0,0.399266,0.0,0.0,0.0,0.0,0.0,0.0
the reporting years total scope 1 2 ghg emissions prepared sourced figure 4.,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.386629,0.0


In [25]:
import math

def find_related_sentences(file_path):
    spacy_model = spacy.load("en_core_web_sm")
    im = extract_sentences(spacy_model, extract_pdf(file_path)) #Get list of corpus
    im = pre_processing(im)
    input_len = len(im)
    input_vec = vectorizer.transform(im)
    cosine_similarities = []
    angles = []
    for i in range(tf_idf_matrix.shape[0]):
        cosine = cosine_similarity(tf_idf_matrix[i], input_vec)[0]
        angle_list = np.rad2deg(np.arccos(cosine))
        cosine_similarities.append(cosine)
        angles.extend(angle_list)
    min_angle = min(angles)
    threshold_value = 30
    index1 = math.ceil(np.argmin(angles) / input_len)
    index2 = np.argmin(angles) % input_len
    new_line = '\n'
    if min_angle <= threshold_value:
        print(f"Low carbon transition plan found: {new_line} Sentence in input report: {im[index2]} {new_line} Sentence in training dataset: {corpus[index1]}")
        return True
    else:
        print(f"No low carbon transition plan found {new_line} Closest sentences: {im[index2]}")
        return False

In [26]:
find_related_sentences("british-land-sustainability-accounts-2022.pdf")

Low carbon transition plan found: 
 Sentence in input report: scope 1 2 ghg emissions intensities epra reporting reported fig. 
 Sentence in training dataset: The climate emergency is high on everyones agenda; we face a cost of living crisis and the impact of Covid 19 continues to reverberate throughout our communities.


True

In [27]:
def check_relevancy(sentence):
    sentence = pre_processing(sentence)
    input_vec = vectorizer.transform(sentence) 
    cosine_similarities = []
    angles = []
    for i in range(tf_idf_matrix.shape[0]):
        cosine = cosine_similarity(tf_idf_matrix[i], input_vec)[0]
        angle_list = np.rad2deg(np.arccos(cosine))
        cosine_similarities.append(cosine)
        angles.extend(angle_list)
        
    min_angle = min(angles)
    threshold_value = 60
    return min_angle <= threshold_value


In [28]:
unrelated_sentences

['as new year, new uncertainties horizon, dedication vital ever.',
 'introduction this year celebrate 20 years sustainability reporting, environmental social focus integral way business.',
 'the climate emergency high everyones agenda; face cost living crisis impact covid 19 continues reverberate communities.',
 'this means progressing pathway net zero making positive impact places important ever.',
 'we continued deliver strong progress year.',
 'at 1 triton square, completed second net zero carbon development site canada water, piloting innovative building materials technologies deliver market leading, highly sustainable buildings.',
 'our approach right terms environment, reflects customers increasingly expect.',
 'the strong business case sustainable buildings underlined year leasing success 1 broadgate, sustainable buildings london fully let (or option) years ahead completion.',
 'on standing portfolio, completed net zero audits working customers invest energy efficient interventi

In [29]:
# Test on 20% of the valid result
test_valid = [check_relevancy(line) for line in related_sentences]

In [30]:
print(len(test_valid))
print(sum(test_valid))

26
0


In [31]:
test_invalid = [check_relevancy(line) for line in unrelated_sentences]

In [32]:
X_test

['emissions grouped scope 1, 2 3 accordance ghg protocol, follows: scope 1: combustion fuels, refrigerant loss.',
 'b1-b5 emissions (embodied carbon operations managed assets): currently, ghg emissions estimated industry benchmarks developed industry expert simon sturgis (see table below).',
 'scope 1 2 absolute target: science based target initiative (sbti) target reported calculating absolute percentage reduction reporting years absolute location-based scope 1 2 ghg emissions versus baseline year 2020.',
 'for sbti numerator, a1-a5 emissions pro-rated duration construction project.',
 'we use location-based method report total ghg emissions track performance 2019 baseline.',
 'overview sasb table this british lands sustainability accounts references indicators set sustainability accounting standards board (sasb) framework.',
 'sbti nzc targets greenhouse gas intensity 2.',
 'scope 1 2 ghg emissions financial intensity measures: we publish financial scope 1 2 ghg emissions intensity a

In [33]:
related_sentences

['our esg journey our 2030 commitments performance overview (kpis) net zero carbon place based approach responsible business performance data 2022 epra index sasb index reporting criteria assurance reports 2 3 4 5 9 17 24 27 61 64 66 85 read tcfd disclosures 2022 annual report against challenging backdrop, teams fantastic job year, working customers partners support local communities.',
 'sbti, net zero targets greenhouse gas intensity for second consecutive year, covid-19 related government restrictions significantly affected portfolios carbon intensity, representing majority reduction.',
 'location based: our use 2021 uk greenhouse gas conversion factors reflect +3% increase ghg intensity gas (vs 2020) +1% increase ghg intensity electricity (vs 2020).',
 'overview sasb table this british lands sustainability accounts references indicators set sustainability accounting standards board (sasb) framework.',
 'area energy management units kwhe sqm sasb code if-re-130a 1 location figure 8,

## CNN

In [515]:
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers import Embedding
from keras.preprocessing import sequence

In [529]:
df_c = pd.read_csv('bl_ratios.csv')

In [532]:
df_c = df_c.drop(columns='Unnamed: 0')

In [537]:
X = df_c['corpus']
y = df_c['have_transition_plan']

In [538]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.3, random_state=42)
X_train[:5]

906    due small population size ensure anonymity, em...
934    a review allocation male females employee band...
684    in general, offices shopping centres floor are...
922    it excludes new starters, weeks complete train...
404    water intensity data covers buildings offices ...
Name: corpus, dtype: object

In [539]:
vectorizer = TfidfVectorizer()
tf_idf_matrix = vectorizer.fit_transform(X_train)

In [540]:
tf_idf_matrix

<691x2470 sparse matrix of type '<class 'numpy.float64'>'
	with 8239 stored elements in Compressed Sparse Row format>

## Relation Extraction

In [73]:
!pip install keybert

Collecting keybert
  Downloading keybert-0.6.0-py2.py3-none-any.whl (22 kB)
Collecting rich>=10.4.0
  Downloading rich-12.6.0-py3-none-any.whl (237 kB)
Collecting sentence-transformers>=0.3.8
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
Collecting commonmark<0.10.0,>=0.9.0
  Using cached commonmark-0.9.1-py2.py3-none-any.whl (51 kB)
Collecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.23.0-py3-none-any.whl (5.3 MB)
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp39-cp39-win_amd64.whl (1.1 MB)
Collecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.10.0-py3-none-any.whl (163 kB)
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.1-cp39-cp39-win_amd64.whl (3.3 MB)
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py): started
  Building wheel for sentence-transformers (setup.py): finished with status 'done'
  Created wheel for sentence-transformers: file

In [74]:
import spacy
from spacy.matcher import Matcher 
from spacy.tokens import Span 
nlp = spacy.load('en_core_web_sm')

from keybert import KeyBERT

In [50]:
# Corpus has already been extract to corpus.txt, take around 15 mins to extract 24 reports
spacy_model = spacy.load("en_core_web_sm")
list_dataset = os.listdir(DATA_FOLDER)
corpus = []
#for file in list_dataset: ##clean loop
pages_content, pages_sentences, all_sentences = extract_pages_sentences(spacy_model,extract_pdf("british-land-sustainability-accounts-2022.pdf"))
corpus.extend(all_sentences)
np.shape(corpus)

(988,)

In [51]:
#Store corpus in text file
with open("bl_corpus.txt", "w") as fp:
    json.dump(corpus, fp)

In [52]:
corpus_data = open('bl_corpus.txt')
corpus = json.load(corpus_data)
corpus

['Our ESG journey Our 2030 commitments Performance overview (KPIs) Net Zero carbon Place based approach Responsible business Performance data 2022 EPRA index SASB index Reporting criteria Assurance Reports 2 3 4 5 9 17 24 27 61 64 66 85 Read more about our TCFD disclosures in our 2022 Annual Report Against a challenging backdrop, our teams have done a fantastic job this year, working with our customers and partners to support local communities.',
 'As we go into a new year, with new uncertainties on the horizon, their dedication will be more vital than ever.',
 'Introduction This year we celebrate 20 years of sustainability reporting, and never has our environmental and social focus been more integral to the way we do business.',
 'The climate emergency is high on everyones agenda; we face a cost of living crisis and the impact of Covid 19 continues to reverberate throughout our communities.',
 'This means progressing our pathway to net zero and making a positive impact at our places i

In [53]:
res = pd.DataFrame(corpus, columns=['words'])

In [54]:
res

Unnamed: 0,words
0,Our ESG journey Our 2030 commitments Performan...
1,"As we go into a new year, with new uncertainti..."
2,Introduction This year we celebrate 20 years o...
3,The climate emergency is high on everyones age...
4,This means progressing our pathway to net zero...
...,...
983,DNV expressly disclaims any liability or co-re...
984,Responsibilities of the Board of Directors of ...
985,Our responsibility is to plan and perform our ...
986,We have not been responsible for the preparati...


In [55]:
res.to_csv('test.csv', index=False)

In [80]:
df_w = pd.read_csv('test.csv')

In [81]:
df_w.head()

Unnamed: 0,words
0,Our ESG journey Our 2030 commitments Performan...
1,"As we go into a new year, with new uncertainti..."
2,Introduction This year we celebrate 20 years o...
3,The climate emergency is high on everyones age...
4,This means progressing our pathway to net zero...


In [59]:
def get_entities(sent):
    ## chunk 1
    ent1 = ""
    ent2 = ""
    
    prv_tok_dep = ""    # dependency tag of previous token in the sentence
    prv_tok_text = ""   # previous token in the sentence
    
    prefix = ""
    modifier = ""

  #############################################################
    for tok in nlp(sent):
        ## chunk 2
        # if token is a punctuation mark then move on to the next token
        if tok.dep_ != "punct":
            # check: token is a compound word or not
            if tok.dep_ == "compound":
                prefix = tok.text
                # if the previous word was also a 'compound' then add the current word to it
                if prv_tok_dep == "compound":
                    prefix = prv_tok_text + " "+ tok.text
      
            # check: token is a modifier or not
            if tok.dep_.endswith("mod") == True:
                modifier = tok.text
                # if the previous word was also a 'compound' then add the current word to it
                if prv_tok_dep == "compound":
                    modifier = prv_tok_text + " "+ tok.text
            
            ## chunk 3
            if tok.dep_.find("subj") == True:
                ent1 = modifier +" "+ prefix + " "+ tok.text
                prefix = ""
                modifier = ""
                prv_tok_dep = ""
                prv_tok_text = ""      
                
            ## chunk 4
            if tok.dep_.find("obj") == True:
                ent2 = modifier +" "+ prefix +" "+ tok.text
                
            ## chunk 5  
            # update variables
            prv_tok_dep = tok.dep_
            prv_tok_text = tok.text
  #############################################################
    return [ent1.strip(), ent2.strip()]

In [60]:
get_entities('We have a transition plan')

['We', 'transition plan']

In [62]:
entities = []
for i in tqdm(df_w['words']):
    entities.append(get_entities(i))

  0%|          | 0/988 [00:00<?, ?it/s]

In [63]:
entities

[['challenging Annual teams', 'local  communities'],
 ['new  dedication', 'new  horizon'],
 ['more  we', 'business'],
 ['living impact', '19  communities'],
 ['progressing', 'positive  places'],
 ['We', 'strong  progress'],
 ['where Canada we', 'sustainable market buildings'],
 ['also  customers', 'also  what'],
 ['1 leasing which', 'ahead  completion'],
 ['efficient  programme', 'efficient  that'],
 ['One  outcome', 'expected MEES legislation'],
 ['where Estate Sustainability we', '* developments'],
 ['local  us', 'biggest  impact'],
 ['year community activities', '24,000  people'],
 ['who', 'powerful  p23'],
 ['', 'ten National Literacy Trust'],
 ['challenging  teams', 'local  communities'],
 ['huge  places', 'huge  contribution'],
 ['new  dedication', 'new  uncertainties'],
 ['outstanding  which', 'long  term'],
 ['who', 'them'],
 ['more  we', 'pandemic'],
 ['zero carbon development', 'teak community reforestation Ghana'],
 ['third party we', 'standing energy portfolio'],
 ['', '1 M

In [64]:
def get_relation(sent):
    doc = nlp(sent)
    
    # Matcher class object 
    matcher = Matcher(nlp.vocab)
    
    #define the pattern 
    pattern = [{'DEP':'ROOT'}, {'DEP':'prep','OP':"?"},
               {'DEP':'agent','OP':"?"},  {'POS':'ADJ','OP':"?"}] 

    matcher.add("matching_1",[pattern]) 

    matches = matcher(doc)
    k = len(matches) - 1

    span = doc[matches[k][1]:matches[k][2]] 

    return(span.text)

In [65]:
relations = []
for i in tqdm(df['words']):
    relations.append(get_relation(i))

  0%|          | 0/988 [00:00<?, ?it/s]

In [66]:
relations

['done',
 'be',
 'celebrate',
 'face',
 'means',
 'continued',
 'completed',
 'is',
 'underlined',
 'completed',
 'be',
 'were delighted',
 'allow',
 'benefitted',
 'is',
 'marked',
 'done',
 'like',
 'be',
 'Prefer',
 'do',
 'been',
 'completed',
 'completed',
 'Awarded',
 'pipeline',
 'Whole',
 'Impacted by',
 'launched',
 'complexes',
 'celebrate',
 'Launched',
 'member of',
 'Launched',
 'Signatory to',
 'projects',
 'system',
 'assessments',
 'established',
 'works',
 'Supported',
 'Launched',
 'Started',
 'Launched',
 'Launched',
 'Worked with',
 'Launched',
 'levy of',
 'Launched',
 'validated',
 'sets',
 'offset embodied',
 'Overview',
 'demonstrate',
 'is',
 'means',
 'work',
 'Connect',
 'are',
 'been on',
 'advocate responsible',
 'set',
 'rated',
 'upgraded',
 'pursued',
 'built',
 'pursuing',
 'increase',
 'created',
 'use',
 'Carbon',
 'have',
 'set',
 'delivered on',
 'committed to',
 'adopted',
 'benchmark',
 'helping',
 'study',
 'assessed with most',
 'rebuilt',
 'usi

In [68]:
dict_df = {'entities': entities, 'relations': relations}

In [70]:
df = pd.DataFrame(dict_df)

In [71]:
df

Unnamed: 0,entities,relations
0,"[challenging Annual teams, local communities]",done
1,"[new dedication, new horizon]",be
2,"[more we, business]",celebrate
3,"[living impact, 19 communities]",face
4,"[progressing, positive places]",means
...,...,...
983,"[expressly person, Limited Assurance Report]",disclaims
984,"[internal Selected that, established Selected ...",have sole
985,"[Selected Information, limited assurance work]",is
986,"[We, Report]",been responsible


In [72]:
df['entities'].str.contains('local')

0     NaN
1     NaN
2     NaN
3     NaN
4     NaN
       ..
983   NaN
984   NaN
985   NaN
986   NaN
987   NaN
Name: entities, Length: 988, dtype: float64

### KeyBERT

In [75]:
bert=KeyBERT()

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [79]:
df

Unnamed: 0,entities,relations
0,"[challenging Annual teams, local communities]",done
1,"[new dedication, new horizon]",be
2,"[more we, business]",celebrate
3,"[living impact, 19 communities]",face
4,"[progressing, positive places]",means
...,...,...
983,"[expressly person, Limited Assurance Report]",disclaims
984,"[internal Selected that, established Selected ...",have sole
985,"[Selected Information, limited assurance work]",is
986,"[We, Report]",been responsible


In [546]:
kw = []
for i in tqdm(df_w['words']):
    kw.append(bert.extract_keywords(i, keyphrase_ngram_range=(3,3), stop_words='english'))

  0%|          | 0/988 [00:00<?, ?it/s]

In [137]:
df_w

Unnamed: 0,words,kw
0,Our ESG journey Our 2030 commitments Performan...,"[(kpis, 0.4649), (esg, 0.3891), (tcfd, 0.3596)..."
1,"As we go into a new year, with new uncertainti...","[(dedication, 0.467), (uncertainties, 0.3364),..."
2,Introduction This year we celebrate 20 years o...,"[(sustainability, 0.6341), (environmental, 0.3..."
3,The climate emergency is high on everyones age...,"[(covid, 0.4679), (climate, 0.4357), (crisis, ..."
4,This means progressing our pathway to net zero...,"[(pathway, 0.2842), (important, 0.2683), (posi..."
...,...,...
983,DNV expressly disclaims any liability or co-re...,"[(disclaims, 0.5645), (liability, 0.518), (dnv..."
984,Responsibilities of the Board of Directors of ...,"[(responsibilities, 0.436), (directors, 0.4093..."
985,Our responsibility is to plan and perform our ...,"[(responsibility, 0.4778), (assurance, 0.4198)..."
986,We have not been responsible for the preparati...,"[(report, 0.338), (responsible, 0.2899), (prep..."


In [547]:
kw

[[('esg journey 2030', 0.5345),
  ('2022 epra index', 0.5165),
  ('tcfd disclosures 2022', 0.5072),
  ('overview kpis net', 0.494),
  ('2030 commitments performance', 0.4932)],
 [('uncertainties horizon dedication', 0.5294),
  ('horizon dedication vital', 0.5184),
  ('year new uncertainties', 0.5093),
  ('new uncertainties horizon', 0.3787),
  ('new year new', 0.3575)],
 [('years sustainability reporting', 0.7734),
  ('20 years sustainability', 0.7292),
  ('sustainability reporting environmental', 0.6904),
  ('reporting environmental social', 0.5771),
  ('celebrate 20 years', 0.5148)],
 [('climate emergency high', 0.6828),
  ('crisis impact covid', 0.6692),
  ('living crisis impact', 0.5545),
  ('impact covid 19', 0.5098),
  ('covid 19 continues', 0.507)],
 [('making positive impact', 0.5332),
  ('positive impact places', 0.4954),
  ('impact places important', 0.4419),
  ('net zero making', 0.4281),
  ('pathway net zero', 0.4243)],
 [('deliver strong progress', 0.7612),
  ('strong prog

In [548]:
key = ['ghg', 'sbti', 'tcfd', 'sasb']

In [549]:
def func(kw, key):
    if any(any(w in word[0] for w in key) for word in kw):
        return True

In [550]:
df_w['kw'] = kw

In [552]:
df_filtered = df_w[df_w['kw'].apply(lambda x: func(x, key)) == True]

In [553]:
df_filtered

Unnamed: 0,words,kw
0,Our ESG journey Our 2030 commitments Performan...,"[(esg journey 2030, 0.5345), (2022 epra index,..."
278,Location based: Our use of the 2021 UK greenho...,"[(2021 uk greenhouse, 0.7075), (uk greenhouse ..."
385,Overview SASB table This is British Lands firs...,"[(lands sustainability accounts, 0.6194), (ove..."
465,Scope 1 and 2 GHG emissions intensities for EP...,"[(emissions intensities epra, 0.7542), (intens..."
466,"For the financial ratio, see GHG Emissions Sco...","[(emissions scope financial, 0.7036), (ghg emi..."
471,"Carbon intensity includes Scope 1, 2 and 3 GHG...","[(carbon intensity includes, 0.718), (scope gh..."
503,Scope 1 and 2 GHG emissions financial intensit...,"[(scope ghg emissions, 0.7161), (emissions fin..."
504,Financial intensity ratio expresses absolute S...,"[(financial intensity ratio, 0.668), (intensit..."
505,Absolute Scope 1 and 2 GHG emissions relate to...,"[(scope ghg emissions, 0.7984), (ghg emissions..."
511,Includes 100% of GHG emissions from our develo...,"[(ghg emissions developments, 0.6352), (emissi..."


In [554]:
df_audit = df_w[df_w['kw'].apply(lambda x: func(x, ['externally assured', 'independently assured', 'independent limited assurance'])) == True]

In [555]:
df_audit

Unnamed: 0,words,kw
263,Selected datahas been independently assured si...,"[(datahas independently assured, 0.7164), (sel..."
418,Certain key data is independently assured (see...,"[(data independently assured, 0.7789), (key da..."
942,This conclusion relates only to the Selected I...,"[(limitations explained overleaf, 0.7038), (li..."
983,DNV expressly disclaims any liability or co-re...,"[(dnv expressly disclaims, 0.839), (disclaims ..."


In [557]:
df_audit['words'].apply(lambda x: print(x))

Selected datahas been independently assured since 2007 (see earlier reports).
Certain key data is independently assured (see below).
This conclusion relates only to the Selected Information, and is to be read in the context of this Independent Limited Assurance Report, in particular the inherent limitations explained overleaf.
DNV expressly disclaims any liability or co-responsibility for any decision a person or an entity may make based on this Independent Limited Assurance Report.


263    None
418    None
942    None
983    None
Name: words, dtype: object

In [562]:
df_compensation = df_w[df_w['kw'].apply(lambda x: func(x, ['targets', 'sustainable practices'])) == True]

In [563]:
df_compensation

Unnamed: 0,words,kw
49,Our Bright Lights skills and employment progra...,"[(carbon reduction targets, 0.6085), (lights s..."
55,Top 81st percentile Science Based Targets: app...,"[(81st percentile science, 0.6119), (percentil..."
63,"From 2021, the 2030 strategy upgraded our BREE...","[(2030 strategy upgraded, 0.7276), (2021 2030 ..."
264,"SBTi, Net Zero targets and greenhouse gas inte...","[(portfolios carbon intensity, 0.5917), (affec..."
460,These targets are based on improvements in who...,"[(improvements building intensity, 0.8052), (t..."
946,SBTi NZC targets greenhouse gas intensity 2.,"[(nzc targets greenhouse, 0.7199), (sbti nzc t..."


In [159]:
df_compensation['words'].apply(lambda x: print(x))

Our Bright Lights skills and employment programme Science Based Targets initiative validated our carbon reduction targets.
Top 81st percentile Science Based Targets: approval in 2021 Our Place Based approach means understanding the most important issues and opportunities in the communities around each of our places and focusing our efforts collaboratively to make the biggest impact at each place.
From 2021, the 2030 strategy upgraded our BREEAM targets to Outstanding for Offices (from Excellent) and Excellent for Retail (from Very Good) 2.
SBTi, Net Zero targets and greenhouse gas intensity For the second consecutive year, COVID-19 and related government restrictions has significantly affected our portfolios carbon intensity, representing the majority of the reduction.
These targets are based on improvements in whole building intensity.
SBTi NZC targets greenhouse gas intensity 2.


49     None
55     None
63     None
264    None
460    None
946    None
Name: words, dtype: object

In [152]:
df_audit['words'].apply(lambda x: print(x))

On the standing portfolio, we completed our net zero audits and are working with our customers to invest into the energy efficient interventions that programme has identified.
Net zero asset audits completed Working with third party experts, we completed 29 audits across the portfolio identifying opportunities to improve the energy efficiency of our standing portfolio.
This year we completed 29 net zero audits of buildings across our portfolio identifying initiatives which would deliver at least a 25% improvement in whole building energy efficiency.
Overview Net Zero Carbon Reducing operational carbon continued Increasing energy efficiency on the standing portfolio This year, third party consultants completed net zero audits covering 29 of our major assets.
Through our partnership with anti-modern slavery charity Unseen, we also undertook independent audits of 10 of our suppliers operating in higher risk areas (with 11 more scheduled), reviewing compliance on 12 key areas of our Suppli

9      None
23     None
105    None
113    None
254    None
256    None
298    None
392    None
400    None
Name: words, dtype: object

In [125]:
def get_kw(df, key):
    kw = []
    for i in tqdm(df['words']):
        kw.append(bert.extract_keywords(i, stop_words='english'))
    
    df['kw'] = kw
    df_filtered = df_w[df_w['kw'].apply(lambda x: func(x, key)) == True]
    if not df_filtered.empty:
        print("True")
        print(df['words'])

In [126]:
get_kw(df_w, ['audited'])

  0%|          | 0/988 [00:00<?, ?it/s]

### BERT Test

In [161]:
!pip install transformers



In [162]:
import torch
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer

In [164]:
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

Downloading:   0%|          | 0.00/443 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

In [465]:
df_w

Unnamed: 0,words,kw
0,Our ESG journey Our 2030 commitments Performan...,"[(overview kpis, 0.5098), (kpis net, 0.474), (..."
1,"As we go into a new year, with new uncertainti...","[(dedication vital, 0.572), (horizon dedicatio..."
2,Introduction This year we celebrate 20 years o...,"[(years sustainability, 0.7121), (sustainabili..."
3,The climate emergency is high on everyones age...,"[(climate emergency, 0.6627), (crisis impact, ..."
4,This means progressing our pathway to net zero...,"[(positive impact, 0.4841), (net zero, 0.4431)..."
...,...,...
983,DNV expressly disclaims any liability or co-re...,"[(disclaims liability, 0.7088), (expressly dis..."
984,Responsibilities of the Board of Directors of ...,"[(responsibilities board, 0.6026), (board dire..."
985,Our responsibility is to plan and perform our ...,"[(responsibility plan, 0.565), (assurance conc..."
986,We have not been responsible for the preparati...,"[(preparation report, 0.4001), (responsible pr..."


In [458]:
question = "Which standard is used to measure carbon emissions? "

In [459]:
paragraph = """
We use internal methodology to measuer carbon emissions
"""

In [460]:
encoding = tokenizer.encode_plus(text=question,text_pair=paragraph)

In [461]:
inputs = encoding['input_ids']  #Token embeddings
sentence_embedding = encoding['token_type_ids']  #Segment embeddings
tokens = tokenizer.convert_ids_to_tokens(inputs) #input tokens

In [462]:
score = model(input_ids=torch.tensor([inputs]), token_type_ids=torch.tensor([sentence_embedding]))

In [463]:
start_index = torch.argmax(score.start_logits)

end_index = torch.argmax(score.end_logits)

answer = ' '.join(tokens[start_index:end_index+1])

In [464]:
corrected_answer = ''

for word in answer.split():
    
    #If it's a subword token
    if word[0:2] == '##':
        corrected_answer += word[2:]
    else:
        corrected_answer += ' ' + word

print(corrected_answer)

 internal methodology


In [565]:
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
tok = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

In [468]:
def test(question, paragraph, model, tokenizer):
    encoding = tokenizer.encode_plus(text=question,text_pair=paragraph)
    inputs = encoding['input_ids']  #Token embeddings
    sentence_embedding = encoding['token_type_ids']  #Segment embeddings
    tokens = tokenizer.convert_ids_to_tokens(inputs) #input tokens
    score = model(input_ids=torch.tensor([inputs]), token_type_ids=torch.tensor([sentence_embedding]))
    
    start_index = torch.argmax(score.start_logits)

    end_index = torch.argmax(score.end_logits)

    answer = ' '.join(tokens[start_index:end_index+1])
    
    corrected_answer = ''

    for word in answer.split():
    
        #If it's a subword token
        if word[0:2] == '##':
            corrected_answer += word[2:]
        else:
            corrected_answer += ' ' + word
    print(corrected_answer)
    return corrected_answer

In [474]:
df_new = df_w[['words']].drop_duplicates()

In [475]:
df_new

Unnamed: 0,words
0,Our ESG journey Our 2030 commitments Performan...
1,"As we go into a new year, with new uncertainti..."
2,Introduction This year we celebrate 20 years o...
3,The climate emergency is high on everyones age...
4,This means progressing our pathway to net zero...
...,...
983,DNV expressly disclaims any liability or co-re...
984,Responsibilities of the Board of Directors of ...
985,Our responsibility is to plan and perform our ...
986,We have not been responsible for the preparati...


In [569]:
df_new['kw2'] = df_new['words'].apply(lambda x: test("Are your emission reduction targets independently assured?", x, model, tok))

 criteria assurance reports
 progressing our pathway to net zero and making a positive impact at our places is more important than ever
 canada water
 fully let ( or under option ) four years ahead of completion
 we completed our net zero audits
 one outcome will be to raise epc ratings across our portfolio to a or b by 2030 in line with expected mees legislation
 gresb
 our place based approach and our strong local relationships
 this year marked ten years of our partnership with the national literacy trust
 certified schemes
 working with third party experts
 whole building operational energy intensity improvement vs 2019 baseline for offices 3
 breeam certification framework
 one of londons biggest rehearsal and artist development complexes

 sustainability brief for developments

 signatory to the un global compact
 independently assured
 completed first whole life carbon assessments
 our sustainability brief established the social and environmental requirements for our development

 by working with these suppliers improvement opportunities will be identified and agreed
 our supplier excellence awards we are delighted to be recognised by british land in their first supplier awards
 havent seen any other client take this approach to service partner recognition
 british land sustainability
 full assurance reports
 selected datahas been independently assured since 2007
 net zero targets
 we have also delivered initiatives which enhance energy efficiency
 independently assured
 the embodied emissions are offset after the project achieves the practical completion
 until the uk government specifies the long - term strategy for low - carbon heating ( including the role of hydrogen and all - electric buildings ) , y will include the use of renewable bio - gas .
 exploring all electric option targeting < 80
 not undertaken the full nabers accreditation
 tbc data will be available in future years
 estimated energy consumption in retail units assumes regular operations .
 th

 any new acquisitions of fully operational properties must have reached at least 80 % occupancy
 energy , carbon and water relate to the whole building including both landlord and occupier areas plus any vacant space .
 net lettable areas ( nla ) .
 in 2021 floor area figures were sourced from mid - year valuations
 must have been managed by british land for at least 12 months and have reached at least 80 % occupancy
 landlord common parts intensity only is reported until occupier data can be obtained
 only is reported until occupier data can be obtained
 independently assured ? [SEP]
 it has both negligible landlord procured common parts consumption and no appropriate denominator
 shopping villages and high street retail : energy and carbon intensity to be reported when further data is available
 shopping villages have external walkways and common areas beyond car parks but which are not enclosed
 neither common parts floor area nor car park spaces is an appropriate denominator in the

 where an estimate is not available
 sub - metered
 where sub - metering is not available
 measured
 it is estimated based on methods that reflect what equipment is being used for
 on - site renewables at our offices comprise photovoltaic panels
 energy is used on site and included in common parts data .
 low carbon technologies
 independently assured ? [SEP]
 on - site renewables in retail
 we have assumed that all electricity generated at our retail sites has been exported to the grid



 we have included energy consumption in our flexible workspace offices ( storey ) in our reporting
 generation
 where this occurs , it is acceptableto default to the higher rating
 where multiple sustainability certifications are held for the same building , the following procedure is applied for determining which certification is reported
 the default selection for reporting is the development certification
 if a further operational certification is sought for the purpose of improving an existing de

 affordable housing : constructing affordable housing , not including design fees
 public space and environment : environmental or art enhancements with a clear community benefit , regardless of land ownership
 accessibility and transport : contributions to highways , roads or public spaces outside our ownership boundary , including payments made to local authorities
 not captured in our community investment programme
 figures are based on spend in the financial year
 data is estimated by our cost consultants based on their professional knowledge and project understanding , and pro - rated monthly across the construction period .
 scope
 scoring changed to out of 45 , with 5 points available for innovation
 target score for all projects remains 40
 independently assured
 spend data is cumulative
 an organisation with a postcode within the defined s106 agreement
 local branches of national firms are included if within the defined area
 an organisation employing fewer than 250 people
 co

In [567]:
df_audit['kw2'] = df_audit['words'].apply(lambda x: test('Are your emission reduction targets independently assured?', x, model, tok))

 selected datahas been independently assured since 2007
 certain key data
 independent limited assurance report
 dnv expressly disclaims any liability or co - responsibility for any decision a person or an entity may make based on this independent limited assurance report


In [508]:
df_new = df_new[~df_new['kw2'].str.contains('SEP')]

In [511]:
df_d = df_new[df_new['kw2'].str.contains('independent|externally|external')]

In [512]:
df_d['words'].apply(lambda x: print(x))

Selected datahas been independently assured since 2007 (see earlier reports).
Overview Carbon emissions (continued) Fig.
Overview Energy use Fig.
Overview Energy use (continued) Fig.
Overview Water use Fig.
Overview Water use (continued) Fig.
Overview Overview Waste and materials Fig.
Overview Waste and materials (continued) Fig.
Overview Biodiversity Fig.
Overview Community Fig.
Overview Contributions and investment Fig.
Overview Health and safety Fig.
Overview Health and safety (continued) Fig.
Employee training proportion by category DSE Assessment and Training (previously referred to as Health and Safety training), fell significantly during the year due to an alternative focus on working from home and returning to the office assessments.
In 2022 we undertook a number of independent water audits in order to identify existing or potential issues.
It also sets out the overall principles, boundaries, scope and methodologies applied when reporting sustainability data in our 2022 Annual 

263    None
266    None
299    None
302    None
318    None
321    None
322    None
323    None
338    None
340    None
344    None
355    None
356    None
375    None
392    None
406    None
407    None
418    None
423    None
495    None
564    None
617    None
652    None
705    None
725    None
828    None
935    None
942    None
951    None
952    None
954    None
962    None
972    None
983    None
985    None
Name: words, dtype: object

In [494]:
df_c = df_new[df_new['kw'].str.contains('ghg')]

In [497]:
df_c['words'].apply(lambda x: print(x))

For the financial ratio, see GHG Emissions Scope 1 and 2 Financial Intensity Measures.
Table 4 shows the combined carbon-equivalent emission factors for the different GHGs considered.


466    None
615    None
Name: words, dtype: object