## Note: Should not be run unless you like to re-train your models

In [9]:
######################################### IMPORTING PACAKGES #############################
# Basic ML Packages
from scipy import spatial
import pandas as pd
import math
import os
import json
import numpy as np
import string

import warnings
warnings.filterwarnings("ignore")

# PDF text extraction
from pdfminer3.layout import LAParams, LTTextBox
from pdfminer3.pdfpage import PDFPage
from pdfminer3.pdfinterp import PDFResourceManager
from pdfminer3.pdfinterp import PDFPageInterpreter
from pdfminer3.converter import PDFPageAggregator
from pdfminer3.converter import TextConverter

# Others
import string
import re
from pprint import pprint
from tqdm.notebook import tqdm
import io

# Text pre-processing (Tokenization, Stemming, Lemmatization)
import nltk
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import RegexpTokenizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('omw-1.4')

# Pdf Extraction Model
import spacy
spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm", disable=['ner'])

#Gensim stopwords
import gensim
from gensim.parsing.preprocessing import remove_stopwords
stopwords = gensim.parsing.preprocessing.STOPWORDS

# Train Test Split
from sklearn.model_selection import train_test_split

# Tf-Idf Vectorization
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

DATA_FOLDER = "datasets/real_estate"

[nltk_data] Downloading package punkt to C:\Users\Chen
[nltk_data]     Wei\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Chen
[nltk_data]     Wei\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\Chen
[nltk_data]     Wei\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


✔ Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


In [10]:
def extract_pdf(file_path):
    """
    Process raw PDF text to structured and processed PDF text to be worked on in Python.
    Parameters
    ----------
    file_path : Relative Location of File
    Return
    ------
    text : str
        processed PDF text if no error is throw
    """   

    try:
        resource_manager = PDFResourceManager()
        fake_file_handle = io.StringIO()
        codec = 'utf-8'
        laparams = LAParams()

        converter = TextConverter(resource_manager, fake_file_handle, codec=codec, laparams=laparams)
        page_interpreter = PDFPageInterpreter(resource_manager, converter)
        
        password = ""
        maxpages = 0
        caching = True
        pagenos = set()

        content = []

        with open(file_path, 'rb') as file:
            for page in PDFPage.get_pages(file,
                                        pagenos, 
                                        maxpages=maxpages,
                                        password=password,
                                        caching=True,
                                        check_extractable=False):

                page_interpreter.process_page(page)

                content.append(fake_file_handle.getvalue())

                fake_file_handle.truncate(0)
                fake_file_handle.seek(0)        

        text = '##PAGE_BREAK##'.join(content)

        # close open handles
        converter.close()
        fake_file_handle.close()
        
        return text

    except Exception as e:
        print(e)

        # close open handles
        converter.close()
        fake_file_handle.close()

        return ""

In [11]:
# nlp preprocessing
def preprocess_lines(line_input):
    """
    Helper Function to preprocess and clean sentences from raw PDF text 
    Parameters
    ----------
    line_input : str
        String that contains a sentence to be cleaned
    Return
    ------
    line : str
        Cleaned sentence
    ----------
    Sub: Substitute regular expression
    Split: Remove blank space from front and rear 
    """  
    # removing header number
    line = re.sub(r'^\s?\d+(.*)$', r'\1', line_input)
    # removing trailing spaces
    line = line.strip()
    # words may be split between lines, ensure we link them back together
    line = re.sub(r'\s?-\s?', '-', line)
    # remove space prior to punctuation
    line = re.sub(r'\s?([,:;\.])', r'\1', line)
    # ESG contains a lot of figures that are not relevant to grammatical structure
    line = re.sub(r'\d{5,}', r' ', line)
    # remove emails
    line = re.sub(r'\S*@\S*\s?', '', line)
    # remove mentions of URLs
    line = re.sub(r'((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*', r' ', line)
    # remove multiple spaces
    line = re.sub(r'\s+', ' ', line)
    # join next line with space
    line = re.sub(r' \n', ' ', line)
    line = re.sub(r'.\n', '. ', line)
    line = re.sub(r'\x0c', ' ', line)
    
    return line

In [12]:
def remove_non_ascii(text):
    """
    Helper Function to remove non ascii characters from text
    Printable will 
    """
    printable = set(string.printable) #Convert iterable to set
    return ''.join(filter(lambda x: x in printable, text))

def not_header(line):
    """
    Helper Function to remove headers
    Check if all the characters are in upper case
    """
    return not line.isupper()

In [13]:
def extract_pages_sentences(nlp, text):    
    """
    Extracting text from raw PDF text and store them by pages and senteces. Raw text is also cleand by removing junk, URLs, etc.
    Consecutive lines are also grouped into paragraphs and spacy is used to parse sentences.
    Parameters
    ----------
    nlp: spacy nlp model
        NLP model to parse sentences
    text : str
        Raw PDF text
    Return
    ------
    pages_content : list of str
        A list containing text from each page of the PDF report. Page number is the index of list + 1
    
    pages_sentences : list of list
        A list containing lists. Page number is the index of outer list + 1. Inner list contains sentences from each page
 
    """  
    
    pages = text.split('##PAGE_BREAK##')
    #print('Number of Pages: {}'.format(len(pages)))

    lines = []
    for i in range(len(pages)):
        page_number = i + 1
        page = pages[i]
        print(f"Extracting page number: {i}")
        
        # remove non ASCII characters
        text = remove_non_ascii(page)
        
        # if len(text.split(' ')) < MIN_WORDS_PER_PAGE:
        #     print(f'Skipped Page: {page_number}')
        #     continue
        
        prev = ""
        for line in text.split('\n\n'):
            # aggregate consecutive lines where text may be broken down
            # only if next line starts with a space or previous does not end with dot.
            if(line.startswith(' ') or not prev.endswith('.')):
                prev = prev + ' ' + line
            else:
                # new paragraph
                lines.append(prev)
                prev = line

        # don't forget left-over paragraph
        lines.append(prev)
        lines.append('##SAME_PAGE##')
        
    lines = '  '.join(lines).split('##SAME_PAGE##')
    
    # clean paragraphs from extra space, unwanted characters, urls, etc.
    # best effort clean up, consider a more versatile cleaner
    
    pages_content = []
    pages_sentences = []
    all_sentences = []

    for line in lines[:-1]: # looping through each page
        
        line = preprocess_lines(line)       
        pages_content.append(str(line).strip())

        sentences = []
        # split paragraphs into well defined sentences using spacy
        for part in list(nlp(line).sents):
            sentences.append(str(part).strip())

        #sentences += nltk.sent_tokenize(line)
            
        # Only interested in full sentences and sentences with 10 to 100 words. --> filter out first page/content page
        sentences = [s for s in sentences if re.match('^[A-Z][^?!.]*[?.!]$', s) is not None]
        sentences = [s.replace('\n', ' ') for s in sentences]
        
        pages_sentences.append(sentences)
        all_sentences.extend(sentences)
    return pages_content, pages_sentences, all_sentences #list, list of list where page is index of outer list, list of sentences

In [14]:
def extract_sentences(nlp, text):
    pages = text.split('##PAGE_BREAK##')
    #print('Number of Pages: {}'.format(len(pages)))

    lines = []
    for i in range(len(pages)):
        page_number = i + 1
        page = pages[i]
        
        # remove non ASCII characters
        text = remove_non_ascii(page)
 
        prev = ""
        for line in text.split('\n\n'):
            # aggregate consecutive lines where text may be broken down
            # only if next line starts with a space or previous does not end with dot.
            if(line.startswith(' ') or not prev.endswith('.')):
                prev = prev + ' ' + line
            else:
                # new paragraph
                lines.append(prev)
                prev = line

        # don't forget left-over paragraph
        lines.append(prev)
        lines.append('##SAME_PAGE##')
        
    lines = '  '.join(lines).split('##SAME_PAGE##')
    
    # clean paragraphs from extra space, unwanted characters, urls, etc.
    # best effort clean up, consider a more versatile cleaner
    
    pages_content = []
    pages_sentences = []
    all_sentences = []

    for line in lines[:-1]: # looping through each page
        
        line = preprocess_lines(line)       
        pages_content.append(str(line).strip())

        sentences = []
        # split paragraphs into well defined sentences using spacy
        for part in list(nlp(line).sents):
            sentences.append(str(part).strip())

        #sentences += nltk.sent_tokenize(line)
            
        # Only interested in full sentences and sentences with 10 to 100 words. --> filter out first page/content page
        sentences = [s for s in sentences if re.match('^[A-Z][^?!.]*[?.!]$', s) is not None]
        sentences = [s.replace('\n', ' ') for s in sentences]
        
        pages_sentences.append(sentences)
        all_sentences.extend(sentences)
    return all_sentences #list of sentences

In [None]:
# Corpus has already been extract to corpus.txt, take around 15 mins to extract 24 reports
spacy_model = spacy.load("en_core_web_sm")
list_dataset = os.listdir(DATA_FOLDER)
corpus = []
#for file in list_dataset: ##clean loop
pages_content, pages_sentences, all_sentences = extract_pages_sentences(spacy_model,extract_pdf(DATA_FOLDER + "/Befimmo 2021 Sustainability Report.pdf"))
corpus.extend(all_sentences)
np.shape(corpus)

In [8]:
#Store corpus in text file
with open("bl_corpus.txt", "w") as fp:
    json.dump(corpus, fp)

In [9]:
corpus_data = open('bl_corpus.txt')
corpus = json.load(corpus_data)
corpus

['Our ESG journey Our 2030 commitments Performance overview (KPIs) Net Zero carbon Place based approach Responsible business Performance data 2022 EPRA index SASB index Reporting criteria Assurance Reports 2 3 4 5 9 17 24 27 61 64 66 85 Read more about our TCFD disclosures in our 2022 Annual Report Against a challenging backdrop, our teams have done a fantastic job this year, working with our customers and partners to support local communities.',
 'As we go into a new year, with new uncertainties on the horizon, their dedication will be more vital than ever.',
 'Introduction This year we celebrate 20 years of sustainability reporting, and never has our environmental and social focus been more integral to the way we do business.',
 'The climate emergency is high on everyones agenda; we face a cost of living crisis and the impact of Covid 19 continues to reverberate throughout our communities.',
 'This means progressing our pathway to net zero and making a positive impact at our places i

In [10]:
from tabulate import tabulate

def stemming(corpus):
    stemmer = SnowballStemmer(language='english')
    revisions = [stemmer.stem(line) for line in corpus]
    return revisions

In [11]:
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer 

def lemmatization(corpus):
    lemmatizer = WordNetLemmatizer()
    revisions = [lemmatizer.lemmatize(line) for line in corpus]
    return revisions
    

[nltk_data] Downloading package wordnet to C:\Users\Chen
[nltk_data]     Wei\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [12]:
tokenizer = TreebankWordTokenizer()
tokenize_output = tokenizer.tokenize(corpus[15])
stemming(tokenize_output)

['this',
 'year',
 'mark',
 'ten',
 'year',
 'of',
 'our',
 'partnership',
 'with',
 'the',
 'nation',
 'literaci',
 'trust',
 '.']

In [13]:
lemmatization(stemming(tokenize_output))

['this',
 'year',
 'mark',
 'ten',
 'year',
 'of',
 'our',
 'partnership',
 'with',
 'the',
 'nation',
 'literaci',
 'trust',
 '.']

In [14]:
def remove_stop_words(corpus):
    revisions = [remove_stopwords(line) for line in corpus]
    return revisions 

In [15]:
remove_stop_words(corpus)

['Our ESG journey Our 2030 commitments Performance overview (KPIs) Net Zero carbon Place based approach Responsible business Performance data 2022 EPRA index SASB index Reporting criteria Assurance Reports 2 3 4 5 9 17 24 27 61 64 66 85 Read TCFD disclosures 2022 Annual Report Against challenging backdrop, teams fantastic job year, working customers partners support local communities.',
 'As new year, new uncertainties horizon, dedication vital ever.',
 'Introduction This year celebrate 20 years sustainability reporting, environmental social focus integral way business.',
 'The climate emergency high everyones agenda; face cost living crisis impact Covid 19 continues reverberate communities.',
 'This means progressing pathway net zero making positive impact places important ever.',
 'We continued deliver strong progress year.',
 'At 1 Triton Square, completed second net zero carbon development site Canada Water, piloting innovative building materials technologies deliver market leading

In [16]:
def pre_processing(corpus):
    return lemmatization(stemming(remove_stop_words(corpus)))

In [17]:
def label_relevancy(corpus, related_words):
    related_words = pre_processing(related_words)
    corpus = pre_processing(corpus)
    related_sentences = [line for line in corpus if any(word in line for word in related_words)]
    unrelated_sentences = [line for line in corpus if (line not in related_sentences)]
    all_sentences = related_sentences + unrelated_sentences
    all_data = pd.DataFrame(all_sentences, columns=['corpus'])
    all_data['have_transition_plan'] = all_data['corpus'].isin(related_sentences)
    return related_sentences, unrelated_sentences, all_data

In [18]:
related_words =  ['externally assured', 'independently assured', 'independent limited assurance']
related_sentences, unrelated_sentences, all_data = label_relevancy(corpus, related_words)
all_data

Unnamed: 0,corpus,have_transition_plan
0,selected datahas independently assured 2007 (s...,True
1,certain key data independently assured (see be...,True
2,we engaged dnv perform independent limited ass...,True
3,overview independent limited assurance report ...,True
4,"this conclusion relates selected information, ...",True
...,...,...
983,"the selection different, acceptable, measureme...",False
984,our assurance relies premise data information ...,False
985,responsibilities board directors british land ...,False
986,we responsible preparation report.,False


In [19]:
all_data.to_csv('bl_ratios.csv')

In [20]:
from sklearn.model_selection import train_test_split

X_train, X_test= train_test_split(related_sentences, test_size=0.5)
X_train[:5]

['certain key data independently assured (see below).',
 'dnv expressly disclaims liability co-responsibility decision person entity based independent limited assurance report.',
 'selected datahas independently assured 2007 (see earlier reports).']

In [21]:
vectorizer = TfidfVectorizer()
tf_idf_matrix = vectorizer.fit_transform(X_train)

In [22]:
tf_idf_matrix.shape

(3, 26)

In [23]:
feature_names = vectorizer.get_feature_names()
corpus_index = [n for n in X_train]
df = pd.DataFrame(tf_idf_matrix.T.todense(), index=feature_names, columns=corpus_index)
df

Unnamed: 0,certain key data independently assured (see below).,dnv expressly disclaims liability co-responsibility decision person entity based independent limited assurance report.,selected datahas independently assured 2007 (see earlier reports).
2007,0.0,0.0,0.385323
assurance,0.0,0.267261,0.0
assured,0.31757,0.0,0.293048
based,0.0,0.267261,0.0
below,0.417567,0.0,0.0
certain,0.417567,0.0,0.0
co,0.0,0.267261,0.0
data,0.417567,0.0,0.0
datahas,0.0,0.0,0.385323
decision,0.0,0.267261,0.0


In [24]:
df.T

Unnamed: 0,2007,assurance,assured,based,below,certain,co,data,datahas,decision,...,independently,key,liability,limited,person,report,reports,responsibility,see,selected
certain key data independently assured (see below).,0.0,0.0,0.31757,0.0,0.417567,0.417567,0.0,0.417567,0.0,0.0,...,0.31757,0.417567,0.0,0.0,0.0,0.0,0.0,0.0,0.31757,0.0
dnv expressly disclaims liability co-responsibility decision person entity based independent limited assurance report.,0.0,0.267261,0.0,0.267261,0.0,0.0,0.267261,0.0,0.0,0.267261,...,0.0,0.0,0.267261,0.267261,0.267261,0.267261,0.0,0.267261,0.0,0.0
selected datahas independently assured 2007 (see earlier reports).,0.385323,0.0,0.293048,0.0,0.0,0.0,0.0,0.0,0.385323,0.0,...,0.293048,0.0,0.0,0.0,0.0,0.0,0.385323,0.0,0.293048,0.385323


In [25]:
import math

def find_related_sentences(file_path):
    spacy_model = spacy.load("en_core_web_sm")
    im = extract_sentences(spacy_model, extract_pdf(file_path)) #Get list of corpus
    im = pre_processing(im)
    input_len = len(im)
    input_vec = vectorizer.transform(im)
    cosine_similarities = []
    angles = []
    for i in range(tf_idf_matrix.shape[0]):
        cosine = cosine_similarity(tf_idf_matrix[i], input_vec)[0]
        angle_list = np.rad2deg(np.arccos(cosine))
        cosine_similarities.append(cosine)
        angles.extend(angle_list)
    min_angle = min(angles)
    threshold_value = 30
    index1 = math.ceil(np.argmin(angles) / input_len)
    index2 = np.argmin(angles) % input_len
    new_line = '\n'
    if min_angle <= threshold_value:
        print(f"Low carbon transition plan found: {new_line} Sentence in input report: {im[index2]} {new_line} Sentence in training dataset: {corpus[index1]}")
        return True
    else:
        print(f"No low carbon transition plan found {new_line} Closest sentences: {im[index2]}")
        return False

In [26]:
find_related_sentences(DATA_FOLDER + "/British Land 2022 Sustainability Report.pdf")

No low carbon transition plan found 
 Closest sentences: certain key data independently assured (see below).


False

In [27]:
def check_relevancy(sentence):
    sentence = pre_processing(sentence)
    input_vec = vectorizer.transform(sentence) 
    cosine_similarities = []
    angles = []
    for i in range(tf_idf_matrix.shape[0]):
        cosine = cosine_similarity(tf_idf_matrix[i], input_vec)[0]
        angle_list = np.rad2deg(np.arccos(cosine))
        cosine_similarities.append(cosine)
        angles.extend(angle_list)
        
    min_angle = min(angles)
    threshold_value = 60
    return min_angle <= threshold_value


In [28]:
unrelated_sentences

['our esg journey our 2030 commitments performance overview (kpis) net zero carbon place based approach responsible business performance data 2022 epra index sasb index reporting criteria assurance reports 2 3 4 5 9 17 24 27 61 64 66 85 read tcfd disclosures 2022 annual report against challenging backdrop, teams fantastic job year, working customers partners support local communities.',
 'as new year, new uncertainties horizon, dedication vital ever.',
 'introduction this year celebrate 20 years sustainability reporting, environmental social focus integral way business.',
 'the climate emergency high everyones agenda; face cost living crisis impact covid 19 continues reverberate communities.',
 'this means progressing pathway net zero making positive impact places important ever.',
 'we continued deliver strong progress year.',
 'at 1 triton square, completed second net zero carbon development site canada water, piloting innovative building materials technologies deliver market leading

In [29]:
# Test on 20% of the valid result
test_valid = [check_relevancy(line) for line in related_sentences]

In [30]:
print(len(test_valid))
print(sum(test_valid))

7
0


In [31]:
test_invalid = [check_relevancy(line) for line in unrelated_sentences]

In [32]:
X_test

['our responsibility plan perform work obtain limited assurance selected information prepared accordance criteria report british land form independent limited assurance conclusion, based work performed evidence obtained.',
 'we engaged dnv perform independent limited assurance selection material 2022 data.',
 'this conclusion relates selected information, read context independent limited assurance report, particular inherent limitations explained overleaf.',
 'overview independent limited assurance report board directors the british land company plc the british land company plc (british land) commissioned dnv business assurance services uk limited (dnv, we) conduct limited assurance engagement selected information presented sustainability accounts 2022 (the report) reporting year ended 31 march 2022.']

In [33]:
related_sentences

['selected datahas independently assured 2007 (see earlier reports).',
 'certain key data independently assured (see below).',
 'we engaged dnv perform independent limited assurance selection material 2022 data.',
 'overview independent limited assurance report board directors the british land company plc the british land company plc (british land) commissioned dnv business assurance services uk limited (dnv, we) conduct limited assurance engagement selected information presented sustainability accounts 2022 (the report) reporting year ended 31 march 2022.',
 'this conclusion relates selected information, read context independent limited assurance report, particular inherent limitations explained overleaf.',
 'dnv expressly disclaims liability co-responsibility decision person entity based independent limited assurance report.',
 'our responsibility plan perform work obtain limited assurance selected information prepared accordance criteria report british land form independent limited 

## CNN

In [515]:
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers import Embedding
from keras.preprocessing import sequence

In [529]:
df_c = pd.read_csv('bl_ratios.csv')

In [532]:
df_c = df_c.drop(columns='Unnamed: 0')

In [537]:
X = df_c['corpus']
y = df_c['have_transition_plan']

In [538]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.3, random_state=42)
X_train[:5]

906    due small population size ensure anonymity, em...
934    a review allocation male females employee band...
684    in general, offices shopping centres floor are...
922    it excludes new starters, weeks complete train...
404    water intensity data covers buildings offices ...
Name: corpus, dtype: object

In [539]:
vectorizer = TfidfVectorizer()
tf_idf_matrix = vectorizer.fit_transform(X_train)

In [540]:
tf_idf_matrix

<691x2470 sparse matrix of type '<class 'numpy.float64'>'
	with 8239 stored elements in Compressed Sparse Row format>

## Relation Extraction

In [7]:
!pip install keybert



In [7]:
import spacy
from spacy.matcher import Matcher 
from spacy.tokens import Span 
nlp = spacy.load('en_core_web_sm')

from keybert import KeyBERT

In [9]:
DATA_FOLDER

'datasets/real_estate'

In [10]:
DATA_FOLDER = 'datasets/real_estate'

In [11]:
DATA_FOLDER

'datasets/real_estate'

In [7]:
# Corpus has already been extract to corpus.txt, take around 15 mins to extract 24 reports
spacy_model = spacy.load("en_core_web_sm")
list_dataset = os.listdir(DATA_FOLDER)
corpus = []
#for file in list_dataset: ##clean loop
for file in list_dataset:
    print(file)
    print(DATA_FOLDER + '/' + file)
    pages_content, pages_sentences, all_sentences = extract_pages_sentences(spacy_model,extract_pdf(DATA_FOLDER + '/' + file))
    corpus.extend(all_sentences)
np.shape(corpus)

British Land 2022 Sustainability Report.pdf
datasets/real_estate/British Land 2022 Sustainability Report.pdf
Extracting page number: 0
Extracting page number: 1
Extracting page number: 2
Extracting page number: 3
Extracting page number: 4
Extracting page number: 5
Extracting page number: 6
Extracting page number: 7
Extracting page number: 8
Extracting page number: 9
Extracting page number: 10
Extracting page number: 11
Extracting page number: 12
Extracting page number: 13
Extracting page number: 14
Extracting page number: 15
Extracting page number: 16
Extracting page number: 17
Extracting page number: 18
Extracting page number: 19
Extracting page number: 20
Extracting page number: 21
Extracting page number: 22
Extracting page number: 23
Extracting page number: 24
Extracting page number: 25
Extracting page number: 26
Extracting page number: 27
Extracting page number: 28
Extracting page number: 29
Extracting page number: 30
Extracting page number: 31
Extracting page number: 32
Extracting

Extracting page number: 142
Extracting page number: 143
Extracting page number: 144
Extracting page number: 145
Extracting page number: 146
Extracting page number: 147
Extracting page number: 148
Extracting page number: 149
Extracting page number: 150
Extracting page number: 151
Extracting page number: 152
Extracting page number: 153
Extracting page number: 154
Extracting page number: 155
Extracting page number: 156
Extracting page number: 157
Extracting page number: 158
Extracting page number: 159
Extracting page number: 160
Extracting page number: 161
Extracting page number: 162
Extracting page number: 163
Extracting page number: 164
Extracting page number: 165
Extracting page number: 166
Extracting page number: 167
Extracting page number: 168
Extracting page number: 169
Extracting page number: 170
Extracting page number: 171
Extracting page number: 172
Extracting page number: 173
Extracting page number: 174
Extracting page number: 175
Extracting page number: 176
Extracting page numb

Icade 2021 URD.pdf
datasets/real_estate/Icade 2021 URD.pdf
Extracting page number: 0
Extracting page number: 1
Extracting page number: 2
Extracting page number: 3
Extracting page number: 4
Extracting page number: 5
Extracting page number: 6
Extracting page number: 7
Extracting page number: 8
Extracting page number: 9
Extracting page number: 10
Extracting page number: 11
Extracting page number: 12
Extracting page number: 13
Extracting page number: 14
Extracting page number: 15
Extracting page number: 16
Extracting page number: 17
Extracting page number: 18
Extracting page number: 19
Extracting page number: 20
Extracting page number: 21
Extracting page number: 22
Extracting page number: 23
Extracting page number: 24
Extracting page number: 25
Extracting page number: 26
Extracting page number: 27
Extracting page number: 28
Extracting page number: 29
Extracting page number: 30
Extracting page number: 31
Extracting page number: 32
Extracting page number: 33
Extracting page number: 34
Extrac

Klepierre 2021 URD.pdf
datasets/real_estate/Klepierre 2021 URD.pdf
Extracting page number: 0
Extracting page number: 1
Extracting page number: 2
Extracting page number: 3
Extracting page number: 4
Extracting page number: 5
Extracting page number: 6
Extracting page number: 7
Extracting page number: 8
Extracting page number: 9
Extracting page number: 10
Extracting page number: 11
Extracting page number: 12
Extracting page number: 13
Extracting page number: 14
Extracting page number: 15
Extracting page number: 16
Extracting page number: 17
Extracting page number: 18
Extracting page number: 19
Extracting page number: 20
Extracting page number: 21
Extracting page number: 22
Extracting page number: 23
Extracting page number: 24
Extracting page number: 25
Extracting page number: 26
Extracting page number: 27
Extracting page number: 28
Extracting page number: 29
Extracting page number: 30
Extracting page number: 31
Extracting page number: 32
Extracting page number: 33
Extracting page number: 3

Prologis 2021-22 ESG Report.pdf
datasets/real_estate/Prologis 2021-22 ESG Report.pdf
Extracting page number: 0
Extracting page number: 1
Extracting page number: 2
Extracting page number: 3
Extracting page number: 4
Extracting page number: 5
Extracting page number: 6
Extracting page number: 7
Extracting page number: 8
Extracting page number: 9
Extracting page number: 10
Extracting page number: 11
Extracting page number: 12
Extracting page number: 13
Extracting page number: 14
Extracting page number: 15
Extracting page number: 16
Extracting page number: 17
Extracting page number: 18
Extracting page number: 19
Extracting page number: 20
Extracting page number: 21
Extracting page number: 22
Extracting page number: 23
Extracting page number: 24
Extracting page number: 25
Extracting page number: 26
Extracting page number: 27
Extracting page number: 28
Extracting page number: 29
Extracting page number: 30
Extracting page number: 31
Extracting page number: 32
Extracting page number: 33
Extract

(15810,)

In [8]:
#Store corpus in text file
with open("auto_corpus.txt", "w") as fp:
    json.dump(corpus, fp)

In [15]:
corpus_data = open('auto_corpus.txt')
corpus = json.load(corpus_data)
corpus

['To achieve this, we set ourselves ten ambitious goals along the entire value chain.',
 'The BMW Group Sustainable Value Report (SVR) has been published to provide stakeholders with comprehensive information about the companys sustainability strategy and the progress made in integrating sustainability into its corporate processes.',
 'The Sustainable Value Report is published at the same time as the Annual Report on the date of the Annual Accounts Press Conference.',
 'The requirements of the German CSR Directive Implemen-tation Act (CSR-RUG) obligate Bayerische Motoren Werke Aktiengesellschaft (BMWAG)topublishanon-financial report at company and Group level.',
 'This will be published jointly as an integrated, separate non-financial report (hereinafterreferredtoasseparatenon-financialreport) within this Sustainable Value Report for BMW AG and BMW Group.',
 'In the SVR 2019 we focused on providing information that is required in order to comply with the German CSR Direc-tive Implement

In [16]:
res = pd.DataFrame(corpus, columns=['words'])

In [17]:
res

Unnamed: 0,words
0,"To achieve this, we set ourselves ten ambitiou..."
1,The BMW Group Sustainable Value Report (SVR) h...
2,The Sustainable Value Report is published at t...
3,The requirements of the German CSR Directive I...
4,This will be published jointly as an integrate...
...,...
42581,The table below compares the disclosure recomm...
42582,The Boards oversight of climate-related risks ...
42583,Describe the organizations processes for ident...
42584,Initiatives Climate Change > Risks and Opportu...


In [18]:
res = res.drop_duplicates()

In [19]:
res.to_csv('test.csv', index=False)

In [20]:
df_w = pd.read_csv('test.csv')

In [21]:
df_w.head()

Unnamed: 0,words
0,"To achieve this, we set ourselves ten ambitiou..."
1,The BMW Group Sustainable Value Report (SVR) h...
2,The Sustainable Value Report is published at t...
3,The requirements of the German CSR Directive I...
4,This will be published jointly as an integrate...


In [19]:
def get_entities(sent):
    ## chunk 1
    ent1 = ""
    ent2 = ""
    
    prv_tok_dep = ""    # dependency tag of previous token in the sentence
    prv_tok_text = ""   # previous token in the sentence
    
    prefix = ""
    modifier = ""

  #############################################################
    for tok in nlp(sent):
        ## chunk 2
        # if token is a punctuation mark then move on to the next token
        if tok.dep_ != "punct":
            # check: token is a compound word or not
            if tok.dep_ == "compound":
                prefix = tok.text
                # if the previous word was also a 'compound' then add the current word to it
                if prv_tok_dep == "compound":
                    prefix = prv_tok_text + " "+ tok.text
      
            # check: token is a modifier or not
            if tok.dep_.endswith("mod") == True:
                modifier = tok.text
                # if the previous word was also a 'compound' then add the current word to it
                if prv_tok_dep == "compound":
                    modifier = prv_tok_text + " "+ tok.text
            
            ## chunk 3
            if tok.dep_.find("subj") == True:
                ent1 = modifier +" "+ prefix + " "+ tok.text
                prefix = ""
                modifier = ""
                prv_tok_dep = ""
                prv_tok_text = ""      
                
            ## chunk 4
            if tok.dep_.find("obj") == True:
                ent2 = modifier +" "+ prefix +" "+ tok.text
                
            ## chunk 5  
            # update variables
            prv_tok_dep = tok.dep_
            prv_tok_text = tok.text
  #############################################################
    return [ent1.strip(), ent2.strip()]

In [20]:
get_entities('We have a transition plan')

['We', 'transition plan']

In [21]:
entities = []
for i in tqdm(df_w['words']):
    entities.append(get_entities(i))

  0%|          | 0/31205 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
entities

In [None]:
def get_relation(sent):
    doc = nlp(sent)
    
    # Matcher class object 
    matcher = Matcher(nlp.vocab)
    
    #define the pattern 
    pattern = [{'DEP':'ROOT'}, {'DEP':'prep','OP':"?"},
               {'DEP':'agent','OP':"?"},  {'POS':'ADJ','OP':"?"}] 

    matcher.add("matching_1",[pattern]) 

    matches = matcher(doc)
    k = len(matches) - 1

    span = doc[matches[k][1]:matches[k][2]] 

    return(span.text)

In [None]:
relations = []
for i in tqdm(df_w['words']):
    relations.append(get_relation(i))

In [None]:
relations

In [None]:
dict_df = {'entities': entities, 'relations': relations}

In [None]:
df = pd.DataFrame(dict_df)

In [None]:
df

In [None]:
df['entities'].str.contains('local')

In [24]:
def keyword_filter(df, keywords):
    filtered = []
    for s in np.array(df['words']):
        sentence = s.lower()
    for k in keywords:
        if k in sentence:
            filtered.append([s, k])
    
    filtered_df = pd.DataFrame(filtered, columns=['sentence', 'keyword(s)']).groupby(['sentence']).agg({'keyword(s)': lambda x: list(x.unique())}).reset_index()
    return filtered_df

In [30]:
keyword_filter(df_w, ['ipcc', 'rcps', 'ssp', 'iea', 'nze', 'aps', 'steps', 'announced pledges', 'stated policies'])

Unnamed: 0,sentence,keyword(s)


In [31]:
df_w

Unnamed: 0,words
0,"To achieve this, we set ourselves ten ambitiou..."
1,The BMW Group Sustainable Value Report (SVR) h...
2,The Sustainable Value Report is published at t...
3,The requirements of the German CSR Directive I...
4,This will be published jointly as an integrate...
...,...
35842,Total number of hours in the reporting period ...
35843,Total number and percentage of significant inv...
35844,The table below compares the disclosure recomm...
35845,The Boards oversight of climate-related risks ...


### KeyBERT

In [9]:
bert=KeyBERT()

In [33]:
kw = []
for i in tqdm(df_w['words']):
    kw.append(bert.extract_keywords(i, keyphrase_ngram_range=(1, 1), stop_words='english'))

  0%|          | 0/23612 [00:00<?, ?it/s]

In [34]:
df_w

Unnamed: 0,words
0,Hear from our Chief Executive Officer and Chie...
1,Our purpose and strategy OUR PURPOSE To provid...
2,OUR STRATEGY To be Canadas leading energy comp...
3,Optimize our base business Expand low-emission...
4,Six strategic objectives support our purpose a...
...,...
23607,TotalEnergies is a global multi-energy company...
23608,"Its 101,000 employees are committed to making ..."
23609,"Present in more than 130 countries, TotalEnerg..."
23610,Its design enables people with motor disabilit...


In [35]:
kw

[[('sustainability', 0.5099),
  ('sustainable', 0.5035),
  ('suncors', 0.4821),
  ('strategy', 0.3831),
  ('2050', 0.305)],
 [('energy', 0.5151),
  ('purpose', 0.454),
  ('earth', 0.3696),
  ('caring', 0.3109),
  ('lives', 0.2894)],
 [('fuels', 0.3961),
  ('canadas', 0.3758),
  ('esg', 0.3645),
  ('energy', 0.3579),
  ('business', 0.3401)],
 [('strategy', 0.4374),
  ('esg', 0.405),
  ('suncors', 0.394),
  ('businesses', 0.3939),
  ('business', 0.3921)],
 [('strategic', 0.5272),
  ('objectives', 0.5031),
  ('strategy', 0.4855),
  ('shareholder', 0.3717),
  ('invested', 0.3065)],
 [('sustainability', 0.5303),
  ('suncors', 0.3859),
  ('climate', 0.3516),
  ('executive', 0.2967),
  ('pandemic', 0.2963)],
 [('strategy', 0.483),
  ('business', 0.4825),
  ('prosperity', 0.4584),
  ('purpose', 0.4158),
  ('competitive', 0.4059)],
 [('energy', 0.5031),
  ('company', 0.4426),
  ('future', 0.3839),
  ('strategy', 0.3702),
  ('evolve', 0.3002)],
 [('2050', 0.4628),
  ('emissions', 0.4301),
  ('ob

In [36]:
key = ['compensation', 'remuneration']

In [37]:
def func(kw, key):
    if any(any(w in word[0] for w in key) for word in kw):
        return True

In [38]:
df_w['kw'] = kw

In [39]:
df_filtered = df_w[df_w['kw'].apply(lambda x: func(x, key)) == True]

In [40]:
df_filtered

Unnamed: 0,words,kw
540,Our total rewards approach for employees is ro...,"[(rewards, 0.5627), (benefits, 0.4376), (compe..."
730,Both the board and management remuneration are...,"[(remuneration, 0.5114), (governance, 0.3968),..."
999,Employee costs are reported in our annual repo...,"[(salaries, 0.4211), (compensation, 0.3805), (..."
1222,"At the same time, the Presidential and Nominat...","[(committee, 0.4404), (executive, 0.4071), (re..."
1223,Remuneration Policy OMV Petrom implemented a n...,"[(remuneration, 0.5896), (petrom, 0.4376), (po..."
...,...,...
23112,The Company supports carbon compensation mecha...,"[(compensation, 0.3974), (sustainability, 0.37..."
23147,"Moreover, it is factored into our compensation...","[(compensation, 0.5924), (policy, 0.3807), (fa..."
23253,A Commitment to a Decent Wage for All In Octob...,"[(wage, 0.456), (compensation, 0.4496), (emplo..."
23255,Decent Compensation for All TotalEnergies is c...,"[(compensation, 0.5705), (totalenergies, 0.410..."


In [41]:
df_filtered[['words']].to_csv('comp_og.csv', index=False)

In [102]:
df_audit = df_w[df_w['kw'].apply(lambda x: func(x, ['externally assured', 'independently assured', 'independent limited assurance'])) == True]

In [103]:
df_audit

Unnamed: 0,words,kw
263,Selected datahas been independently assured si...,"[(datahas independently assured, 0.7164), (sel..."
418,Certain key data is independently assured (see...,"[(data independently assured, 0.7789), (key da..."
942,This conclusion relates only to the Selected I...,"[(limitations explained overleaf, 0.7038), (li..."
983,DNV expressly disclaims any liability or co-re...,"[(dnv expressly disclaims, 0.839), (disclaims ..."


In [104]:
df_audit['words'].apply(lambda x: print(x))

Selected datahas been independently assured since 2007 (see earlier reports).
Certain key data is independently assured (see below).
This conclusion relates only to the Selected Information, and is to be read in the context of this Independent Limited Assurance Report, in particular the inherent limitations explained overleaf.
DNV expressly disclaims any liability or co-responsibility for any decision a person or an entity may make based on this Independent Limited Assurance Report.


263    None
418    None
942    None
983    None
Name: words, dtype: object

In [105]:
df_compensation = df_w[df_w['kw'].apply(lambda x: func(x, ['targets', 'sustainable practices'])) == True]

In [106]:
df_compensation

Unnamed: 0,words,kw
49,Our Bright Lights skills and employment progra...,"[(carbon reduction targets, 0.6085), (lights s..."
55,Top 81st percentile Science Based Targets: app...,"[(81st percentile science, 0.6119), (percentil..."
63,"From 2021, the 2030 strategy upgraded our BREE...","[(2030 strategy upgraded, 0.7276), (2021 2030 ..."
264,"SBTi, Net Zero targets and greenhouse gas inte...","[(portfolios carbon intensity, 0.5917), (affec..."
460,These targets are based on improvements in who...,"[(improvements building intensity, 0.8052), (t..."
946,SBTi NZC targets greenhouse gas intensity 2.,"[(nzc targets greenhouse, 0.7199), (sbti nzc t..."


In [107]:
df_compensation['words'].apply(lambda x: print(x))

Our Bright Lights skills and employment programme Science Based Targets initiative validated our carbon reduction targets.
Top 81st percentile Science Based Targets: approval in 2021 Our Place Based approach means understanding the most important issues and opportunities in the communities around each of our places and focusing our efforts collaboratively to make the biggest impact at each place.
From 2021, the 2030 strategy upgraded our BREEAM targets to Outstanding for Offices (from Excellent) and Excellent for Retail (from Very Good) 2.
SBTi, Net Zero targets and greenhouse gas intensity For the second consecutive year, COVID-19 and related government restrictions has significantly affected our portfolios carbon intensity, representing the majority of the reduction.
These targets are based on improvements in whole building intensity.
SBTi NZC targets greenhouse gas intensity 2.


49     None
55     None
63     None
264    None
460    None
946    None
Name: words, dtype: object

In [108]:
df_audit['words'].apply(lambda x: print(x))

Selected datahas been independently assured since 2007 (see earlier reports).
Certain key data is independently assured (see below).
This conclusion relates only to the Selected Information, and is to be read in the context of this Independent Limited Assurance Report, in particular the inherent limitations explained overleaf.
DNV expressly disclaims any liability or co-responsibility for any decision a person or an entity may make based on this Independent Limited Assurance Report.


263    None
418    None
942    None
983    None
Name: words, dtype: object

In [109]:
def get_kw(df, key):
    kw = []
    for i in tqdm(df['words']):
        kw.append(bert.extract_keywords(i, stop_words='english'))
    
    df['kw'] = kw
    df_filtered = df_w[df_w['kw'].apply(lambda x: func(x, key)) == True]
    if not df_filtered.empty:
        print("True")
        print(df['words'])

In [110]:
get_kw(df_w, ['audited'])

  0%|          | 0/988 [00:00<?, ?it/s]

### BERT Test

In [161]:
!pip install transformers



In [162]:
import torch
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer

In [164]:
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

Downloading:   0%|          | 0.00/443 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

In [465]:
df_w

Unnamed: 0,words,kw
0,Our ESG journey Our 2030 commitments Performan...,"[(overview kpis, 0.5098), (kpis net, 0.474), (..."
1,"As we go into a new year, with new uncertainti...","[(dedication vital, 0.572), (horizon dedicatio..."
2,Introduction This year we celebrate 20 years o...,"[(years sustainability, 0.7121), (sustainabili..."
3,The climate emergency is high on everyones age...,"[(climate emergency, 0.6627), (crisis impact, ..."
4,This means progressing our pathway to net zero...,"[(positive impact, 0.4841), (net zero, 0.4431)..."
...,...,...
983,DNV expressly disclaims any liability or co-re...,"[(disclaims liability, 0.7088), (expressly dis..."
984,Responsibilities of the Board of Directors of ...,"[(responsibilities board, 0.6026), (board dire..."
985,Our responsibility is to plan and perform our ...,"[(responsibility plan, 0.565), (assurance conc..."
986,We have not been responsible for the preparati...,"[(preparation report, 0.4001), (responsible pr..."


In [458]:
question = "Which standard is used to measure carbon emissions? "

In [459]:
paragraph = """
We use internal methodology to measuer carbon emissions
"""

In [460]:
encoding = tokenizer.encode_plus(text=question,text_pair=paragraph)

In [461]:
inputs = encoding['input_ids']  #Token embeddings
sentence_embedding = encoding['token_type_ids']  #Segment embeddings
tokens = tokenizer.convert_ids_to_tokens(inputs) #input tokens

In [462]:
score = model(input_ids=torch.tensor([inputs]), token_type_ids=torch.tensor([sentence_embedding]))

In [463]:
start_index = torch.argmax(score.start_logits)

end_index = torch.argmax(score.end_logits)

answer = ' '.join(tokens[start_index:end_index+1])

In [464]:
corrected_answer = ''

for word in answer.split():
    
    #If it's a subword token
    if word[0:2] == '##':
        corrected_answer += word[2:]
    else:
        corrected_answer += ' ' + word

print(corrected_answer)

 internal methodology


In [565]:
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
tok = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

In [468]:
def test(question, paragraph, model, tokenizer):
    encoding = tokenizer.encode_plus(text=question,text_pair=paragraph)
    inputs = encoding['input_ids']  #Token embeddings
    sentence_embedding = encoding['token_type_ids']  #Segment embeddings
    tokens = tokenizer.convert_ids_to_tokens(inputs) #input tokens
    score = model(input_ids=torch.tensor([inputs]), token_type_ids=torch.tensor([sentence_embedding]))
    
    start_index = torch.argmax(score.start_logits)

    end_index = torch.argmax(score.end_logits)

    answer = ' '.join(tokens[start_index:end_index+1])
    
    corrected_answer = ''

    for word in answer.split():
    
        #If it's a subword token
        if word[0:2] == '##':
            corrected_answer += word[2:]
        else:
            corrected_answer += ' ' + word
    print(corrected_answer)
    return corrected_answer

In [474]:
df_new = df_w[['words']].drop_duplicates()

In [475]:
df_new

Unnamed: 0,words
0,Our ESG journey Our 2030 commitments Performan...
1,"As we go into a new year, with new uncertainti..."
2,Introduction This year we celebrate 20 years o...
3,The climate emergency is high on everyones age...
4,This means progressing our pathway to net zero...
...,...
983,DNV expressly disclaims any liability or co-re...
984,Responsibilities of the Board of Directors of ...
985,Our responsibility is to plan and perform our ...
986,We have not been responsible for the preparati...


In [569]:
df_new['kw2'] = df_new['words'].apply(lambda x: test("Are your emission reduction targets independently assured?", x, model, tok))

 criteria assurance reports
 progressing our pathway to net zero and making a positive impact at our places is more important than ever
 canada water
 fully let ( or under option ) four years ahead of completion
 we completed our net zero audits
 one outcome will be to raise epc ratings across our portfolio to a or b by 2030 in line with expected mees legislation
 gresb
 our place based approach and our strong local relationships
 this year marked ten years of our partnership with the national literacy trust
 certified schemes
 working with third party experts
 whole building operational energy intensity improvement vs 2019 baseline for offices 3
 breeam certification framework
 one of londons biggest rehearsal and artist development complexes

 sustainability brief for developments

 signatory to the un global compact
 independently assured
 completed first whole life carbon assessments
 our sustainability brief established the social and environmental requirements for our development

 by working with these suppliers improvement opportunities will be identified and agreed
 our supplier excellence awards we are delighted to be recognised by british land in their first supplier awards
 havent seen any other client take this approach to service partner recognition
 british land sustainability
 full assurance reports
 selected datahas been independently assured since 2007
 net zero targets
 we have also delivered initiatives which enhance energy efficiency
 independently assured
 the embodied emissions are offset after the project achieves the practical completion
 until the uk government specifies the long - term strategy for low - carbon heating ( including the role of hydrogen and all - electric buildings ) , y will include the use of renewable bio - gas .
 exploring all electric option targeting < 80
 not undertaken the full nabers accreditation
 tbc data will be available in future years
 estimated energy consumption in retail units assumes regular operations .
 th

 any new acquisitions of fully operational properties must have reached at least 80 % occupancy
 energy , carbon and water relate to the whole building including both landlord and occupier areas plus any vacant space .
 net lettable areas ( nla ) .
 in 2021 floor area figures were sourced from mid - year valuations
 must have been managed by british land for at least 12 months and have reached at least 80 % occupancy
 landlord common parts intensity only is reported until occupier data can be obtained
 only is reported until occupier data can be obtained
 independently assured ? [SEP]
 it has both negligible landlord procured common parts consumption and no appropriate denominator
 shopping villages and high street retail : energy and carbon intensity to be reported when further data is available
 shopping villages have external walkways and common areas beyond car parks but which are not enclosed
 neither common parts floor area nor car park spaces is an appropriate denominator in the

 where an estimate is not available
 sub - metered
 where sub - metering is not available
 measured
 it is estimated based on methods that reflect what equipment is being used for
 on - site renewables at our offices comprise photovoltaic panels
 energy is used on site and included in common parts data .
 low carbon technologies
 independently assured ? [SEP]
 on - site renewables in retail
 we have assumed that all electricity generated at our retail sites has been exported to the grid



 we have included energy consumption in our flexible workspace offices ( storey ) in our reporting
 generation
 where this occurs , it is acceptableto default to the higher rating
 where multiple sustainability certifications are held for the same building , the following procedure is applied for determining which certification is reported
 the default selection for reporting is the development certification
 if a further operational certification is sought for the purpose of improving an existing de

 affordable housing : constructing affordable housing , not including design fees
 public space and environment : environmental or art enhancements with a clear community benefit , regardless of land ownership
 accessibility and transport : contributions to highways , roads or public spaces outside our ownership boundary , including payments made to local authorities
 not captured in our community investment programme
 figures are based on spend in the financial year
 data is estimated by our cost consultants based on their professional knowledge and project understanding , and pro - rated monthly across the construction period .
 scope
 scoring changed to out of 45 , with 5 points available for innovation
 target score for all projects remains 40
 independently assured
 spend data is cumulative
 an organisation with a postcode within the defined s106 agreement
 local branches of national firms are included if within the defined area
 an organisation employing fewer than 250 people
 co

In [567]:
df_audit['kw2'] = df_audit['words'].apply(lambda x: test('Are your emission reduction targets independently assured?', x, model, tok))

 selected datahas been independently assured since 2007
 certain key data
 independent limited assurance report
 dnv expressly disclaims any liability or co - responsibility for any decision a person or an entity may make based on this independent limited assurance report


In [508]:
df_new = df_new[~df_new['kw2'].str.contains('SEP')]

In [511]:
df_d = df_new[df_new['kw2'].str.contains('independent|externally|external')]

In [512]:
df_d['words'].apply(lambda x: print(x))

Selected datahas been independently assured since 2007 (see earlier reports).
Overview Carbon emissions (continued) Fig.
Overview Energy use Fig.
Overview Energy use (continued) Fig.
Overview Water use Fig.
Overview Water use (continued) Fig.
Overview Overview Waste and materials Fig.
Overview Waste and materials (continued) Fig.
Overview Biodiversity Fig.
Overview Community Fig.
Overview Contributions and investment Fig.
Overview Health and safety Fig.
Overview Health and safety (continued) Fig.
Employee training proportion by category DSE Assessment and Training (previously referred to as Health and Safety training), fell significantly during the year due to an alternative focus on working from home and returning to the office assessments.
In 2022 we undertook a number of independent water audits in order to identify existing or potential issues.
It also sets out the overall principles, boundaries, scope and methodologies applied when reporting sustainability data in our 2022 Annual 

263    None
266    None
299    None
302    None
318    None
321    None
322    None
323    None
338    None
340    None
344    None
355    None
356    None
375    None
392    None
406    None
407    None
418    None
423    None
495    None
564    None
617    None
652    None
705    None
725    None
828    None
935    None
942    None
951    None
952    None
954    None
962    None
972    None
983    None
985    None
Name: words, dtype: object

In [494]:
df_c = df_new[df_new['kw'].str.contains('ghg')]

In [497]:
df_c['words'].apply(lambda x: print(x))

For the financial ratio, see GHG Emissions Scope 1 and 2 Financial Intensity Measures.
Table 4 shows the combined carbon-equivalent emission factors for the different GHGs considered.


466    None
615    None
Name: words, dtype: object