In [41]:
######################################### IMPORTING PACAKGES #############################
# Basic ML Packages
from scipy import spatial
import math
import os
import json
import string

import warnings
warnings.filterwarnings("ignore")

# PDF text extraction
from pdfminer3.layout import LAParams, LTTextBox
from pdfminer3.pdfpage import PDFPage
from pdfminer3.pdfinterp import PDFResourceManager
from pdfminer3.pdfinterp import PDFPageInterpreter
from pdfminer3.converter import PDFPageAggregator
from pdfminer3.converter import TextConverter

# Others
import string
import re
from pprint import pprint
from tqdm.notebook import tqdm
import io

# Text pre-processing (Tokenization, Stemming, Lemmatization)
import nltk
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer 
from nltk.tokenize import RegexpTokenizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('omw-1.4')

# Pdf Extraction Model
import spacy
spacy.cli.download("en_core_web_sm")

#Gensim stopwords
import gensim
from gensim.parsing.preprocessing import remove_stopwords
stopwords = gensim.parsing.preprocessing.STOPWORDS

import numpy as np
import pandas as pd
import PyPDF2
import tabula
from tabula import read_pdf
import io
from functools import reduce
from pdfminer.high_level import extract_text
import pdf2image

import pickle

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

from keras.models import Sequential, load_model
from keras.layers import Dense, LSTM, Embedding, Dropout
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils import pad_sequences

from imblearn.over_sampling import RandomOverSampler

import torch
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline

import spacy
from spacy.matcher import Matcher 
from spacy.tokens import Span 
nlp = spacy.load('en_core_web_sm')

from keybert import KeyBERT

[nltk_data] Downloading package punkt to C:\Users\Chen
[nltk_data]     Wei\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Chen
[nltk_data]     Wei\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\Chen
[nltk_data]     Wei\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


✔ Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


## Previous Functions

In [2]:
def extract_pdf(file_path):
    """
    Process raw PDF text to structured and processed PDF text to be worked on in Python.
    Parameters
    ----------
    file_path : Relative Location of File
    Return
    ------
    text : str
        processed PDF text if no error is throw
    """   

    try:
        resource_manager = PDFResourceManager()
        fake_file_handle = io.StringIO()
        codec = 'utf-8'
        laparams = LAParams()

        converter = TextConverter(resource_manager, fake_file_handle, codec=codec, laparams=laparams)
        page_interpreter = PDFPageInterpreter(resource_manager, converter)
        
        password = ""
        maxpages = 0
        caching = True
        pagenos = set()

        content = []

        with open(file_path, 'rb') as file:
            for page in PDFPage.get_pages(file,
                                        pagenos, 
                                        maxpages=maxpages,
                                        password=password,
                                        caching=True,
                                        check_extractable=False):

                page_interpreter.process_page(page)

                content.append(fake_file_handle.getvalue())

                fake_file_handle.truncate(0)
                fake_file_handle.seek(0)        

        text = '##PAGE_BREAK##'.join(content)

        # close open handles
        converter.close()
        fake_file_handle.close()
        
        return text

    except Exception as e:
        print(e)

        # close open handles
        converter.close()
        fake_file_handle.close()

        return ""

In [3]:
# nlp preprocessing
def preprocess_lines(line_input):
    """
    Helper Function to preprocess and clean sentences from raw PDF text 
    Parameters
    ----------
    line_input : str
        String that contains a sentence to be cleaned
    Return
    ------
    line : str
        Cleaned sentence
    ----------
    Sub: Substitute regular expression
    Split: Remove blank space from front and rear 
    """  
    # removing header number
    line = re.sub(r'^\s?\d+(.*)$', r'\1', line_input)
    # removing trailing spaces
    line = line.strip()
    # words may be split between lines, ensure we link them back together
    line = re.sub(r'\s?-\s?', '-', line)
    # remove space prior to punctuation
    line = re.sub(r'\s?([,:;\.])', r'\1', line)
    # ESG contains a lot of figures that are not relevant to grammatical structure
    line = re.sub(r'\d{5,}', r' ', line)
    # remove emails
    line = re.sub(r'\S*@\S*\s?', '', line)
    # remove mentions of URLs
    line = re.sub(r'((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*', r' ', line)
    # remove multiple spaces
    line = re.sub(r'\s+', ' ', line)
    # join next line with space
    line = re.sub(r' \n', ' ', line)
    line = re.sub(r'.\n', '. ', line)
    line = re.sub(r'\x0c', ' ', line)
    
    return line

In [4]:
def remove_non_ascii(text):
    """
    Helper Function to remove non ascii characters from text
    Printable will 
    """
    printable = set(string.printable) #Convert iterable to set
    return ''.join(filter(lambda x: x in printable, text))

def not_header(line):
    """
    Helper Function to remove headers
    Check if all the characters are in upper case
    """
    return not line.isupper()

In [5]:
def extract_pages_sentences(nlp, text):    
    """
    Extracting text from raw PDF text and store them by pages and senteces. Raw text is also cleand by removing junk, URLs, etc.
    Consecutive lines are also grouped into paragraphs and spacy is used to parse sentences.
    Parameters
    ----------
    nlp: spacy nlp model
        NLP model to parse sentences
    text : str
        Raw PDF text
    Return
    ------
    pages_content : list of str
        A list containing text from each page of the PDF report. Page number is the index of list + 1
    
    pages_sentences : list of list
        A list containing lists. Page number is the index of outer list + 1. Inner list contains sentences from each page
 
    """  
    
    pages = text.split('##PAGE_BREAK##')
    #print('Number of Pages: {}'.format(len(pages)))

    lines = []
    for i in range(len(pages)):
        page_number = i + 1
        page = pages[i]
        print(f"Extracting page number: {i}")
        
        # remove non ASCII characters
        text = remove_non_ascii(page)
        
        # if len(text.split(' ')) < MIN_WORDS_PER_PAGE:
        #     print(f'Skipped Page: {page_number}')
        #     continue
        
        prev = ""
        for line in text.split('\n\n'):
            # aggregate consecutive lines where text may be broken down
            # only if next line starts with a space or previous does not end with dot.
            if(line.startswith(' ') or not prev.endswith('.')):
                prev = prev + ' ' + line
            else:
                # new paragraph
                lines.append(prev)
                prev = line

        # don't forget left-over paragraph
        lines.append(prev)
        lines.append('##SAME_PAGE##')
        
    lines = '  '.join(lines).split('##SAME_PAGE##')
    
    # clean paragraphs from extra space, unwanted characters, urls, etc.
    # best effort clean up, consider a more versatile cleaner
    
    pages_content = []
    pages_sentences = []
    all_sentences = []

    for line in lines[:-1]: # looping through each page
        
        line = preprocess_lines(line)       
        pages_content.append(str(line).strip())

        sentences = []
        # split paragraphs into well defined sentences using spacy
        for part in list(nlp(line).sents):
            sentences.append(str(part).strip())

        #sentences += nltk.sent_tokenize(line)
            
        # Only interested in full sentences and sentences with 10 to 100 words. --> filter out first page/content page
        sentences = [s for s in sentences if re.match('^[A-Z][^?!.]*[?.!]$', s) is not None]
        sentences = [s.replace('\n', ' ') for s in sentences]
        
        pages_sentences.append(sentences)
        all_sentences.extend(sentences)
    return pages_content, pages_sentences, all_sentences #list, list of list where page is index of outer list, list of sentences

In [6]:
def extract_sentences(nlp, text):
    pages = text.split('##PAGE_BREAK##')
    #print('Number of Pages: {}'.format(len(pages)))

    lines = []
    for i in range(len(pages)):
        page_number = i + 1
        page = pages[i]
        
        # remove non ASCII characters
        text = remove_non_ascii(page)
 
        prev = ""
        for line in text.split('\n\n'):
            # aggregate consecutive lines where text may be broken down
            # only if next line starts with a space or previous does not end with dot.
            if(line.startswith(' ') or not prev.endswith('.')):
                prev = prev + ' ' + line
            else:
                # new paragraph
                lines.append(prev)
                prev = line

        # don't forget left-over paragraph
        lines.append(prev)
        lines.append('##SAME_PAGE##')
        
    lines = '  '.join(lines).split('##SAME_PAGE##')
    
    # clean paragraphs from extra space, unwanted characters, urls, etc.
    # best effort clean up, consider a more versatile cleaner
    
    pages_content = []
    pages_sentences = []
    all_sentences = []

    for line in lines[:-1]: # looping through each page
        
        line = preprocess_lines(line)       
        pages_content.append(str(line).strip())

        sentences = []
        # split paragraphs into well defined sentences using spacy
        for part in list(nlp(line).sents):
            sentences.append(str(part).strip())

        #sentences += nltk.sent_tokenize(line)
            
        # Only interested in full sentences and sentences with 10 to 100 words. --> filter out first page/content page
        sentences = [s for s in sentences if re.match('^[A-Z][^?!.]*[?.!]$', s) is not None]
        sentences = [s.replace('\n', ' ') for s in sentences]
        
        pages_sentences.append(sentences)
        all_sentences.extend(sentences)
    return all_sentences #list of sentences

## New Functions

In [7]:
def preprocessing(df):
    df = df.drop_duplicates()
    bert=KeyBERT()
    kw = []
    for i in tqdm(df['words']):
        kw.append(bert.extract_keywords(i, keyphrase_ngram_range=(2, 2), stop_words='english'))
    df['kw'] = kw
    return df

In [32]:
def keyword_filter(df, keywords):
    def func(kw, key):
        if any(any(w in word[0] for w in key) for word in kw):
            return True
    
    df_filtered = df[df['kw'].apply(lambda x: func(x, keywords)) == True]
    return df_filtered

In [45]:
def word_embedding(df, embed_column, attribute_no, embedding_model='tfidf'):
    if embedding_model == 'tfidf': ##save fit model and transform here
        X = df[embed_column]
        X = X.apply(lambda x: x.lower())
        if attribute_no == 14:
            tfidf = pickle.load(open('tfidf_14_model.sav', 'rb'))
        elif (attribute_no == 7) or (attribute_no == 15):
            tfidf = pickle.load(open('tfidf_15_model.sav', 'rb'))
        elif attribute_no == 17:
            tfidf = pickle.load(open('tfidf_17_model.sav', 'rb'))
        else:
            raise Exception(f"Wrong Model used for attribute: {attribute_no}")
        x = tfidf.transform(X)
        X_encoded = pd.DataFrame(x.toarray())
        return X_encoded

In [98]:
def qa_filtering(df):
    model_name = "deepset/roberta-base-squad2"
    nlp = pipeline('question-answering', model=model_name, tokenizer=model_name)
    
    res = []
    q1 = 'Who audited the targets?'
    q2 = 'Who assured the targets?'
    q3 = 'Who verified the targets?'
    for i in df['sentence']:
        QA_1 = {
            'question': q1,
            'context': i
        }
        QA_2 = {
            'question': q2,
            'context': i
        }
        QA_3 = {
            'question': q3,
            'context': i
        }

        ans1 = nlp(QA_1)['answer']
        score1 = nlp(QA_1)['score']
        ans2 = nlp(QA_2)['answer']
        score2 = nlp(QA_2)['score']
        ans3 = nlp(QA_3)['answer']
        score3 = nlp(QA_3)['score']

        maxi = max([score1, score2, score3])
        if maxi == score1:
            res.append(ans1)
        elif maxi == score2:
            res.append(ans2)
        else:
            res.append(ans3)
    return res

In [11]:
def retrieve_images(file_path,  output_path, keywords=[r'scope \d', 'location-based', 'market-based'], table_only=False):
    dataset_list = os.listdir(file_path)
    for file in dataset_list:
        #reading pdf file to filter keywords
        pdfFile = open(file_path + '/' + file, 'rb')
        pdfReader = PyPDF2.PdfFileReader(pdfFile)
        totpages = pdfReader.numPages
        
        print("Starting with file: " + file)
        page_with_keywords = []
        for p in range(pdfReader.numPages):
            text = pdfReader.pages[p].extract_text().lower()
            if any(re.search(x, text) for x in keywords):
                if (p+1) not in page_with_keywords:
                    page_with_keywords.append(p + 1)
        
        ## Filter for only tables.
        if table_only:
            table_pages = []
            for i in page_with_keywords:
                pdf = read_pdf(file_path + '/' + file, pages=i, stream=True, pandas_options={'header':'None'}, multiple_tables=True)
                if len(pdf) > 0:
                    table_pages.append(i)
            page_with_keywords = table_pages
        
        ##Extract images
        for i in page_with_keywords:
            pdf2image.convert_from_path(file_path + '/' + file, output_folder = output_path, fmt='png', 
                                       first_page = i, last_page = i, output_file = str(file) + str(i))
        
        print('Finished with file: ' + file)
    return ""

In [13]:
def attribute_7(df):
    return attribute_15(df)

In [108]:
def attribute_14(df):
    df = keyword_filter(df, ['ghg', 'sbti', 'tcfd', 'sasb', r'scope /d'])
    df['preprocessed'] = df['sentence'].apply(lambda x: pre_processing(x))
    X = word_embedding(df, 'sentence', 14)
    lr_model = pickle.load(open('lr_14_model.sav', 'rb'))
    rf_model = pickle.load(open('rf_14_model.sav', 'rb'))
    ada_model = pickle.load(open('ada_14_model.sav', 'rb'))
    
    lr_pred = lr_model.predict(X)
    rf_pred = rf_model.predict(X)
    ada_pred = ada_model.predict(X)
   
    ## Ensemble voting
    df_combi = pd.DataFrame([lr_pred, rf_pred, ada_pred]).transpose()
    df_combi['total'] = df_combi.mode(axis=1)[0]
    df = df.reset_index()
    df['flag'] = df_combi['total']
    
    ### return 1s only
    df_ones = df[df['flag'] == 1]
    
    for index, rows in df_ones.iterrows():
        res = []
        if ('ghg' in rows['sentence'].lower()) or (r'scope \d' in rows['sentence'].lower()):
            res.append('GHG')
        if ('sbti' in rows['sentence'].lower()) or ('science based targets' in rows['sentence'].lower()):
            res.append('SBTi')
        if ('tcfd' in rows['sentence'].lower()) or ('climate-related financial disclosures' in rows['sentence'].lower()):
            res.append('TCFD')
        if ('sasb' in rows['sentence'].lower()) or ('sustainability accounting' in rows['sentence'].lower()):
            res.append('SASB')
    
        df_ones.at[index, 'methodologies'] = str(res)
    df_ones = df_ones[['sentence', 'methodologies', 'flag']]
    return df_ones

In [97]:
def attribute_15(df):
    df = keyword_filter(df, ['assurance', 'limited assurance', 'externally verified', 'independent', 'third-party'])
    df['preprocessed'] = df['sentence'].apply(lambda x: pre_processing(x))
    X = word_embedding(df, 'sentence', 15)
    
    lr_model = pickle.load(open('lr_15_model.sav', 'rb'))
    
    lr_pred = lr_model.predict(X)
    
    ##return 1s only
    df['flag'] = lr_pred
    df_ones = df[df['flag'] == 1]
    res = qa_filtering(df_ones)
    df_ones['auditors'] = res
    
    df_ones = df_ones[['sentence', 'auditors', 'flag']]
    return df_ones

In [104]:
def attribute_17(df):
    df = keyword_filter(df, ['compensation', 'remuneration'])
    df['preprocessed'] = df['sentence'].apply(lambda x: pre_processing(x))
    X = word_embedding(df, 'sentence', 17)
    
    lr_model = pickle.load(open('lr_17_model.sav', 'rb'))
    lstm_model = load_model('lstm_17_model.h5')
    ada_model = pickle.load(open('ada_17_model.sav', 'rb'))
    tok = pickle.load(open('tok_17_model.sav', 'rb'))
    
    df_word = df['sentence']
    test = tok.texts_to_sequences(df_word)
    test_matrix = pad_sequences(test, maxlen=100)
    
    lr_pred = lr_model.predict(X)
    lstm_pred = np.where(lstm_model.predict(test_matrix) < 0.5, 0, 1)
    ada_pred = ada_model.predict(X)
    
    ## Ensemble Voting
    df_combi = pd.DataFrame([lr_pred, lstm_pred, ada_pred]).transpose()
    df_combi['majority'] = df_combi.mode(axis=1)[0]
    df = df.reset_index()
    df['flag'] = df_combi['majority']
    
    ## Returns 1s only
    df_ones = df[df['flag'] == 1]
    
    df_ones = df_ones[['sentence', 'flag']]
    return df_ones

In [3]:
def clean(line):
    line = re.sub(r'[0-9\.]+', '', line) # remove digits
    line = re.sub(r'[^\w\s]','', line) # remove punctuation
    return line

def stemming(line):
    stemmer = SnowballStemmer(language='english')
    return [stemmer.stem(token) for token in line]

def lemmatization(line):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in line]

def remove_stop_words(line):
    return [remove_stopwords(token) for token in line]

def pre_processing(line):
    tokenizer = TreebankWordTokenizer()

    tokenized_line = tokenizer.tokenize(clean(line))
    preprocessed_line = stemming(lemmatization(remove_stop_words(tokenized_line)))
    
    return ' '.join([token for token in preprocessed_line if token != ''])

In [92]:
def keyword_filter(df, keywords):
    filtered = []
    for s in np.array(df['sentence']):
        sentence = s.lower()
        for k in keywords:
            if k in sentence:
                filtered.append([s, k])
    
    filtered_df = pd.DataFrame(filtered, columns=['sentence', 'keyword(s)']).groupby(['sentence']).agg({'keyword(s)': lambda x: list(x.unique())}).reset_index()
    return filtered_df

## Test PDF

In [91]:
len(re.findall('tcfd', 'This sentence has no (tcfd)'))

1

In [17]:
DATA_FOLDER = 'test_dataset/documents'

In [18]:
# Corpus has already been extract to corpus.txt, take around 15 mins to extract 24 reports
spacy_model = spacy.load("en_core_web_sm")
list_dataset = os.listdir(DATA_FOLDER)
corpus = []
#for file in list_dataset: ##clean loop
pages_content, pages_sentences, all_sentences = extract_pages_sentences(spacy_model,extract_pdf(DATA_FOLDER + "/ubm_esg_report_2021.pdf"))
corpus.extend(all_sentences)
np.shape(corpus)

Extracting page number: 0
Extracting page number: 1
Extracting page number: 2
Extracting page number: 3
Extracting page number: 4
Extracting page number: 5
Extracting page number: 6
Extracting page number: 7
Extracting page number: 8
Extracting page number: 9
Extracting page number: 10
Extracting page number: 11
Extracting page number: 12
Extracting page number: 13
Extracting page number: 14
Extracting page number: 15
Extracting page number: 16
Extracting page number: 17
Extracting page number: 18
Extracting page number: 19
Extracting page number: 20
Extracting page number: 21
Extracting page number: 22
Extracting page number: 23
Extracting page number: 24
Extracting page number: 25
Extracting page number: 26
Extracting page number: 27
Extracting page number: 28
Extracting page number: 29
Extracting page number: 30
Extracting page number: 31
Extracting page number: 32
Extracting page number: 33
Extracting page number: 34
Extracting page number: 35
Extracting page number: 36
Extracting 

(1118,)

In [19]:
#Store corpus in text file
with open("ubm_corpus.txt", "w") as fp:
    json.dump(corpus, fp)

In [72]:
corpus_data = open('ubm_corpus.txt')
corpus = json.load(corpus_data)
corpus

['The dividend is paid in the following nancial year but is based on the previous years net prot.',
 'The dividend proposal for 2021 is subject to the approval of the Annual General Meeting.',
 'G as in Governance UBM & Sustainability 3.',
 'Important information 7.',
 'Maly-GrtnerCOODear Shareholders,Dear Stakeholders,The westernmost city in Ukraine, Uzhhorod, is closer to Vienna than Bregenz in the far west of Austria.',
 'In view of this war on our doorstep, it appears to be reasonable to ask the one or other question on the current signicance of ESG.',
 'But really?',
 'As part of our social responsibility, we made 150 rooms in our Polish hotels available as immediate assistance for the rst refugees from Ukraine.',
 'That is also ESG it stands for the S in Environment, Social and , this war has also increased the focus on energy supplies in Europe and the dependence on Russian gas, especially in Germany and Austria, our two core markets.',
 'Gas, regardless of where it originates, 

In [73]:
res = pd.DataFrame(corpus, columns=['words'])

In [74]:
res = res.drop_duplicates()

In [75]:
res

Unnamed: 0,words
0,The dividend is paid in the following nancial ...
1,The dividend proposal for 2021 is subject to t...
2,G as in Governance UBM & Sustainability 3.
3,Important information 7.
4,"Maly-GrtnerCOODear Shareholders,Dear Stakehold..."
...,...
1113,The amounts were rounded based on the compensa...
1114,"However, rounding, typesetting and printing er..."
1115,This ESG report is published in English and Ge...
1116,"In the event of a discrepancy or deviation, th..."


In [76]:
res.to_csv('ubm.csv', index=False)

In [4]:
res = pd.read_csv('ubm.csv')

In [8]:
res = res.rename(columns={'words': 'sentence'})

In [82]:
keywords = ['ghg', 'sbti', 'tcfd', 'sasb', r'scope /d']
keywords = [x.lower() for x in keywords]
keywords

['ghg', 'sbti', 'tcfd', 'sasb', 'scope /d']

In [83]:
keyword_filter(res, keywords)

Unnamed: 0,sentence,keyword(s)


In [42]:
res_u['sentence'].apply(lambda x: pre_processing(x))

0     effici measur includ good insul energyeffici b...
1     energi indic real estat develop total calcul e...
2     gri in properti develop earli plan phase defin...
3     gri standard disclosur page omiss reason ungc ...
4     gri standard disclosur page omiss reason ungc ...
5     gross risk evalu basi dimens scope probabl occ...
6     in ubm appli membership un global compact comm...
7     in cours general risk manag probabl occurr sco...
8     key indic environment indic group locat energi...
9     one relat step announc ubm commit offici suppo...
10    scope consolid this report cover corpor locat ...
11    the tcfd recommend report climat relat risk ap...
12    the tcfd recommend voluntari disclosur climat ...
13    the evalu gross risk esg risk analysi base dim...
14    the fix salari manag board member base scope d...
15    the social assess base estim number involv per...
16    there number chang comparison materi analysi e...
17    ubm offici support task forc climaterel fi

In [78]:
retrieve_images(DATA_FOLDER, 'test_dataset/images', table_only=True)

Starting with file: ubm_esg_report_2021.pdf
Finished with file: ubm_esg_report_2021.pdf


''

In [46]:
res

Unnamed: 0,sentence
0,The dividend is paid in the following nancial ...
1,The dividend proposal for 2021 is subject to t...
2,G as in Governance UBM & Sustainability 3.
3,Important information 7.
4,"Maly-GrtnerCOODear Shareholders,Dear Stakehold..."
...,...
1100,The amounts were rounded based on the compensa...
1101,"However, rounding, typesetting and printing er..."
1102,This ESG report is published in English and Ge...
1103,"In the event of a discrepancy or deviation, th..."


In [107]:
df_14 = attribute_14(res)

   index  \
0      0   
1      1   
2      2   
3      3   
4      4   
5      5   
6      6   
7      7   
8      8   

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  sentence  \
0                                                                                

In [94]:
df_14

Unnamed: 0,sentence,methodologies,flag
3,"In 2021 UBM applied for membership of the UN Global Compact, com-mitted to compliance with the UN Sustainable Develop-ment Goals, and became an ofcial supporter of the Task Force on Climate-related Financial Disclosures (TCFD).",['TCFD'],1
4,One related step was the announcement in 2021 of UBMs commitment as an official supporter of the Task Force on Climate-related Financial Disclosures (TCFD).,['TCFD'],1
5,The TCFD recommendations on the reporting of climate- related risks were applied for the first time in 2021 and grad-ually implemented.,['TCFD'],1
6,"The TCFD recommends the voluntary disclosure of climate- related risks based on four pillars: governance, strategy, risk management, and metrics and targets.",['TCFD'],1


In [58]:
pd.set_option('display.max_colwidth', None)

In [99]:
df_15 = attribute_15(res)

In [100]:
df_15

Unnamed: 0,sentence,auditors,flag
0,An external review with limited assurance was carried out by PwC (see page 125).,PwC,1
6,"Report on the independent assurance of non-financial reporting Independent Limited Assurance Report on the ESG Report (Translation) We performed a limited assurance engagement of the ESG report in accordance with the requirements of section 267a UGB (Austrian Company Code) and of the GRI Standards 2021 (hereinafter the ESG report) of UBM Development AG, Vienna (the Company) for the financial year 2021.","UBM Development AG, Vienna",1
9,"Transparency and reliability are also decisive for our ESG reporting, and we therefore arranged for an external lim-ited assurance audit of this ESG report (more information is provided on page 125).",external,1


In [105]:
df_17 = attribute_17(res)



In [106]:
df_17

Unnamed: 0,sentence,flag
9,The inclusion of ESG factors in the remu-neration model will be analysed in detail in the future: Plans call for the integration of key ESG performance indicators in managements variable remuneration over the medium term.,1
10,The integration of ESG in the remuneration model will be analysed in detail: ESG key performance indicators will be integrated in managements variable remuneration over the medium term.,1
