In [1]:
######################################### IMPORTING PACAKGES #############################
# Basic ML Packages
from scipy import spatial
import math
import os
import json
import string

import warnings
warnings.filterwarnings("ignore")

# PDF text extraction
from pdfminer3.layout import LAParams, LTTextBox
from pdfminer3.pdfpage import PDFPage
from pdfminer3.pdfinterp import PDFResourceManager
from pdfminer3.pdfinterp import PDFPageInterpreter
from pdfminer3.converter import PDFPageAggregator
from pdfminer3.converter import TextConverter

# Others
import string
import re
from pprint import pprint
from tqdm.notebook import tqdm
import io

# Text pre-processing (Tokenization, Stemming, Lemmatization)
import nltk
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer 
from nltk.tokenize import RegexpTokenizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('omw-1.4')

# Pdf Extraction Model
import spacy
spacy.cli.download("en_core_web_sm")

#Gensim stopwords
import gensim
from gensim.parsing.preprocessing import remove_stopwords
stopwords = gensim.parsing.preprocessing.STOPWORDS

import numpy as np
import pandas as pd
import PyPDF2
import tabula
from tabula import read_pdf
import io
from functools import reduce
from pdfminer.high_level import extract_text
import pdf2image

import pickle

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

from keras.models import Sequential, load_model
from keras.layers import Dense, LSTM, Embedding, Dropout
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils import pad_sequences

from imblearn.over_sampling import RandomOverSampler

import torch
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline

import spacy
from spacy.matcher import Matcher 
from spacy.tokens import Span 
nlp = spacy.load('en_core_web_sm')

[nltk_data] Downloading package punkt to C:\Users\Chen
[nltk_data]     Wei\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Chen
[nltk_data]     Wei\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\Chen
[nltk_data]     Wei\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


✔ Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


## New Functions

In [7]:
def preprocessing(df):
    df = df.drop_duplicates()
    bert=KeyBERT()
    kw = []
    for i in tqdm(df['words']):
        kw.append(bert.extract_keywords(i, keyphrase_ngram_range=(2, 2), stop_words='english'))
    df['kw'] = kw
    return df

In [8]:
def keyword_filter(df, keywords):
    def func(kw, key):
        if any(any(w in word[0] for w in key) for word in kw):
            return True
    
    df_filtered = df[df['kw'].apply(lambda x: func(x, keywords)) == True]
    return df_filtered

In [9]:
def word_embedding(df, embed_column, attribute_no, embedding_model='tfidf'):
    if embedding_model == 'tfidf': ##save fit model and transform here
        X = df[embed_column]
        X = X.apply(lambda x: x.lower())
        if attribute_no == 14:
            tfidf = pickle.load(open('models/tfidf_14_model.sav', 'rb'))
        elif (attribute_no == 7) or (attribute_no == 15):
            tfidf = pickle.load(open('models/tfidf_15_model.sav', 'rb'))
        elif attribute_no == 17:
            tfidf = pickle.load(open('models/tfidf_17_model.sav', 'rb'))
        else:
            raise Exception(f"Wrong Model used for attribute: {attribute_no}")
        x = tfidf.transform(X)
        X_encoded = pd.DataFrame(x.toarray())
        return X_encoded

In [10]:
def qa_filtering(df):
    model_name = "deepset/roberta-base-squad2"
    nlp = pipeline('question-answering', model=model_name, tokenizer=model_name)
    
    res = []
    q1 = 'Who audited the targets?'
    q2 = 'Who assured the targets?'
    q3 = 'Who verified the targets?'
    for i in df['sentence']:
        QA_1 = {
            'question': q1,
            'context': i
        }
        QA_2 = {
            'question': q2,
            'context': i
        }
        QA_3 = {
            'question': q3,
            'context': i
        }

        ans1 = nlp(QA_1)['answer']
        score1 = nlp(QA_1)['score']
        ans2 = nlp(QA_2)['answer']
        score2 = nlp(QA_2)['score']
        ans3 = nlp(QA_3)['answer']
        score3 = nlp(QA_3)['score']

        maxi = max([score1, score2, score3])
        if maxi == score1:
            res.append(ans1)
        elif maxi == score2:
            res.append(ans2)
        else:
            res.append(ans3)
    return res

In [1]:
def retrieve_images(file_path,  output_path, keywords=[r'scope \d', 'location-based', 'market-based'], table_only=True):
    dataset_list = os.listdir(file_path)
    for file in dataset_list:
        #reading pdf file to filter keywords
        pdfFile = open(file_path + '/' + file, 'rb')
        pdfReader = PyPDF2.PdfFileReader(pdfFile)
        totpages = pdfReader.numPages
        
        print("Starting with file: " + file)
        page_with_keywords = []
        for p in range(pdfReader.numPages):
            text = pdfReader.pages[p].extract_text().lower()
            if any(re.search(x, text) for x in keywords):
                if (p+1) not in page_with_keywords:
                    page_with_keywords.append(p + 1)
        
        ## Filter for only tables.
        if table_only:
            table_pages = []
            for i in page_with_keywords:
                pdf = read_pdf(file_path + '/' + file, pages=i, stream=True, pandas_options={'header':'None'}, multiple_tables=True)
                if len(pdf) > 0:
                    table_pages.append(i)
            page_with_keywords = table_pages
        
        ##Extract images
        for i in page_with_keywords:
            pdf2image.convert_from_path(file_path + '/' + file, output_folder = output_path, fmt='png', 
                                       first_page = i, last_page = i, output_file = str(file) + str(i))
        
        print('Finished with file: ' + file)
    return ""

In [12]:
def attribute_7(df):
    return attribute_15(df)

In [88]:
def attribute_14(df):
    df = keyword_filter(df, ['ghg', 'sbti', 'tcfd', 'sasb', r'scope /d'])
    df['preprocessed'] = df['sentence'].apply(lambda x: pre_processing(x))
    if df.empty:
        return df
    X = word_embedding(df, 'preprocessed', 14)
    lr_model = pickle.load(open('models/lr_14_model.sav', 'rb'))
    rf_model = pickle.load(open('models/rf_14_model.sav', 'rb'))
    svc_model = pickle.load(open('models/svc_14_model.pkl', 'rb'))
    
    lr_pred = lr_model.predict(X)
    rf_pred = rf_model.predict(X)
    svc_pred = svc_model.predict(X)
   
    ## Ensemble voting
    df_combi = pd.DataFrame([lr_pred, rf_pred, svc_pred]).transpose()
    df_combi['total'] = df_combi.mode(axis=1)[0]
    df = df.reset_index()
    df['flag'] = df_combi['total']
    
    ### return 1s only
    df_ones = df[df['flag'] == 1]
    
    for index, rows in df_ones.iterrows():
        res = []
        if ('ghg' in rows['sentence'].lower()) or (r'scope \d' in rows['sentence'].lower()):
            res.append('GHG')
        if ('sbti' in rows['sentence'].lower()) or ('science based targets' in rows['sentence'].lower()):
            res.append('SBTi')
        if ('tcfd' in rows['sentence'].lower()) or ('climate-related financial disclosures' in rows['sentence'].lower()):
            res.append('TCFD')
        if ('sasb' in rows['sentence'].lower()) or ('sustainability accounting' in rows['sentence'].lower()):
            res.append('SASB')
    
        df_ones.at[index, 'methodologies'] = str(res)
    df_ones = df_ones[['sentence', 'methodologies', 'flag']]
    return df_ones

In [123]:
def attribute_15(df, further_precision=True):
    df = keyword_filter(df, ['assurance', 'limited assurance', 'externally verified', 'independent', 'third-party'])
    df['preprocessed'] = df['sentence'].apply(lambda x: pre_processing(x))
    if df.empty:
        return df
    X = word_embedding(df, 'preprocessed', 15)
    
    ada_model = pickle.load(open('models/ada_15_model.pkl', 'rb'))
    svc = pickle.load(open('models/svc_15.pkl', 'rb'))
    tfidf_2 = pickle.load(open('models/tfidf_15_2.pkl', 'rb'))
    
    ada_pred = ada_model.predict(X)
    
    ##return 1s only
    df['flag'] = ada_pred
    df_ones = df[df['flag'] == 1]
    
    if further_precision:
        new_X = df_ones['preprocessed']
        if new_X.size != 0:
            x = tfidf_2.transform(new_X)
            new_test_X = pd.DataFrame(x.toarray())
            sv_pred = svc.predict(new_test_X)

            df_ones['further_flag'] = sv_pred
            df_verified = df_ones[df_ones['further_flag'] == 1]
        else:
            df_verified = pd.DataFrame()

        if not df_verified.empty:
            res = qa_filtering(df_verified)
            df_verified['auditors'] = res
            df_verified = df_verified[['sentence', 'auditors', 'further_flag']]
            return df_verified
        else:
            print("Unable to conduct further separation. Original separation will be used instead.")
    
    res = qa_filtering(df_ones)
    df_ones['auditors'] = res

    df_ones = df_ones[['sentence', 'auditors', 'flag']]
    return df_ones

In [91]:
def attribute_17(df):
    df = keyword_filter(df, ['compensation', 'remuneration'])
    df['preprocessed'] = df['sentence'].apply(lambda x: pre_processing(x))
    if df.empty:
        return df
    X = word_embedding(df, 'preprocessed', 17)
    
    lr_model = pickle.load(open('models/lr_17_model.sav', 'rb'))
    ada_model = pickle.load(open('models/ada_17_model.sav', 'rb'))
    
    
    ada_pred = ada_model.predict(X)
    
    df['flag'] = ada_pred
    
    ## Returns 1s only
    df_ones = df[df['flag'] == 1]
    
    df_ones = df_ones[['sentence', 'flag']]
    return df_ones

In [16]:
def clean(line):
    line = re.sub(r'[0-9\.]+', '', line) # remove digits
    line = re.sub(r'[^\w\s]','', line) # remove punctuation
    return line

def stemming(line):
    stemmer = SnowballStemmer(language='english')
    return [stemmer.stem(token) for token in line]

def lemmatization(line):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in line]

def remove_stop_words(line):
    return [remove_stopwords(token) for token in line]

def pre_processing(line):
    tokenizer = TreebankWordTokenizer()

    tokenized_line = tokenizer.tokenize(clean(line))
    preprocessed_line = stemming(lemmatization(remove_stop_words(tokenized_line)))
    
    return ' '.join([token for token in preprocessed_line if token != ''])

In [17]:
def keyword_filter(df, keywords):
    filtered = []
    for s in np.array(df['sentence']):
        sentence = s.lower()
        for k in keywords:
            if k in sentence:
                filtered.append([s, k])
    
    filtered_df = pd.DataFrame(filtered, columns=['sentence', 'keyword(s)']).groupby(['sentence']).agg({'keyword(s)': lambda x: list(x.unique())}).reset_index()
    return filtered_df

## Test PDF

In [132]:
res = pd.read_csv('citycon.csv')

In [125]:
res = res.rename(columns={'words': 'sentence'})

In [24]:
keywords = ['ghg', 'sbti', 'tcfd', 'sasb', r'scope /d']
keywords = [x.lower() for x in keywords]
keywords

['ghg', 'sbti', 'tcfd', 'sasb', 'scope /d']

In [25]:
keyword_filter(res, keywords)

Unnamed: 0,sentence,keyword(s)
0,Energy indicators real estate development Tota...,[ghg]
1,"GRI Standard Disclosure Page Omission, Reason ...",[tcfd]
2,"GRI Standard Disclosure Page Omission, Reason ...","[ghg, tcfd]"
3,In 2021 UBM applied for membership of the UN G...,[tcfd]
4,One related step was the announcement in 2021 ...,[tcfd]
5,The TCFD recommendations on the reporting of c...,[tcfd]
6,The TCFD recommends the voluntary disclosure o...,[tcfd]
7,There were a number of changes in comparison w...,[ghg]
8,UBM has also been an official supporter of the...,[tcfd]


In [26]:
res_u['sentence'].apply(lambda x: pre_processing(x))

NameError: name 'res_u' is not defined

In [78]:
retrieve_images(DATA_FOLDER, 'test_dataset/images', table_only=True)

Starting with file: ubm_esg_report_2021.pdf
Finished with file: ubm_esg_report_2021.pdf


''

In [27]:
res

Unnamed: 0,sentence
0,The dividend is paid in the following nancial ...
1,The dividend proposal for 2021 is subject to t...
2,G as in Governance UBM & Sustainability 3.
3,Important information 7.
4,"Maly-GrtnerCOODear Shareholders,Dear Stakehold..."
...,...
1100,The amounts were rounded based on the compensa...
1101,"However, rounding, typesetting and printing er..."
1102,This ESG report is published in English and Ge...
1103,"In the event of a discrepancy or deviation, th..."


In [26]:
pd.set_option('display.max_colwidth', None)

In [133]:
df_14 = attribute_14(res)

In [134]:
df_14

Unnamed: 0,sentence,methodologies,flag
1,"CARBON In calculating its carbon footprint, Citycon applies the Greenhouse Gas Protocol (GHG) developed by the World Resources Institute and the World Business Council for Sustaina-ble Development.",['GHG'],1
2,"Following this, in October 2021 we became the first real estate company in Finland to join the Science Based Targets initiative (SBTi).",['SBTi'],1
3,Joining SBTi is based on their validation.,['SBTi'],1
6,We continue to excel on our climate goals: We are committed to the SBTi (Science Based Targets initiative) to reduce 50% of our scope 1 and 2 emissions by 2030 when compared to our emissions in 2018.,['SBTi'],1


In [135]:
df_15 = attribute_15(res, True)

Unable to conduct further separation. Original separation will be used instead.


In [136]:
df_15

Unnamed: 0,sentence,auditors,flag


In [137]:
df_17 = attribute_17(res)

In [138]:
df_17

Unnamed: 0,sentence,keyword(s),preprocessed
