## Import libaries

In [1]:
# # Mount on google drive
# from google.colab import drive
# drive.mount('/content/gdrive')

# General
import pandas as pd
import numpy as np
import re

# Load models
from keras.models import load_model
import pickle

# Text pre-processing
import nltk
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer 
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('wordnet')
import gensim
from gensim.parsing.preprocessing import remove_stopwords
stopwords = gensim.parsing.preprocessing.STOPWORDS

[nltk_data] Downloading package punkt to C:\Users\Chen
[nltk_data]     Wei\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Chen
[nltk_data]     Wei\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\Chen
[nltk_data]     Wei\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Chen
[nltk_data]     Wei\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Functions

In [2]:
def load_models():

  # relevance_vectorizer = pickle.load(open('gdrive/My Drive/BT4103/saved_models/relevance/vectorizer.pkl', 'rb'))
  # relevance_model = pickle.load(open('gdrive/My Drive/BT4103/saved_models/relevance/rf.pkl', 'rb'))
  # scale_vectorizer = pickle.load(open('gdrive/My Drive/BT4103/saved_models/scale/vectorizer.pkl', 'rb'))
  # scale_model = load_model('gdrive/My Drive/BT4103/saved_models/scale/nn.h5')

    relevance_vectorizer = pickle.load(open('models/relevance_vectorizer.pkl', 'rb'))
    relevance_model = pickle.load(open('models/relevance_rf.pkl', 'rb'))
    scale_vectorizer = pickle.load(open('models/scale_vectorizer.pkl', 'rb'))
    scale_model = load_model('models/scale_nn.h5')

    return relevance_vectorizer, relevance_model, scale_vectorizer, scale_model

In [3]:
def clean(line):
    line = re.sub(r'[0-9\.]+', '', line) # remove digits
    line = re.sub(r'[^\w\s]','', line) # remove punctuation
    return line

def stemming(line):
    stemmer = SnowballStemmer(language='english')
    return [stemmer.stem(token) for token in line]

def lemmatization(line):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in line]

def remove_stop_words(line):
    return [remove_stopwords(token) for token in line]

def pre_processing(line):
    tokenizer = TreebankWordTokenizer()

    tokenized_line = tokenizer.tokenize(clean(line))
    preprocessed_line = stemming(lemmatization(remove_stop_words(tokenized_line)))
    
    return ' '.join([token for token in preprocessed_line if token != ''])

In [4]:
def keyword_filter(df, keywords):
    filtered = []
    for s in np.array(df['sentence']):
        sentence = s.lower()
    for k in keywords:
        if k in sentence:
            filtered.append([s, k])
    
    filtered_df = pd.DataFrame(filtered, columns=['sentence', 'keyword(s)']).groupby(['sentence']).agg({'keyword(s)': lambda x: list(x.unique())}).reset_index()
    return filtered_df

In [5]:
def predict(X, vectorizer, model, pred_type):
    X_vec = pd.DataFrame(vectorizer.transform(X['preprocessed']).todense(), columns=vectorizer.get_feature_names_out())
    if pred_type == 'relevance':
        y_pred = model.predict(X_vec)
    elif pred_type == 'scale':
        y_pred = [i+1 for i in np.argmax(model.predict(X_vec, verbose=0), axis=1)]
    df = pd.DataFrame({'sentence': X['sentence'],'preprocessed': X['preprocessed'], 'pred_label': y_pred})
    return df

In [6]:
def is_quantitative(x):
    x = x.lower()

    x = re.sub("[2][0][0-5][0-9]", "", x) #remove years
    x = re.sub("fy[0-9]+", "", x) #remove numbers that represent financial year e.g. FY21
    x = re.sub("tier\s*[0-9]", "", x) #remove numbers related to tiers
    x = re.sub("scope\s*[0-9]", "", x) #remove numbers related to scope
    x = re.sub("co2", "", x) #remove 'CO2'
    x = re.sub("cid.+", "", x) #remove 'cid'
    x = re.sub("[0-9]+[:)]|[#]+[0-9]", "", x) #remove numbers for indexing e.g. 1) or #1 or 1:

    return re.search("supplier", x) and len(re.findall(r'\d+', x)) > 0

In [21]:
def get_attribute_23_25(df):
    # load trained models
    relevance_vectorizer, relevance_model, scale_vectorizer, scale_model = load_models()

    # keyword filter and preprocessing
    df_filtered = keyword_filter(df, ['supplier', 'supply chain', 'value chain'])
    print(df_filtered)
    if df_filtered.empty:
        return df_filtered, [0], df_filtered
    df_filtered['preprocessed'] = df_filtered['sentence'].apply(lambda x: pre_processing(x))

    # predict
    relevance = predict(df_filtered, relevance_vectorizer, relevance_model, 'relevance')
    scale = predict(relevance[relevance['pred_label'] == True], scale_vectorizer, scale_model, 'scale')

    # get final results
    relevance['quantitative'] = relevance['sentence'].apply(lambda x: is_quantitative(x))
    relevant = pd.DataFrame(relevance[relevance['pred_label'] == True]['sentence'])
    attribute_23 = pd.DataFrame(relevance[(relevance['quantitative'] == True) & (relevance['pred_label'] == True)]['sentence'])
    attribute_25 = list(set(scale['pred_label']))

    print('# Relevant sentences found: ' + str(relevant.shape[0]))

    return attribute_23, attribute_25, relevant

## Predict Attribute 23, 25

In [7]:
pd.set_option('display.max_colwidth', None)
# Read dataset with raw sentences (no preprocessing)
# Path to be changed accordingly
df = pd.read_csv('gdrive/My Drive/AY2223S1/BT4103/data/labelled.csv')[['sentence', 'page']]

In [8]:
df = pd.read_csv('ubm.csv')

In [9]:
df.head()

Unnamed: 0,sentence
0,The dividend is paid in the following nancial ...
1,The dividend proposal for 2021 is subject to t...
2,G as in Governance UBM & Sustainability 3.
3,Important information 7.
4,"Maly-GrtnerCOODear Shareholders,Dear Stakehold..."


In [10]:
df = df.rename(columns={'words': 'sentence'})

In [11]:
df

Unnamed: 0,sentence
0,The dividend is paid in the following nancial ...
1,The dividend proposal for 2021 is subject to t...
2,G as in Governance UBM & Sustainability 3.
3,Important information 7.
4,"Maly-GrtnerCOODear Shareholders,Dear Stakehold..."
...,...
1100,The amounts were rounded based on the compensa...
1101,"However, rounding, typesetting and printing er..."
1102,This ESG report is published in English and Ge...
1103,"In the event of a discrepancy or deviation, th..."


In [17]:
attribute_23, attribute_25, relevant = get_attribute_23_25(df)

Empty DataFrame
Columns: [sentence, keyword(s)]
Index: []


In [22]:
attribute_23.head()

Unnamed: 0,sentence
0,The dividend is paid in the following nancial ...
1,The dividend proposal for 2021 is subject to t...
2,G as in Governance UBM & Sustainability 3.
3,Important information 7.
4,"Maly-GrtnerCOODear Shareholders,Dear Stakehold..."


In [23]:
attribute_25

[0]

In [24]:
relevant.head()

Unnamed: 0,sentence
0,The dividend is paid in the following nancial ...
1,The dividend proposal for 2021 is subject to t...
2,G as in Governance UBM & Sustainability 3.
3,Important information 7.
4,"Maly-GrtnerCOODear Shareholders,Dear Stakehold..."
