## Import libaries

In [1]:
# # Mount on google drive
# from google.colab import drive
# drive.mount('/content/gdrive')

# General
import pandas as pd
import numpy as np
import re

# Load models
from keras.models import load_model
import pickle

# Text pre-processing
import nltk
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer 
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('wordnet')
import gensim
from gensim.parsing.preprocessing import remove_stopwords
stopwords = gensim.parsing.preprocessing.STOPWORDS

[nltk_data] Downloading package punkt to C:\Users\Chen
[nltk_data]     Wei\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Chen
[nltk_data]     Wei\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\Chen
[nltk_data]     Wei\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Chen
[nltk_data]     Wei\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Functions

In [2]:
def load_models():

  # relevance_vectorizer = pickle.load(open('gdrive/My Drive/BT4103/saved_models/relevance/vectorizer.pkl', 'rb'))
  # relevance_model = pickle.load(open('gdrive/My Drive/BT4103/saved_models/relevance/rf.pkl', 'rb'))
  # scale_vectorizer = pickle.load(open('gdrive/My Drive/BT4103/saved_models/scale/vectorizer.pkl', 'rb'))
  # scale_model = load_model('gdrive/My Drive/BT4103/saved_models/scale/nn.h5')

    relevance_vectorizer = pickle.load(open('relevance_vectorizer.pkl', 'rb'))
    relevance_model = pickle.load(open('relevance_rf.pkl', 'rb'))
    scale_vectorizer = pickle.load(open('scale_vectorizer.pkl', 'rb'))
    scale_model = load_model('scale_nn.h5')

    return relevance_vectorizer, relevance_model, scale_vectorizer, scale_model

In [4]:
def clean(line):
    line = re.sub(r'[0-9\.]+', '', line) # remove digits
    line = re.sub(r'[^\w\s]','', line) # remove punctuation
    return line

def stemming(line):
    stemmer = SnowballStemmer(language='english')
    return [stemmer.stem(token) for token in line]

def lemmatization(line):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in line]

def remove_stop_words(line):
    return [remove_stopwords(token) for token in line]

def pre_processing(line):
    tokenizer = TreebankWordTokenizer()

    tokenized_line = tokenizer.tokenize(clean(line))
    preprocessed_line = stemming(lemmatization(remove_stop_words(tokenized_line)))
    
    return ' '.join([token for token in preprocessed_line if token != ''])

In [36]:
def keyword_filter(df, keywords):
    filtered = []
    for s in np.array(df['sentence']):
        sentence = s.lower()
    for k in keywords:
        if k in sentence:
            filtered.append([s, k])
    
    filtered_df = pd.DataFrame(filtered, columns=['sentence', 'keyword(s)']).groupby(['sentence']).agg({'keyword(s)': lambda x: list(x.unique())}).reset_index()
    return filtered_df

In [6]:
def predict(X, vectorizer, model, pred_type):
    X_vec = pd.DataFrame(vectorizer.transform(X['preprocessed']).todense(), columns=vectorizer.get_feature_names_out())
    if pred_type == 'relevance':
        y_pred = model.predict(X_vec)
    elif pred_type == 'scale':
        y_pred = [i+1 for i in np.argmax(model.predict(X_vec, verbose=0), axis=1)]
    df = pd.DataFrame({'sentence': X['sentence'],'preprocessed': X['preprocessed'], 'pred_label': y_pred})
    return df

In [7]:
def is_quantitative(x):
    x = x.lower()

    x = re.sub("[2][0][0-5][0-9]", "", x) #remove years
    x = re.sub("fy[0-9]+", "", x) #remove numbers that represent financial year e.g. FY21
    x = re.sub("tier\s*[0-9]", "", x) #remove numbers related to tiers
    x = re.sub("scope\s*[0-9]", "", x) #remove numbers related to scope
    x = re.sub("co2", "", x) #remove 'CO2'
    x = re.sub("cid.+", "", x) #remove 'cid'
    x = re.sub("[0-9]+[:)]|[#]+[0-9]", "", x) #remove numbers for indexing e.g. 1) or #1 or 1:

    return re.search("supplier", x) and len(re.findall(r'\d+', x)) > 0

In [34]:
def get_attribute_23_25(df):
    # load trained models
    relevance_vectorizer, relevance_model, scale_vectorizer, scale_model = load_models()

    # keyword filter and preprocessing
    df_filtered = keyword_filter(df, ['supplier', 'supply chain', 'value chain'])
    print(df_filtered)
    df_filtered['preprocessed'] = df_filtered['sentence'].apply(lambda x: pre_processing(x))

    # predict
    relevance = predict(df_filtered, relevance_vectorizer, relevance_model, 'relevance')
    scale = predict(relevance[relevance['pred_label'] == True], scale_vectorizer, scale_model, 'scale')

    # get final results
    relevance['quantitative'] = relevance['sentence'].apply(lambda x: is_quantitative(x))
    relevant = pd.DataFrame(relevance[relevance['pred_label'] == True]['sentence'])
    attribute_23 = pd.DataFrame(relevance[(relevance['quantitative'] == True) & (relevance['pred_label'] == True)]['sentence'])
    attribute_25 = list(set(scale['pred_label']))

    print('# Relevant sentences found: ' + str(relevant.shape[0]))

    return attribute_23, attribute_25, relevant

## Predict Attribute 23, 25

In [7]:
pd.set_option('display.max_colwidth', None)
# Read dataset with raw sentences (no preprocessing)
# Path to be changed accordingly
df = pd.read_csv('gdrive/My Drive/AY2223S1/BT4103/data/labelled.csv')[['sentence', 'page']]

In [38]:
df = pd.read_csv('ubm.csv')

In [39]:
df.head()

Unnamed: 0,words
0,The dividend is paid in the following nancial ...
1,The dividend proposal for 2021 is subject to t...
2,G as in Governance UBM & Sustainability 3.
3,Important information 7.
4,"Maly-GrtnerCOODear Shareholders,Dear Stakehold..."


In [40]:
df = df.rename(columns={'words': 'sentence'})

In [41]:
df

Unnamed: 0,sentence
0,The dividend is paid in the following nancial ...
1,The dividend proposal for 2021 is subject to t...
2,G as in Governance UBM & Sustainability 3.
3,Important information 7.
4,"Maly-GrtnerCOODear Shareholders,Dear Stakehold..."
...,...
1100,The amounts were rounded based on the compensa...
1101,"However, rounding, typesetting and printing er..."
1102,This ESG report is published in English and Ge...
1103,"In the event of a discrepancy or deviation, th..."


In [42]:
attribute_23, attribute_25, relevant = get_attribute_23_25(df)

None


TypeError: 'NoneType' object is not subscriptable

In [None]:
attribute_23.head()

Unnamed: 0,sentence
78,"Among newly selected suppliers in fiscal 2020, 100% of them met both Nissan(cid:96)s social standards and basic environmental principles."
79,"Among our new supplier nominations in 2021, we have achieved over 70% secondary PGM use, reducing roughly 480,000 t CO emissions in the coming three years."
80,"Among the achievements of 2021 were the first estimate of upstream Scope 3 emissions (7 Mt CO2 equivalent) with EcoAct, the launch of a collective climate approach with 1,000 suppliers and the adoption of a target for the Companys 400 leading suppliers."
83,"Amount of scrap steel and aluminium recycled from production process (in t) 45,000 71,000 Scrap steel 2020 2021 Scrap aluminium 2020 2021 0 10,402 INTRODUCTION FUNDAMENTALS PRODUCTS AND SERVICES PRODUCTION, OPER ATION AND SUPPLIER NETWORK EMPLOYEE AND SOCIET Y OTHER INFORMATION Scrap steel and aluminium recycling system at our Plant Dadong."
96,Annual Purchase Value purchased from Tier 1 suppliers evaluated on CSR Percentage of complaints raised by Supervisory Authorities for customer privacy/data protection infringements handled within 1 month Number of convictions of noncompliance concerning product and service information and labeling or marketing communications Number of nameplates/models on which an LCA have been performed Percentage of nameplates with ZEV offering Share of ZEV in sales mix Waste normalized/vehicle produced Percentage of waste recovered out of total waste generated VOC emissions from paint shops normalized in g/m2 painted and kg/vehicle produced Total water withdrawal Total water withdrawn normalized in m3/vehicle produced Percentage of plants that have done a RENATU evaluation CSR scores of Stellantis suppliers assessed by Ecovadis Average Human Rights scores of Stellantis Tier1 suppliers assessed by independent third party Assessed suppliers for which corrective action plans have been developed for Human Rights issues 22.


In [None]:
attribute_25

[1, 2, 3]

In [None]:
relevant.head()

Unnamed: 0,sentence
2,"A NET ZERO COMMITMENT TO SUPPORT THE PLANETS CARBON NEUTRALITY Stellantis targets to become Carbon Net Zero on the whole value chain by 2038, with single digit percent compensation of residual emissions vs 2021 level."
4,A circular economy mindset to enlarge the lifespan of a vehicle: green materials: Their wider application requires the development of robust supply chains and more research on new materials.
6,"A description of how waterrelated impacts are addressed, including how the organization works with stakeholders to steward water as a shared resource, and how it engages with suppliers or customers with significant waterrelated impacts."
8,A failure by suppliers to meet applicable environmental laws or regulations may lead to a disruption of Stellantis supply chain or to raw materials and components price increases.
11,"A growing element in our due diligence efforts, the SAQ is used to assess supplier sustainability and to support Ford in identifying social and environmental risks throughout our supply base."
