In [50]:
import re
import swifter
import numpy as np
import pandas as pd
from typing import List, Union
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report

import tensorflow_text
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.layers import Input, Dropout, Dense, GlobalAveragePooling1D

import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('../data/train_augmented.csv')
df = df.drop(columns=['ID'])

# Some of the rows has null value at ABSTRACT. We replace the null value with the TITLE text.
df['ABSTRACT'] = df['ABSTRACT'].where(pd.notnull, df['TITLE'])
df.head()

Unnamed: 0,TITLE,ABSTRACT,Activist_Investors,Cost_Reduction,Covid_19,Digital_capabilities,Diversity___Inclusiveness,Headquarters_Relocation,International_Expansions,M_A,Management_changes,Restructuring,Rewards___benefits,Spin_offs__Split_offs,Tax_Risk,Upskilling__reskilling,Wage_Dispute
0,"Chevron, Total others slash production, cut a...",EnergyMix The world's five largest oil product...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Survey shows 4% of companies in Singapore sla...,[Source: TODAY] SINGAPORE - The livelihoods of...,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0
2,"""Dark day for Brighouse"" as two bank branches ...",[Source: examiner.co.uk] Barclays Bank and Yor...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,"""Helping Hands"": BASF donates 40,000 liters of...","BASF is supplying 40,000 liters of hand saniti...",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
4,$2B Chicago area transit packaging group will ...,A $2 billion transit packaging company will re...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0


In [4]:
def process_text(text):
    text = text.lower().replace('\n',' ').replace('\r','').strip()
    text = re.sub(' +', ' ', text)
    text = re.sub(r'[^\w\s]','',text)
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'\w*\d\w*', '', text)       
    return text

In [5]:
df['text'] = df['ABSTRACT'].swifter.apply(lambda x: process_text(x))
df.head()

HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=14050.0, style=ProgressStyle(descripti…




Unnamed: 0,TITLE,ABSTRACT,Activist_Investors,Cost_Reduction,Covid_19,Digital_capabilities,Diversity___Inclusiveness,Headquarters_Relocation,International_Expansions,M_A,Management_changes,Restructuring,Rewards___benefits,Spin_offs__Split_offs,Tax_Risk,Upskilling__reskilling,Wage_Dispute,text
0,"Chevron, Total others slash production, cut a...",EnergyMix The world's five largest oil product...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,energymix the worlds five largest oil producti...
1,Survey shows 4% of companies in Singapore sla...,[Source: TODAY] SINGAPORE - The livelihoods of...,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,source today singapore the livelihoods of man...
2,"""Dark day for Brighouse"" as two bank branches ...",[Source: examiner.co.uk] Barclays Bank and Yor...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,source examinercouk barclays bank and yorkshir...
3,"""Helping Hands"": BASF donates 40,000 liters of...","BASF is supplying 40,000 liters of hand saniti...",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,basf is supplying liters of hand sanitizer pr...
4,$2B Chicago area transit packaging group will ...,A $2 billion transit packaging company will re...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,a billion transit packaging company will relo...


In [6]:
label_cols = ['Activist_Investors', 'Cost_Reduction', 'Covid_19', 'Digital_capabilities', 'Diversity___Inclusiveness',
              'Headquarters_Relocation', 'International_Expansions', 'M_A', 'Management_changes', 'Restructuring', 
              'Rewards___benefits', 'Spin_offs__Split_offs', 'Tax_Risk', 'Upskilling__reskilling', 'Wage_Dispute']

df['one_hot_labels'] = list(df[label_cols].values)
idx2label = dict(zip(range(15),label_cols))

labels = list(df.one_hot_labels.values)

In [7]:
# Identifying indices of 'one_hot_labels' entries that only occur once - this will allow us to stratify split our training data later
label_counts = df.one_hot_labels.astype(str).value_counts()
one_freq = label_counts[label_counts==1].keys()
one_freq_idxs = sorted(list(df[df.one_hot_labels.astype(str).isin(one_freq)].index), reverse=True)
print('df label indices with only one instance: ', one_freq_idxs)

df label indices with only one instance:  [2654, 823]


In [8]:
one_freq_df = df.iloc[one_freq_idxs]
one_freq_labels = [labels.pop(i) for i in one_freq_idxs]

df = df.drop(df.index[one_freq_idxs])

In [9]:
df_train, df_val = train_test_split(df, test_size=0.15, random_state=2021, stratify=labels)
df_test = df_val[:120]
df_val = df_val[120:]

df_train = pd.concat([df_train, one_freq_df])

In [10]:
train_sentences = df_train["text"].values
val_sentences = df_test["text"].values

train_y = df_train[label_cols].values
val_y = df_test[label_cols].values

In [105]:
class BERTClassifier:
    def __init__(self, n_labels: int, threshold:float=0.5):
        self.__n_labels = n_labels
        self.__batch_size = 32
        self.__threshold = threshold
        self.__optimizer = Adam(lr=1e-5, decay=1e-6)
        self.__checkpointer = ModelCheckpoint(filepath="../model_weights/NN_weights.hdf5", 
                                              verbose=1, save_best_only=True)
        self.__model = self.__init_model(n_labels = self.__n_labels)
        
        
    def __init_model(self, n_labels:int):
        text_input = tf.keras.layers.Input(shape=(), dtype=tf.string)
        preprocessor = hub.KerasLayer(
            "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
        encoder_inputs = preprocessor(text_input)
        encoder = hub.KerasLayer(
            "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4",
            trainable=True)
        outputs = encoder(encoder_inputs)
        pooled_output = outputs["pooled_output"]      # [batch_size, 1024].
        sequence_output = outputs["sequence_output"]  # [batch_size, seq_length, 1024].
        x = GlobalAveragePooling1D()(sequence_output)
        x = Dropout(0.2)(x)
        output = Dense(n_labels, activation='sigmoid', name='outputs')(x)

        model = Model(inputs=text_input, outputs=output)
        model.compile(loss='binary_crossentropy', optimizer=self.__optimizer, metrics=['accuracy'])
        print(model.summary())
        return model
    
    
    def load_weights(self, filepath: str):
        self.__model.load_weights(filepath=filepath)
        print("Saved model weights successfully loaded...")
        
        
    def fit(self, X_train, y_train, validation_data, epochs=10):
        self.__model.fit(X_train, y_train, epochs=epochs, batch_size=self.__batch_size,
                        callbacks=[self.__checkpointer], validation_data=validation_data)

    
    def predict(self, X:Union[List, np.array]):
        preds = self.__model.predict(X)
        preds = np.where(preds > 0.5, 1, 0)
        preds_ = []
        for pred in preds:
            oth_label = 0 if np.any(pred) else 1
            preds_.append(np.append(pred, oth_label))
        return np.array(preds_)

In [28]:
model = BERTClassifier(n_labels=len(label_cols))

model.fit(X_train=df_train['text'].values, y_train=df_train[label_cols].values, epochs=10, 
          validation_data=(df_val['text'].values, df_val[label_cols].values))

Model: "functional_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None,)]            0                                            
__________________________________________________________________________________________________
keras_layer_2 (KerasLayer)      {'input_word_ids': ( 0           input_2[0][0]                    
__________________________________________________________________________________________________
keras_layer_3 (KerasLayer)      {'pooled_output': (N 109482241   keras_layer_2[0][0]              
                                                                 keras_layer_2[0][1]              
                                                                 keras_layer_2[0][2]              
_______________________________________________________________________________________

In [107]:
model.load_weights(filepath='../model_weights/NN_weights.hdf5')

Saved model weights successfully loaded...


In [103]:
# Re-adding the "Others" category to test data
df_test['Others'] = df.apply(lambda x: 0 if np.any(x[label_cols].values) else 1, axis=1)
test_label_cols = label_cols + ['Others']

idx2label[15] = 'Others'
idx2label

{0: 'Activist_Investors',
 1: 'Cost_Reduction',
 2: 'Covid_19',
 3: 'Digital_capabilities',
 4: 'Diversity___Inclusiveness',
 5: 'Headquarters_Relocation',
 6: 'International_Expansions',
 7: 'M_A',
 8: 'Management_changes',
 9: 'Restructuring',
 10: 'Rewards___benefits',
 11: 'Spin_offs__Split_offs',
 12: 'Tax_Risk',
 13: 'Upskilling__reskilling',
 14: 'Wage_Dispute',
 15: 'Others'}

## Testing the model

In [115]:
def get_predictions(text:str, threshold:float=0.5):
    pred = model.predict([text]).flatten()
    return pred

In [116]:
test_idx = 36
pred = get_predictions(text=df_test.iloc[test_idx]['text'])
print("Text: ", df_test.iloc[test_idx]['text'])
print()
print("Predicted categories: ")
print([idx2label[idx] for idx in np.where(pred == 1)[0]])
print()
print("Actual categories: ")
actual = df_test.iloc[test_idx][label_cols][df_test.iloc[test_idx][label_cols] == 1].index.tolist()
print(actual if actual else ['Others'])

Text:   click here to view this document in its original formatdiscovery announces a new spinoff of its   rated show taking viewers into the brutal winter gold mining offseason

Predicted categories: 
['Spin_offs__Split_offs']

Actual categories: 
['Spin_offs__Split_offs']


## Test Metrics

In [117]:
y_pred = model.predict(df_test['text'].values)
y_true = df_test[test_label_cols].values

In [118]:
def hamming_score(y_true, y_pred, normalize=True, sample_weight=None):
    '''
    Compute the Hamming score (a.k.a. label-based accuracy) for the multi-label case
    https://stackoverflow.com/q/32239577/395857
    '''
    acc_list = []
    for i in range(y_true.shape[0]):
        set_true = set( np.where(y_true[i])[0] )
        set_pred = set( np.where(y_pred[i])[0] )
        #print('\nset_true: {0}'.format(set_true))
        #print('set_pred: {0}'.format(set_pred))
        tmp_a = None
        if len(set_true) == 0 and len(set_pred) == 0:
            tmp_a = 1
        else:
            tmp_a = len(set_true.intersection(set_pred))/\
                    float( len(set_true.union(set_pred)) )
        #print('tmp_a: {0}'.format(tmp_a))
        acc_list.append(tmp_a)
    return np.mean(acc_list)

In [119]:
print("Classification Report: ")
print(classification_report(y_true,y_pred,target_names=test_label_cols))
print()
print('F1 Accuracy: ', f1_score(y_true, y_pred, average='micro'))
print("Hamming Score: ", hamming_score(y_true, y_pred))

Classification Report: 
                           precision    recall  f1-score   support

       Activist_Investors       0.83      1.00      0.91         5
           Cost_Reduction       0.80      0.69      0.74        29
                 Covid_19       0.86      0.95      0.90        19
     Digital_capabilities       1.00      0.91      0.95        11
Diversity___Inclusiveness       0.88      1.00      0.93         7
  Headquarters_Relocation       0.86      0.86      0.86         7
 International_Expansions       0.88      0.78      0.82         9
                      M_A       1.00      1.00      1.00         3
       Management_changes       0.89      0.89      0.89         9
            Restructuring       0.67      0.33      0.44         6
       Rewards___benefits       1.00      1.00      1.00         5
    Spin_offs__Split_offs       0.86      1.00      0.92         6
                 Tax_Risk       1.00      0.86      0.92         7
   Upskilling__reskilling       1.00 

Next, we will take a look at the instances where the model got it wrong.

In [71]:
for idx, (y1, y2) in enumerate(zip(y_true, y_pred)):
    if not (y1==y2).all():
        print(df_test.iloc[idx]['ABSTRACT'])
        print('TRUE LABELS: ', [idx2label[idx] for idx in np.where(y1 == 1)[0]])
        print('PREDICTED LABELS: ', [idx2label[idx] for idx in np.where(y2 == 1)[0]])
        print()

Few would disagree that most multinational corporations invariably favour Singapore over Malaysia when it comes to setting up their regional base or corporate headquarters. In recent years, other Asean countries — Vietnam, Indonesia and the Philippines,.
TRUE LABELS:  ['Upskilling__reskilling']
PREDICTED LABELS:  ['Headquarters_Relocation']

[Source: Dow Jones Newswires Chinese (English)] Starbucks Corp. says it has reopened hundreds of its China stores that closed in response to the coronavirus outbreak. Chief Executive Kevin Johnson said Thursday that 85% of the coffee giant's stores in it
TRUE LABELS:  ['Cost_Reduction', 'Covid_19']
PREDICTED LABELS:  ['Cost_Reduction', 'Covid_19', 'Management_changes']

Hyatt Hotels Corp. is laying off 1,300 employees globally, beginning June 1, in an attempt to cope with the coronavirus crisis. Hyatt has also taken measures including reductions of companywide expenditures, extended salary reductions for its senior lead
TRUE LABELS:  ['Covid_19']
P

Looking at the true labels and predicted labels, it is quite clear that the most of the wrong predictions that the model makes are on text that are confusing and highly ambiguous. Overall the model performs decently on the data.