### Deep Learning Supervised Text Classification
- BiLSTM (BDA + TrDA)

Bidirectional recurrent neural networks (RNN) put two independent RNNs together. This structure allows the networks to have both backward and forward information about the sequence at every time step


### Prepare Environment

In [1]:
from numpy.random import seed
seed(156)
import tensorflow as tf
tf.random.set_seed(256)

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
%cd /content/drive/MyDrive/Colab_Notebooks/safety_report_tc

/content/drive/MyDrive/Colab_Notebooks/safety_report_tc


In [4]:
# Importing required libraries
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tqdm.std import tqdm
from IPython.display import display, HTML
import time
import textwrap
import json
import datetime

In [None]:
#!pip list -v
# programmatically with python
#!python -m site --user-site
#!pip list --user -v

### Load Data

In [5]:
# Pick a minority category of interest (base data from source)
focus_cats = [
    'hydraulic fluid or oil leak',
    'line strike',
    'site compliance or practice issue',
    'ppe non-compliance_144_out_df_temp',
    'mechanical or equipment issue'
    ]

fns = [
    '2209031206', # hydraulic fluid or oil leak
    '2210031444', # line strike
    '2210031122', # site compliance or practice issue
    '2210031323'  #'ppe non-compliance'
]

fn_bd_aug_names = [
    'hydraulic fluid or oil leak_4700_simple_da',
    'line strike_8091_simple_da',
    'site compliance or practice issue_8000_simple_da',
    'ppe non-compliance_8050_simple_da',
    'mechanical or equipment issue_8294_simple_da'
]

fn_tr_aug_names = [
    'hydraulic fluid or oil leak_7920_trda',
    'line strike_8064_trda',
    'site compliance or practice issue_8410_trda',
    'ppe non-compliance_',
    'mechanical or equipment issue_'
]

# Select a category to load
pick = input('Pick a category: ')
index = focus_cats.index(pick)
focus_cat = focus_cats[index]
fn = fns[index]
fn = f'01_data/prepared/{fn}_prepared_{focus_cat}_data.csv'

# Load data from a minority category of interest
df = pd.read_csv(fn)
df = df[['text', 'category']]
df = pd.read_csv(fn)
df

Pick a category: line strike


Unnamed: 0,text,category
0,foreign body entered employee l eye while grin...,0
1,drainage pipe damaged at two two m depth see s...,0
2,robodrill spider excavator being operated when...,0
3,pressure hose made contact with light fitting ...,0
4,nacap 30t hitachi hyd hose split resultong in ...,0
...,...,...
93852,fall on water vehicle an employee was using an...,0
93853,other fall to lower level unspecified an emplo...,0
93854,injured by slipping or swinging object held by...,0
93855,direct exposure to electricity greater than tw...,0


### Data Preparation (Train / Test Splits)

In [6]:
# Now create the input to the model training stage
X = df.text
y = df.category # Remember, these are not necessarily the 'truth' but rule_book hits

# Apply a simple 80/20 split
# TODO - The dataset is heavily imbalanced. Treat this appropriately.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1234)

In [7]:
base_train_count = len(X_train)
base_train_count

75085

In [8]:
test_trues = list(y_test[y_test==1].index.values)
test_trues.sort()
print('Test:', len(test_trues), test_trues)

train_trues = list(y_train[y_train==1].index.values)
train_trues.sort()
print('Train:', len(train_trues), train_trues)

Test: 62 [819, 1559, 1779, 2042, 2243, 2546, 2557, 3180, 3471, 3497, 3609, 3661, 4435, 5068, 5999, 6704, 7729, 7905, 8081, 8504, 8792, 9378, 9415, 9703, 9870, 9893, 9935, 10113, 10698, 12444, 12798, 13145, 14488, 14509, 17206, 17367, 17477, 17630, 18430, 18441, 18881, 18902, 18911, 19339, 20728, 21007, 21357, 22261, 22401, 22636, 22831, 22913, 26529, 30086, 40394, 43621, 67311, 78441, 78526, 80754, 86830, 93268]
Train: 264 [285, 289, 333, 511, 791, 824, 871, 999, 1252, 1334, 1361, 1582, 1610, 1621, 1675, 1677, 1681, 1746, 1755, 1786, 1798, 1804, 1952, 1971, 1984, 2025, 2033, 2067, 2070, 2168, 2210, 2276, 2303, 2337, 2377, 2457, 2492, 2614, 2617, 2674, 3016, 3046, 3114, 3355, 3357, 3381, 3434, 3469, 3579, 3644, 3688, 3701, 3706, 3777, 3974, 3997, 4061, 4141, 4535, 4636, 4757, 4910, 5158, 5359, 5470, 5510, 5541, 5576, 5610, 5631, 5670, 5820, 5974, 6114, 6243, 6501, 7736, 7850, 8019, 8031, 8033, 8079, 8102, 8185, 8255, 8359, 8425, 8556, 8746, 8912, 8918, 8922, 8986, 9104, 9110, 9151, 9206

In [9]:
# Look at training 'trues', i.e., rule-book hits
df_train_trues = df.iloc[train_trues]
df_train_trues

Unnamed: 0,text,category
285,dc cable struck allied fencing employees repor...,1
289,water line strike during drilling operations d...,1
333,mini excavator damaged underground three pvc d...,1
511,soil boring activity aggrevates poor condition...,1
791,it was notice that part of the bank had collap...,1
...,...,...
85446,struck against stationary object or equipment ...,1
86522,other fall to lower level more than thirty fee...,1
89205,direct exposure to electricity greater than tw...,1
91291,indirect exposure to electricity unspecified a...,1


### **Basic Data Augmentation (BDA) Experiments**

In [10]:
fn_bd_aug_names

['hydraulic fluid or oil leak_4700_simple_da',
 'line strike_8091_simple_da',
 'site compliance or practice issue_8000_simple_da',
 'ppe non-compliance_8050_simple_da',
 'mechanical or equipment issue_8294_simple_da']

In [11]:
# Load fabricated reports
fn_bd_name = fn_bd_aug_names[index] 
df_fab = pd.read_csv(f'01_data/fabricated/{fn_bd_name}.csv')
df_fab

Unnamed: 0,text,category
0,dc cable struck allied debate employee cover e...,1
1,dc ball over cable struck allied fencing emplo...,1
2,dc cable struck allied fencing employees repor...,1
3,dc cable allied fencing reported experiencing ...,1
4,dc cable struck allied fencing employees repor...,1
...,...,...
8086,direct exposure to electricity greater than tw...,1
8087,direct exposure to electricity greater than tw...,1
8088,exposure to electricity greater than two hundr...,1
8089,point exposure to electricity greater than two...,1


In [12]:
# Prepare data & conbine with original training data
X_train_fab = df_fab.text
X_train_new = X_train.append(X_train_fab)
X_train_list = list(X_train_new)
y_train_list = list(y_train)

In [13]:
# Complete the new training dataframe
df_shuff = pd.DataFrame(X_train_list, columns = ['text'])
y_train_list.extend([1 for i in range(len(X_train_fab))])
df_shuff['category'] = y_train_list
df_shuff

Unnamed: 0,text,category
0,ip operated air assisted door with left hand w...,0
1,twenty x spiders fell from roof when generator...,0
2,struck by discharged object or substance an em...,0
3,caught in running equipment or machinery n e c...,0
4,caught in running equipment or machinery durin...,0
...,...,...
83171,direct exposure to electricity greater than tw...,1
83172,direct exposure to electricity greater than tw...,1
83173,exposure to electricity greater than two hundr...,1
83174,point exposure to electricity greater than two...,1


In [14]:
# Shuffle the dataframe
df_shuff = df_shuff.sample(frac=1).reset_index(drop=True)
df_shuff

Unnamed: 0,text,category
0,struck by dislodged flying object particle an ...,0
1,exposure to environmental heat while cooking i...,0
2,other fall to lower level unspecified an emplo...,0
3,contact with hot objects or substances an empl...,0
4,caught in running houseclean equipment or mach...,1
...,...,...
83171,theft of generator third incident trespasser e...,0
83172,caught in running equipment or machinery unspe...,0
83173,employees reported gunshots being fired from o...,0
83174,compressed or pinched by shifting objects or e...,0


In [15]:
# Visually check a random sample of 20 (hits)
df_shuff[df_shuff['category']==1].sample(20)

Unnamed: 0,text,category
64535,the excavating inform a mini digger streetligh...,1
6396,excavator operator made contact fiber optic li...,1
35063,the contractor on site struck one of our rente...,1
67995,one conclusion of a thirty six agate line come...,1
50173,employees were attempting to line detailed cut...,1
50194,low voltage cable was struck by a three hundre...,1
15790,marked strike with release no injuries sustain...,1
39188,hgcp wg maintenance supplement labour skid loa...,1
51086,particular using a concrete have to place conc...,1
66455,skid steer with auger bit attachment drilled i...,1


In [16]:
X_train = df_shuff.text
y_train = df_shuff.category

### Transformer Data Augmentation (TrDA)

In [17]:
# Load fabricated reports
fn_tr_name = fn_tr_aug_names[index] 
df_fab = pd.read_csv(f'01_data/fabricated/{fn_tr_name}.csv')
df_fab

Unnamed: 0,text,group
0,digger burst buried pipe damaging it no injuri...,line strike
1,digger burst buried pipe resulting in a small ...,line strike
2,digger burst buried pipe connecting pipe on ea...,line strike
3,digger burst buried pipe rack in field inciden...,line strike
4,digger burst buried pipe with teeth at 110mm t...,line strike
...,...,...
8059,minidigger when excavating struck underground ...,line strike
8060,minidigger when excavating struck underground ...,line strike
8061,minidigger when excavating struck underground ...,line strike
8062,minidigger when excavating struck underground ...,line strike


In [18]:
# Prepare data and augment raw (real) training data
X_train_fab = df_fab.text
X_train_new = X_train.append(X_train_fab)
X_train_list = list(X_train_new)
y_train_list = list(y_train)

In [19]:
df_shuff = pd.DataFrame(X_train_list, columns = ['text'])
y_train_list.extend([1 for i in range(len(X_train_fab))])
df_shuff['category'] = y_train_list
df_shuff

Unnamed: 0,text,category
0,struck by dislodged flying object particle an ...,0
1,exposure to environmental heat while cooking i...,0
2,other fall to lower level unspecified an emplo...,0
3,contact with hot objects or substances an empl...,0
4,caught in running houseclean equipment or mach...,1
...,...,...
91235,minidigger when excavating struck underground ...,1
91236,minidigger when excavating struck underground ...,1
91237,minidigger when excavating struck underground ...,1
91238,minidigger when excavating struck underground ...,1


In [20]:
# Shuffle the dataframe
df_shuff = df_shuff.sample(frac=1).reset_index(drop=True)
df_shuff

Unnamed: 0,text,category
0,fitter ip hit his right middle finger with a h...,0
1,fall through surface or existing opening eleve...,0
2,caught in running equipment or machinery durin...,0
3,fall on same level due to tripping over an obj...,0
4,individual reported slight discomfort to side ...,0
...,...,...
91235,caught in running equipment or machinery durin...,0
91236,equipment operator an unmarked two after an ei...,1
91237,caught in running equipment or machinery n e c...,0
91238,diesel found spilled on row no work activity i...,0


In [21]:
# Quick visual check of a random sample of 20 (hits)
df_shuff[df_shuff['category']==1].sample(20)

Unnamed: 0,text,category
58905,excavator operator made contact with a matal p...,1
32847,minidigger came in contact with underground ca...,1
86320,dumper collided with underground cable inciden...,1
54046,whilst excavating with a mini streetlight was ...,1
54654,minidigger while excavating made contact with ...,1
44363,unmarked hand that of abandoned four steel lin...,1
67450,tie accidentally drilled into steam tracing on...,1
74419,jcb damaged two cable sleeves whilst excavatin...,1
62085,dozer connected with drain line while excavati...,1
67284,that or jump from water vehicle an employee wa...,1


## Build Model

In [22]:
# Load modelling building libraries
from keras.layers import Dropout, Dense, Embedding, LSTM, Bidirectional
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from sklearn.metrics import matthews_corrcoef, confusion_matrix
from sklearn import metrics
from sklearn.utils import shuffle
from collections import Counter
import numpy as np
import pickle
import matplotlib.pyplot as plt
import warnings
import logging
logging.basicConfig(level=logging.INFO)

In [23]:
# Create a function to prepare model input sequences and embedding dictionary
def prepare_model_input(X_train, X_test, MAX_NB_WORDS=75000, MAX_SEQUENCE_LENGTH=500):
    np.random.seed(7)
    all_text = np.concatenate((X_train, X_test), axis=0)
    all_text = np.array(all_text)

    # Fit tokeniser only on training text
    text = np.array(X_train)
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts(text)

    # Save the tokenizer as .pkl file
    pickle.dump(tokenizer, open(f'08_output/{time.strftime("%y%m%d%H%M")}_text_tokenizer.pkl', 'wb'))

    # Generate sequences for all text using tokenizer created only on training text
    # This converts the sentence into a sequence of integers, e.g., [2, 4, 5, 7]
    sequences = tokenizer.texts_to_sequences(all_text)

    # Get a list of all words and their sequence numbers
    word_index = tokenizer.word_index

    # Pad out the sequences with zeroes to max sequence length
    all_text = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    
    # Seperate training sequences from test
    X_train_Glove = all_text[0:len(X_train), ]
    X_test_Glove = all_text[len(X_train):, ]

    # Now gather the embeddings
    # Start with standard GloVe
    ## https://www.google.com/search?client=safari&rls=en&q=glove+embeddings&ie=UTF-8&oe=UTF-8
    embeddings_dict = {}
    f = open("03_embeddings/glove/glove.6B.50d.txt", encoding="utf8")
    for line in f:
        values = line.split()
        word = values[0]
        try:
            coefs = np.asarray(values[1:], dtype='float32')
        except:
            pass
        embeddings_dict[word] = coefs
    f.close()
    print('Total %s word vectors.' % len(embeddings_dict))
    return (X_train_Glove, X_test_Glove, word_index, embeddings_dict)


In [24]:
# Create a function that builds the deep learning model
def build_bilstm(word_index, embeddings_dict, nclasses,  MAX_SEQUENCE_LENGTH=500, EMBEDDING_DIM=50, dropout=0.5, hidden_layer = 3, lstm_node = 32):
    # Initialize a sequential model
    model = Sequential()
    
    # Make the embedding matrix using the embedding_dict
    embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
    for word, i in word_index.items():
        embedding_vector = embeddings_dict.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            if len(embedding_matrix[i]) != len(embedding_vector):
                print("could not broadcast input array from shape", str(len(embedding_matrix[i])),
                      "into shape", str(len(embedding_vector)), " Please make sure your"
                                                                " EMBEDDING_DIM is equal to embedding_vector file ,GloVe,")
                exit(1)
            embedding_matrix[i] = embedding_vector
            
    # Add embedding layer
    print('_words', len(word_index))
    model.add(Embedding(len(word_index) + 1,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=True))
    # Add hidden layers 
    # Default will be 3 layers
    # Default lstm nodel number will be 32
    for i in range(0, hidden_layer):
        # Add a bidirectional lstm layer
        model.add(Bidirectional(LSTM(lstm_node, return_sequences=True, recurrent_dropout=0.2)))

        # Add a dropout layer after each lstm layer
        model.add(Dropout(dropout))

    model.add(Bidirectional(LSTM(lstm_node, recurrent_dropout=0.2)))

    # Attention addition
    #model.add(attention(return_sequences=True))
    
    model.add(Dropout(dropout))

    # Add the fully connected layer with 256 nuerons & Relu activation
    model.add(Dense(256, activation='relu'))

    # Add the output layer with softmax activation (binary output)
    model.add(Dense(nclasses, activation='softmax'))

    # Compile the model using sparse_categorical_crossentropy
    model.compile(loss='sparse_categorical_crossentropy',
                      optimizer='adam',
                      metrics=["categorical_accuracy"])
    return model

In [25]:
X_train = df_shuff.text
y_train = df_shuff.category

print("Preparing model input ...")
X_train_Glove, X_test_Glove, word_index, embeddings_dict = prepare_model_input(X_train, X_test)
print("Done!")

Preparing model input ...
Total 400000 word vectors.
Done!


In [26]:
print("Building Model!")
model = build_bilstm(word_index, embeddings_dict, 2)
model.summary()

Building Model!
_words 68423
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 500, 50)           3421200   
                                                                 
 bidirectional (Bidirectiona  (None, 500, 64)          21248     
 l)                                                              
                                                                 
 dropout (Dropout)           (None, 500, 64)           0         
                                                                 
 bidirectional_1 (Bidirectio  (None, 500, 64)          24832     
 nal)                                                            
                                                                 
 dropout_1 (Dropout)         (None, 500, 64)           0         
                                                                 
 bidirectional_2 (Bidirecti

In [27]:
# Quick view of rule_book hits (should represent the focus category)
X_test[y_test == 1].sample(20)

8081     track hoe bucket struck the mainline pipeline ...
8792     concrete kibble bucket handle made contact wit...
20728    tie accidentally drilled into steam tracing on...
9935     suspended load pipe spool struck the adjacent ...
3180     bucket of excavator scraped and damaged electr...
7729     during manual excavation pickaxe point struck ...
5999     wgpacs employee was driving down the highway b...
8504     fitter cut into live service air line not redu...
14509    contractor excavator boom struck overhead line...
22636    a three hundred and eight excavator made conta...
2243     while relocating crane the boom contacted and ...
18881    two crews were excavating a trench and struck ...
22261    private water line breach private water line b...
17367    sewer line strike while drilling wood and dril...
17630    track hoe struck underground pvc water line tr...
40394    struck by dislodged flying object particle two...
21007    flare stack of treater was being picked and wa.

In [28]:
# Reminder of test set breakdown
Counter(y_test)

Counter({0: 18710, 1: 62})

In [29]:
# Verify augmented training set breakdown
Counter(y_train)

Counter({0: 74821, 1: 16419})

### Model Training & Evaluatrion

In [30]:
def get_eval_report(labels, preds):
    mcc = matthews_corrcoef(labels, preds)
    tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()
    precision = (tp)/(tp+fp)
    recall = (tp)/(tp+fn)
    f1 = (2*(precision*recall))/(precision+recall)
    return {
        "mcc": mcc,
        "true positive": tp,
        "true negative": tn,
        "false positive": fp,
        "false negative": fn,
        "precision" : precision,
        "recall" : recall,
        "F1" : f1,
        "accuracy": (tp+tn)/(tp+tn+fp+fn)
    }

def compute_metrics(labels, preds):
    assert len(preds) == len(labels)
    return get_eval_report(labels, preds)

def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string], '')
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()


In [31]:
# Split training into training and validation data
X_train_vGlove, X_val_Glove, y_vtrain, y_val = train_test_split(X_train_Glove, y_train, test_size = 0.1, stratify=y_train, random_state = 1234)

# Verify stratification
num_pos_vTrain = len(y_vtrain[y_vtrain==1])
print(f'Number of positives in training (val) split: {num_pos_vTrain}')

num_pos_val = len(y_val[y_val==1])
print(f'Number of positives in validation data: {num_pos_val}')

Number of positives in training (val) split: 14777
Number of positives in validation data: 1642


In [None]:
# Train the model
print('Training for:', focus_cat)
epoch_num = int(input('Enter epoch number (default is 3): '))
history = model.fit(X_train_vGlove, y_vtrain,
                              validation_data=(X_val_Glove, y_val),
                              epochs=epoch_num,
                              batch_size=128,
                              verbose=1)

Training for: line strike
Enter epoch number (default is 3): 3
Epoch 1/3
  6/642 [..............................] - ETA: 32:54 - loss: 0.6033 - categorical_accuracy: 0.8451

In [None]:
# Plot training history
plot_graphs(history, 'loss')

In [None]:
# Print accuracy measures
print(f'\n Evaluating Model for "{focus_cat}" with {epoch_num} epochs ... \n')

predicted = model.predict(X_test_Glove)
predicted = np.argmax(predicted, axis=1)

print(metrics.classification_report(y_test, predicted))
print("\n")

In [None]:
# Create dataframe for inspection of results
tmp = pd.DataFrame(X_test, columns=['text'])
tmp['pred'] = predicted
tmp['rule_book'] = y_test.values

# Quick look at predicted positives
print('Check rule-book hit count: ', len(tmp[tmp['rule_book']==1]))

In [None]:
# Examine precision stats
# Look at predicted positives that did not have a rule-book hit
table = tmp[(tmp['pred']==1) & (tmp['rule_book']==0)]
table.to_csv(f'10_performance/model_{focus_cat}_bda_trda_fps.csv')
display(HTML(table.to_html()))
total_pos_preds = len(tmp[(tmp['pred']==1)])
total_pos_hits = len(tmp[(tmp['pred']==1) & (tmp['rule_book']==1)])
print(f'\nThere are {len(table)} entries (out of {sum(y_test==1)})...')
print(f'\nTotal no. of positive predictions is  {total_pos_preds}...')
print(f'\nTotal no. of positive hits is  {total_pos_hits }...')

In [None]:
# Take a look at predicted negatives that had a rule-book hit
table = tmp[(tmp['pred']==0) & (tmp['rule_book']==1)]
display(HTML(table.to_html()))
print(f'\nThere are {len(table)} entries (out of {sum(y_test==1)})...')

In [None]:
# Save the model
filename = f'08_output/models/{time.strftime("%y%m%d%H%M")}_{focus_cat}_bilstm_bda_trda_{epoch_num}_model.pkl'
pickle.dump(model, open(filename, 'wb'))