### Deep Learning Supervised Text Classification
- BiLSTM

Bidirectional recurrent neural networks (RNN) put two independent RNNs together. This structure allows the networks to have both backward and forward information about the sequence at every time step


### Prepare Environment

In [1]:
from numpy.random import seed
seed(156)
import tensorflow as tf
tf.random.set_seed(256)

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
%cd /content/drive/MyDrive/Colab_Notebooks/safety_report_tc

/content/drive/MyDrive/Colab_Notebooks/safety_report_tc


In [4]:
# Importing required libraries
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tqdm.std import tqdm
from IPython.display import display, HTML
import time
import textwrap
import json
import datetime

### Load Data

In [5]:
# Load data from a minority category of interest
focus = 'hydraulic fluid or oil leak'
data_ref = '2209031206'
fn = f'data/prepared/{data_ref}_prepared_data.csv'
df = pd.read_csv(fn)
df

Unnamed: 0,text,category
0,foreign body entered employee l eye while grin...,0
1,drainage pipe damaged at twenty-two m depth se...,0
2,robodrill spider excavator being operated when...,1
3,pressure hose made contact with light fitting ...,0
4,nacap 30t hitachi hyd hose split resultong in ...,0
...,...,...
93852,fall on water vehicle an employee was using an...,0
93853,other fall to lower level unspecified an emplo...,0
93854,injured by slipping or swinging object held by...,0
93855,direct exposure to electricity greater than tw...,0


### Data Preparation (Train / Test Splits)

In [6]:
# Now create the input to the model training stage
X = df.text
y = df.category # Remember, these are not necessarily the 'truth' but rule_book hits

# Apply a simple 80/20 split
# TODO - The dataset is heavily imbalanced. Treat this appropriately.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1234)

In [7]:
base_train_count = len(X_train)
base_train_count

75085

In [8]:
test_trues = list(y_test[y_test==1].index.values)
test_trues.sort()
print('Test:', len(test_trues), test_trues)

train_trues = list(y_train[y_train==1].index.values)
train_trues.sort()
print('Train:', len(train_trues), train_trues)

Test: 107 [17, 82, 343, 355, 514, 534, 1559, 1750, 1960, 1965, 2117, 2142, 2380, 2447, 2482, 2701, 2867, 3526, 3619, 3672, 3759, 3803, 3880, 3908, 3989, 4047, 4118, 4355, 4374, 4421, 4890, 5244, 5444, 6371, 6584, 6585, 6831, 6935, 7814, 7969, 8551, 8704, 8794, 9041, 9153, 9201, 9381, 9556, 9596, 9659, 9815, 9826, 9829, 11286, 11403, 11658, 11684, 11779, 12091, 12196, 12517, 12669, 12830, 12882, 13467, 13652, 13715, 15027, 16050, 18773, 20094, 20272, 20446, 21302, 21344, 21919, 22531, 22578, 22835, 22993, 23279, 23358, 24007, 24075, 24310, 26148, 26401, 26864, 29258, 30436, 30716, 35411, 50966, 51336, 55157, 57713, 61714, 64840, 69153, 70741, 76577, 77778, 78574, 79345, 81714, 82479, 88394]
Train: 470 [2, 24, 46, 143, 151, 235, 237, 241, 273, 404, 443, 454, 490, 518, 548, 751, 813, 897, 994, 1024, 1188, 1296, 1373, 1399, 1401, 1418, 1461, 1513, 1531, 1537, 1546, 1556, 1558, 1587, 1678, 1684, 1704, 1754, 1774, 1812, 1857, 1883, 2041, 2043, 2066, 2075, 2120, 2123, 2124, 2131, 2199, 2215, 

In [9]:
# Look at training 'trues', i.e., rule-book hits
df_train_trues = df.iloc[train_trues]
df_train_trues.to_csv('data/prepared/df_train_trues.csv')
df_train_trues

Unnamed: 0,text,category
2,robodrill spider excavator being operated when...,1
24,fuel red diesel spilled onto whiterockdirt sur...,1
46,oil leak from crane that was on site for plann...,1
143,small oil spill on twenty-one thousand and twe...,1
151,spider excavator hyd leak to soil 10ml spider ...,1
...,...,...
90259,struck by discharged object or substance an em...,1
90458,injured by slipping or swinging object held by...,1
90618,struck by swinging part of powered vehicle an ...,1
91361,caught in running equipment or machinery durin...,1


## Build Model

In [10]:
# Load modelling building libraries
from keras.layers import Dropout, Dense, Embedding, LSTM, Bidirectional
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from sklearn.metrics import matthews_corrcoef, confusion_matrix
from sklearn import metrics
from sklearn.utils import shuffle
from collections import Counter
import numpy as np
import pickle
import matplotlib.pyplot as plt
import warnings
import logging
logging.basicConfig(level=logging.INFO)

In [11]:
# Create a function to prepare model input sequences and embedding dictionary
def prepare_model_input(X_train, X_test, MAX_NB_WORDS=75000, MAX_SEQUENCE_LENGTH=500):
    np.random.seed(7)
    all_text = np.concatenate((X_train, X_test), axis=0)
    all_text = np.array(all_text)

    # Fit tokeniser only on training text
    text = np.array(X_train)
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts(text)

    # Save the tokenizer as .pkl file
    pickle.dump(tokenizer, open(f'output/{time.strftime("%y%m%d%H%M")}_text_tokenizer.pkl', 'wb'))

    # Generate sequences for all text using tokenizer created only on training text
    # This converts the sentence into a sequence of integers, e.g., [2, 4, 5, 7]
    sequences = tokenizer.texts_to_sequences(all_text)

    # Get a list of all words and their sequence numbers
    word_index = tokenizer.word_index

    # Pad out the sequences with zeroes to max sequence length
    all_text = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    
    # Seperate training sequences from test
    X_train_Glove = all_text[0:len(X_train), ]
    X_test_Glove = all_text[len(X_train):, ]

    # Now gather the embeddings
    # Start with standard GloVe
    ## https://www.google.com/search?client=safari&rls=en&q=glove+embeddings&ie=UTF-8&oe=UTF-8
    embeddings_dict = {}
    f = open("embeddings/glove/glove.6B.50d.txt", encoding="utf8")
    for line in f:
        values = line.split()
        word = values[0]
        try:
            coefs = np.asarray(values[1:], dtype='float32')
        except:
            pass
        embeddings_dict[word] = coefs
    f.close()
    print('Total %s word vectors.' % len(embeddings_dict))
    return (X_train_Glove, X_test_Glove, word_index, embeddings_dict)


In [12]:
# Create a function that builds the deep learning model
def build_bilstm(word_index, embeddings_dict, nclasses,  MAX_SEQUENCE_LENGTH=500, EMBEDDING_DIM=50, dropout=0.5, hidden_layer = 3, lstm_node = 32):
    # Initialize a sequential model
    model = Sequential()
    
    # Make the embedding matrix using the embedding_dict
    embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
    for word, i in word_index.items():
        embedding_vector = embeddings_dict.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            if len(embedding_matrix[i]) != len(embedding_vector):
                print("could not broadcast input array from shape", str(len(embedding_matrix[i])),
                      "into shape", str(len(embedding_vector)), " Please make sure your"
                                                                " EMBEDDING_DIM is equal to embedding_vector file ,GloVe,")
                exit(1)
            embedding_matrix[i] = embedding_vector
            
    # Add embedding layer
    print('_words', len(word_index))
    model.add(Embedding(len(word_index) + 1,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=True))
    # Add hidden layers 
    # Default will be 3 layers
    # Default lstm nodel number will be 32
    for i in range(0, hidden_layer):
        # Add a bidirectional lstm layer
        model.add(Bidirectional(LSTM(lstm_node, return_sequences=True, recurrent_dropout=0.2)))

        # Add a dropout layer after each lstm layer
        model.add(Dropout(dropout))

    model.add(Bidirectional(LSTM(lstm_node, recurrent_dropout=0.2)))

    # Attention addition
    #model.add(attention(return_sequences=True))
    
    model.add(Dropout(dropout))

    # Add the fully connected layer with 256 nuerons & Relu activation
    model.add(Dense(256, activation='relu'))

    # Add the output layer with softmax activation (binary output)
    model.add(Dense(nclasses, activation='softmax'))

    # Compile the model using sparse_categorical_crossentropy
    model.compile(loss='sparse_categorical_crossentropy',
                      optimizer='adam',
                      metrics=["categorical_accuracy"])
    return model

In [13]:
print("Preparing model input ...")
X_train_Glove, X_test_Glove, word_index, embeddings_dict = prepare_model_input(X_train, X_test)
print("Done!")

Preparing model input ...
Total 400000 word vectors.
Done!


In [14]:
print("Building Model!")
model = build_bilstm(word_index, embeddings_dict, 2)
model.summary()

Building Model!
_words 68369
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 500, 50)           3418500   
                                                                 
 bidirectional (Bidirectiona  (None, 500, 64)          21248     
 l)                                                              
                                                                 
 dropout (Dropout)           (None, 500, 64)           0         
                                                                 
 bidirectional_1 (Bidirectio  (None, 500, 64)          24832     
 nal)                                                            
                                                                 
 dropout_1 (Dropout)         (None, 500, 64)           0         
                                                                 
 bidirectional_2 (Bidirecti

In [15]:
# Quick view of rule_book hits (should represent the focus category)
X_test[y_test == 1].iloc[0]

'a skid steer hydraulic hose broke causing no more than five gallons of hydraulic fluid to leak onto the ground at approximately one thousand, six hundred and fifteen on two million, one hundred and fifty-two thousand and twenty-two a logistics operator was transporting a skid steer to the entrance of block 3c for relocation a hydraulic hose on the boom of the skid steer broke causing less than five gallons of hydraulic fluid to leak onto the ground the leak was immediately noticed and the machine was stopped environmental swppp was notified and clean up procedures began the actual severity and potential ratings were selected as an a1 due to minimal reversible environmental impact the equipment was shut down environmental was notified clean up procedures immediately took place all contaminated soil and diapers were picked up and disposed of into proper containments'

### Model Training & Evaluatrion

In [16]:
def get_eval_report(labels, preds):
    mcc = matthews_corrcoef(labels, preds)
    tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()
    precision = (tp)/(tp+fp)
    recall = (tp)/(tp+fn)
    f1 = (2*(precision*recall))/(precision+recall)
    return {
        "mcc": mcc,
        "true positive": tp,
        "true negative": tn,
        "false positive": fp,
        "false negative": fn,
        "precision" : precision,
        "recall" : recall,
        "F1" : f1,
        "accuracy": (tp+tn)/(tp+tn+fp+fn)
    }

def compute_metrics(labels, preds):
    assert len(preds) == len(labels)
    return get_eval_report(labels, preds)

def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string], '')
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()


In [None]:
# Train the model

def format_time(elapsed):
    return str(datetime.timedelta(seconds=int(round((elapsed)))))

total_t0 = time.time()
history = model.fit(X_train_Glove, y_train,
                              validation_data=(X_test_Glove, y_test),
                              epochs=4,
                              batch_size=128,
                              verbose=1)

Epoch 1/4
 44/587 [=>............................] - ETA: 27:28 - loss: 0.1260 - categorical_accuracy: 0.9780

In [None]:
print(f'\nTotal training took {format_time(time.time()-total_t0)}')

In [None]:
# Plot training history
plot_graphs(history, 'loss')

In [None]:
# load the model from disk
#fname = '2209181545_hydraulic fluid or oil leak_bilstm_model'
#filename = f'output/models/{fname}.pkl'
#model = pickle.load(open(filename, 'rb'))

In [None]:
# Print accuracy measures
print("\n Evaluating Model ... \n")

predicted = model.predict(X_test_Glove)
predicted = np.argmax(predicted, axis=1)

print(metrics.classification_report(y_test, predicted))
print("\n")
logger = logging.getLogger("logger")
result = compute_metrics(y_test, predicted)
for key in (result.keys()):
    logger.info("  %s = %s", key, str(result[key]))

In [None]:
# Create dataframe for inspection of results
tmp = pd.DataFrame(X_test, columns=['text'])
tmp['pred'] = predicted
tmp['rule_book'] = y_test.values

# Quick look at predicted positives
print('Check rule-book hit count: ', len(tmp[tmp['rule_book']==1]))

In [None]:
Counter(y_test)

In [None]:
# Check a random selection of 'trues', i.e., rule-book hits...
# These should be consistent with the focis category, e.g., hydraulic fluid
table = tmp[(tmp['rule_book']==1)].sample(20)
display(HTML(table.to_html()))

In [None]:
# Take a look at predicted positives that did not have a rule-book hit
table = tmp[(tmp['pred']==1) & (tmp['rule_book']==0)]
display(HTML(table.to_html()))
table.to_csv(f'performance/model_{focus}_fps.csv')
print(f"\nThere are {len(table)} 'false' positives ...")

In [None]:
# Take a look at rule-book hits that did not have postive model hit...
table = tmp[(tmp['pred']==0) & (tmp['rule_book']==1)]
display(HTML(table.to_html()))
table.to_csv(f'performance/model_{focus}_fns.csv', index=False)
table_count = len(table)
ruley_count = sum(y_test==1)
perc_calced = round(100*(table_count / ruley_count), 1)
print(f'\nThere are {table_count } entries (out of {ruley_count}, {perc_calced}%)...')

### Inference Checks

In [None]:
#text = ['fall lower level less six feet employee miss step fell stationary semitruck injure back']
#text = ["mini excavator develop small hydraulic leak remove pavement fluid contract concrete pavement area clean client notify migrate legacy cairs incident 34326on monday july thirty-one two thousand and seventeen celanese plant clear lake brazos e employee operate mini excavator develop small hydraulic leak remove pavement fluid contract concrete pavement area clean client notify migrate legacy cairs see detail description"]
#text = ['contact hot object substances employee transport hot drip fluid use cook oil container fluid contact employee cause first second degree burn']
#text = ['strike discharge object substance lineman work aerial lift leak break hydraulic line inject fluid hand hospitalize']
text_raw = 'dozer along right way small leak hose observe dozer right way small leak hydraulic hoseno contamination grind oil dozer notify operator damage hose contact strathclyde send fitter site repair'
text_raw = 'I was walking down the yard and I lost my footing and tripped and broke my leg'
text_raw = 'Employee noticed oil fluid leaking from hydraulic line'

In [None]:
#with open('/content/drive/MyDrive/Colab_Notebooks/safety_report_tc/output/text_tokenizer.pkl', 'rb') as f:
#    tokenizer = pickle.load(f)

In [None]:
# Create simple function for running inference on user input text
def inference_run():
    text_raw = input('Enter text:')
    # TODO - Apply text prep steps to user input steps
    text = [text_raw]
    text = np.array(text)
    text_all_tmp = np.concatenate((X_train, X_test), axis=0)
    text_tmp = np.array(X_train)
    tokenizer = Tokenizer(num_words=75000)
    tokenizer.fit_on_texts(text_tmp)

    sequences = tokenizer.texts_to_sequences(text)
    X_test_Glove_s = pad_sequences(sequences, maxlen=500)
    check_class = np.argmax(model.predict(X_test_Glove_s), axis=1)[0]
    if check_class == 1:
        # Use text wrap to avoid a paragraph of text printing as single line in output window
        lines = textwrap.wrap(text_raw, 80, break_long_words=False)
        for i, l in enumerate(lines):
            if i == 0:
                print(f'\n"{l}')
            elif i == len(lines)-1:
                print(f'{l}"')
            else:
                print(l)
        print('\nPredicted: ', focus_cat)
    else:
        print('\nNot classified...') 

In [None]:
inference_run()

Enter text:dozer along right way small leak hose observe dozer right way small leak hydraulic hoseno contamination grind oil dozer notify operator damage hose contact strathclyde send fitter site repair

"dozer along right way small leak hose observe dozer right way small leak
hydraulic hoseno contamination grind oil dozer notify operator damage hose
contact strathclyde send fitter site repair"

Predicted:  hydraulic fluid or oil leak


### Save Model to Local Drive

In [None]:
# Save the model
filename = f'output/models/{time.strftime("%y%m%d%H%M")}_{focus}_bilstm_model.pkl'
pickle.dump(model, open(filename, 'wb'))

# load the model from disk
# loaded_model = pickle.load(open(filename, 'rb'))



### Other Methods

#### Logistic Regression

In [None]:
from sklearn import preprocessing
scaler = preprocessing.MinMaxScaler()
X_train_Glove_sc = scaler.fit_transform(X_train_Glove)

In [None]:
from sklearn.linear_model import LogisticRegression
lr_clf = LogisticRegression(max_iter=3000)
lr_clf.fit(X_train_Glove_sc, y_train)

LogisticRegression(max_iter=3000)

In [None]:
lr_clf.score(X_test_Glove, y_test)

0.7802578308118474

In [None]:
y_predicted = lr_clf.predict(X_test_Glove)

In [None]:
cnf = metrics.confusion_matrix(y_test, y_predicted)
cnf

array([[14590,  4075],
       [   50,    57]])

In [None]:
lr_recall = round(cnf[1, 1] / (cnf[1, 0] + cnf[1, 1]), 2)
print(f'Logistic regression recall = {lr_recall}')
lr_prec = round(cnf[1, 1] / (cnf[0, 1] + cnf[1, 1]), 2)
print(f'Logistic regression precision = {lr_prec}')

Logistic regression recall = 0.53
Logistic regression precision = 0.01


#### SVM

In [None]:
from sklearn.svm import OneClassSVM
# Train a one-class SVM 
svm_model =  OneClassSVM(nu = 0.005) # 0.5%
svm_model.fit(pd.DataFrame(X_train_Glove))

OneClassSVM(nu=0.005)

In [None]:
preds_svm = pd.Series(svm_model.predict(X_test_Glove)).map({1: 0, -1: 1})
preds_svm = pd.DataFrame({'prediction':preds_svm})
preds_svm['rule_book'] = y_test.reset_index(drop=True)
preds_svm[(preds_svm['prediction']==1) & (preds_svm['rule_book']==1)]

Unnamed: 0,prediction,rule_book
1227,1,1
2751,1,1
2829,1,1
6492,1,1
10474,1,1
17640,1,1
17957,1,1


In [None]:
# Check metrics
svm_ta = len(preds_svm[(preds_svm['rule_book']==1)])
svm_tp = len(preds_svm[(preds_svm['prediction']==1) & (preds_svm['rule_book']==1)])
svm_fp = len(preds_svm[(preds_svm['prediction']==1) & (preds_svm['rule_book']==0)])
svm_pp = len(preds_svm[(preds_svm['prediction']==1)])

prec = svm_tp/svm_pp
reca = svm_tp/svm_ta

print(f'Precision: {round(prec, 2)}')
print(f'Recall: {round(reca, 2)}')

Precision: 0.01
Recall: 0.07
