### Deep Learning Supervised Text Classification
- BiLSTM

Bidirectional recurrent neural networks (RNN) put two independent RNNs together. This structure allows the networks to have both backward and forward information about the sequence at every time step


### Prepare Environment

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install contractions

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting contractions
  Downloading contractions-0.1.72-py2.py3-none-any.whl (8.3 kB)
Collecting textsearch>=0.0.21
  Downloading textsearch-0.0.21-py2.py3-none-any.whl (7.5 kB)
Collecting anyascii
  Downloading anyascii-0.3.1-py3-none-any.whl (287 kB)
[K     |████████████████████████████████| 287 kB 3.1 MB/s 
[?25hCollecting pyahocorasick
  Downloading pyahocorasick-1.4.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (106 kB)
[K     |████████████████████████████████| 106 kB 61.4 MB/s 
[?25hInstalling collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully installed anyascii-0.3.1 contractions-0.1.72 pyahocorasick-1.4.4 textsearch-0.0.21


In [3]:
# Importing required libraries
import pandas as pd
import re
import requests
import nltk
import inflect
import contractions
from bs4 import BeautifulSoup
import re, string, unicodedata
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from tqdm.std import tqdm
from IPython.display import display, HTML
import time
import textwrap
import requests
import json

In [4]:
# Install required nltk resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

### Load Data

In [5]:
# Pick a minority category of interest
focus_cat = 'hydraulic fluid or oil leak'
fn = '/content/drive/MyDrive/Colab_Notebooks/safety_report_tc/output/hydraulic fluid or oil leak_577_out_df_temp.csv'
df = pd.read_csv(fn)
df = df[['text', 'category']]

# Convert category to binary 1/0
# Remeber that 1 is a rule-book hit & not necessarily the 'truth'
df['category'] = df['category'].apply(lambda x : x.replace("*** Not Classified", "0"))
df['category'] = df['category'].apply(lambda x : x.replace(focus_cat, "1"))
df['category'] = df['category'].astype(int)
df

Unnamed: 0,text,category
0,foreign body entered employee's (l) eye while ...,0
1,drainage pipe damaged at ~2.2 m depth. see sup...,0
2,robodrill spider excavator being operated when...,1
3,pressure hose made contact with light fitting ...,0
4,nacap 30t hitachi hyd hose split resultong in ...,0
...,...,...
93852,fall on water vehicle an employee was using an...,0
93853,"other fall to lower level, unspecified an empl...",0
93854,injured by slipping or swinging object held by...,0
93855,"direct exposure to electricity, greater than 2...",0


### Data Prepatation

In [6]:
# Main denoising function
def denoise_text(txt):
    # Strip html if any.
    soup = BeautifulSoup(txt, "html.parser")
    txt = soup.get_text()

    # Replace contractions in string of text. For ex. didn't -> did not
    txt = contractions.fix(txt)
    return txt

In [7]:
# Test the denoising function
chk_text = "<p>she didn't tell me anything </br> about what's gonna <html> happen in the end"
denoise_text(chk_text)

'she did not tell me anything  about what is going to  happen in the end'

In [18]:
# Normalization may include everal step
# Each function below fulfills a (potential) step in normalization

STOP_SENTS = ['migrated from legacy cairs']

f = open('/content/drive/MyDrive/Colab_Notebooks/safety_report_tc/data/us2gb.json')
US2GB = json.load(f)
f.close()

def remove_stop_sentences(in_doc):
    for i, sen in enumerate(STOP_SENTS):
        if i==0:
            rep_doc = in_doc
        else:
            rep_doc = out_doc
        out_doc = rep_doc.replace(f'{sen}', '') 
    out_doc = re.sub(' +', ' ', out_doc)    
    return(out_doc)

def americanize(in_doc):
    for american_spelling, british_spelling in US2GB.items(): 
        in_doc = re.sub(f'(?<![a-zA-Z]){british_spelling}(?![a-z-Z])', american_spelling, in_doc)
    return(in_doc)

def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def replace_numbers(words):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words

def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words

def stem_words(words):
    """Stem words in list of tokenized words"""
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

def lemmatize_verbs(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas

# Define the normalization pipeline
# Comment out steps not used
def normalize_text(words):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_punctuation(words)
    words = replace_numbers(words)
    # TODO - Experiment with the below options
    # Inuitively, stopwords should not be removed as they may impact
    # semantic meaning.
    #words = remove_stopwords(words) 
    #words = stem_words(words)
    words = lemmatize_verbs(words)
    return words


In [9]:
# Let's test the individual normalization functions
print("Remove non_ascii: ", remove_non_ascii(['h', 'ॐ', '©', '1']))
print("To lowercase: ", to_lowercase(['HELLO', 'hiDDen', 'wanT', 'GOING']))
print("Remove punctuation: ", remove_punctuation(['hello!!', 'how?', 'done,']))
print("Replace numbers: ", replace_numbers(['1', '2', '3']))
print("Remove stopwords: ", remove_stopwords(['this', 'and', 'amazing', 'not', 'no', 'yes']))
print("Stem words: ", stem_words(['beautiful', 'flying', 'waited']))
print("Lemmatize verbs: ", lemmatize_verbs(['hidden', 'walking', 'ran']))

# Now apply the pipeline (on a test)
print("Normalize text: ", normalize_text(['hidden', 'in', 'the', 'ALMIRAH', 'he', 'WAited', '2', 'ॐ', 'hours!!']))

Remove non_ascii:  ['h', '', '', '1']
To lowercase:  ['hello', 'hidden', 'want', 'going']
Remove punctuation:  ['hello', 'how', 'done']
Replace numbers:  ['one', 'two', 'three']
Remove stopwords:  ['amazing', 'yes']
Stem words:  ['beauty', 'fly', 'wait']
Lemmatize verbs:  ['hide', 'walk', 'run']
Normalize text:  ['hide', 'in', 'the', 'almirah', 'he', 'wait', 'two', 'hours']


In [10]:
# Tokenize text into words
def simple_tokenize(text):
    return nltk.word_tokenize(text)

# Check the function
sample_text = 'he did not say anything  about what is going to  happen'
print("tokenize results :", simple_tokenize(sample_text))

tokenize results : ['he', 'did', 'not', 'say', 'anything', 'about', 'what', 'is', 'going', 'to', 'happen']


In [11]:
# Now create a simple function to denoise, normalize and apply simple tokeniser to text
def text_prepare(text):
    text = denoise_text(text)
    text = ' '.join([x for x in normalize_text(simple_tokenize(text))])
    return text

In [12]:
text_prepare('I am not gonna go to the shop')

'i be not go to go to the shop'

In [None]:
tqdm.pandas()
# Remove stop sentences & more than single spaces
print('Removing stop sentences...')
df['text'] = df['text'].progress_apply(lambda x: remove_stop_sentences(x))

# Convert all text to one dictionary (let's choose american english)
print('\nAmericanizing...')
df['text'] = df['text'].progress_apply(lambda x: americanize(x))

Removing stop sentences...


100%|██████████| 93857/93857 [00:02<00:00, 41512.07it/s]


Americanizing...


 95%|█████████▍| 88924/93857 [5:47:15<18:00,  4.56it/s]

In [27]:
# Now apply the main text prep pipeline to all text:
df['text'] = [text_prepare(x) for x in tqdm(df['text'])]

# Clean up on category encoding
le = LabelEncoder()
df['category'] = le.fit_transform(df['category'])
print('Done!')
df.head()

100%|██████████| 93857/93857 [02:09<00:00, 723.01it/s] 


Done!


Unnamed: 0,text,category
0,foreign body enter employee s l eye while grin...,0
1,drainage pipe damage at twenty-two m depth see...,0
2,robodrill spider excavator be operate when hos...,1
3,pressure hose make contact with light fit whil...,0
4,nacap 30t hitachi hyd hose split resultong in ...,0


## Build Model

In [28]:
# Load modelling building libraries
from keras.layers import Dropout, Dense, Embedding, LSTM, Bidirectional
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from sklearn.metrics import matthews_corrcoef, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.utils import shuffle
import numpy as np
import pickle
import matplotlib.pyplot as plt
import warnings
import logging
logging.basicConfig(level=logging.INFO)

In [29]:
# Create a function to prepare model input sequences and embedding dictionary
def prepare_model_input(X_train, X_test, MAX_NB_WORDS=75000, MAX_SEQUENCE_LENGTH=500):
    np.random.seed(7)
    all_text = np.concatenate((X_train, X_test), axis=0)
    all_text = np.array(all_text)

    # Fit tokeniser only on training text
    text = np.array(X_train)
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts(text)

    # Save the tokenizer as .pkl file
    pickle.dump(tokenizer, open(f'/content/drive/MyDrive/Colab_Notebooks/safety_report_tc/output/{time.strftime("%y%m%d%H%M")}_text_tokenizer.pkl', 'wb'))

    # Generate sequences for all text using tokenizer created only on training text
    # This converts the sentence into a sequence of integers, e.g., [2, 4, 5, 7]
    sequences = tokenizer.texts_to_sequences(all_text)

    # Get a list of all words and their sequence numbers
    word_index = tokenizer.word_index

    # Pad out the sequences with zeroes to max sequence length
    all_text = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    
    # Seperate training sequences from test
    X_train_Glove = all_text[0:len(X_train), ]
    X_test_Glove = all_text[len(X_train):, ]

    # Now gather the embeddings
    # Start with standard GloVe
    ## https://www.google.com/search?client=safari&rls=en&q=glove+embeddings&ie=UTF-8&oe=UTF-8
    embeddings_dict = {}
    f = open("/content/drive/MyDrive/Colab_Notebooks/glove.6B.50d.txt", encoding="utf8")
    for line in f:
        values = line.split()
        word = values[0]
        try:
            coefs = np.asarray(values[1:], dtype='float32')
        except:
            pass
        embeddings_dict[word] = coefs
    f.close()
    print('Total %s word vectors.' % len(embeddings_dict))
    return (X_train_Glove, X_test_Glove, word_index, embeddings_dict)


In [30]:
# Create a function that builds the deep learning model
def build_bilstm(word_index, embeddings_dict, nclasses,  MAX_SEQUENCE_LENGTH=500, EMBEDDING_DIM=50, dropout=0.5, hidden_layer = 3, lstm_node = 32):
    # Initialize a sequential model
    model = Sequential()
    
    # Make the embedding matrix using the embedding_dict
    embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
    for word, i in word_index.items():
        embedding_vector = embeddings_dict.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            if len(embedding_matrix[i]) != len(embedding_vector):
                print("could not broadcast input array from shape", str(len(embedding_matrix[i])),
                      "into shape", str(len(embedding_vector)), " Please make sure your"
                                                                " EMBEDDING_DIM is equal to embedding_vector file ,GloVe,")
                exit(1)
            embedding_matrix[i] = embedding_vector
            
    # Add embedding layer
    model.add(Embedding(len(word_index) + 1,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=True))
    # Add hidden layers 
    # Default will be 3 layers
    # Default lstm nodel number will be 32
    for i in range(0, hidden_layer):
        # Add a bidirectional lstm layer
        model.add(Bidirectional(LSTM(lstm_node, return_sequences=True, recurrent_dropout=0.2)))

        # Add a dropout layer after each lstm layer
        model.add(Dropout(dropout))

    model.add(Bidirectional(LSTM(lstm_node, recurrent_dropout=0.2)))
    model.add(Dropout(dropout))

    # Add the fully connected layer with 256 nuerons & Relu activation
    model.add(Dense(256, activation='relu'))

    # Add the output layer with softmax activation (binary output)
    model.add(Dense(nclasses, activation='softmax'))

    # Compile the model using sparse_categorical_crossentropy
    model.compile(loss='sparse_categorical_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy'])
    return model

In [31]:
# Now create the input to the model training stage
X = df.text
y = df.category # Remember, these are not necessarily the 'truth' but rule_book hits

# Apply a simple 80/20 split
# TODO - The dataset is heavily imbalanced. Treat this appropriately.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1234)

print("Preparing model input ...")
X_train_Glove, X_test_Glove, word_index, embeddings_dict = prepare_model_input(X_train, X_test)
print("Done!")

Preparing model input ...
Total 400000 word vectors.
Done!


In [32]:
print("Building Model!")
model = build_bilstm(word_index, embeddings_dict, 2)
model.summary()

Building Model!
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 500, 50)           3129650   
                                                                 
 bidirectional (Bidirectiona  (None, 500, 64)          21248     
 l)                                                              
                                                                 
 dropout (Dropout)           (None, 500, 64)           0         
                                                                 
 bidirectional_1 (Bidirectio  (None, 500, 64)          24832     
 nal)                                                            
                                                                 
 dropout_1 (Dropout)         (None, 500, 64)           0         
                                                                 
 bidirectional_2 (Bidirectio  (None, 500

In [33]:
# Quick view of rule_book hits (should represent the focus category)
X_test[y_test == 1].iloc[0]

'a skid steer hydraulic hose break cause no more than five gallons of hydraulic fluid to leak onto the grind at approximately one thousand, six hundred and fifteen on two million, one hundred and fifty-two thousand and twenty-two a logistics operator be transport a skid steer to the entrance of block 3c for relocation a hydraulic hose on the boom of the skid steer break cause less than five gallons of hydraulic fluid to leak onto the grind the leak be immediately notice and the machine be stop environmental swppp be notify and clean up procedures begin the actual severity and potential rat be select as an a1 due to minimal reversible environmental impact the equipment be shut down environmental be notify clean up procedures immediately take place all contaminate soil and diapers be pick up and dispose of into proper containments'

### Model Training & Evaluatrion

In [34]:
def get_eval_report(labels, preds):
    mcc = matthews_corrcoef(labels, preds)
    tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()
    precision = (tp)/(tp+fp)
    recall = (tp)/(tp+fn)
    f1 = (2*(precision*recall))/(precision+recall)
    return {
        "mcc": mcc,
        "true positive": tp,
        "true negative": tn,
        "false positive": fp,
        "false negative": fn,
        "precision" : precision,
        "recall" : recall,
        "F1" : f1,
        "accuracy": (tp+tn)/(tp+tn+fp+fn)
    }

def compute_metrics(labels, preds):
    assert len(preds) == len(labels)
    return get_eval_report(labels, preds)

def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string], '')
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()


In [None]:
# Train the model
history = model.fit(X_train_Glove, y_train,
                              validation_data=(X_test_Glove, y_test),
                              epochs=5,
                              batch_size=128,
                              verbose=1)

Epoch 1/5
  8/587 [..............................] - ETA: 58:35 - loss: 0.3752 - accuracy: 0.9473

In [None]:
# Plot training history
plot_graphs(history, 'accuracy')
plot_graphs(history, 'loss')

In [28]:
# Print accuracy measures
print("\n Evaluating Model ... \n")

predicted = model.predict(X_test_Glove)
predicted = np.argmax(predicted, axis=1)

print(metrics.classification_report(y_test, predicted))
print("\n")
logger = logging.getLogger("logger")
result = compute_metrics(y_test, predicted)
for key in (result.keys()):
    logger.info("  %s = %s", key, str(result[key]))


 Evaluating Model ... 

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     18660
           1       0.60      0.86      0.71       112

    accuracy                           1.00     18772
   macro avg       0.80      0.93      0.85     18772
weighted avg       1.00      1.00      1.00     18772





In [29]:
# Deepdive into the results / take a closer look...
tmp = pd.DataFrame(X_test, columns=['text'])
tmp['pred'] = predicted
tmp['rule_book'] = y_test

# Dump results out for quick inspection in Excel (if required/desired)
tmp.to_csv('/content/drive/MyDrive/Colab_Notebooks/safety_report_tc/output/my_tmp_out.csv')

# Quick look at predicted positives
tmp[tmp['pred']==1]

Unnamed: 0,text,pred,rule_book
17588,hydraulic fluid leak to asphalt from hiab truc...,1,1
10656,hydraulic oil leak from coil tube vehicle stor...,1,1
9358,hyd hose failure result in 500ml of hyd oil re...,1,1
1513,a miniexcavator be travel on a plant road when...,1,1
19531,small hydraulic leak at b2200 from a crane at ...,1,1
...,...,...,...
2432,an spill of approximately twenty-three gallons...,1,0
23646,an estimate 100ml of oil leak from a boat moto...,1,1
7554,ee notice approx one cup of hydraulic fluid un...,1,1
5904,a crane operator be sit idle in a sixty ton re...,1,1


In [30]:
# Take a look at predicted positives that did not have a rule-book hit
table = tmp[(tmp['pred']==1) & (tmp['rule_book']==0)]
display(HTML(table.to_html()))
print(f'\nThere are {len(table)} entries (out of {sum(y_test==1)})...')

Unnamed: 0,text,pred,rule_book
11937,a small amount of engine oil approximately five litres spill onto the haul road from a moxy which suffer engine failure a small amount of engine oil approximately five litres spill onto the haul road from a moxy which suffer engine failure the spill be quickly contain use absorbent pad and sock as well as a large plant nappy once the moxy be remove from catchment to pad c29 a small amount of contaminate stone be remove from the haul road place in hazardous waste bag and remove from site along with use spill kitsthe break down moxy remain on pad c29 outside afton catchment until a new engine can be fit and a large plant nappy capacity up to 28l have be place underneath the belly plate the remain oil in the machine be contain within the engine and we be confident that no further spillage will occur the plant nappy will be inspect 22nd decemberno harm to the environment have occur as a result of this incident there be no watercourses close to where the incident occur or where the moxy be currently situate on c29 pad outside afton catchment the spill be quickly contain use absorbent pad and sock as well as a large plant nappy once the moxy be remove from catchment to pad c29 a small amount of contaminate stone be remove from the haul road place in hazardous waste bag and remove from site along with use spill kit,1,0
1035,"approximately 200ml of thinner spill over the concrete pavement migrate from legacy cairs incident 40170one of our technician be involve in the paint work for the fade fire extinguisher box near to column txe30 he be have one litre of red colour paint container with lid and an open top plastic container with about 200ml of thinner which be use to clean his paint brush and roller the thinner container be place on top of the concrete pavement at about one thousand, four hundred and ten hours due to heavy wind the thinner container flip over and cause the thinner to spill over the spillage be contain in the concrete pave area and do not escape to the nearby drain he then immediately clean the spillage the incident be immediately report to eastman hsse officer migrate from legacy cairs see detail description",1,0
14776,approximately twenty-three litres hydraulic oil spill onto grind from oil seal of terex cc28001 crane at the north side of pr platform while rais approximately twenty-three litres hydraulic oil spill onto grind from oil seal of terex cc28001 crane at the north side of pr platform while rais ata hsse be inform irt be call and the contaminate grind be clean with adsorbents and dispose to oily skip investigation be commence,1,0
12746,excess concrete and washout spill on pad follow pile cap concrete pour excess concrete and washout spill on pad follow pile cap concrete pour concrete washwater have not be contain within an icb as per environmental management plan and have spill onto pad there have be negligible impact on the environment and no impact on nearby watercourses which be monitor daily dry excess concrete and washout have be scrap up and use as backfill around the pile cap foundation all foundation team will receive a tbt on appropriate handle of concrete and disposal of washwaters their potential impact on the environment if not handle correctly and of the particularly sensitive nature of the afton catchment,1,0
9400,"glycol line strike and release wood s subcontractor aardvark drill be complete a geotechnical borehole investigation on site aardvark be in theprocess of drill borehole bh1905 target depth of fifteen mbgs which be locate on the northern west platform of therutherford go station during augur they encounter a glycol line within six inch below grind surface the glycol line isroughly one fourteen inch thick and comprise of a rubber material upon encounter the glycol line they immediately stop augur deploy the spill kit along the fence line to the west and place bentonite clay base absorbant material on the liquid the spill occur at approximately 1058am and it be immediately contain use spill sock and bentonite hole plug by1105am this effectively contain and stop any glycol from run off to adjacent surface the valves be shut off toprevent any further release of the glycol liquid from the line a shop vac be use to vacuum and clean any residual liquid onthe concrete surface both the glycol liquid and the bentonite be place into fifty-five gal sealable steel drum for offsite disposalan estimate seventy-five l of the hydrualic fluid be recover and contain in steel drum and the remain twenty-five l be absorb bythe bentonite the cleanup efforts be complete by one thousand, one hundred and fifty be a visible liquid wet stain can be observe but there be noremaining fluids on the platform surface edit be in the process of secure a subcontractor to repair the glycol line",1,0
6806,two thousand and thirty litres of hydraulic oil spill onto concrete pad from twenty-five tonne franna crane when extend boom from hose burst spill contain and immediately clean up use spill kit spoil no environmental impact migrate from legacy cairs incident 30250primary sedimentation tank louvre removal project2030 litres of hydraulic oil spill onto concrete pad from twenty-five tonne franna crane p d rig when extend boom from hose burst spill contain and immediately clean up use spill kit spoilhydraulics technician call to site to repair hose no environmental impact from spill release incl stormwaterimpact on florafauna no issue with the hydraulics be identify during prestart check no load be be lift at the time migrate from legacy cairs see detail description,1,0
12309,cow infringement during commission of vru compressor control of work infringement during dynamic commission of the vru compressor a wcc be issue for dynamic commission under sanction to test include the lube oil system it be discover that a gearbox cover have be remove while the lube oil pump be energise which be not part of the work scope and not mention on the permit all dynamic commission work on the vru compressor stop the stt have be remove and equipment reisolated,1,0
12153,minor oil spillage oil spillage approximately one litre due to disconnect hose on rubber duck due to faulty bolt approximately one litre of oil spill onto layby absorbent pad utilise and spillage clean up immediately no spillage of oil beyond the layby fitter repair the hose promptly,1,0
6696,wgpsn work party be task with replace a pipe spool within a crude oil line a small amount of oil be expect in the line but when the flanges be split a large volume of crude oil spill from the line migrate from legacy cairs incident 30370on tuesday 10th may wgpsn personnel be task with replace a pipe spool within a crude oil line at f201 t201 with use of a mobile craneas ithad not be possible to fully process release the pipe line the use of temporary bunding and a veolia environmental service vacuum tanker be in place to capture any product still within the linewhen the flanges be split and the pipework partially lift it become clear that large volumes of thick crude oil be still in the line the two temporary bunds quickly begin fill up the vacuum tanker be unable to cope with the amount of crude spill from the line and a replacement tanker be order migrate from legacy cairs see detail description,1,0
7499,leak oxygen hose find in workshop while prepare to use oxy butane weld gear migrate from legacy cairs incident 28554while set up the oxy butane cut gear a hiss noise be hear the operative use leak detection spray to establish if the hose be leak and two leak be find the equipment be isolate and the hose ventedoperative barriered off the area and report the fault to the bp workshop manager the equipment be operate and maintain by bpnear miss report to asset hse who be carry out an investigation migrate from legacy cairs see detail description,1,0



There are 64 entries (out of 112)...


In [31]:
# Take a look at rule-book hits that did not have postive model hit...
table = tmp[(tmp['pred']==0) & (tmp['rule_book']==1)]
display(HTML(table.to_html()))
table_count = len(table)
ruley_count = sum(y_test==1)
perc_calced = round(100*(table_count / ruley_count), 1)
print(f'\nThere are {table_count } entries (out of {ruley_count}, {perc_calced}%)...')

Unnamed: 0,text,pred,rule_book
1774,a tie operative work in the sonar cab on boat five suffer a foreign body enter his eye after the client open a oil line directly above where he be work and the oil contain within the line drop down onto his head and into his eye migrate from legacy cairs incident 38436a wood group tie operative have be task with insulate the pipework within the sonar cab of boat five within the ddh build at barrowinfurness the pipework that be to be insulate be low to the floor which mean the tie operative have to lie down on the floor to install the laggingas the operative be lie on the floor a bae operative enter the cab and ask the ip if he would mind if he step over him the ip agree the bae operative then proceed to open an oil line that be sit directly above the ip as he open the line lube oil which also contain metal fill escape from the line and drip down onto the ip s head at the same time also bounce off the floor and this be when the liquid enter the ip s eyethe ip felt discomfort straight away and go to the level two eyewash station and wash his eye out immediately after wash his eye the ip report to sick bay as he still felt something in his eye the sick bay medics wash through his eye again and use a magnetas they suspect metal fill have enter his eye after the wash and magnet the ip report that he do not feel anything in his eye anymorethe ip be advise to stay away from dusty areas until his eye felt betterinvestigation be ongoing with both wg and the client migrate from legacy cairs see detail description,0,1
81643,fall onto or against object on same level nec an employee be function test the wench line for leak slip on the oil rig floor and fell onto a sheave amputate two finger on his leave hand the sheave be partially guard at the time,0,1
78010,catch in or compress by equipment or object unspecified an employee be check the hydraulic line of a skid steer for leak when two of their finger become catch in the hydraulics and be smash one finger have to be amputate to the nail,0,1
61015,strike by swing or slip object other than handheld nec an employee be adjust the wire length set on the machine when the hydraulic hose on the machine fail at the crimp connection cause the hose to whip and strike the employee s lower leg the employee sustain lacerations to the leg and a break ankle,0,1
39597,explosion or fire on water vehicle an employee be operate a crane to load grain onto an oceangoing ship when a hydraulic hose fail and cause a fire after contact a muffler the employee exit the cab of the crane and land on a catwalk break the right leg,0,1
2316,residual hydraulic oil from underneath the vehicle s engine drip onto the grind during heavy rainfall the oily sheen be clean up and do not enter any water course migrate from legacy cairs incident 37883the she will own pick up truck be drive by wood secondee staff the vehicle be park on the road between ler one and the process area during heavy rainfall other wood personnel notice an oily sheen on the grind and contact the she will area tech to alert them of a spill the vehicle have recently be for service and it be suspect that residual oil remain on the splash guard which be spread by the heavy rain fallthe area be cordoned off combine purpose spill mat be use to clean up the spill but be not effective bioversal be use to treat the spill and it be purposefully wash into the vdrain in a control manner where it be collect by the waste contractor at the time there be heavy rainfall and this be the best option to ensure the oily water do not enter any drain or water course migrate from legacy cairs see detail description,0,1
64639,fall on same level due to slip an employee be level out a mold when the employee slip on hydraulic fluid and fell on a press bolt injure the upper right arm and require hospitalization,0,1
14309,lube oil spill due to disturb flange not torqued up less than ten litres release to bunded deck area no loss to sea this incident occur on an enermec workscope where amecfw be assist with bolt torqueing we have record this incident for the purpose of capture and share lessons learnedon 8th oct two thousand and sixteen an enermec technician be part of a work party change mesh screen on the lube oil system in preparation for a screen flush the screen to be change have be identify and the work party progress with the change of screen ready for the flush the follow day as the technician be attempt to remove a screen he encounter diffculty he disturb the closest flange on the same line upstream to allow more movement of the flange to free the screen it be that stage the technician do not fully communicate this action to all members of the amecfw work party he also do not follow the procedure in place to place a tag on the flange he disturb to assist with the removal of the screen to indicate it require torquing this would of allow the oncoming amecfw team to identify this flange and torque to the correct value assumptions be make from enermech that all flanges would be torqued however only flanges identify with the correct tag be torqued incident be under investigation,0,1
5111,"a leak be find at crude oil storage tank a cost a during routine area check at 945am on the 19th december two thousand and sixteen leak rate estimate at approximately 50litresminute migrate from legacy cairs incident 31963the leak be find during routine area check at 945am on the 19th december two thousand and sixteen the last area check have be carry out at 500am on the 19th december and nothing untoward be foundthe latest estimate of flow be approximately 50l per minute analysis carry out on this fluid suggest that the release be that of the crude that be store in cost a this be currently flow to the gully around the base of the cost a and be capture within the contaminate drain system this be rout to ballast tank a until require equipment be available to circulate the release oil back to cost athe tank manufacturers have be contact to ascertain the likely source of the leak which could be either the tank floor or the annular ring on report this to the health and safety executive it be relay that a similar release in another location result in a catastrophic failure of the tank on receive information from the tank manufacturer it will be better understand the nature of the failure of the tank and should additional pollution response be require wood group shall inform the necessary external respondersarrangements be currently be make to have the tank content removedupdate discharge of cost a complete on three hundred and eleven thousand, two hundred and sixteen to tanker migrate from legacy cairs see detail description",0,1
29212,ignition of vapors gas or liquids an employee suffer a skin burn when hydraulic fluid catch on fire and land on his forearm during a weld process,0,1



There are 16 entries (out of 112, 14.3%)...


### Inference Checks

In [32]:
#text = ['fall lower level less six feet employee miss step fell stationary semitruck injure back']
#text = ["mini excavator develop small hydraulic leak remove pavement fluid contract concrete pavement area clean client notify migrate legacy cairs incident 34326on monday july thirty-one two thousand and seventeen celanese plant clear lake brazos e employee operate mini excavator develop small hydraulic leak remove pavement fluid contract concrete pavement area clean client notify migrate legacy cairs see detail description"]
#text = ['contact hot object substances employee transport hot drip fluid use cook oil container fluid contact employee cause first second degree burn']
#text = ['strike discharge object substance lineman work aerial lift leak break hydraulic line inject fluid hand hospitalize']
text_raw = 'dozer along right way small leak hose observe dozer right way small leak hydraulic hoseno contamination grind oil dozer notify operator damage hose contact strathclyde send fitter site repair'
text_raw = 'I was walking down the yard and I lost my footing and tripped and broke my leg'
text_raw = 'Employee noticed oil fluid leaking from hydraulic line'

In [28]:
#with open('/content/drive/MyDrive/Colab_Notebooks/safety_report_tc/output/text_tokenizer.pkl', 'rb') as f:
#    tokenizer = pickle.load(f)

In [33]:
# Create simple function for running inference on user input text
def inference_run():
    text_raw = input('Enter text:')
    # TODO - Apply text prep steps to user input steps
    text = [text_raw]
    text = np.array(text)
    text_all_tmp = np.concatenate((X_train, X_test), axis=0)
    text_tmp = np.array(X_train)
    tokenizer = Tokenizer(num_words=75000)
    tokenizer.fit_on_texts(text_tmp)

    sequences = tokenizer.texts_to_sequences(text)
    X_test_Glove_s = pad_sequences(sequences, maxlen=500)
    check_class = np.argmax(model.predict(X_test_Glove_s), axis=1)[0]
    if check_class == 1:
        # Use text wrap to avoid a paragraph of text printing as single line in output window
        lines = textwrap.wrap(text_raw, 80, break_long_words=False)
        for i, l in enumerate(lines):
            if i == 0:
                print(f'\n"{l}')
            elif i == len(lines)-1:
                print(f'{l}"')
            else:
                print(l)
        print('\nPredicted: ', focus_cat)
    else:
        print('\nNot classified...') 

In [35]:
inference_run()

Enter text:strike discharge object substance lineman work aerial lift leak break hydraulic line inject fluid hand hospitalize

"strike discharge object substance lineman work aerial lift leak break hydraulic
line inject fluid hand hospitalize"

Predicted:  hydraulic fluid or oil leak


In [36]:
# Save the model
filename = f'/content/drive/MyDrive/Colab_Notebooks/safety_report_tc/output/{time.strftime("%y%m%d%H%M")}_model.pkl'
pickle.dump(model, open(filename, 'wb'))

# load the model from disk
# loaded_model = pickle.load(open(filename, 'rb'))

