### Deep Learning Supervised Text Classification
- BiLSTM

Bidirectional recurrent neural networks (RNN) put two independent RNNs together. This structure allows the networks to have both backward and forward information about the sequence at every time step


### Prepare Environment

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install contractions

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting contractions
  Downloading contractions-0.1.72-py2.py3-none-any.whl (8.3 kB)
Collecting textsearch>=0.0.21
  Downloading textsearch-0.0.21-py2.py3-none-any.whl (7.5 kB)
Collecting pyahocorasick
  Downloading pyahocorasick-1.4.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (106 kB)
[K     |████████████████████████████████| 106 kB 16.2 MB/s 
[?25hCollecting anyascii
  Downloading anyascii-0.3.1-py3-none-any.whl (287 kB)
[K     |████████████████████████████████| 287 kB 49.0 MB/s 
[?25hInstalling collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully installed anyascii-0.3.1 contractions-0.1.72 pyahocorasick-1.4.4 textsearch-0.0.21


In [3]:
# Importing required libraries
import pandas as pd
import re
import nltk
import inflect
import contractions
from bs4 import BeautifulSoup
import re, string, unicodedata
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from tqdm.std import tqdm
from IPython.display import display, HTML
import time
import textwrap

In [4]:
# Install required nltk resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

### Load Data

In [5]:
# Pick a minority category of interest
focus_cat = 'hydraulic fluid or oil leak'
fn = '/content/drive/MyDrive/Colab_Notebooks/safety_report_tc/output/hydraulic fluid or oil leak_577_out_df_temp.csv'
df = pd.read_csv(fn)
df = df[['text', 'category']]

# Convert category to binary 1/0
# Remeber that 1 is a rule-book hit & not necessarily the 'truth'
df['category'] = df['category'].apply(lambda x : x.replace("*** Not Classified", "0"))
df['category'] = df['category'].apply(lambda x : x.replace(focus_cat, "1"))
df['category'] = df['category'].astype(int)
df

Unnamed: 0,text,category
0,foreign body entered employee's (l) eye while ...,0
1,drainage pipe damaged at ~2.2 m depth. see sup...,0
2,robodrill spider excavator being operated when...,1
3,pressure hose made contact with light fitting ...,0
4,nacap 30t hitachi hyd hose split resultong in ...,0
...,...,...
93852,fall on water vehicle an employee was using an...,0
93853,"other fall to lower level, unspecified an empl...",0
93854,injured by slipping or swinging object held by...,0
93855,"direct exposure to electricity, greater than 2...",0


### Data Prepatation

In [6]:
# Main denoising function
def denoise_text(txt):
    # Strip html if any.
    soup = BeautifulSoup(txt, "html.parser")
    txt = soup.get_text()

    # Replace contractions in string of text. For ex. didn't -> did not
    txt = contractions.fix(txt)
    return txt

In [7]:
# Test the denoising function
chk_text = "<p>she didn't tell me anything </br> about what's gonna <html> happen in the end"
denoise_text(chk_text)

'she did not tell me anything  about what is going to  happen in the end'

In [11]:
# Normalization may include everal step
# Each function below fulfills a (potential) step in normalization

def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def replace_numbers(words):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words

def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words

def stem_words(words):
    """Stem words in list of tokenized words"""
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

def lemmatize_verbs(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas

# Define the normalization pipeline
# Comment out steps not used
def normalize_text(words):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_punctuation(words)
    words = replace_numbers(words)
    # TODO - Experiment with the below options
    # Inuitively, stopwords should not be removed as they may impact
    # semantic meaning.
    #words = remove_stopwords(words) 
    #words = stem_words(words)
    words = lemmatize_verbs(words)
    return words


In [12]:
# Let's test the individual normalization functions
print("Remove non_ascii: ", remove_non_ascii(['h', 'ॐ', '©', '1']))
print("To lowercase: ", to_lowercase(['HELLO', 'hiDDen', 'wanT', 'GOING']))
print("Remove punctuation: ", remove_punctuation(['hello!!', 'how?', 'done,']))
print("Replace numbers: ", replace_numbers(['1', '2', '3']))
print("Remove stopwords: ", remove_stopwords(['this', 'and', 'amazing', 'not', 'no', 'yes']))
print("Stem words: ", stem_words(['beautiful', 'flying', 'waited']))
print("Lemmatize verbs: ", lemmatize_verbs(['hidden', 'walking', 'ran']))

# Now apply the pipeline (on a test)
print("Normalize text: ", normalize_text(['hidden', 'in', 'the', 'ALMIRAH', 'he', 'WAited', '2', 'ॐ', 'hours!!']))

Remove non_ascii:  ['h', '', '', '1']
To lowercase:  ['hello', 'hidden', 'want', 'going']
Remove punctuation:  ['hello', 'how', 'done']
Replace numbers:  ['one', 'two', 'three']
Remove stopwords:  ['amazing', 'yes']
Stem words:  ['beauty', 'fly', 'wait']
Lemmatize verbs:  ['hide', 'walk', 'run']
Normalize text:  ['hide', 'in', 'the', 'almirah', 'he', 'wait', 'two', 'hours']


In [10]:
# Tokenize text into words
def simple_tokenize(text):
    return nltk.word_tokenize(text)

# Check the function
sample_text = 'he did not say anything  about what is going to  happen'
print("tokenize results :", simple_tokenize(sample_text))

tokenize results : ['he', 'did', 'not', 'say', 'anything', 'about', 'what', 'is', 'going', 'to', 'happen']


In [14]:
# Now create a simple function to denoise, normalize and apply simple tokeniser to text
def text_prepare(text):
    text = denoise_text(text)
    text = ' '.join([x for x in normalize_text(simple_tokenize(text))])
    return text

In [16]:
text_prepare('I am not gonna go to the shop')

'i be not go to go to the shop'

In [13]:
# Apply to all text:
df['text'] = [text_prepare(x) for x in tqdm(df['text'])]
le = LabelEncoder()
df['category'] = le.fit_transform(df['category'])
df.head()

100%|██████████| 93857/93857 [02:02<00:00, 768.63it/s] 


Unnamed: 0,text,category
0,foreign body enter employee s l eye while grin...,0
1,drainage pipe damage at twenty-two m depth see...,0
2,robodrill spider excavator be operate when hos...,1
3,pressure hose make contact with light fit whil...,0
4,nacap 30t hitachi hyd hose split resultong in ...,0


## Build Model

In [17]:
# Load modelling building libraries
from keras.layers import Dropout, Dense, Embedding, LSTM, Bidirectional
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from sklearn.metrics import matthews_corrcoef, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.utils import shuffle
import numpy as np
import pickle
import matplotlib.pyplot as plt
import warnings
import logging
logging.basicConfig(level=logging.INFO)

In [18]:
# Create a function to prepare model input sequences and embedding dictionary
def prepare_model_input(X_train, X_test, MAX_NB_WORDS=75000, MAX_SEQUENCE_LENGTH=500):
    np.random.seed(7)
    all_text = np.concatenate((X_train, X_test), axis=0)
    all_text = np.array(all_text)

    # Fit tokeniser only on training text
    text = np.array(X_train)
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts(text)

    # Save the tokenizer as .pkl file
    pickle.dump(tokenizer, open(f'/content/drive/MyDrive/Colab_Notebooks/safety_report_tc/output/{time.strftime("%y%m%d%H%M")}_text_tokenizer.pkl', 'wb'))

    # Generate sequences for all text using tokenizer created only on training text
    # This converts the sentence into a sequence of integers, e.g., [2, 4, 5, 7]
    sequences = tokenizer.texts_to_sequences(all_text)

    # Get a list of all words and their sequence numbers
    word_index = tokenizer.word_index

    # Pad out the sequences with zeroes to max sequence length
    all_text = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    
    # Seperate training sequences from test
    X_train_Glove = all_text[0:len(X_train), ]
    X_test_Glove = all_text[len(X_train):, ]

    # Now gather the embeddings
    # Start with standard GloVe
    ## https://www.google.com/search?client=safari&rls=en&q=glove+embeddings&ie=UTF-8&oe=UTF-8
    embeddings_dict = {}
    f = open("/content/drive/MyDrive/Colab_Notebooks/glove.6B.50d.txt", encoding="utf8")
    for line in f:
        values = line.split()
        word = values[0]
        try:
            coefs = np.asarray(values[1:], dtype='float32')
        except:
            pass
        embeddings_dict[word] = coefs
    f.close()
    print('Total %s word vectors.' % len(embeddings_dict))
    return (X_train_Glove, X_test_Glove, word_index, embeddings_dict)


In [19]:
# Create a function that builds the deep learning model
def build_bilstm(word_index, embeddings_dict, nclasses,  MAX_SEQUENCE_LENGTH=500, EMBEDDING_DIM=50, dropout=0.5, hidden_layer = 3, lstm_node = 32):
    # Initialize a sequential model
    model = Sequential()
    
    # Make the embedding matrix using the embedding_dict
    embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
    for word, i in word_index.items():
        embedding_vector = embeddings_dict.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            if len(embedding_matrix[i]) != len(embedding_vector):
                print("could not broadcast input array from shape", str(len(embedding_matrix[i])),
                      "into shape", str(len(embedding_vector)), " Please make sure your"
                                                                " EMBEDDING_DIM is equal to embedding_vector file ,GloVe,")
                exit(1)
            embedding_matrix[i] = embedding_vector
            
    # Add embedding layer
    model.add(Embedding(len(word_index) + 1,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=True))
    # Add hidden layers 
    # Default will be 3 layers
    # Default lstm nodel number will be 32
    for i in range(0, hidden_layer):
        # Add a bidirectional lstm layer
        model.add(Bidirectional(LSTM(lstm_node, return_sequences=True, recurrent_dropout=0.2)))

        # Add a dropout layer after each lstm layer
        model.add(Dropout(dropout))

    model.add(Bidirectional(LSTM(lstm_node, recurrent_dropout=0.2)))
    model.add(Dropout(dropout))

    # Add the fully connected layer with 256 nuerons & Relu activation
    model.add(Dense(256, activation='relu'))

    # Add the output layer with softmax activation (binary output)
    model.add(Dense(nclasses, activation='softmax'))

    # Compile the model using sparse_categorical_crossentropy
    model.compile(loss='sparse_categorical_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy'])
    return model

In [20]:
# Now create the input to the model training stage
X = df.text
y = df.category # Remember, these are not necessarily the 'truth' but rule_book hits

# Apply a simple 80/20 split
# TODO - The dataset is heavily imbalanced. Treat this appropriately.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

print("Preparing model input ...")
X_train_Glove, X_test_Glove, word_index, embeddings_dict = prepare_model_input(X_train, X_test)
print("Done!")

Preparing model input ...
Total 400000 word vectors.
Done!


In [21]:
print("Building Model!")
model = build_bilstm(word_index, embeddings_dict, 2)
model.summary()

Building Model!
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 500, 50)           3134700   
                                                                 
 bidirectional (Bidirectiona  (None, 500, 64)          21248     
 l)                                                              
                                                                 
 dropout (Dropout)           (None, 500, 64)           0         
                                                                 
 bidirectional_1 (Bidirectio  (None, 500, 64)          24832     
 nal)                                                            
                                                                 
 dropout_1 (Dropout)         (None, 500, 64)           0         
                                                                 
 bidirectional_2 (Bidirectio  (None, 500

In [24]:
# Quick view of rule_book hits (should represent the focus category)
X_test[y_test == 1].iloc[0]

'hydraulic fluid leak to asphalt from hiab truck contractor hesl hiab truck deliver and unload container for runway chip seal work have be leak hydraulic fluid from ram hosesoperator and assistant know of the leak and place spill absorbent pad on the vehicle and under the vehicle on the asphalt they also place a spill tray under one ram some over splash occur adjacent the tray in addition as the truck leave location remnant fluid drip onto the asphalt for approximately 3lm obvious fluid clean with spill material and sand place over the areathe follow morning absorbent material spread along the drip area material bag and dispose of in designate bin for removal from site approximately 50ml to asphalt'

### Model Training & Evaluatrion

In [25]:
def get_eval_report(labels, preds):
    mcc = matthews_corrcoef(labels, preds)
    tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()
    precision = (tp)/(tp+fp)
    recall = (tp)/(tp+fn)
    f1 = (2*(precision*recall))/(precision+recall)
    return {
        "mcc": mcc,
        "true positive": tp,
        "true negative": tn,
        "false positive": fp,
        "false negative": fn,
        "precision" : precision,
        "recall" : recall,
        "F1" : f1,
        "accuracy": (tp+tn)/(tp+tn+fp+fn)
    }

def compute_metrics(labels, preds):
    assert len(preds) == len(labels)
    return get_eval_report(labels, preds)

def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string], '')
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()


In [None]:
# Train the model
history = model.fit(X_train_Glove, y_train,
                              validation_data=(X_test_Glove, y_test),
                              epochs=5,
                              batch_size=128,
                              verbose=1)

Epoch 1/5
 37/587 [>.............................] - ETA: 47:45 - loss: 0.1347 - accuracy: 0.9673

In [None]:
# Plot training history
plot_graphs(history, 'accuracy')
plot_graphs(history, 'loss')

In [None]:
# Print accuracy measures
print("\n Evaluating Model ... \n")

predicted = model.predict(X_test_Glove)
predicted = np.argmax(predicted, axis=1)

print(metrics.classification_report(y_test, predicted))
print("\n")
logger = logging.getLogger("logger")
result = compute_metrics(y_test, predicted)
for key in (result.keys()):
    logger.info("  %s = %s", key, str(result[key]))

In [29]:
# Deepdive into the results / take a closer look...
tmp = pd.DataFrame(X_test, columns=['text'])
tmp['pred'] = predicted
tmp['rule_book'] = y_test

# Dump results out for quick inspection in Excel (if required/desired)
tmp.to_csv('/content/drive/MyDrive/Colab_Notebooks/safety_report_tc/output/my_tmp_out.csv')

# Quick look at predicted positives
tmp[tmp['pred']==1]

Unnamed: 0,text,pred,rule_book
4168,three hundred and thirty-six excavator pp104 e...,1,1
2971,third party own operate crane small hydraulic ...,1,1
8397,hydraulic hose component part failure minor re...,1,1
20952,spill grade 200ml hydraulic fluid kp70 load pr...,1,1
24446,hydraulic fluid leak soil 50ml approx fail hos...,1,1
...,...,...,...
11716,twenty twenty tons franna hydraulic oil leak f...,1,1
3633,metso crusher experience hydraulic fluid leak ...,1,1
8551,wheel excavator develop hydrauilic hose failur...,1,1
85984,strike discharge object substance employee tro...,1,1


In [43]:
# Take a look at predicted positives that did not have a rule-book hit
table = tmp[(tmp['pred']==1) & (tmp['rule_book']==0)]
display(HTML(table.to_html()))
print(f'\nThere are {len(table)} entries (out of {sum(y_test==1)})...')

Unnamed: 0,text,pred,rule_book
4566,begin unload crane delivery trailer pin hole leak develop brake line brake fluid leak pavement leak fluid contain clean immediately properly report client personnel migrate legacy cairs incident 33541on tuesday march twenty-one two thousand and seventeen four hundred pm infinity construction crew begin unload crane delivery trailer pin hole leak develop brake line brake fluid leak pavement leak fluid contain clean immediately properly report client personnel migrate legacy cairs see detail description,1,0
3311,hydraulic breaker attach mini excavator hydraulic line failure fluid lose approx twelve cup onto grind plastic place equipment fluid clean dispose properly migrate legacy cairs incident 35835on thursday october twenty-six two thousand and seventeen one hundred and fifty pm celanese plant clear lake hydraulic breaker attach mini excavator hydraulic line failure fluid lose approx twelve cup onto grind plastic place equipment fluid clean dispose properly migrate legacy cairs see detail description,1,0
14462,dozer along right way small leak hose observe dozer right way small leak hydraulic hoseno contamination grind oil dozer notify operator damage hose contact strathclyde send fitter site repair,1,0
26460,"cp20220024 env nanjv 90l hydraulic oil release one thousand, two hundred shovel one thousand, two hundred shovel load material oil line swing motor fail release 90l hydraulic oil work stop supervisor contact spill pad trays place grind cleanup initiate",1,0
13242,small bore tube fit make correctly put service hydraulic supply line wing valve associate bb18 b75 develop leak local tree valves fusible plug upstream wing valve connection investigation operations personnel two fittings find make correctly per parker alok guidelines 4part sign process hookup complete mc one documentation provide geg qedi state construct leak test accordance ans ti066 ansti019 ie ready servicevery minor amount hydraulic oil spill qedi receive little information regard incident reason report late apache originally say go investigate later decide go report investigate qedi offer assistance require qedi operations manger hse advisor decide record learn purpose qedi provide apache qedi learn relevant,1,0
14421,rear dump tractor trailer tip offload process offload classfive aggregate reinforce crane stand rear dump tractor trailer approximately fifty elevate preparation dump load driver claim felt right rear trailer give way driver attempt dump load tip side could finish trailer break away truck design truck remain place hydraulic cylinder release approximately four gallons hydraulic oil contain remediate work stop assess injuries spill contain remediate use spill kit soil overexcavated place overpack drum,1,0
12427,oil spillage occur due failure hydraulic hose concrete pour slab second floor mb307 concrete pump hydraulic oil spillage cause rupture one hose jvsl apply emergency procedure call fire brigade spillage contain oil absorber waste correctly dispose surface clean reinstate previous status,1,0
32,"kp fifteen row 200ml hyd fluid grade 30t excavator hydraulic spill occur approximately one thousand, one hundred and ten nacap excavator p1986 kp fifteen 200ml grade spill kit locate siteall contaminate materials absorbent pad dispose appropriately contaminate waste bag",1,0



There are 8 entries (out of 127)...


In [47]:
# Take a look at rule-book hits that did not have postive model hit...
table = tmp[(tmp['pred']==0) & (tmp['rule_book']==1)]
display(HTML(table.to_html()))
table_count = len(table)
ruley_count = sum(y_test==1)
perc_calced = round(100*(table_count / ruley_count), 1)
print(f'\nThere are {table_count } entries (out of {ruley_count}, {perc_calced}%)...')

Unnamed: 0,text,pred,rule_book
4373,cat three hundred and thirty-six excavator pp104 experience hydraulic leak grade operations operator immediately shut machine spill response equipmentteam location assist migrate legacy cairs incident 33737while three hundred and thirty-six cat excavator operate kp106 side cast location construction right way operator lose hydraulic power immediately shut machine notify supervisor mobilise spill kit equipment per sop spill contain contaminate soil place bag appropriate environmental disposal wg coordinator attend location conduct initial investigation maintenance logistics team inform send mechanic assess status excavator migrate legacy cairs see detail description,0,1
24575,diesel leak soil approximately 50ml filler cap diesel spill soil approx 50ml spider excavator two diesel filler cap open diesel flow leak contain contaminate material remove disposal filler cap secure,0,1
78010,catch compress equipment object unspecified employee check hydraulic line skid steer leak two finger become catch hydraulics smash one finger amputate nail,0,1
2370,hydraulic hose fit clients crane leak one gallon onto grind replacement hose spill report clean per client protocols migrate legacy cairs incident 37857the crane elevator group call replace leak hose clients crane hose replace crane start test hose hose pressurize fit another hose part system fail cause leak spill report clean per clients protocols migrate legacy cairs see detail description,0,1
9074,approximately fifty-six gallons oil release facility flare boom facility shut monthly compliance test operations initiate monthly esd check sdv make sure close time time control room operator notice slop tank vent scrubber level reach high level slop tank vent scrubber pump could enable oilywater mixture seep vent boom pump enable level return normal,0,1
73780,strike discharge object substance employee use new retraction tool disassemble air cycle machine unit process remove retain nut retraction tool thread shaft adapter break cause bubble burst hydraulic fluid rupture approximately six thousand psi fluid cutlacerated employee leave hand middle forefingers hospitalize require surgery,0,1
87314,catch compress equipment object unspecified employee work vertical lathe machine oil leak employee use finger wipe oil finger get catch hydraulic position switch pin result amputation fingertip nail,0,1
4077,d6 dozer reverse spotter notice trail oil leak grade five litres engine oil go grade migrate legacy cairs incident 34030d6 dozer reverse spotter notice trail oil leak grade five litres engine oil go grade operations time level stone stock pile area adjacent wpa angorethe operator notify shut machine catch pan place leak prevent release gradethe mechanics inform assess d6 dozerthe area question clean contaminate material place disposable bag appropriate environmental disposalupon inspection identify engine sump plug dislodge hit stonemechanics attend scene repair dozer return work shop inspection migrate legacy cairs see detail description,0,1
1774,tie operative work sonar cab boat five suffer foreign body enter eye client open oil line directly work oil contain within line drop onto head eye migrate legacy cairs incident 38436a wood group tie operative task insulate pipework within sonar cab boat five within ddh build barrowinfurness pipework insulate low floor mean tie operative lie floor install laggingas operative lie floor bae operative enter cab ask ip would mind step ip agree bae operative proceed open oil line sit directly ip open line lube oil also contain metal fill escape line drip onto ip head time also bounce floor liquid enter ip eyethe ip felt discomfort straight away go level two eyewash station wash eye immediately wash eye ip report sick bay still felt something eye sick bay medics wash eye use magnetas suspect metal fill enter eye wash magnet ip report feel anything eye anymorethe ip advise stay away dusty areas eye felt betterinvestigation ongoing wg client migrate legacy cairs see detail description,0,1
21325,non preventable company vehicle minor damage injury site sterling solar project forty mile north lake havasu city document soil classification leave one locations rear differential truck hit piece rock start leak oil able fix damage spot oil leakage observe since contact donlen receive direction report incident supervisor contact donlen receive instructions get vehicle repair,0,1



There are 51 entries (out of 127, 40.2%)...


### Inference Checks

In [128]:
#text = ['fall lower level less six feet employee miss step fell stationary semitruck injure back']
#text = ["mini excavator develop small hydraulic leak remove pavement fluid contract concrete pavement area clean client notify migrate legacy cairs incident 34326on monday july thirty-one two thousand and seventeen celanese plant clear lake brazos e employee operate mini excavator develop small hydraulic leak remove pavement fluid contract concrete pavement area clean client notify migrate legacy cairs see detail description"]
#text = ['contact hot object substances employee transport hot drip fluid use cook oil container fluid contact employee cause first second degree burn']
#text = ['strike discharge object substance lineman work aerial lift leak break hydraulic line inject fluid hand hospitalize']
text_raw = 'dozer along right way small leak hose observe dozer right way small leak hydraulic hoseno contamination grind oil dozer notify operator damage hose contact strathclyde send fitter site repair'
text_raw = 'I was walking down the yard and I lost my footing and tripped and broke my leg'
text_raw = 'Employee noticed oil fluid leaking from hydraulic line'

In [28]:
#with open('/content/drive/MyDrive/Colab_Notebooks/safety_report_tc/output/text_tokenizer.pkl', 'rb') as f:
#    tokenizer = pickle.load(f)

In [131]:
# Create simple function for running inference on user input text
def inference_run():
    text_raw = input('Enter text:')
    # TODO - Apply text prep steps to user input steps
    text = [text_raw]
    text = np.array(text)
    text_all_tmp = np.concatenate((X_train, X_test), axis=0)
    text_tmp = np.array(X_train)
    tokenizer = Tokenizer(num_words=75000)
    tokenizer.fit_on_texts(text_tmp)

    sequences = tokenizer.texts_to_sequences(text)
    X_test_Glove_s = pad_sequences(sequences, maxlen=500)
    check_class = np.argmax(model.predict(X_test_Glove_s), axis=1)[0]
    if check_class == 1:
        # Use text wrap to avoid a paragraph of text printing as single line in output window
        lines = textwrap.wrap(text_raw, 80, break_long_words=False)
        for i, l in enumerate(lines):
            if i == 0:
                print(f'\n"{l}')
            elif i == len(lines)-1:
                print(f'{l}"')
            else:
                print(l)
        print('\nPredicted: ', focus_cat)
    else:
        print('\nNot classified...') 

In [137]:
inference_run()

Enter text:Employee noticed oil fluid leaking from hydraulic line

"Employee noticed oil fluid leaking from hydraulic line

Predicted:  hydraulic fluid or oil leak


In [55]:
# Save the model
filename = f'/content/drive/MyDrive/Colab_Notebooks/safety_report_tc/output/{time.strftime("%y%m%d%H%M")}_model.pkl'
pickle.dump(model, open(filename, 'wb'))

# load the model from disk
# loaded_model = pickle.load(open(filename, 'rb'))

