In [2]:
# Set random seeds
from IPython.display import display, HTML
from numpy.random import seed
seed(156)
import tensorflow as tf
tf.random.set_seed(256)

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
%cd /content/drive/MyDrive/Colab_Notebooks/safety_report_tc

/content/drive/MyDrive/Colab_Notebooks/safety_report_tc


In [4]:
!pip install contractions

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting contractions
  Downloading contractions-0.1.72-py2.py3-none-any.whl (8.3 kB)
Collecting textsearch>=0.0.21
  Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Collecting pyahocorasick
  Downloading pyahocorasick-1.4.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (106 kB)
[K     |████████████████████████████████| 106 kB 34.2 MB/s 
[?25hCollecting anyascii
  Downloading anyascii-0.3.1-py3-none-any.whl (287 kB)
[K     |████████████████████████████████| 287 kB 57.1 MB/s 
[?25hInstalling collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully installed anyascii-0.3.1 contractions-0.1.72 pyahocorasick-1.4.4 textsearch-0.0.24


In [5]:
# Importing required libraries
import pandas as pd
import re
import requests
import nltk
import inflect
import contractions
from bs4 import BeautifulSoup
import re, string, unicodedata
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from tqdm.std import tqdm
from IPython.display import display, HTML
import time
import textwrap
import requests
import json

In [6]:
# Install required nltk resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [9]:
# Pick a minority category of interest (base data from source)
focus_cats = [
    'hydraulic fluid or oil leak',
    'line strike'
    ]

fns = [
    'hydraulic fluid or oil leak_577_out_df_temp',
    'line strike_358_out_df_temp'
]

# Filename for fabricated data(by decoder) file
# This is used at the end for denoising & normalising the fabricated data before
# use in TC modelling.
fnfs = 'hydraulic fluid or oil leak_7920'

In [10]:
# Select a category to load
pick = input('Pick a category: ')
index = focus_cats.index(pick)
focus_cat = focus_cats[index]
fn = fns[index]
fnf = fnfs[index]

# Load the base data
df = pd.read_csv(f'08_output/{fn}.csv')
df = df[['text', 'category']]

# Convert category to binary 1/0
# Remeber that 1 is a rule-book hit & not necessarily the 'truth'
df['category'] = df['category'].apply(lambda x : x.replace("*** Not Classified", "0"))
df['category'] = df['category'].apply(lambda x : x.replace(focus_cat, "1"))
df['category'] = df['category'].astype(int)
df

Pick a category: line strike


Unnamed: 0,text,category
0,foreign body entered employee's (l) eye while ...,0
1,drainage pipe damaged at ~2.2 m depth. see sup...,0
2,robodrill spider excavator being operated when...,0
3,pressure hose made contact with light fitting ...,0
4,nacap 30t hitachi hyd hose split resultong in ...,0
...,...,...
93852,fall on water vehicle an employee was using an...,0
93853,"other fall to lower level, unspecified an empl...",0
93854,injured by slipping or swinging object held by...,0
93855,"direct exposure to electricity, greater than 2...",0


Inspired by
https://github.com/pashupati98/text-classification

## Data Preparation

In [11]:
STOP_SENTS = ['migrated from legacy cairs', 'migrated from cairs']

f = open('01_data/us2gb.json')
US2GB = json.load(f)
f.close()

def remove_stop_sentences(in_doc):
    for i, sen in enumerate(STOP_SENTS):
        if i==0:
            rep_doc = in_doc
        else:
            rep_doc = out_doc
        out_doc = rep_doc.replace(f'{sen}', '') 
    out_doc = re.sub(' +', ' ', out_doc)    
    return(out_doc)

def americanize(in_doc):
    for american_spelling, british_spelling in US2GB.items(): 
        in_doc = re.sub(f'(?<![a-zA-Z]){british_spelling}(?![a-z-Z])', american_spelling, in_doc)
    return(in_doc)

# Main denoising function
def denoise_text(txt):
    # Strip html if any.
    soup = BeautifulSoup(txt, "html.parser")
    txt = soup.get_text()

    # Replace contractions in string of text. For ex. didn't -> did not
    txt = contractions.fix(txt)

    # Change to lower case
    txt = txt.lower()

    # Remove any defined stop sentences
    txt = remove_stop_sentences(txt)

    # Convert British to American spelling
    #txt = americanize(txt)
    return txt

In [12]:
# Test the denoising function
chk_text = "<p>she didn't TELL me anything </br> about what's gonna <html> happen in the end but did want to optimise the colours migrated from cairs"
denoise_text(chk_text)

'she did not tell me anything about what is going to happen in the end but did want to optimise the colours '

In [13]:
# Normalization may include everal step
# Each function below fulfills a (potential) step in normalization

def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        split_words = word.split('.')
        for word in split_words:
            new_word = re.sub(r'[^\w\s]', '', word)
            if new_word != '':
                new_words.append(new_word)
    return new_words

def replace_numbers(words):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words

def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words

def stem_words(words):
    """Stem words in list of tokenized words"""
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

def lemmatize_verbs(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas

# Define the normalization pipeline
# Comment out steps not used
def normalize_text(words, lema=False, stem=False, stop=False):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_punctuation(words)
    words = replace_numbers(words)
    # TODO - Experiment with the below options
    # Inuitively, stopwords should not be removed as they may impact
    # semantic meaning.
    if stop: words = remove_stopwords(words) 
    if stem: words = stem_words(words)
    if lema: words = lemmatize_verbs(words)
    
    return words


In [14]:
# Let's test the individual normalization functions
print("Remove non_ascii: ", remove_non_ascii(['h', 'ॐ', '©', '1']))
print("To lowercase: ", to_lowercase(['HELLO', 'hiDDen', 'wanT', 'GOING']))
print("Remove punctuation: ", remove_punctuation(['hello!!', 'how?', 'done,', 'hello.you', 'You my friend.Are a fool.']))
print("Replace numbers: ", replace_numbers(['1', '2', '3']))
print("Remove stopwords: ", remove_stopwords(['this', 'and', 'amazing', 'not', 'no', 'yes']))
print("Stem words: ", stem_words(['beautiful', 'flying', 'waited']))
print("Lemmatize verbs: ", lemmatize_verbs(['hidden', 'walking', 'ran']))

# Now apply the pipeline (on a test)
print("Normalize text: ", normalize_text(['hidden', 'in', 'the', 'ALMIRAH', 'he', 'WAited', '2', 'ॐ', 'hours!!']))

Remove non_ascii:  ['h', '', '', '1']
To lowercase:  ['hello', 'hidden', 'want', 'going']
Remove punctuation:  ['hello', 'how', 'done', 'hello', 'you', 'You my friend', 'Are a fool']
Replace numbers:  ['one', 'two', 'three']
Remove stopwords:  ['amazing', 'yes']
Stem words:  ['beauty', 'fly', 'wait']
Lemmatize verbs:  ['hide', 'walk', 'run']
Normalize text:  ['hidden', 'in', 'the', 'almirah', 'he', 'waited', 'two', 'hours']


In [15]:
# Tokenize text into words
def simple_tokenize(text):
    return nltk.word_tokenize(text)

# Check the function
sample_text = 'he did not say anything  about what is going to  happen'
print("tokenize results :", simple_tokenize(sample_text))

tokenize results : ['he', 'did', 'not', 'say', 'anything', 'about', 'what', 'is', 'going', 'to', 'happen']


In [16]:
# Now create a simple function to denoise, normalize and apply simple tokeniser to text
def text_prepare(text):
    text = denoise_text(text)
    text = ' '.join([x for x in normalize_text(simple_tokenize(text))])
    return text

In [17]:
re.sub(" s ", " ", text_prepare("I am not gonna go to the shop's to visualise"))

'i am not going to go to the shop to visualise'

In [18]:
# Now apply the main text prep pipeline to all text:
# Takes about 5 mins to run on Google CoLab
df['text'] = [text_prepare(x) for x in tqdm(df['text'])]

# Clean up on category encoding
le = LabelEncoder()
df['category'] = le.fit_transform(df['category'])
print('Done!')
df.head()

100%|██████████| 93857/93857 [01:35<00:00, 980.77it/s] 

Done!





Unnamed: 0,text,category
0,foreign body entered employee s l eye while gr...,0
1,drainage pipe damaged at two two m depth see s...,0
2,robodrill spider excavator being operated when...,0
3,pressure hose made contact with light fitting ...,0
4,nacap 30t hitachi hyd hose split resultong in ...,0


In [19]:
df['text'] = df['text'].apply(lambda x: re.sub(' s ', ' ', x))
df.head()

Unnamed: 0,text,category
0,foreign body entered employee l eye while grin...,0
1,drainage pipe damaged at two two m depth see s...,0
2,robodrill spider excavator being operated when...,0
3,pressure hose made contact with light fitting ...,0
4,nacap 30t hitachi hyd hose split resultong in ...,0


In [20]:
table = df[df['category']==1]
display(HTML(table.to_html()))

Unnamed: 0,text,category
251,operative cut into compressed air line which was not isolated correctly near miss incident a pipefitter was cutting into an existing compressed airline pipe work using a grinder to complete tiein works to connect newly installed pipework to the building compressed airline as the pipe fitter was performing the cut they heard air being released from the pipe the air was being released away from him as the cut was on the back underside of the pipe ensuring the escaping compressed air was directed away from the individual the operative was wearing controlled area coveralls and a full face kamira impact resistant respirator no injuries were sustained the safe system of work ssow issued by the plant safe system controller ssc contained a lock out box key for an isolation the wood supervisor confirmed the isolation paper work was for the line he was supposed to work on it appears the wrong valve was isolated by the plant team work was stopped and the relevant plant and clients personnel were notified plant personnel checked the ssow and isolation paper work which appeared in good order plant personnel later identified the valve for the compressed air line and stopped the release of air a level two investigation has commenced,1
285,dc cable struck allied fencing employees reported experiencing an electrical shock while installing posts with a pneumatic driving device subcontractor employees were driving a fence post utilizing a pneumatic driver while driving the post both employees reported receiving a shock all work was immediately stopped the area barricaded and all power in the block deenergized the employees were observed onsite and for precautionary reasons transported to a local medical facility for further medical evaluation and released the initial investigation discovered that the post came into contact with an energized dc feeder cable the line location was marked above ground via flagging and on provided drawings the subcontractor did not follow the site daily work request process investigation is in progress,1
289,water line strike during drilling operations during geotechnical drilling of boreholes at the site the driller observed flowing water through the drill pipe after drilling to fifteen four six m the drill response indicated that it was cobbled and boulder thirteen to fifteen which is not an unusual condition the driller immediately contacted the wood e i site lead who advised to mix a heavy batch of mud to see if it would abate the flow as it was thought to initially be an artesian flow however this process saw no returns of the mud and the flows became heavier a large amount of water continued to flow to surface which indicated to crew that damage to an underground pipe may have occurred water appeared clear in nature with no odour the scene was immediately frozen scene was frozen client was notified action to control the water flow was initiated vacuum trucks,1
333,mini excavator damaged underground three pvc drainage line during excavation activities wood civil crew was conducting an excavation in the seven hundred block and using a mini excavator when the bucket damaged an unidentified three pvc water drainage line on the edge of the excavation at approximately fourteen deep work was immediately stopped and operations and wood personnel were notified the civil crew was properly permitted and a valid excavation permit was in place prior to starting the project the client had ground penetrating radar conducted in the area to be excavated which did not identify the pvc pipe in addition concrete was removed from the excavation area and probing was conducted per program requirements two centers thoughout excavation area down to eighteen by the wood civil group that also did not identify the underground pipe client drawings also did not identify the three pvc line in the excavation area work was stopped and operations notified personnel were taken for client protocol screening,1
435,damage to a nonconductive nonactive burried line while excavating a dc crossover the excavator operator struck and severed a buried fiber line a spotter was hand digging with a shovel but the line was not located at the location that the drawing showed the excavator bucket was scraping the side wall when the line was severed this was six feet eight inches away when compared to the drawing work was immediately stopped and supervisors for wood hcc and wood renewables were contacted incident was investigated by wood hcc,1
511,soil boring activity aggrevates poor condition of an existing water line causing a leak me soil boring crew was onsite following a plan to take four additional bores in an area that had previously been scanned using groundpenetrating radar and radio detection technology customer provided drawings reviewed as well as bored before the gpr rd scans of the area identified the location of two previously installed lines that were bedded in sand and both were daylighted utilizing hydroexcavation methods through another subcontractor badger they proceeded to use the auger bit to break through the surface rock which extended approximately sixty-eight into the ground the team then switched to the core bit and hydraulically pushed it approximately another sixty-eight into the hardpacked clay when they noticed water coming out of the coupling the task was ceased and wood management was immediately notified that there was water coming out of the hole badger was then called onsite in order to daylight the source of the seeping water using hydroexcavation only to find multiple pin holes in a corrodeddeteriorating three carbon steel potable water line that was approximately two feet below the surface and had been packed in hard clay approximately twelve inches beyond the depth in which the boring took place the normal process for burying underground utilities is to bed and encase them in sand along with tracer wires etc to allow them to be located as with the other two lines that gpr rd were able to identify however with the age and deterioration of the potable water line below the boring it is small diameter as well as the presence of the hardpacked clay one of the hardest materials for gpr rd to penetrate the scans were unsuccessful it is also likely that with the deteriorated pipe being encased with clay it was not known to have been already leaking until a void was created to allow the water to escape the task was ceased and wood management was immediately notified that there was water coming out of the hole badger was then called onsite in order to daylight the source of the seeping water using hydroexcavation only to find multiple pin holes in a corrodeddeteriorating three carbon steel potable water line that was approximately two feet below the surface and had been packed in hard clay approximately twelve inches beyond the depth in which the boring took place the normal process for burying underground utilities is to bed and encase them in sand along with tracer wires etc to allow them to be located as with the other two lines that gpr rd were able to identify however with the age and deterioration of the potable water line below the boring it is small diameter as well as the presence of the hardpacked clay one of the hardest materials for gpr rd to penetrate the scans were unsuccessful it is also likely that with the deteriorated pipe being encased with clay it was not known to have been already leaking until a void was created to allow the water to escape the repair was complete before end of shift yesterday,1
791,it was notice that part of the bank had collapsed near the line and the weight of the earthen bank broke the line team had exposed a three seldom used poly gas line as part of bell hole excavation on sixteen november the line belonged to painted pegasus upon returning to work today eighteen nov it was notice that part of the bank had collapsed near the line and the weight of the earthen bank broke the line there was no release as the line was not in use nor did the team reenter the excavation the broken line was reported to the owner he closed valves as precaution and began line repair,1
819,plate compactor was damaged when operator turned the bucket in the down position operator was warming up a front end loader and activated the controls to move the hydraulics a pedestal plate compactor was in the bucket from the day before which he noticed during his walk around inspection when he turned the bucket in the down position the plate compactor fell out and was damaged operator was taken for a drug screen and not allowed to run any equipment for the remaining of the shift superintendent held a standown with the crew and discussed complacency,1
824,excavator made contact with two empty one inch conduit during excavation for catch basin grate causing damage to conduit crew was tasked with exposing a grate cover for a catch basin when the incident occurred the operator was using a jd thirteen mini excavator to scratch the finish gravel when he hooked two one inch conduit lines upon lifting the bucket the two lines broke both conduit lines were empty the conduit was noted to be approximately forty-five inches below finish grade work was stopped the operator reported the incident to his immediate supervisor the superintendent was notified of the damaged conduit and location and contacted the wood heavy civil safety department the general contractor and client were notified of the incident the scene was secured and safety was ensured,1
871,project newly installed and not inservice fiberoptic cable was struck by excavator bucket operator digging a trench came in contact with a fiberoptic line that was believed to be buried deeper than it actually was causing damage to the line the operator and spotter were excavating an electrical trench in block four near pcs four hundred and twenty-three when a newly installed and not inservice fiberoptic cable was severed line strike by the excavator bucket slightly below two feet from final grade the work authorization and excavation permits were signed off and no linescables were identified on the permits stop work enforced all digging across site was stopped to be reevaluated the active permits and process reviewed stand down on trenching and excavating took place witness statements jsa excavation permits pictures taken,1


In [None]:
# Save the prepared dataset
filename = f'01_data/prepared/{time.strftime("%y%m%d%H%M")}_prepared_{focus_cat}_data.csv'
df.to_csv(filename, index=False)

### Fabricated Data Prep

In [None]:
# Load fabricated reports
df_fab = pd.read_csv(f'01_data/fabricated/{fnf}.csv')
df_fab

Unnamed: 0,text,group
0,there was a leak of oil when hydraulic fluid e...,hydraulic fluid or oil leak
1,there was a leak from a hose on a pump truck w...,hydraulic fluid or oil leak
2,there was a leak to grade one hundred and fift...,hydraulic fluid or oil leak
3,there was a leak of hydraulic fluid from a ren...,hydraulic fluid or oil leak
4,there was a leak from a jlg cat three thousand...,hydraulic fluid or oil leak
...,...,...
7915,oil fluid witnessed falling object resulting i...,hydraulic fluid or oil leak
7916,oil fluid witnessed on three out of four an em...,hydraulic fluid or oil leak
7917,oil fluid witnessed nearmisses at separation t...,hydraulic fluid or oil leak
7918,oil fluid witnessed a hydraulic leak from a na...,hydraulic fluid or oil leak


In [None]:
df_fab['text'] = [text_prepare(x) for x in tqdm(df_fab['text'])]

100%|██████████| 7920/7920 [00:09<00:00, 846.80it/s]


In [None]:
df_fab = df_fab.sample(frac=1).reset_index(drop=True)
df_fab

Unnamed: 0,text,group
0,crew noticed drip coming from hose fitting on ...,hydraulic fluid or oil leak
1,diesel failure on skid steer resulted in 1l of...,hydraulic fluid or oil leak
2,operative witnessed discharge of hydraulic flu...,hydraulic fluid or oil leak
3,subcontractor witnessed loss of hydraulic flui...,hydraulic fluid or oil leak
4,operative discovered leak in hose on roller dr...,hydraulic fluid or oil leak
...,...,...
7915,there was a flow of oil from the gully to the ...,hydraulic fluid or oil leak
7916,hyd fluid failure of excavator hydraulic fitti...,hydraulic fluid or oil leak
7917,apprentice noticed spillage of hydraulic fluid...,hydraulic fluid or oil leak
7918,apprentice witnessed drip from a bucket lift i...,hydraulic fluid or oil leak


In [None]:
# Save the prepared fabricated dataset
filename = f'01_data/prepared/{time.strftime("%y%m%d%H%M")}_prepared_fabricated_{focus_cat}_data.csv'
df_fab.to_csv(filename, index=False)