In [3]:
# Set random seeds
from IPython.display import display, HTML
from numpy.random import seed
seed(156)
import tensorflow as tf
tf.random.set_seed(256)

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
%cd /content/drive/MyDrive/Colab_Notebooks/safety_report_tc

/content/drive/MyDrive/Colab_Notebooks/safety_report_tc


In [6]:
!pip install contractions

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting contractions
  Downloading contractions-0.1.72-py2.py3-none-any.whl (8.3 kB)
Collecting textsearch>=0.0.21
  Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Collecting anyascii
  Downloading anyascii-0.3.1-py3-none-any.whl (287 kB)
[K     |████████████████████████████████| 287 kB 5.7 MB/s 
[?25hCollecting pyahocorasick
  Downloading pyahocorasick-1.4.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (106 kB)
[K     |████████████████████████████████| 106 kB 3.7 MB/s 
[?25hInstalling collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully installed anyascii-0.3.1 contractions-0.1.72 pyahocorasick-1.4.4 textsearch-0.0.24


In [7]:
# Importing required libraries
import pandas as pd
import re
import requests
import nltk
import inflect
import contractions
from bs4 import BeautifulSoup
import re, string, unicodedata
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from tqdm.std import tqdm
from IPython.display import display, HTML
import time
import textwrap
import requests
import json

In [8]:
# Install required nltk resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [9]:
# Pick a minority category of interest (base data from source)
focus_cats = [
    'hydraulic fluid or oil leak',
    'line strike',
    'site compliance or practice issue',
    'ppe non-compliance',
    'mechanical or equipment issue'
    ]

fns = [
    'hydraulic fluid or oil leak_577_out_df_temp',
    'line strike_326_out_df_temp',
    'site compliance or practice issue_400_out_df_temp',
    'ppe non-compliance_144_out_df_temp',
    'mechanical or equipment issue_471_out_df_temp'
]

# Filename for fabricated data(by decoder) file
# This is used at the end for denoising & normalising the fabricated data before
# use in TC modelling.
fnfs = 'hydraulic fluid or oil leak_7920'

In [10]:
# Select a category to load
pick = input('Pick a category: ')
index = focus_cats.index(pick)
focus_cat = focus_cats[index]
fn = fns[index]
fnf = fnfs[index]

# Load the base data
df = pd.read_csv(f'08_output/{fn}.csv')
df = df[['text', 'category']]

# Convert category to binary 1/0
# Remeber that 1 is a rule-book hit & not necessarily the 'truth'
df['category'] = df['category'].apply(lambda x : x.replace("*** Not Classified", "0"))
df['category'] = df['category'].apply(lambda x : x.replace(focus_cat, "1"))
df['category'] = df['category'].astype(int)
df

Pick a category: mechanical or equipment issue


Unnamed: 0,text,category
0,foreign body entered employee's (l) eye while ...,0
1,drainage pipe damaged at ~2.2 m depth. see sup...,0
2,robodrill spider excavator being operated when...,0
3,pressure hose made contact with light fitting ...,0
4,nacap 30t hitachi hyd hose split resultong in ...,0
...,...,...
93852,fall on water vehicle an employee was using an...,0
93853,"other fall to lower level, unspecified an empl...",0
93854,injured by slipping or swinging object held by...,0
93855,"direct exposure to electricity, greater than 2...",0


Inspired by
https://github.com/pashupati98/text-classification

## Data Preparation

In [11]:
STOP_SENTS = ['migrated from legacy cairs', 'migrated from cairs']

f = open('01_data/us2gb.json')
US2GB = json.load(f)
f.close()

def remove_stop_sentences(in_doc):
    for i, sen in enumerate(STOP_SENTS):
        if i==0:
            rep_doc = in_doc
        else:
            rep_doc = out_doc
        out_doc = rep_doc.replace(f'{sen}', '') 
    out_doc = re.sub(' +', ' ', out_doc)    
    return(out_doc)

def americanize(in_doc):
    for american_spelling, british_spelling in US2GB.items(): 
        in_doc = re.sub(f'(?<![a-zA-Z]){british_spelling}(?![a-z-Z])', american_spelling, in_doc)
    return(in_doc)

# Main denoising function
def denoise_text(txt):
    # Strip html if any.
    soup = BeautifulSoup(txt, "html.parser")
    txt = soup.get_text()

    # Replace contractions in string of text. For ex. didn't -> did not
    txt = contractions.fix(txt)

    # Change to lower case
    txt = txt.lower()

    # Remove any defined stop sentences
    txt = remove_stop_sentences(txt)

    # Convert British to American spelling
    #txt = americanize(txt)
    return txt

In [12]:
# Test the denoising function
chk_text = "<p>she didn't TELL me anything </br> about what's gonna <html> happen in the end but did want to optimise the colours migrated from cairs"
denoise_text(chk_text)

'she did not tell me anything about what is going to happen in the end but did want to optimise the colours '

In [13]:
# Normalization may include everal step
# Each function below fulfills a (potential) step in normalization

def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        split_words = word.split('.')
        for word in split_words:
            new_word = re.sub(r'[^\w\s]', '', word)
            if new_word != '':
                new_words.append(new_word)
    return new_words

def replace_numbers(words):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words

def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words

def stem_words(words):
    """Stem words in list of tokenized words"""
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

def lemmatize_verbs(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas

# Define the normalization pipeline
# Comment out steps not used
def normalize_text(words, lema=False, stem=False, stop=False):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_punctuation(words)
    words = replace_numbers(words)
    # TODO - Experiment with the below options
    # Inuitively, stopwords should not be removed as they may impact
    # semantic meaning.
    if stop: words = remove_stopwords(words) 
    if stem: words = stem_words(words)
    if lema: words = lemmatize_verbs(words)
    
    return words


In [14]:
# Let's test the individual normalization functions
print("Remove non_ascii: ", remove_non_ascii(['h', 'ॐ', '©', '1']))
print("To lowercase: ", to_lowercase(['HELLO', 'hiDDen', 'wanT', 'GOING']))
print("Remove punctuation: ", remove_punctuation(['hello!!', 'how?', 'done,', 'hello.you', 'You my friend.Are a fool.']))
print("Replace numbers: ", replace_numbers(['1', '2', '3']))
print("Remove stopwords: ", remove_stopwords(['this', 'and', 'amazing', 'not', 'no', 'yes']))
print("Stem words: ", stem_words(['beautiful', 'flying', 'waited']))
print("Lemmatize verbs: ", lemmatize_verbs(['hidden', 'walking', 'ran']))

# Now apply the pipeline (on a test)
print("Normalize text: ", normalize_text(['hidden', 'in', 'the', 'ALMIRAH', 'he', 'WAited', '2', 'ॐ', 'hours!!']))

Remove non_ascii:  ['h', '', '', '1']
To lowercase:  ['hello', 'hidden', 'want', 'going']
Remove punctuation:  ['hello', 'how', 'done', 'hello', 'you', 'You my friend', 'Are a fool']
Replace numbers:  ['one', 'two', 'three']
Remove stopwords:  ['amazing', 'yes']
Stem words:  ['beauty', 'fly', 'wait']
Lemmatize verbs:  ['hide', 'walk', 'run']
Normalize text:  ['hidden', 'in', 'the', 'almirah', 'he', 'waited', 'two', 'hours']


In [15]:
# Tokenize text into words
def simple_tokenize(text):
    return nltk.word_tokenize(text)

# Check the function
sample_text = 'he did not say anything  about what is going to  happen'
print("tokenize results :", simple_tokenize(sample_text))

tokenize results : ['he', 'did', 'not', 'say', 'anything', 'about', 'what', 'is', 'going', 'to', 'happen']


In [16]:
# Now create a simple function to denoise, normalize and apply simple tokeniser to text
def text_prepare(text):
    text = denoise_text(text)
    text = ' '.join([x for x in normalize_text(simple_tokenize(text))])
    return text

In [17]:
re.sub(" s ", " ", text_prepare("I am not gonna go to the shop's to visualise"))

'i am not going to go to the shop to visualise'

In [18]:
# Now apply the main text prep pipeline to all text:
# Takes about 5 mins to run on Google CoLab
df['text'] = [text_prepare(x) for x in tqdm(df['text'])]

# Clean up on category encoding
le = LabelEncoder()
df['category'] = le.fit_transform(df['category'])
print('Done!')
df.head()

100%|██████████| 93857/93857 [01:46<00:00, 880.14it/s] 


Done!


Unnamed: 0,text,category
0,foreign body entered employee s l eye while gr...,0
1,drainage pipe damaged at two two m depth see s...,0
2,robodrill spider excavator being operated when...,0
3,pressure hose made contact with light fitting ...,0
4,nacap 30t hitachi hyd hose split resultong in ...,0


In [19]:
df['text'] = df['text'].apply(lambda x: re.sub(' s ', ' ', x))
df.head()

Unnamed: 0,text,category
0,foreign body entered employee l eye while grin...,0
1,drainage pipe damaged at two two m depth see s...,0
2,robodrill spider excavator being operated when...,0
3,pressure hose made contact with light fitting ...,0
4,nacap 30t hitachi hyd hose split resultong in ...,0


In [20]:
table = df[df['category']==1]
display(HTML(table.to_html()))

Unnamed: 0,text,category
149,"crane cable damaged when big block was pulled into anti two block by crane operator the following events occurred after crane operations ceased at two thousand, one hundred and thirty hrs two thousand, eight hundred and twenty after backloading twenty lifts crane operator drew in the big block so that it was tight against the anti two block he also stated he could not see just went by feel crane operator then placed the crane in the boom cradle in the morning 1000am two thousand, nine hundred and twenty crane mechanic observed that the big block was driven into the antitwo blocksideways which caused damage to the main hoist cable1100am two thousand, nine hundred and twenty mechanics are working on cutting back the cable will cut out the bad section twenty place the load line back in service",1
172,non preventable company vehicle minor damage no injuries vehicle was parked overnight and an employee noticed the smell of gas employee found that the fuel line was damaged internal reporting process initiated,1
231,during the maneuver of a cherry picker in the ll area the driver damaged an energized cable during the installation of the steel structure in the ll area a cherry picker driver while rotating the boom of the machine hit against an energized cable two hundred and twenty volt positioned in the below part of the structure this caused the damage of the cable and consequently a black out in the area ll the circuit breaker worked properly the driver stopped the work and informed immediately the wood supervisor and his foreman the driver stopped the work and informed immediately the wood supervisor and his foreman the cable was immediately replaced by zeppelin,1
240,an installed post was found twisted and bent in circuit one a post previously installed in the array was damaged the damage was reported by a sun solar supervisor there are no witness accounts at this time however the investigation is ongoing the damged post is located along the south side of the main road in circuit one in the first position on row six sun solar reported the damage immediately upon finding it to both the quality and safety department the post was flagged with red barricade tape for identification purposes however in its present postion the post does not present a hazard to traffic or pedestrians the post will be released for repair by the conclusion of this investigation sun solar reported the damage immediately upon finding it to both the quality and safety department the post was flagged with red barricade tape,1
256,damage found on vehicle unkown who caused vehicle was parked on the street overnight at home in the morning wood driver found damage to the driver side door and the mirror casing is cracked driver does not know what caused this damage driver called supervisor and donlen to report damage supervisor and driver prepared investigation forms and provided photos to hsse advisor,1
282,a pvc pipe and slip flange joint separated during startup of the plant pvc glued pipe joint separated during startup of new system installation a blind was left in a different part of the line and caused an over pressurization of the system resulting in in the pipe slipping out of the glued slip flange joint investigation into whether or not the joint was ever glued is underway initial because of failure was determined to be a blind removed blind and repaired the pipe to return to service investigation into whether or not the joint was actually glued is being conducted,1
310,burnt out cable for dvd drive within the pc used for rig control and monitoring the pc was off and would not turn on power was on to the rig and pc and all of the cables were plugged in everything looked ok on inspection in the workshop a very charred and melted connecter cable that plugs into the power supply socket of the integral dvd drive unit was found pc not damaged and risk of fire was minimal a new power cable has been installed which bypasses power to the dvd drive and the pc is now back up and running heart raised to report incident,1
320,fa small fragment of cutting disk hit ip in right upper thigh whilst cutting small steelwork bullets fa ip was utilising grinder with a cutting disk to cut through small steelwork bullets i e spacers small fragment of cutting disk 40ml shattered and hit ip in upper right thigh cutting operation was ongoing for ten fifteen seconds disk was new and visually inspected by ip prior to use grinder daily checks also completed by ip at start of shift ip received slight graze to right thigh the cutting disk shard did not pierce the material of ip coveralls ip immediately stopped the job and reported incident to supervisor hse advisor ip initially proactively checked other similar type cutting disks available on rack and whole batch was immediately quarantined by warehouse stores grinder was immediately quarantined to be returned to atr for appropriate mechanical checks cutting disk supplier informed regarding potential fault with this batch of cutting disks,1
353,power outage city of houston power grid failed which affected the horseshoe building at the park ten campus the outage continued until nine hundred and thirty pm next mornings employees encountered multiple monitors a couple of desktops powerstrips and network connections damaged due to the electrical surge all building personnel were requested to evacuate the building after forty-five min as per the houston erp the tcs houston bcp was activated and the situation was monitored until building operations returned to normal campus security was notified and a patrol guard posted at the building due to all doors systems had been deactivated,1
396,slip and fall at customer facility a wood employee was walking on a metal grate mezzanine reviewing a conveyor installation when he slipped on some oil and fell initially he thought the was fine and it seemed that way upon waking up the next morning he did not feel right and decided he needed to go to an urgent care facility to get checked out they examined him and took xrays but they found no significant damage bottom line he was just bruised a little swollen and stiff he returned to work on his next scheduled work day oil on floor where employee slipped and fell was immediately cleaned up by gm personnel follow up with employee is being conducted by hsse department,1


In [21]:
# Save the prepared dataset
filename = f'01_data/prepared/{time.strftime("%y%m%d%H%M")}_prepared_{focus_cat}_data.csv'
df.to_csv(filename, index=False)

### Fabricated Data Prep

In [None]:
# Load fabricated reports
df_fab = pd.read_csv(f'01_data/fabricated/{fnf}.csv')
df_fab

Unnamed: 0,text,group
0,there was a leak of oil when hydraulic fluid e...,hydraulic fluid or oil leak
1,there was a leak from a hose on a pump truck w...,hydraulic fluid or oil leak
2,there was a leak to grade one hundred and fift...,hydraulic fluid or oil leak
3,there was a leak of hydraulic fluid from a ren...,hydraulic fluid or oil leak
4,there was a leak from a jlg cat three thousand...,hydraulic fluid or oil leak
...,...,...
7915,oil fluid witnessed falling object resulting i...,hydraulic fluid or oil leak
7916,oil fluid witnessed on three out of four an em...,hydraulic fluid or oil leak
7917,oil fluid witnessed nearmisses at separation t...,hydraulic fluid or oil leak
7918,oil fluid witnessed a hydraulic leak from a na...,hydraulic fluid or oil leak


In [None]:
df_fab['text'] = [text_prepare(x) for x in tqdm(df_fab['text'])]

100%|██████████| 7920/7920 [00:09<00:00, 846.80it/s]


In [None]:
df_fab = df_fab.sample(frac=1).reset_index(drop=True)
df_fab

Unnamed: 0,text,group
0,crew noticed drip coming from hose fitting on ...,hydraulic fluid or oil leak
1,diesel failure on skid steer resulted in 1l of...,hydraulic fluid or oil leak
2,operative witnessed discharge of hydraulic flu...,hydraulic fluid or oil leak
3,subcontractor witnessed loss of hydraulic flui...,hydraulic fluid or oil leak
4,operative discovered leak in hose on roller dr...,hydraulic fluid or oil leak
...,...,...
7915,there was a flow of oil from the gully to the ...,hydraulic fluid or oil leak
7916,hyd fluid failure of excavator hydraulic fitti...,hydraulic fluid or oil leak
7917,apprentice noticed spillage of hydraulic fluid...,hydraulic fluid or oil leak
7918,apprentice witnessed drip from a bucket lift i...,hydraulic fluid or oil leak


In [None]:
# Save the prepared fabricated dataset
filename = f'01_data/prepared/{time.strftime("%y%m%d%H%M")}_prepared_fabricated_{focus_cat}_data.csv'
df_fab.to_csv(filename, index=False)