In [1]:
# Set random seeds
from numpy.random import seed
seed(156)
import tensorflow as tf
tf.random.set_seed(256)

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
%cd /content/drive/MyDrive/Colab_Notebooks/safety_report_tc

/content/drive/MyDrive/Colab_Notebooks/safety_report_tc


In [4]:
!pip install contractions

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting contractions
  Downloading contractions-0.1.72-py2.py3-none-any.whl (8.3 kB)
Collecting textsearch>=0.0.21
  Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Collecting anyascii
  Downloading anyascii-0.3.1-py3-none-any.whl (287 kB)
[K     |████████████████████████████████| 287 kB 4.0 MB/s 
[?25hCollecting pyahocorasick
  Downloading pyahocorasick-1.4.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (106 kB)
[K     |████████████████████████████████| 106 kB 53.9 MB/s 
[?25hInstalling collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully installed anyascii-0.3.1 contractions-0.1.72 pyahocorasick-1.4.4 textsearch-0.0.24


In [5]:
# Importing required libraries
import pandas as pd
import re
import requests
import nltk
import inflect
import contractions
from bs4 import BeautifulSoup
import re, string, unicodedata
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from tqdm.std import tqdm
from IPython.display import display, HTML
import time
import textwrap
import requests
import json

In [6]:
# Install required nltk resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [7]:
# Pick a minority category of interest
focus_cat = 'hydraulic fluid or oil leak'
fn = 'output/hydraulic fluid or oil leak_577_out_df_temp.csv'
df = pd.read_csv(fn)
df = df[['text', 'category']]

# Convert category to binary 1/0
# Remeber that 1 is a rule-book hit & not necessarily the 'truth'
df['category'] = df['category'].apply(lambda x : x.replace("*** Not Classified", "0"))
df['category'] = df['category'].apply(lambda x : x.replace(focus_cat, "1"))
df['category'] = df['category'].astype(int)
df

Unnamed: 0,text,category
0,foreign body entered employee's (l) eye while ...,0
1,drainage pipe damaged at ~2.2 m depth. see sup...,0
2,robodrill spider excavator being operated when...,1
3,pressure hose made contact with light fitting ...,0
4,nacap 30t hitachi hyd hose split resultong in ...,0
...,...,...
93852,fall on water vehicle an employee was using an...,0
93853,"other fall to lower level, unspecified an empl...",0
93854,injured by slipping or swinging object held by...,0
93855,"direct exposure to electricity, greater than 2...",0


Inspired by
https://github.com/pashupati98/text-classification

## Data Preparation

In [8]:
STOP_SENTS = ['migrated from legacy cairs', 'migrated from cairs']

f = open('data/us2gb.json')
US2GB = json.load(f)
f.close()

def remove_stop_sentences(in_doc):
    for i, sen in enumerate(STOP_SENTS):
        if i==0:
            rep_doc = in_doc
        else:
            rep_doc = out_doc
        out_doc = rep_doc.replace(f'{sen}', '') 
    out_doc = re.sub(' +', ' ', out_doc)    
    return(out_doc)

def americanize(in_doc):
    for american_spelling, british_spelling in US2GB.items(): 
        in_doc = re.sub(f'(?<![a-zA-Z]){british_spelling}(?![a-z-Z])', american_spelling, in_doc)
    return(in_doc)

# Main denoising function
def denoise_text(txt):
    # Strip html if any.
    soup = BeautifulSoup(txt, "html.parser")
    txt = soup.get_text()

    # Replace contractions in string of text. For ex. didn't -> did not
    txt = contractions.fix(txt)

    # Change to lower case
    txt = txt.lower()

    # Remove any defined stop sentences
    txt = remove_stop_sentences(txt)

    # Convert British to American spelling
    #txt = americanize(txt)
    return txt

In [9]:
# Test the denoising function
chk_text = "<p>she didn't TELL me anything </br> about what's gonna <html> happen in the end but did want to optimise the colours migrated from cairs"
denoise_text(chk_text)

'she did not tell me anything about what is going to happen in the end but did want to optimise the colours '

In [10]:
# Normalization may include everal step
# Each function below fulfills a (potential) step in normalization

def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def replace_numbers(words):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words

def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words

def stem_words(words):
    """Stem words in list of tokenized words"""
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

def lemmatize_verbs(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas

# Define the normalization pipeline
# Comment out steps not used
def normalize_text(words, lema=False, stem=False, stop=False):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_punctuation(words)
    words = replace_numbers(words)
    # TODO - Experiment with the below options
    # Inuitively, stopwords should not be removed as they may impact
    # semantic meaning.
    if stop: words = remove_stopwords(words) 
    if stem: words = stem_words(words)
    if lema: words = lemmatize_verbs(words)
    
    return words


In [11]:
# Let's test the individual normalization functions
print("Remove non_ascii: ", remove_non_ascii(['h', 'ॐ', '©', '1']))
print("To lowercase: ", to_lowercase(['HELLO', 'hiDDen', 'wanT', 'GOING']))
print("Remove punctuation: ", remove_punctuation(['hello!!', 'how?', 'done,']))
print("Replace numbers: ", replace_numbers(['1', '2', '3']))
print("Remove stopwords: ", remove_stopwords(['this', 'and', 'amazing', 'not', 'no', 'yes']))
print("Stem words: ", stem_words(['beautiful', 'flying', 'waited']))
print("Lemmatize verbs: ", lemmatize_verbs(['hidden', 'walking', 'ran']))

# Now apply the pipeline (on a test)
print("Normalize text: ", normalize_text(['hidden', 'in', 'the', 'ALMIRAH', 'he', 'WAited', '2', 'ॐ', 'hours!!']))

Remove non_ascii:  ['h', '', '', '1']
To lowercase:  ['hello', 'hidden', 'want', 'going']
Remove punctuation:  ['hello', 'how', 'done']
Replace numbers:  ['one', 'two', 'three']
Remove stopwords:  ['amazing', 'yes']
Stem words:  ['beauty', 'fly', 'wait']
Lemmatize verbs:  ['hide', 'walk', 'run']
Normalize text:  ['hidden', 'in', 'the', 'almirah', 'he', 'waited', 'two', 'hours']


In [12]:
# Tokenize text into words
def simple_tokenize(text):
    return nltk.word_tokenize(text)

# Check the function
sample_text = 'he did not say anything  about what is going to  happen'
print("tokenize results :", simple_tokenize(sample_text))

tokenize results : ['he', 'did', 'not', 'say', 'anything', 'about', 'what', 'is', 'going', 'to', 'happen']


In [13]:
# Now create a simple function to denoise, normalize and apply simple tokeniser to text
def text_prepare(text):
    text = denoise_text(text)
    text = ' '.join([x for x in normalize_text(simple_tokenize(text))])
    return text

In [14]:
re.sub(" s ", " ", text_prepare("I am not gonna go to the shop's to visualise"))

'i am not going to go to the shop to visualise'

In [15]:
# Now apply the main text prep pipeline to all text:
# Takes about 5 mins to run on Google CoLab
df['text'] = [text_prepare(x) for x in tqdm(df['text'])]

# Clean up on category encoding
le = LabelEncoder()
df['category'] = le.fit_transform(df['category'])
print('Done!')
df.head()

100%|██████████| 93857/93857 [01:54<00:00, 817.93it/s] 

Done!





Unnamed: 0,text,category
0,foreign body entered employee s l eye while gr...,0
1,drainage pipe damaged at twenty-two m depth se...,0
2,robodrill spider excavator being operated when...,1
3,pressure hose made contact with light fitting ...,0
4,nacap 30t hitachi hyd hose split resultong in ...,0


In [16]:
df['text'] = df['text'].apply(lambda x: re.sub(' s ', ' ', x))
df.head()

Unnamed: 0,text,category
0,foreign body entered employee l eye while grin...,0
1,drainage pipe damaged at twenty-two m depth se...,0
2,robodrill spider excavator being operated when...,1
3,pressure hose made contact with light fitting ...,0
4,nacap 30t hitachi hyd hose split resultong in ...,0


In [None]:
# Save the prepared dataset
filename = f'data/prepared/{time.strftime("%y%m%d%H%M")}_prepared_data.csv'
df.to_csv(filename, index=False)

### Fabricated Data Prep

In [17]:
# Load fabricated reports
df_fab = pd.read_csv('data/fabricated/hydraulic fluid or oil leak_7920.csv')
df_fab

Unnamed: 0,text,group
0,there was a leak of oil when hydraulic fluid e...,hydraulic fluid or oil leak
1,there was a leak from a hose on a pump truck w...,hydraulic fluid or oil leak
2,there was a leak to grade one hundred and fift...,hydraulic fluid or oil leak
3,there was a leak of hydraulic fluid from a ren...,hydraulic fluid or oil leak
4,there was a leak from a jlg cat three thousand...,hydraulic fluid or oil leak
...,...,...
7915,oil fluid witnessed falling object resulting i...,hydraulic fluid or oil leak
7916,oil fluid witnessed on three out of four an em...,hydraulic fluid or oil leak
7917,oil fluid witnessed nearmisses at separation t...,hydraulic fluid or oil leak
7918,oil fluid witnessed a hydraulic leak from a na...,hydraulic fluid or oil leak


In [18]:
df_fab['text'] = [text_prepare(x) for x in tqdm(df_fab['text'])]

100%|██████████| 7920/7920 [00:09<00:00, 846.80it/s]


In [21]:
df_fab = df_fab.sample(frac=1).reset_index(drop=True)
df_fab

Unnamed: 0,text,group
0,crew noticed drip coming from hose fitting on ...,hydraulic fluid or oil leak
1,diesel failure on skid steer resulted in 1l of...,hydraulic fluid or oil leak
2,operative witnessed discharge of hydraulic flu...,hydraulic fluid or oil leak
3,subcontractor witnessed loss of hydraulic flui...,hydraulic fluid or oil leak
4,operative discovered leak in hose on roller dr...,hydraulic fluid or oil leak
...,...,...
7915,there was a flow of oil from the gully to the ...,hydraulic fluid or oil leak
7916,hyd fluid failure of excavator hydraulic fitti...,hydraulic fluid or oil leak
7917,apprentice noticed spillage of hydraulic fluid...,hydraulic fluid or oil leak
7918,apprentice witnessed drip from a bucket lift i...,hydraulic fluid or oil leak


In [23]:
# Save the prepared fabricated dataset
filename = f'data/prepared/{time.strftime("%y%m%d%H%M")}_prepared_fabricated_{focus_cat}_data.csv'
df_fab.to_csv(filename, index=False)

### Training and Test Split

In [None]:
# Now create the input to the model training stage
X = df.text
y = df.category # Remember, these are not necessarily the 'truth' but rule_book hits

# Apply a simple 80/20 split
# TODO - The dataset is heavily imbalanced. Treat this appropriately.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1234)

In [None]:
test_trues = list(y_test[y_test==1].index.values)
test_trues.sort()
print('Test:', len(test_trues), test_trues)

train_trues = list(y_train[y_train==1].index.values)
train_trues.sort()
print('Train:', len(train_trues), train_trues)

In [None]:
# Look at training 'trues', i.e., rule-book hits
df_train_trues = df.iloc[train_trues]
fn_trues = f'data/prepared/{time.strftime("%y%m%d%H%M")}_df_train_trues_{focus_cat}.csv'
df_train_trues.to_csv(fn_trues)
df_train_trues