In [99]:
import numpy as np
import pandas as pd
import re
import nltk

from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import PorterStemmer as stemmer

from tqdm.notebook import tqdm

In [100]:
incidents_fn = "data/20220413_D1_Incidents.csv"
rule_book_fn = "data/rule_book.csv"

In [3]:
rules = pd.read_csv(rule_book_fn)
incidents = pd.read_csv(incidents_fn, dtype=str)

In [4]:
incidents.head(3)

Unnamed: 0,tblID,IRN,IncidentNumber,IncidentDate,TimeofIncident,BusinessUnit,BusinessGroup,ServiceLine,Project,Office,...,JFCAction,OHdefinition,MTCcount,RWCcount,LTIcount,FAcount,TransferTimeStamp,SourceFileName,FullDescription,ImmediateAction
0,175,39878,20200472,26/02/2020,14:15,Projects,Renewable Energy & Power,Power & Industrials EPC,YCI M1 Project,YCI M1 Project,...,,,,,,1.0,,CAIRS_Incidents.csv,Employee was grinding with a four-inch grinder...,Employee was transported to on-site medical an...
1,176,39877,20200471,26/02/2020,16:15,Consulting,Energy Optimisation & Innovation,EMEA,E&II Sustainable Infrastructure,E&II Sustainable Infrastructure,...,,,,,,,,CAIRS_Incidents.csv,See supporting information,"I stopped the drillers from installing, phoned..."
2,177,39876,20200470,27/02/2020,10:15,Operations,Asia Pacific,APAC East,Esso PNG LNG,Hides,...,,,,,,,,CAIRS_Incidents.csv,At approximately 10:15 on the 27th of Feb a hy...,"All contaminated materials, absorbent pads, we..."


In [5]:
incidents.rename(columns={'IncidentNumber': 'incident_id'}, inplace=True)
incidents['text'] = (
        incidents['ShortDescription'].astype(str).fillna('') + ' ' + 
        incidents['FullDescription'].astype(str).fillna('') + ' ' + 
        incidents['ImmediateAction'].astype(str).fillna('')
).str.lower()

incidents = incidents[['incident_id', 'text']]

In [6]:
incidents.text[0]

"foreign body entered employee's (l) eye while grinding. employee was grinding with a four-inch grinder on an iron support in the compressor building when he felt discomfort to his left eye. employee was wearing all required ppe. investigation is pending upon employee’s return from offsite medical evaluation. it will be noted that the site has been dealing with strong winds throughout the day. foreign body was removed using first aid measures. employee was transported to on-site medical and then transferred to an off-site eye specialist for further treatment. foreign body was removed using first aid measures. employee was released to return to work without restrictions."

In [101]:
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()
def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

test = incidents.text.head(1).apply(lemmatize_text)
print(list(test))

[['foreign', 'body', 'entered', "employee's", '(l)', 'eye', 'while', 'grinding.', 'employee', 'wa', 'grinding', 'with', 'a', 'four-inch', 'grinder', 'on', 'an', 'iron', 'support', 'in', 'the', 'compressor', 'building', 'when', 'he', 'felt', 'discomfort', 'to', 'his', 'left', 'eye.', 'employee', 'wa', 'wearing', 'all', 'required', 'ppe.', 'investigation', 'is', 'pending', 'upon', 'employee’s', 'return', 'from', 'offsite', 'medical', 'evaluation.', 'it', 'will', 'be', 'noted', 'that', 'the', 'site', 'ha', 'been', 'dealing', 'with', 'strong', 'wind', 'throughout', 'the', 'day.', 'foreign', 'body', 'wa', 'removed', 'using', 'first', 'aid', 'measures.', 'employee', 'wa', 'transported', 'to', 'on-site', 'medical', 'and', 'then', 'transferred', 'to', 'an', 'off-site', 'eye', 'specialist', 'for', 'further', 'treatment.', 'foreign', 'body', 'wa', 'removed', 'using', 'first', 'aid', 'measures.', 'employee', 'wa', 'released', 'to', 'return', 'to', 'work', 'without', 'restrictions.']]


In [329]:
lem.lemmatize('employee’s')

'employee’s'

In [11]:
stem = stemmer()
def stem_text(text):
    return [stem.stem(w) for w in w_tokenizer.tokenize(text)]
   
test = incidents.text.head(1).apply(stem_text)
print(list(test))

[['foreign', 'bodi', 'enter', "employee'", '(l)', 'eye', 'while', 'grinding.', 'employe', 'wa', 'grind', 'with', 'a', 'four-inch', 'grinder', 'on', 'an', 'iron', 'support', 'in', 'the', 'compressor', 'build', 'when', 'he', 'felt', 'discomfort', 'to', 'hi', 'left', 'eye.', 'employe', 'wa', 'wear', 'all', 'requir', 'ppe.', 'investig', 'is', 'pend', 'upon', 'employee’', 'return', 'from', 'offsit', 'medic', 'evaluation.', 'it', 'will', 'be', 'note', 'that', 'the', 'site', 'ha', 'been', 'deal', 'with', 'strong', 'wind', 'throughout', 'the', 'day.', 'foreign', 'bodi', 'wa', 'remov', 'use', 'first', 'aid', 'measures.', 'employe', 'wa', 'transport', 'to', 'on-sit', 'medic', 'and', 'then', 'transfer', 'to', 'an', 'off-sit', 'eye', 'specialist', 'for', 'further', 'treatment.', 'foreign', 'bodi', 'wa', 'remov', 'use', 'first', 'aid', 'measures.', 'employe', 'wa', 'releas', 'to', 'return', 'to', 'work', 'without', 'restrictions.']]


In [13]:
def tokenize(text):
    tokens = word_tokenize(text)
    tokens = [tok for tok in tokens if tok!='-']
    return tokens

In [14]:
test_tokens = tokenize(incidents.text[0])
test_tokens

['foreign',
 'body',
 'entered',
 'employee',
 "'s",
 '(',
 'l',
 ')',
 'eye',
 'while',
 'grinding',
 '.',
 'employee',
 'was',
 'grinding',
 'with',
 'a',
 'four-inch',
 'grinder',
 'on',
 'an',
 'iron',
 'support',
 'in',
 'the',
 'compressor',
 'building',
 'when',
 'he',
 'felt',
 'discomfort',
 'to',
 'his',
 'left',
 'eye',
 '.',
 'employee',
 'was',
 'wearing',
 'all',
 'required',
 'ppe',
 '.',
 'investigation',
 'is',
 'pending',
 'upon',
 'employee',
 '’',
 's',
 'return',
 'from',
 'offsite',
 'medical',
 'evaluation',
 '.',
 'it',
 'will',
 'be',
 'noted',
 'that',
 'the',
 'site',
 'has',
 'been',
 'dealing',
 'with',
 'strong',
 'winds',
 'throughout',
 'the',
 'day',
 '.',
 'foreign',
 'body',
 'was',
 'removed',
 'using',
 'first',
 'aid',
 'measures',
 '.',
 'employee',
 'was',
 'transported',
 'to',
 'on-site',
 'medical',
 'and',
 'then',
 'transferred',
 'to',
 'an',
 'off-site',
 'eye',
 'specialist',
 'for',
 'further',
 'treatment',
 '.',
 'foreign',
 'body',
 '

In [15]:
def translate_to_regex(rule_part):
    """
    converts rule_book rules to regex format;
    drops trailing separators in process
    """
    if isinstance(rule_part, str):
        rule_part = re.sub(r"\s{2,}", " ", rule_part)
        rule_part = re.sub(r'\\\\b', r'\\b', rule_part)
        
        return r'(\b' + r'\b)|(\b'.join([s for s in rule_part.split('_ ') if s]) + r'\b)'
    else:
        return ''

In [16]:
rules['keyword'] = rules['keyword'].apply(lambda x: x.replace('*', '.*')+r'\b')
rules['rules_pre'] = rules['rules_pre'].apply(translate_to_regex)
rules['rules_post'] = rules['rules_post'].apply(translate_to_regex)
rules['rules_all'] = rules['rules_all'].apply(translate_to_regex)

rules['voids'] = rules['voids'].apply(translate_to_regex)
print('rules translated')

rules translated


In [17]:
rules

Unnamed: 0,group,keyword,rules_pre,rules_post,rules_all,voids
0,eye injury or irritation,eye.*\b,(\bsomething.*in\b)|(\bswell.*\b)|(\bswollen\b...,,(\birritation\b)|(\bforeign body\b)|(\bforeign...,
1,site compliance or practice issue,permit\b,(\bwithout.*work\b)|(\bnot captured\b),(\bnot in place\b)|(\bwere not captured\b),(\bduring.* audit\b)|(\bfailed.* section\b)|(\...,


In [18]:
def check_presence(pattern, string):
    if pattern:
        return bool(re.search(pattern, string))
    else:
        return False

In [19]:
def find_pattern(tokens, keyword, check_pre, check_post, check_all, check_void, window):
    """
    for a list of tokens finds specified keyword and returns True
    if the neighbourhood of this keyword satisfies pre-, post- or all- context rules
    and doesn't contain anything forbidden
    :param tokens: list of tokens
    :param keyword: pattern which a token should match
    :param check_pre: pattern which several previous tokens (concatenated) should match
    :param check_post: pattern which several subsequent tokens (concatenated) should match
    :param check_all: pattern which previous tokens + keyword + subsequent tokens should match
    :param check_void: pattern which previous tokens + keyword + subsequent tokens should NOT match
    :param window: N of pre and post tokens to consider
    :return: True/False - whether at least one matching part was found
    """
    # Extract contexts of keyword (if any found)
    # Four parts: [(pre), (keyword), (post), (all)]
    matches = [(
        ' '.join(tokens[max(0, i - window - 1): i]), tokens[i],
        ' '.join(tokens[i + 1:i + window + 1]),
        ' '.join(tokens[max(0, i - window - 1):i + window + 1])
        ) for i, tok in enumerate(tokens) if re.match(keyword, tok)]
    
    # do tests
    final_match = any([
        (check_presence(check_pre, pre) or check_presence(check_all, all_) or check_presence(check_post, post))
        and not check_presence(check_void, all_) for (pre, kw, post, all_) in matches
    ])
    return final_match

In [20]:
tokens = test_tokens
keyword = rules.keyword[0]
check_pre = rules.rules_pre[0]
check_post = rules.rules_post[0]
check_all = rules.rules_all[0]
check_void = rules.voids[0]
window = 5

In [23]:
matches = [(
        ' '.join(tokens[max(0, i - window - 1): i]), # pre-context
            tokens[i],                               # keyword
        ' '.join(tokens[i + 1:i + window + 1]),      # post-context
        ' '.join(tokens[max(0, i - window - 1):i + window + 1]) # All context
        ) for i, tok in enumerate(tokens) if re.match(keyword, tok)]

In [24]:
matches

[("entered employee 's ( l )",
  'eye',
  'while grinding . employee was',
  "entered employee 's ( l ) eye while grinding . employee was"),
 ('he felt discomfort to his left',
  'eye',
  '. employee was wearing all',
  'he felt discomfort to his left eye . employee was wearing all'),
 ('and then transferred to an off-site',
  'eye',
  'specialist for further treatment .',
  'and then transferred to an off-site eye specialist for further treatment .')]

In [111]:
find_pattern(
    tokens = test_tokens, 
    keyword = rules.keyword[0], 
    check_pre = rules.rules_pre[0], 
    check_post = rules.rules_post[0], 
    check_all = rules.rules_all[0], 
    check_void = rules.voids[0], 
    window = 5
)

True

In [119]:
[(check_presence(check_pre, pre) or check_presence(check_all, all_) or check_presence(check_post, post))
        and not check_presence(check_void, all_) for (pre, kw, post, all_) in matches]

[False, True, False]

In [125]:
for (pre, kw, post, all_) in matches:
    #print(pre)
    #print(kw)
    #print(post)
    print(all_)

entered employee 's ( l ) eye while grinding . employee was
he felt discomfort to his left eye . employee was wearing all
and then transferred to an off-site eye specialist for further treatment .


In [129]:
check_all

'(\\birritation\\b)|(\\bforeign body\\b)|(\\bforeign object\\b)|(\\birritation (.*) eye\\b)|(\\bpain in (.*) eye\\b)|(\\bfell (.*) eye\\b)|(\\bfall (.*) eye\\b)|(\\bdebris (.*) eye\\b)|(\\bpain to\\b)'

In [28]:
# Sentence tokenizer
print(sent_tokenize(incidents.text[0]))

["foreign body entered employee's (l) eye while grinding.", 'employee was grinding with a four-inch grinder on an iron support in the compressor building when he felt discomfort to his left eye.', 'employee was wearing all required ppe.', 'investigation is pending upon employee’s return from offsite medical evaluation.', 'it will be noted that the site has been dealing with strong winds throughout the day.', 'foreign body was removed using first aid measures.', 'employee was transported to on-site medical and then transferred to an off-site eye specialist for further treatment.', 'foreign body was removed using first aid measures.', 'employee was released to return to work without restrictions.']


In [107]:
syn_dict = {
    'worker_syns':['employee', 'ip', 'worker', 'man', 'woman', 'he', 'she', 'his', 'her'],
    'pain_syns':['pain', 'discomfort', 'soreness', 'hurt', 'something in', 
             'swelling', 'swollen', 'dust', 'particle', 'irritation', 'foreign', 'fb'],
    'eye_syns':['eye', 'pupil', 'iris']
}

In [43]:
chk_text = 'employee was grinding with a four-inch grinder on an iron support in the compressor building when he felt discomfort to his left eye.'
pattern = '(\\bemployee (.*) discomfort (.*) eye\\b)'
check_presence(pattern, chk_text)

True

In [44]:
sents = sent_tokenize(incidents.text[0])
for sent in sents:
    if check_presence(pattern, sent):
        print(f'{check_presence(pattern, sent)}: {sent}')

True: employee was grinding with a four-inch grinder on an iron support in the compressor building when he felt discomfort to his left eye.


In [48]:
chk_text = 'the ip felt extreme soreness in his eye'
finds_list = []
finds_pats = []
incid_nums = []
for worker_syn in syn_dict.get('worker_syns'):
    for pain_syn in syn_dict.get('pain_syns'): 
        for eye_syn in syn_dict.get('eye_syns'):   
            pattern = f'(\\b{worker_syn} (.*) {pain_syn} (.*) {eye_syn}\\b)'
            check = check_presence(pattern, chk_text)
            if check: 
                print(f'{check}: {pattern}')
                finds_list.append(check)
                finds_pats.append(pattern)
                #incid_nums.append(irn)

True: (\bip (.*) soreness (.*) eye\b)


In [108]:
pos_1st = syn_dict.get('worker_syns')
pos_2nd = syn_dict.get('pain_syns')
pos_3rd = syn_dict.get('eye_syns')

# Connections between synonyms 1-2, 2-3
connect = ['(.*)', '(.*)']

# Shuffle rules
srs = ([2, 1, 4, 3, 0], [])

finds_list = []
finds_pats = []
incid_nums = []

for row in tqdm(range(len(incidents))):
    par_text = incidents.text[row].lower()
    sen_toks = sent_tokenize(par_text)    
    irn = incidents.incident_id[row]
    for chk_text in sen_toks:
        for first_syn in pos_1st:
            locals()["first_syn"] = first_syn
            for second_syn in pos_2nd: 
                for third_syn in pos_3rd:  
                    x = f'{first_syn} {connect[0]} {second_syn} {connect[1]} {third_syn}'
                    pattern = f'(\\b{x}\\b)'
                    check = check_presence(pattern, chk_text)
                    if check: 
                        print(f'{check}: {pattern}')
                        finds_list.append(check)
                        finds_pats.append(pattern)
                        incid_nums.append(irn)
                    else:
                        for sr in srs:
                            if len(sr) == 0: continue
                            a = x.split()
                            # Adjust the word sequence using shuffle rule
                            pattern = f'(\\b{a[sr[0]]} {a[sr[1]]} {a[sr[2]]} {a[sr[3]]} {a[sr[4]]}\\b)'
                            rev_check = check_presence(pattern, chk_text)
                            if rev_check: 
                                print(f'{rev_check}: {pattern}')
                                finds_list.append(check)
                                finds_pats.append(pattern)
                                incid_nums.append(irn)
                    

  0%|          | 0/27158 [00:00<?, ?it/s]

True: (\bemployee (.*) discomfort (.*) eye\b)
True: (\bhe (.*) discomfort (.*) eye\b)
True: (\bhe (.*) foreign (.*) eye\b)
True: (\bhis (.*) foreign (.*) eye\b)
True: (\bemployee (.*) discomfort (.*) eye\b)
True: (\bhis (.*) discomfort (.*) eye\b)
True: (\bip (.*) irritation (.*) eye\b)
True: (\bemployee (.*) discomfort (.*) eye\b)
True: (\bip (.*) discomfort (.*) eye\b)
True: (\bforeign (.*) eye (.*) ip\b)
True: (\bhis (.*) discomfort (.*) eye\b)
True: (\bforeign (.*) eye (.*) his\b)
True: (\bip (.*) foreign (.*) eye\b)
True: (\bforeign (.*) eye (.*) he\b)
True: (\bhe (.*) particle (.*) eye\b)
True: (\bhis (.*) particle (.*) eye\b)
True: (\bworker (.*) pain (.*) eye\b)
True: (\bhe (.*) pain (.*) eye\b)
True: (\bworker (.*) irritation (.*) eye\b)
True: (\bhe (.*) irritation (.*) eye\b)
True: (\bworker (.*) pain (.*) eye\b)
True: (\bhe (.*) pain (.*) eye\b)
True: (\bhe (.*) pain (.*) eye\b)
True: (\bip (.*) something in (.*) eye\b)
True: (\bip (.*) something in (.*) eye\b)
True: (\bempl

KeyboardInterrupt: 

In [55]:
len(list(dict.fromkeys(incid_nums)))

390

In [106]:
# Inspect findings
treat = False
for inc in list(dict.fromkeys(incid_nums)):
    par_text = incidents.text[incidents.incident_id == inc].values[0]    
    sen_toks = sent_tokenize(par_text) 
    print(inc)
    for sen in sen_toks: 
        if treat:
            tokenized_words = word_tokenize(sen)
            tokenized_sentence = []
            for word in tokenized_words:
                tokenized_sentence.append(lemmatizer.lemmatize(word))
                #tokenized_sentence.append(stem.stem(word))
            tokenized_sentence = " ".join(tokenized_sentence)                
            print(tokenized_sentence)
        else:
            print(sen)    
    print('\n')    

20200472
foreign body entered employee's (l) eye while grinding.
employee was grinding with a four-inch grinder on an iron support in the compressor building when he felt discomfort to his left eye.
employee was wearing all required ppe.
investigation is pending upon employee’s return from offsite medical evaluation.
it will be noted that the site has been dealing with strong winds throughout the day.
foreign body was removed using first aid measures.
employee was transported to on-site medical and then transferred to an off-site eye specialist for further treatment.
foreign body was removed using first aid measures.
employee was released to return to work without restrictions.


20200412
foreign body removed from employees (r) eye after grinding operation.
employee was using a grinder with a wire wheel attachment to clean and buff recent welds.
upon completion the employee removed his face shield and immediately felt discomfort to his right eye.
employee reported incident to supervisi

20163208
a foreign object was blown into the right eye of an employee while looking at sight glass level.
this incident upgraded to restricted work case 4/6/16, he was released with restrictions.
next follow up 4/7/16.
migrated from legacy cairs incident #30132an employee was trying to see the level in a sight glass on the heater treater.
his safety glasses were tinted yellow and he felt by lowering them he could see the level in the sight glass better.
he lowered his safety glasses and a gust of wind blew a foreign object into his right eye.
he tried to get it out by rubbing and then flushing.
the medic on the facility removed a small object.this incident upgraded to restricted work case 4/6/16, he was released with restrictions.
next follow up 4/7/16.
migrated from legacy cairs - see detailed description


20163162
scratch to eye whilst cutting penetrations in hvac ducting using a shark saw utility leg shaft at the 101m level.
migrated from legacy cairs incident #30086work party were

20170987
employee felt foreign body enter right eye.
employee was in the buggy driving to the parking lot at the end of the shift.
as the buggy turned left, employee felt something enter his right eye.
employee was wearing safety glasses properly at the time of incident.
employee was immediately seen by the site nurse where a flush was performed.
employee felt minor relief, no foreign body was found during the flush.


20170888
foreign body in left eye.
while the worker was grinding the pipe supports, he noticed an irritation in his left eye.he directly stopped the work and went to the clinic.
visit to contractor clinic.


20170875
suspected foreign body entered ip's right eye at 14:30 whilst returning to the worksite, the ip was passing under a structural beam, underdeck of dp level 1. the ip felt a foreign body enter his right eye.
the ip immediately used an eye wash to flush the eye out.
he then attended the medic.
the medic indicated no foreign body to be found.
no medical treatmen

ip was wearing standard safety glasses at the time of the event.


20160930
ee got a foreign body in her eye while filing the top of posts.
employee reported to safety on 5-9-16 that she had gotten a foreign body in her eye on saturday (5-7-2016) while she was filing the top of posts.
she stated that she successfully removed the foreign body immediately after it entered her eye.
she also stated she felt no discomfort on sunday, but felt slight irritation on 5-9-16                                     visually inspected employees eye and saw no sign of irritation, weeping, or a foreign body.
flushed the employees left eye as a precaution.
instructed the employee to return to safety should she feel any additional discomfort, and given safety contact information if she needs after hours assistance.


20160834
employee was cutting off a pile using a metabo when he felt something get in his eye.
employee was cutting a pile to install a splice plate when he felt something in his left eye empl

In [93]:
stem.stem('Be particles in hands')

'be particles in hand'

In [110]:
org_string = "This is employee's hand"
pattern = r"'s"

# Replace all occurrences of character 's with an empty string
org_string = re.sub(pattern, '', org_string )
print(org_string)

This is employee hand
