In [266]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from tqdm.notebook import tqdm
import numpy as np

In [93]:
incidents_fn = "data/20220413_D1_Incidents.csv"
rule_book_fn = "data/rule_book.csv"

In [94]:
rules = pd.read_csv(rule_book_fn)
incidents = pd.read_csv(incidents_fn, dtype=str)

In [95]:
incidents.head(3)

Unnamed: 0,tblID,IRN,IncidentNumber,IncidentDate,TimeofIncident,BusinessUnit,BusinessGroup,ServiceLine,Project,Office,...,JFCAction,OHdefinition,MTCcount,RWCcount,LTIcount,FAcount,TransferTimeStamp,SourceFileName,FullDescription,ImmediateAction
0,175,39878,20200472,26/02/2020,14:15,Projects,Renewable Energy & Power,Power & Industrials EPC,YCI M1 Project,YCI M1 Project,...,,,,,,1.0,,CAIRS_Incidents.csv,Employee was grinding with a four-inch grinder...,Employee was transported to on-site medical an...
1,176,39877,20200471,26/02/2020,16:15,Consulting,Energy Optimisation & Innovation,EMEA,E&II Sustainable Infrastructure,E&II Sustainable Infrastructure,...,,,,,,,,CAIRS_Incidents.csv,See supporting information,"I stopped the drillers from installing, phoned..."
2,177,39876,20200470,27/02/2020,10:15,Operations,Asia Pacific,APAC East,Esso PNG LNG,Hides,...,,,,,,,,CAIRS_Incidents.csv,At approximately 10:15 on the 27th of Feb a hy...,"All contaminated materials, absorbent pads, we..."


In [96]:
incidents.rename(columns={'IncidentNumber': 'incident_id'}, inplace=True)
incidents['text'] = (
        incidents['ShortDescription'].astype(str).fillna('') + ' ' + 
        incidents['FullDescription'].astype(str).fillna('') + ' ' + 
        incidents['ImmediateAction'].astype(str).fillna('')
).str.lower()

incidents = incidents[['incident_id', 'text']]

In [315]:
incidents.text[0]

"foreign body entered employee's (l) eye while grinding. employee was grinding with a four-inch grinder on an iron support in the compressor building when he felt discomfort to his left eye. employee was wearing all required ppe. investigation is pending upon employee’s return from offsite medical evaluation. it will be noted that the site has been dealing with strong winds throughout the day. foreign body was removed using first aid measures. employee was transported to on-site medical and then transferred to an off-site eye specialist for further treatment. foreign body was removed using first aid measures. employee was released to return to work without restrictions."

In [326]:
import nltk
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()
def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

test = incidents.text.head(1).apply(lemmatize_text)
print(list(test))

[['foreign', 'body', 'entered', "employee's", '(l)', 'eye', 'while', 'grinding.', 'employee', 'wa', 'grinding', 'with', 'a', 'four-inch', 'grinder', 'on', 'an', 'iron', 'support', 'in', 'the', 'compressor', 'building', 'when', 'he', 'felt', 'discomfort', 'to', 'his', 'left', 'eye.', 'employee', 'wa', 'wearing', 'all', 'required', 'ppe.', 'investigation', 'is', 'pending', 'upon', 'employee’s', 'return', 'from', 'offsite', 'medical', 'evaluation.', 'it', 'will', 'be', 'noted', 'that', 'the', 'site', 'ha', 'been', 'dealing', 'with', 'strong', 'wind', 'throughout', 'the', 'day.', 'foreign', 'body', 'wa', 'removed', 'using', 'first', 'aid', 'measures.', 'employee', 'wa', 'transported', 'to', 'on-site', 'medical', 'and', 'then', 'transferred', 'to', 'an', 'off-site', 'eye', 'specialist', 'for', 'further', 'treatment.', 'foreign', 'body', 'wa', 'removed', 'using', 'first', 'aid', 'measures.', 'employee', 'wa', 'released', 'to', 'return', 'to', 'work', 'without', 'restrictions.']]


In [329]:
lem.lemmatize('employee’s')

'employee’s'

In [331]:
# import these modules
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
  
stem = PorterStemmer()
def stem_text(text):
    return [stem.stem(w) for w in w_tokenizer.tokenize(text)]
   
test = incidents.text.head(1).apply(stem_text)
print(list(test))

program  :  program
programs  :  program
programmer  :  programm
programming  :  program
programmers  :  programm
[['foreign', 'bodi', 'enter', "employee'", '(l)', 'eye', 'while', 'grinding.', 'employe', 'wa', 'grind', 'with', 'a', 'four-inch', 'grinder', 'on', 'an', 'iron', 'support', 'in', 'the', 'compressor', 'build', 'when', 'he', 'felt', 'discomfort', 'to', 'hi', 'left', 'eye.', 'employe', 'wa', 'wear', 'all', 'requir', 'ppe.', 'investig', 'is', 'pend', 'upon', 'employee’', 'return', 'from', 'offsit', 'medic', 'evaluation.', 'it', 'will', 'be', 'note', 'that', 'the', 'site', 'ha', 'been', 'deal', 'with', 'strong', 'wind', 'throughout', 'the', 'day.', 'foreign', 'bodi', 'wa', 'remov', 'use', 'first', 'aid', 'measures.', 'employe', 'wa', 'transport', 'to', 'on-sit', 'medic', 'and', 'then', 'transfer', 'to', 'an', 'off-sit', 'eye', 'specialist', 'for', 'further', 'treatment.', 'foreign', 'bodi', 'wa', 'remov', 'use', 'first', 'aid', 'measures.', 'employe', 'wa', 'releas', 'to', 'retu

In [98]:
def tokenize(text):
    tokens = word_tokenize(text)
    tokens = [tok for tok in tokens if tok!='-']
    return tokens

In [99]:
test_tokens = tokenize(incidents.text[0])

In [100]:
def translate_to_regex(rule_part):
    """
    converts rule_book rules to regex format;
    drops trailing separators in process
    """
    if isinstance(rule_part, str):
        rule_part = re.sub(r"\s{2,}", " ", rule_part)
        rule_part = re.sub(r'\\\\b', r'\\b', rule_part)
        
        return r'(\b' + r'\b)|(\b'.join([s for s in rule_part.split('_ ') if s]) + r'\b)'
    else:
        return ''

In [101]:
rules['keyword'] = rules['keyword'].apply(lambda x: x.replace('*', '.*')+r'\b')
rules['rules_pre'] = rules['rules_pre'].apply(translate_to_regex)
rules['rules_post'] = rules['rules_post'].apply(translate_to_regex)
rules['rules_all'] = rules['rules_all'].apply(translate_to_regex)

rules['voids'] = rules['voids'].apply(translate_to_regex)
print('rules translated')

rules translated


In [102]:
rules

Unnamed: 0,group,keyword,rules_pre,rules_post,rules_all,voids
0,eye injury or irritation,eye.*\b,(\bsomething.*in\b)|(\bswell.*\b)|(\bswollen\b...,,(\birritation\b)|(\bforeign body\b)|(\bforeign...,
1,site compliance or practice issue,permit\b,(\bwithout.*work\b)|(\bnot captured\b),(\bnot in place\b)|(\bwere not captured\b),(\bduring.* audit\b)|(\bfailed.* section\b)|(\...,


In [105]:
def check_presence(pattern, string):
    if pattern:
        return bool(re.search(pattern, string))
    else:
        return False

In [109]:
def find_pattern(tokens, keyword, check_pre, check_post, check_all, check_void, window):
    """
    for a list of tokens finds specified keyword and returns True
    if the neighbourhood of this keyword satisfies pre-, post- or all- context rules
    and doesn't contain anything forbidden
    :param tokens: list of tokens
    :param keyword: pattern which a token should match
    :param check_pre: pattern which several previous tokens (concatenated) should match
    :param check_post: pattern which several subsequent tokens (concatenated) should match
    :param check_all: pattern which previous tokens + keyword + subsequent tokens should match
    :param check_void: pattern which previous tokens + keyword + subsequent tokens should NOT match
    :param window: N of pre and post tokens to consider
    :return: True/False - whether at least one matching part was found
    """
    # Extract contexts of keyword (if any found)
    # Four parts: [(pre), (keyword), (post), (all)]
    matches = [(
        ' '.join(tokens[max(0, i - window - 1): i]), tokens[i],
        ' '.join(tokens[i + 1:i + window + 1]),
        ' '.join(tokens[max(0, i - window - 1):i + window + 1])
        ) for i, tok in enumerate(tokens) if re.match(keyword, tok)]
    
    # do tests
    final_match = any([
        (check_presence(check_pre, pre) or check_presence(check_all, all_) or check_presence(check_post, post))
        and not check_presence(check_void, all_) for (pre, kw, post, all_) in matches
    ])
    return final_match

In [115]:
tokens = test_tokens
keyword = rules.keyword[0]
check_pre = rules.rules_pre[0]
check_post = rules.rules_post[0]
check_all = rules.rules_all[0]
check_void = rules.voids[0]
window = 5

In [116]:
matches = [(
        ' '.join(tokens[max(0, i - window - 1): i]), tokens[i],
        ' '.join(tokens[i + 1:i + window + 1]),
        ' '.join(tokens[max(0, i - window - 1):i + window + 1])
        ) for i, tok in enumerate(tokens) if re.match(keyword, tok)]

In [117]:
matches

[("entered employee 's ( l )",
  'eye',
  'while grinding . employee was',
  "entered employee 's ( l ) eye while grinding . employee was"),
 ('he felt discomfort to his left',
  'eye',
  '. employee was wearing all',
  'he felt discomfort to his left eye . employee was wearing all'),
 ('and then transferred to an off-site',
  'eye',
  'specialist for further treatment .',
  'and then transferred to an off-site eye specialist for further treatment .')]

In [111]:
find_pattern(
    tokens = test_tokens, 
    keyword = rules.keyword[0], 
    check_pre = rules.rules_pre[0], 
    check_post = rules.rules_post[0], 
    check_all = rules.rules_all[0], 
    check_void = rules.voids[0], 
    window = 5
)

True

In [119]:
[(check_presence(check_pre, pre) or check_presence(check_all, all_) or check_presence(check_post, post))
        and not check_presence(check_void, all_) for (pre, kw, post, all_) in matches]

[False, True, False]

In [125]:
for (pre, kw, post, all_) in matches:
    #print(pre)
    #print(kw)
    #print(post)
    print(all_)

entered employee 's ( l ) eye while grinding . employee was
he felt discomfort to his left eye . employee was wearing all
and then transferred to an off-site eye specialist for further treatment .


In [129]:
check_all

'(\\birritation\\b)|(\\bforeign body\\b)|(\\bforeign object\\b)|(\\birritation (.*) eye\\b)|(\\bpain in (.*) eye\\b)|(\\bfell (.*) eye\\b)|(\\bfall (.*) eye\\b)|(\\bdebris (.*) eye\\b)|(\\bpain to\\b)'

In [283]:
syn_dict = {
    'worker_syns':['employee', 'ip', 'worker', 'man', 'woman'],
    'pain_syns':['pain', 'discomfort', 'soreness', 'hurt', 'something in', 
             'swelling', 'swollen', 'dust', 'particle', 'irritation', 'foreign'],
    'eye_syns':['eye', 'pupil', 'iris']
}

In [206]:
chk_text = 'the employee felt discomfort in his eye'
pattern = '(\\bemployee (.*) discomfort in (.*) eye\\b)'
check_presence(pattern, chk_text)

True

In [261]:
chk_text = 'the ip felt extreme soreness in his eye'
finds_list = []
finds_pats = []
incid_nums = []
for worker_syn in worker_syns:
    for pain_syn in pain_syns: 
        for eye_syn in eye_syns:   
            pattern = f'(\\b{worker_syn} (.*) {pain_syn} (.*) {eye_syn}\\b)'
            check = check_presence(pattern, chk_text)
            if check: 
                print(f'{check}: {pattern}')
                finds_list.append(check)
                finds_pats.append(pattern)
                incid_nums.append(irn)

True: (\bip (.*) soreness (.*) eye\b)


In [284]:
syn_dict.get('worker_syns')

['employee', 'ip', 'worker', 'man', 'woman']

In [292]:
srs = ([2, 1, 4, 3, 0], [])

In [293]:
srs[1]

[]

In [332]:
pos_1st = syn_dict.get('worker_syns')
pos_2nd = syn_dict.get('pain_syns')
pos_3rd = syn_dict.get('eye_syns')

# Connections between synonyms 1-2, 2-3
connect = ['(.*)', '(.*)']

# Shuffle rules
srs = ([2, 1, 4, 3, 0], [])

finds_list = []
finds_pats = []
incid_nums = []

for row in tqdm(range(len(incidents))):
    chk_text = incidents.text[row].lower()
    lem_text = lem.lemmatize(chk_text)
    irn = incidents.incident_id[row]
    for first_syn in pos_1st:
        locals()["first_syn"] = first_syn
        for second_syn in pos_2nd: 
            for third_syn in pos_3rd:  
                x = f'{first_syn} {connect[0]} {second_syn} {connect[1]} {third_syn}'
                pattern = f'(\\b{x}\\b)'
                check = check_presence(pattern, chk_text)
                if check: 
                    print(f'{check}: {pattern}')
                    finds_list.append(check)
                    finds_pats.append(pattern)
                    incid_nums.append(irn)
                else:
                    for sr in srs:
                        if len(sr) == 0: continue
                        a = x.split()
                        # Adjust the word sequence using shuffle rule
                        pattern = f'(\\b{a[sr[0]]} {a[sr[1]]} {a[sr[2]]} {a[sr[3]]} {a[sr[4]]}\\b)'
                        rev_check = check_presence(pattern, chk_text)
                        if rev_check: 
                            print(f'{rev_check}: {pattern}')
                            finds_list.append(check)
                            finds_pats.append(pattern)
                            incid_nums.append(irn)
                    

  0%|          | 0/27158 [00:00<?, ?it/s]

True: (\bemployee (.*) discomfort (.*) eye\b)
True: (\bemployee (.*) foreign (.*) eye\b)
True: (\bforeign (.*) eye (.*) employee\b)
True: (\bdiscomfort (.*) eye (.*) ip\b)
True: (\bemployee (.*) discomfort (.*) eye\b)
True: (\bforeign (.*) eye (.*) employee\b)
True: (\bip (.*) irritation (.*) eye\b)
True: (\bemployee (.*) foreign (.*) eye\b)
True: (\bemployee (.*) discomfort (.*) eye\b)
True: (\bip (.*) discomfort (.*) eye\b)
True: (\bip (.*) particle (.*) eye\b)
True: (\bip (.*) foreign (.*) eye\b)
True: (\bpain (.*) eye (.*) ip\b)
True: (\birritation (.*) eye (.*) ip\b)
True: (\bworker (.*) pain (.*) eye\b)
True: (\bworker (.*) irritation (.*) eye\b)
True: (\bpain (.*) eye (.*) ip\b)
True: (\bworker (.*) pain (.*) eye\b)
True: (\bpain (.*) eye (.*) man\b)
True: (\bemployee (.*) something in (.*) eye\b)
True: (\bip (.*) something in (.*) eye\b)
True: (\bip (.*) something in (.*) eye\b)
True: (\bip (.*) particle (.*) eye\b)
True: (\bip (.*) foreign (.*) eye\b)
True: (\bemployee (.*) di

True: (\bemployee (.*) irritation (.*) eye\b)
True: (\bip (.*) irritation (.*) eye\b)
True: (\bemployee (.*) dust (.*) eye\b)
True: (\bemployee (.*) discomfort (.*) eye\b)
True: (\bemployee (.*) irritation (.*) eye\b)
True: (\bemployee (.*) discomfort (.*) eye\b)
True: (\bemployee (.*) discomfort (.*) eye\b)
True: (\birritation (.*) eye (.*) employee\b)
True: (\bemployee (.*) foreign (.*) eye\b)
True: (\bip (.*) something in (.*) eye\b)
True: (\bip (.*) foreign (.*) eye\b)
True: (\bdiscomfort (.*) eye (.*) ip\b)
True: (\bforeign (.*) eye (.*) ip\b)
True: (\bip (.*) discomfort (.*) eye\b)
True: (\bip (.*) irritation (.*) eye\b)
True: (\bip (.*) foreign (.*) eye\b)
True: (\bdiscomfort (.*) eye (.*) employee\b)
True: (\bip (.*) pain (.*) eye\b)
True: (\bip (.*) discomfort (.*) eye\b)
True: (\bip (.*) particle (.*) eye\b)
True: (\bip (.*) irritation (.*) eye\b)
True: (\bforeign (.*) eye (.*) ip\b)
True: (\bemployee (.*) irritation (.*) eye\b)
True: (\bip (.*) something in (.*) eye\b)
True:

True: (\bemployee (.*) discomfort (.*) eye\b)
True: (\bemployee (.*) irritation (.*) eye\b)
True: (\bemployee (.*) foreign (.*) eye\b)
True: (\bip (.*) irritation (.*) eye\b)
True: (\bip (.*) foreign (.*) eye\b)
True: (\bemployee (.*) particle (.*) eye\b)
True: (\bemployee (.*) discomfort (.*) eye\b)
True: (\birritation (.*) eye (.*) employee\b)
True: (\bemployee (.*) foreign (.*) eye\b)
True: (\bemployee (.*) discomfort (.*) eye\b)
True: (\bemployee (.*) irritation (.*) eye\b)
True: (\bdiscomfort (.*) eye (.*) worker\b)
True: (\birritation (.*) eye (.*) worker\b)
True: (\bip (.*) foreign (.*) eye\b)
True: (\bemployee (.*) discomfort (.*) eye\b)
True: (\bemployee (.*) dust (.*) eye\b)
True: (\bemployee (.*) foreign (.*) eye\b)
True: (\bworker (.*) discomfort (.*) eye\b)
True: (\bemployee (.*) discomfort (.*) eye\b)
True: (\bip (.*) discomfort (.*) eye\b)
True: (\bip (.*) foreign (.*) eye\b)
True: (\bemployee (.*) irritation (.*) eye\b)
True: (\bemployee (.*) foreign (.*) eye\b)
True: (

True: (\bparticle (.*) eye (.*) ip\b)
True: (\bworker (.*) particle (.*) eye\b)
True: (\bemployee (.*) irritation (.*) eye\b)
True: (\bemployee (.*) foreign (.*) eye\b)
True: (\bip (.*) discomfort (.*) eye\b)
True: (\bip (.*) discomfort (.*) iris\b)
True: (\bip (.*) something in (.*) eye\b)
True: (\bip (.*) something in (.*) iris\b)
True: (\bip (.*) foreign (.*) eye\b)
True: (\bip (.*) foreign (.*) iris\b)
True: (\birritation (.*) eye (.*) ip\b)
True: (\bforeign (.*) eye (.*) ip\b)
True: (\bip (.*) irritation (.*) eye\b)
True: (\bforeign (.*) eye (.*) ip\b)
True: (\bip (.*) particle (.*) eye\b)
True: (\bforeign (.*) eye (.*) ip\b)
True: (\bswelling (.*) eye (.*) ip\b)
True: (\birritation (.*) eye (.*) ip\b)
True: (\bforeign (.*) eye (.*) ip\b)
True: (\bip (.*) something in (.*) eye\b)
True: (\bemployee (.*) discomfort (.*) eye\b)
True: (\bemployee (.*) something in (.*) eye\b)
True: (\bemployee (.*) dust (.*) eye\b)
True: (\bip (.*) dust (.*) eye\b)
True: (\bworker (.*) dust (.*) eye\b

True: (\bemployee (.*) discomfort (.*) eye\b)
True: (\bemployee (.*) something in (.*) eye\b)
True: (\bpain (.*) eye (.*) ip\b)
True: (\bdust (.*) eye (.*) ip\b)
True: (\birritation (.*) eye (.*) ip\b)
True: (\bworker (.*) pain (.*) eye\b)
True: (\bworker (.*) dust (.*) eye\b)
True: (\bworker (.*) irritation (.*) eye\b)
True: (\bemployee (.*) irritation (.*) eye\b)
True: (\bdust (.*) eye (.*) worker\b)
True: (\bworker (.*) irritation (.*) eye\b)
True: (\bemployee (.*) discomfort (.*) eye\b)
True: (\bemployee (.*) foreign (.*) eye\b)
True: (\bip (.*) discomfort (.*) eye\b)
True: (\bip (.*) foreign (.*) eye\b)
True: (\bip (.*) foreign (.*) eye\b)
True: (\bemployee (.*) foreign (.*) eye\b)
True: (\bforeign (.*) eye (.*) ip\b)
True: (\bemployee (.*) discomfort (.*) eye\b)
True: (\bemployee (.*) particle (.*) eye\b)
True: (\bemployee (.*) foreign (.*) eye\b)
True: (\bip (.*) foreign (.*) eye\b)
True: (\bip (.*) discomfort (.*) eye\b)
True: (\bip (.*) something in (.*) eye\b)
True: (\bdiscom

In [312]:
len(list(dict.fromkeys(incid_nums)))

543