In [47]:
import numpy as np
import pandas as pd
import re
import nltk

from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import PorterStemmer as stemmer

from tqdm.notebook import tqdm

In [48]:
incidents_fn = "data/source/20220413_D1_Incidents.csv"
rule_book_fn = "data/rule_book.csv"

In [49]:
rules = pd.read_csv(rule_book_fn)
incidents = pd.read_csv(incidents_fn, dtype=str)

In [50]:
incidents.head(3)

Unnamed: 0,tblID,IRN,IncidentNumber,IncidentDate,TimeofIncident,BusinessUnit,BusinessGroup,ServiceLine,Project,Office,...,JFCAction,OHdefinition,MTCcount,RWCcount,LTIcount,FAcount,TransferTimeStamp,SourceFileName,FullDescription,ImmediateAction
0,175,39878,20200472,26/02/2020,14:15,Projects,Renewable Energy & Power,Power & Industrials EPC,YCI M1 Project,YCI M1 Project,...,,,,,,1.0,,CAIRS_Incidents.csv,Employee was grinding with a four-inch grinder...,Employee was transported to on-site medical an...
1,176,39877,20200471,26/02/2020,16:15,Consulting,Energy Optimisation & Innovation,EMEA,E&II Sustainable Infrastructure,E&II Sustainable Infrastructure,...,,,,,,,,CAIRS_Incidents.csv,See supporting information,"I stopped the drillers from installing, phoned..."
2,177,39876,20200470,27/02/2020,10:15,Operations,Asia Pacific,APAC East,Esso PNG LNG,Hides,...,,,,,,,,CAIRS_Incidents.csv,At approximately 10:15 on the 27th of Feb a hy...,"All contaminated materials, absorbent pads, we..."


In [51]:
incidents.rename(columns={'IncidentNumber': 'incident_id'}, inplace=True)
incidents['text'] = (
        incidents['ShortDescription'].astype(str).fillna('') + ' ' + 
        incidents['FullDescription'].astype(str).fillna('') + ' ' + 
        incidents['ImmediateAction'].astype(str).fillna('')
).str.lower()

incidents = incidents[['incident_id', 'text']]

In [60]:
sample100 = incidents.sample(n=100)
sample100.index = range(100)

In [17]:
incidents.text[0]

"foreign body entered employee's (l) eye while grinding. employee was grinding with a four-inch grinder on an iron support in the compressor building when he felt discomfort to his left eye. employee was wearing all required ppe. investigation is pending upon employee’s return from offsite medical evaluation. it will be noted that the site has been dealing with strong winds throughout the day. foreign body was removed using first aid measures. employee was transported to on-site medical and then transferred to an off-site eye specialist for further treatment. foreign body was removed using first aid measures. employee was released to return to work without restrictions."

In [18]:
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()
def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

test = incidents.text.head(1).apply(lemmatize_text)
print(list(test))

[['foreign', 'body', 'entered', "employee's", '(l)', 'eye', 'while', 'grinding.', 'employee', 'wa', 'grinding', 'with', 'a', 'four-inch', 'grinder', 'on', 'an', 'iron', 'support', 'in', 'the', 'compressor', 'building', 'when', 'he', 'felt', 'discomfort', 'to', 'his', 'left', 'eye.', 'employee', 'wa', 'wearing', 'all', 'required', 'ppe.', 'investigation', 'is', 'pending', 'upon', 'employee’s', 'return', 'from', 'offsite', 'medical', 'evaluation.', 'it', 'will', 'be', 'noted', 'that', 'the', 'site', 'ha', 'been', 'dealing', 'with', 'strong', 'wind', 'throughout', 'the', 'day.', 'foreign', 'body', 'wa', 'removed', 'using', 'first', 'aid', 'measures.', 'employee', 'wa', 'transported', 'to', 'on-site', 'medical', 'and', 'then', 'transferred', 'to', 'an', 'off-site', 'eye', 'specialist', 'for', 'further', 'treatment.', 'foreign', 'body', 'wa', 'removed', 'using', 'first', 'aid', 'measures.', 'employee', 'wa', 'released', 'to', 'return', 'to', 'work', 'without', 'restrictions.']]


In [19]:
lemmatizer.lemmatize('employee’s')

'employee’s'

In [20]:
stem = stemmer()
def stem_text(text):
    return [stem.stem(w) for w in w_tokenizer.tokenize(text)]
   
test = incidents.text.head(1).apply(stem_text)
print(list(test))

[['foreign', 'bodi', 'enter', "employee'", '(l)', 'eye', 'while', 'grinding.', 'employe', 'wa', 'grind', 'with', 'a', 'four-inch', 'grinder', 'on', 'an', 'iron', 'support', 'in', 'the', 'compressor', 'build', 'when', 'he', 'felt', 'discomfort', 'to', 'hi', 'left', 'eye.', 'employe', 'wa', 'wear', 'all', 'requir', 'ppe.', 'investig', 'is', 'pend', 'upon', 'employee’', 'return', 'from', 'offsit', 'medic', 'evaluation.', 'it', 'will', 'be', 'note', 'that', 'the', 'site', 'ha', 'been', 'deal', 'with', 'strong', 'wind', 'throughout', 'the', 'day.', 'foreign', 'bodi', 'wa', 'remov', 'use', 'first', 'aid', 'measures.', 'employe', 'wa', 'transport', 'to', 'on-sit', 'medic', 'and', 'then', 'transfer', 'to', 'an', 'off-sit', 'eye', 'specialist', 'for', 'further', 'treatment.', 'foreign', 'bodi', 'wa', 'remov', 'use', 'first', 'aid', 'measures.', 'employe', 'wa', 'releas', 'to', 'return', 'to', 'work', 'without', 'restrictions.']]


In [21]:
def tokenize(text):
    tokens = word_tokenize(text)
    tokens = [tok for tok in tokens if tok!='-']
    return tokens

In [22]:
test_tokens = tokenize(incidents.text[0])
test_tokens

['foreign',
 'body',
 'entered',
 'employee',
 "'s",
 '(',
 'l',
 ')',
 'eye',
 'while',
 'grinding',
 '.',
 'employee',
 'was',
 'grinding',
 'with',
 'a',
 'four-inch',
 'grinder',
 'on',
 'an',
 'iron',
 'support',
 'in',
 'the',
 'compressor',
 'building',
 'when',
 'he',
 'felt',
 'discomfort',
 'to',
 'his',
 'left',
 'eye',
 '.',
 'employee',
 'was',
 'wearing',
 'all',
 'required',
 'ppe',
 '.',
 'investigation',
 'is',
 'pending',
 'upon',
 'employee',
 '’',
 's',
 'return',
 'from',
 'offsite',
 'medical',
 'evaluation',
 '.',
 'it',
 'will',
 'be',
 'noted',
 'that',
 'the',
 'site',
 'has',
 'been',
 'dealing',
 'with',
 'strong',
 'winds',
 'throughout',
 'the',
 'day',
 '.',
 'foreign',
 'body',
 'was',
 'removed',
 'using',
 'first',
 'aid',
 'measures',
 '.',
 'employee',
 'was',
 'transported',
 'to',
 'on-site',
 'medical',
 'and',
 'then',
 'transferred',
 'to',
 'an',
 'off-site',
 'eye',
 'specialist',
 'for',
 'further',
 'treatment',
 '.',
 'foreign',
 'body',
 '

In [23]:
def translate_to_regex(rule_part):
    """
    converts rule_book rules to regex format;
    drops trailing separators in process
    """
    if isinstance(rule_part, str):
        rule_part = re.sub(r"\s{2,}", " ", rule_part)
        rule_part = re.sub(r'\\\\b', r'\\b', rule_part)
        
        return r'(\b' + r'\b)|(\b'.join([s for s in rule_part.split('_ ') if s]) + r'\b)'
    else:
        return ''

In [24]:
rules['keyword'] = rules['keyword'].apply(lambda x: x.replace('*', '.*')+r'\b')
rules['rules_pre'] = rules['rules_pre'].apply(translate_to_regex)
rules['rules_post'] = rules['rules_post'].apply(translate_to_regex)
rules['rules_all'] = rules['rules_all'].apply(translate_to_regex)

rules['voids'] = rules['voids'].apply(translate_to_regex)
print('rules translated')

rules translated


In [25]:
rules

Unnamed: 0,group,keyword,rules_pre,rules_post,rules_all,voids
0,eye injury or irritation,eye.*\b,(\bsomething.*in\b)|(\bswell.*\b)|(\bswollen\b...,,(\birritation\b)|(\bforeign body\b)|(\bforeign...,
1,site compliance or practice issue,permit\b,(\bwithout.*work\b)|(\bnot captured\b),(\bnot in place\b)|(\bwere not captured\b),(\bduring.* audit\b)|(\bfailed.* section\b)|(\...,


In [26]:
def check_presence(pattern, string):
    if pattern:
        return bool(re.search(pattern, string))
    else:
        return False

In [27]:
def find_pattern(tokens, keyword, check_pre, check_post, check_all, check_void, window):
    """
    for a list of tokens finds specified keyword and returns True
    if the neighbourhood of this keyword satisfies pre-, post- or all- context rules
    and doesn't contain anything forbidden
    :param tokens: list of tokens
    :param keyword: pattern which a token should match
    :param check_pre: pattern which several previous tokens (concatenated) should match
    :param check_post: pattern which several subsequent tokens (concatenated) should match
    :param check_all: pattern which previous tokens + keyword + subsequent tokens should match
    :param check_void: pattern which previous tokens + keyword + subsequent tokens should NOT match
    :param window: N of pre and post tokens to consider
    :return: True/False - whether at least one matching part was found
    """
    # Extract contexts of keyword (if any found)
    # Four parts: [(pre), (keyword), (post), (all)]
    matches = [(
        ' '.join(tokens[max(0, i - window - 1): i]), tokens[i],
        ' '.join(tokens[i + 1:i + window + 1]),
        ' '.join(tokens[max(0, i - window - 1):i + window + 1])
        ) for i, tok in enumerate(tokens) if re.match(keyword, tok)]
    
    # do tests
    final_match = any([
        (check_presence(check_pre, pre) or check_presence(check_all, all_) or check_presence(check_post, post))
        and not check_presence(check_void, all_) for (pre, kw, post, all_) in matches
    ])
    return final_match

In [28]:
tokens = test_tokens
keyword = rules.keyword[0]
check_pre = rules.rules_pre[0]
check_post = rules.rules_post[0]
check_all = rules.rules_all[0]
check_void = rules.voids[0]
window = 5

In [29]:
matches = [(
        ' '.join(tokens[max(0, i - window - 1): i]), # pre-context
            tokens[i],                               # keyword
        ' '.join(tokens[i + 1:i + window + 1]),      # post-context
        ' '.join(tokens[max(0, i - window - 1):i + window + 1]) # All context
        ) for i, tok in enumerate(tokens) if re.match(keyword, tok)]

In [30]:
matches

[("entered employee 's ( l )",
  'eye',
  'while grinding . employee was',
  "entered employee 's ( l ) eye while grinding . employee was"),
 ('he felt discomfort to his left',
  'eye',
  '. employee was wearing all',
  'he felt discomfort to his left eye . employee was wearing all'),
 ('and then transferred to an off-site',
  'eye',
  'specialist for further treatment .',
  'and then transferred to an off-site eye specialist for further treatment .')]

In [31]:
find_pattern(
    tokens = test_tokens, 
    keyword = rules.keyword[0], 
    check_pre = rules.rules_pre[0], 
    check_post = rules.rules_post[0], 
    check_all = rules.rules_all[0], 
    check_void = rules.voids[0], 
    window = 5
)

True

In [32]:
[(check_presence(check_pre, pre) or check_presence(check_all, all_) or check_presence(check_post, post))
        and not check_presence(check_void, all_) for (pre, kw, post, all_) in matches]

[False, True, False]

In [33]:
for (pre, kw, post, all_) in matches:
    #print(pre)
    #print(kw)
    #print(post)
    print(all_)

entered employee 's ( l ) eye while grinding . employee was
he felt discomfort to his left eye . employee was wearing all
and then transferred to an off-site eye specialist for further treatment .


In [34]:
check_all

'(\\birritation\\b)|(\\bforeign body\\b)|(\\bforeign object\\b)|(\\birritation (.*) eye\\b)|(\\bpain in (.*) eye\\b)|(\\bfell (.*) eye\\b)|(\\bfall (.*) eye\\b)|(\\bdebris (.*) eye\\b)|(\\bpain to\\b)'

In [35]:
# Sentence tokenizer
print(sent_tokenize(incidents.text[0]))

["foreign body entered employee's (l) eye while grinding.", 'employee was grinding with a four-inch grinder on an iron support in the compressor building when he felt discomfort to his left eye.', 'employee was wearing all required ppe.', 'investigation is pending upon employee’s return from offsite medical evaluation.', 'it will be noted that the site has been dealing with strong winds throughout the day.', 'foreign body was removed using first aid measures.', 'employee was transported to on-site medical and then transferred to an off-site eye specialist for further treatment.', 'foreign body was removed using first aid measures.', 'employee was released to return to work without restrictions.']


In [87]:
chk_text = 'employee was grinding with a four-inch grinder on an iron support in the compressor building when he felt discomfort to his left eye.'
pattern = '(\\bemployee (.*) discomfort (.*) eye\\b)'
check_presence(pattern, chk_text)

True

In [41]:
sents = sent_tokenize(incidents.text[0])
for sent in sents:
    if check_presence(pattern, sent):
        print(f'{check_presence(pattern, sent)}: {sent}')

True: employee was grinding with a four-inch grinder on an iron support in the compressor building when he felt discomfort to his left eye.


In [42]:
chk_text = 'the ip felt extreme soreness in his eye'
finds_list = []
finds_pats = []
incid_nums = []
for worker_syn in syn_dict.get('worker_syns'):
    for pain_syn in syn_dict.get('pain_syns'): 
        for eye_syn in syn_dict.get('eye_syns'):   
            pattern = f'(\\b{worker_syn} (.*) {pain_syn} (.*) {eye_syn}\\b)'
            check = check_presence(pattern, chk_text)
            if check: 
                print(f'{check}: {pattern}')
                finds_list.append(check)
                finds_pats.append(pattern)
                #incid_nums.append(irn)

True: (\bip (.*) soreness (.*) eye\b)
True: (\bip (.*) in (.*) eye\b)
True: (\bthe (.*) soreness (.*) eye\b)
True: (\bthe (.*) in (.*) eye\b)
True: (\bthe (.*) felt (.*) eye\b)


## Load Synonyms & Rules

In [528]:
syn_csv = pd.read_csv('synonyms.csv')

# Load the dictionary of synonyms
syn_dict = {}
for r in range(len(syn_csv)):
    syn_toks = syn_csv.keywords[r].split(',')
    syn_dict.update({syn_csv.syn[r]:syn_toks})
    
#syn_dict

rul_csv = pd.read_csv('rules.csv')
#rul_csv

In [510]:
#syn_csv

In [511]:
#syn_dict

In [512]:
def get_matches(keyword, tokens, span):
    # If the keyword matches a token, i will be where it occurs in the token string
    # You then apply the span before and after to create a keyword in context snipet
    matches = [(' '.join(tokens[max(0, i - span - 1):i + span + 1])) for i, tok in enumerate(tokens) if re.match(keyword, tok)]
    return(matches)

In [513]:
tokens = test_tokens
test = get_matches('eye*', test_tokens, 5)
print(len(test))
print(test)

3
["entered employee 's ( l ) eye while grinding . employee was", 'he felt discomfort to his left eye . employee was wearing all', 'and then transferred to an off-site eye specialist for further treatment .']


In [514]:
' hellp '.strip()

'hellp'

In [564]:
def rule_book_scan(incidents, rules):
    
    finds_list = []
    finds_pats = []
    incid_nums = []
    incid_cats = []
    
    #for r in range(len(rules)):
    for r in range(4, len(rules), 1):
        
        rul_syns = tokenize(re.sub(r", ", " ", rules.syns[r]))
        
        pos_1st = syn_dict.get(rul_syns[0])
        pos_2nd = syn_dict.get(rul_syns[1])
        pos_3rd = syn_dict.get(rul_syns[2])
     
        # Connections between synonyms 1-2, 2-3
        connect = ['(.*)', '(.*)']

        # Shuffle rules
        srs = ([2, 1, 4, 3, 0], [])

        cat = rules.rule[r]
        
        span = rules.span[r]
        
        shuffle = rules.shuffle[r]
        
        search_keyword = rules.keyword[r]

        console_str = f'Checking rule {r+1} of {len(rules)} ({cat})'
        print(console_str)
        print('='*len(console_str))
        for row in tqdm(range(len(incidents))):
            par_text = incidents.text[row].lower()
            # Check sentence by sentence, don't use span
            # TODO: Check impact of using span, should it be used or not?
            sen_toks = sent_tokenize(par_text)    
            irn = incidents.incident_id[row]
            for chk_text in sen_toks:
                
                # TODO: remove plurals, workers or worker's >>> worker
                for first_syn in pos_1st:
                    locals()["first_syn"] = first_syn.strip()
                    for second_syn in pos_2nd: 
                        for third_syn in pos_3rd:                              
                            
                            if search_keyword == '-':
                                
                                x = f'{first_syn.strip()}{connect[0]}{second_syn.strip()}{connect[1]}{third_syn.strip()}'
                                pattern = f'({x})'
                                print(pattern)
                                check = check_presence(pattern, chk_text)
                                    
                                if check: 
                                    print(f'{check}: {pattern}')
                                    finds_list.append(check)
                                    finds_pats.append(pattern)
                                    incid_nums.append(irn)
                                    incid_cats.append(cat)
                                    
                                else:
                                    for sr in srs:
                                        if len(sr) == 0: continue
                                        a = x.split()
                                        if shuffle == False:
                                                break
                                        # Adjust the word sequence using shuffle rule
                                        pattern = f'({a[sr[0]]}{a[sr[1]]}{a[sr[2]]}{a[sr[3]]}{a[sr[4]]})'
                                        rev_check = check_presence(pattern, chk_text)
                                        if rev_check: 
                                            print(f'{rev_check}: {pattern} ---Shuffled')
                                            finds_list.append(rev_check)
                                            finds_pats.append(pattern)
                                            incid_nums.append(irn)
                                            incid_cats.append(cat)
                                
                            else:
                                test_tokens = tokenize(chk_text)
                                kwics = get_matches(search_keyword, test_tokens, span)
                            
                                for kwic in kwics:
                                    x = f'{first_syn.strip()}{connect[0]}{second_syn.strip()}{connect[1]}{third_syn.strip()}'
                                    pattern = f'({x})'
                                    check = check_presence(pattern, kwic)

                                    if check: 
                                        print(f'{check}: {pattern}')
                                        finds_list.append(check)
                                        finds_pats.append(pattern)
                                        incid_nums.append(irn)
                                        incid_cats.append(cat)

                                    else:
                                        for sr in srs:
                                            if len(sr) == 0: continue
                                            a = x.split()
                                            if shuffle == False:
                                                break
                                            # Adjust the word sequence using shuffle rule
                                            #print('shuffling')
                                            pattern = f'({a[sr[0]]}{a[sr[1]]}{a[sr[2]]}{a[sr[3]]}{a[sr[4]]})'
                                            rev_check = check_presence(pattern, chk_text)
                                            if rev_check: 
                                                print(f'{rev_check}: {pattern} ---Shuffled')
                                                finds_list.append(rev_check)
                                                finds_pats.append(pattern)
                                                incid_nums.append(irn)
                                                incid_cats.append(cat)
                                            
        print('\n')
    out_df = pd.DataFrame(data=incid_nums, columns=['incid_nums'])
    out_df['finds_pats'] = finds_pats
    out_df['finds_list'] = finds_list
    out_df['incid_cats'] = incid_cats
    return(out_df)

In [565]:
finds_df = rule_book_scan(sample100, rul_csv)

Checking rule 5 of 6 (vehicle incident)


  0%|          | 0/100 [00:00<?, ?it/s]

(involved(.*)dozer(.*)collision)
(involved(.*)dozer(.*)accident)
(involved(.*)excavator(.*)collision)
(involved(.*)excavator(.*)accident)
(involved(.*)truck(.*)collision)
(involved(.*)truck(.*)accident)
(involved(.*)forklift(.*)collision)
(involved(.*)forklift(.*)accident)
(involved(.*)car(.*)collision)
(involved(.*)car(.*)accident)
(involved(.*)vehicle(.*)collision)
(involved(.*)vehicle(.*)accident)
(involved(.*)digger(.*)collision)
(involved(.*)digger(.*)accident)
(involved(.*)van(.*)collision)
(involved(.*)van(.*)accident)
(involved(.*)bus(.*)collision)
(involved(.*)bus(.*)accident)
(involved(.*)utility(.*)collision)
(involved(.*)utility(.*)accident)
(participated in(.*)dozer(.*)collision)
(participated in(.*)dozer(.*)accident)
(participated in(.*)excavator(.*)collision)
(participated in(.*)excavator(.*)accident)
(participated in(.*)truck(.*)collision)
(participated in(.*)truck(.*)accident)
(participated in(.*)forklift(.*)collision)
(participated in(.*)forklift(.*)accident)
(partici

(involved(.*)bus(.*)collision)
(involved(.*)bus(.*)accident)
(involved(.*)utility(.*)collision)
(involved(.*)utility(.*)accident)
(participated in(.*)dozer(.*)collision)
(participated in(.*)dozer(.*)accident)
(participated in(.*)excavator(.*)collision)
(participated in(.*)excavator(.*)accident)
(participated in(.*)truck(.*)collision)
(participated in(.*)truck(.*)accident)
(participated in(.*)forklift(.*)collision)
(participated in(.*)forklift(.*)accident)
(participated in(.*)car(.*)collision)
(participated in(.*)car(.*)accident)
(participated in(.*)vehicle(.*)collision)
(participated in(.*)vehicle(.*)accident)
(participated in(.*)digger(.*)collision)
(participated in(.*)digger(.*)accident)
(participated in(.*)van(.*)collision)
(participated in(.*)van(.*)accident)
(participated in(.*)bus(.*)collision)
(participated in(.*)bus(.*)accident)
(participated in(.*)utility(.*)collision)
(participated in(.*)utility(.*)accident)
(involved(.*)dozer(.*)collision)
(involved(.*)dozer(.*)accident)
(in

(participated in(.*)digger(.*)accident)
(participated in(.*)van(.*)collision)
(participated in(.*)van(.*)accident)
(participated in(.*)bus(.*)collision)
(participated in(.*)bus(.*)accident)
(participated in(.*)utility(.*)collision)
(participated in(.*)utility(.*)accident)
(involved(.*)dozer(.*)collision)
(involved(.*)dozer(.*)accident)
(involved(.*)excavator(.*)collision)
(involved(.*)excavator(.*)accident)
(involved(.*)truck(.*)collision)
(involved(.*)truck(.*)accident)
(involved(.*)forklift(.*)collision)
(involved(.*)forklift(.*)accident)
(involved(.*)car(.*)collision)
(involved(.*)car(.*)accident)
(involved(.*)vehicle(.*)collision)
(involved(.*)vehicle(.*)accident)
(involved(.*)digger(.*)collision)
(involved(.*)digger(.*)accident)
(involved(.*)van(.*)collision)
(involved(.*)van(.*)accident)
(involved(.*)bus(.*)collision)
(involved(.*)bus(.*)accident)
(involved(.*)utility(.*)collision)
(involved(.*)utility(.*)accident)
(participated in(.*)dozer(.*)collision)
(participated in(.*)dozer

(participated in(.*)car(.*)accident)
(participated in(.*)vehicle(.*)collision)
(participated in(.*)vehicle(.*)accident)
(participated in(.*)digger(.*)collision)
(participated in(.*)digger(.*)accident)
(participated in(.*)van(.*)collision)
(participated in(.*)van(.*)accident)
(participated in(.*)bus(.*)collision)
(participated in(.*)bus(.*)accident)
(participated in(.*)utility(.*)collision)
(participated in(.*)utility(.*)accident)
(involved(.*)dozer(.*)collision)
(involved(.*)dozer(.*)accident)
(involved(.*)excavator(.*)collision)
(involved(.*)excavator(.*)accident)
(involved(.*)truck(.*)collision)
(involved(.*)truck(.*)accident)
(involved(.*)forklift(.*)collision)
(involved(.*)forklift(.*)accident)
(involved(.*)car(.*)collision)
(involved(.*)car(.*)accident)
(involved(.*)vehicle(.*)collision)
(involved(.*)vehicle(.*)accident)
(involved(.*)digger(.*)collision)
(involved(.*)digger(.*)accident)
(involved(.*)van(.*)collision)
(involved(.*)van(.*)accident)
(involved(.*)bus(.*)collision)
(in

(participated in(.*)truck(.*)accident)
(participated in(.*)forklift(.*)collision)
(participated in(.*)forklift(.*)accident)
(participated in(.*)car(.*)collision)
(participated in(.*)car(.*)accident)
(participated in(.*)vehicle(.*)collision)
(participated in(.*)vehicle(.*)accident)
(participated in(.*)digger(.*)collision)
(participated in(.*)digger(.*)accident)
(participated in(.*)van(.*)collision)
(participated in(.*)van(.*)accident)
(participated in(.*)bus(.*)collision)
(participated in(.*)bus(.*)accident)
(participated in(.*)utility(.*)collision)
(participated in(.*)utility(.*)accident)
(involved(.*)dozer(.*)collision)
(involved(.*)dozer(.*)accident)
(involved(.*)excavator(.*)collision)
(involved(.*)excavator(.*)accident)
(involved(.*)truck(.*)collision)
(involved(.*)truck(.*)accident)
(involved(.*)forklift(.*)collision)
(involved(.*)forklift(.*)accident)
(involved(.*)car(.*)collision)
(involved(.*)car(.*)accident)
(involved(.*)vehicle(.*)collision)
(involved(.*)vehicle(.*)accident)


(participated in(.*)dozer(.*)accident)
(participated in(.*)excavator(.*)collision)
(participated in(.*)excavator(.*)accident)
(participated in(.*)truck(.*)collision)
(participated in(.*)truck(.*)accident)
(participated in(.*)forklift(.*)collision)
(participated in(.*)forklift(.*)accident)
(participated in(.*)car(.*)collision)
(participated in(.*)car(.*)accident)
(participated in(.*)vehicle(.*)collision)
(participated in(.*)vehicle(.*)accident)
(participated in(.*)digger(.*)collision)
(participated in(.*)digger(.*)accident)
(participated in(.*)van(.*)collision)
(participated in(.*)van(.*)accident)
(participated in(.*)bus(.*)collision)
(participated in(.*)bus(.*)accident)
(participated in(.*)utility(.*)collision)
(participated in(.*)utility(.*)accident)
(involved(.*)dozer(.*)collision)
(involved(.*)dozer(.*)accident)
(involved(.*)excavator(.*)collision)
(involved(.*)excavator(.*)accident)
(involved(.*)truck(.*)collision)
(involved(.*)truck(.*)accident)
(involved(.*)forklift(.*)collision)

(involved(.*)utility(.*)collision)
(involved(.*)utility(.*)accident)
(participated in(.*)dozer(.*)collision)
(participated in(.*)dozer(.*)accident)
(participated in(.*)excavator(.*)collision)
(participated in(.*)excavator(.*)accident)
(participated in(.*)truck(.*)collision)
(participated in(.*)truck(.*)accident)
(participated in(.*)forklift(.*)collision)
(participated in(.*)forklift(.*)accident)
(participated in(.*)car(.*)collision)
(participated in(.*)car(.*)accident)
(participated in(.*)vehicle(.*)collision)
(participated in(.*)vehicle(.*)accident)
(participated in(.*)digger(.*)collision)
(participated in(.*)digger(.*)accident)
(participated in(.*)van(.*)collision)
(participated in(.*)van(.*)accident)
(participated in(.*)bus(.*)collision)
(participated in(.*)bus(.*)accident)
(participated in(.*)utility(.*)collision)
(participated in(.*)utility(.*)accident)
(involved(.*)dozer(.*)collision)
(involved(.*)dozer(.*)accident)
(involved(.*)excavator(.*)collision)
(involved(.*)excavator(.*)a

(involved(.*)bus(.*)collision)
(involved(.*)bus(.*)accident)
(involved(.*)utility(.*)collision)
(involved(.*)utility(.*)accident)
(participated in(.*)dozer(.*)collision)
(participated in(.*)dozer(.*)accident)
(participated in(.*)excavator(.*)collision)
(participated in(.*)excavator(.*)accident)
(participated in(.*)truck(.*)collision)
(participated in(.*)truck(.*)accident)
(participated in(.*)forklift(.*)collision)
(participated in(.*)forklift(.*)accident)
(participated in(.*)car(.*)collision)
(participated in(.*)car(.*)accident)
(participated in(.*)vehicle(.*)collision)
(participated in(.*)vehicle(.*)accident)
(participated in(.*)digger(.*)collision)
(participated in(.*)digger(.*)accident)
(participated in(.*)van(.*)collision)
(participated in(.*)van(.*)accident)
(participated in(.*)bus(.*)collision)
(participated in(.*)bus(.*)accident)
(participated in(.*)utility(.*)collision)
(participated in(.*)utility(.*)accident)
(involved(.*)dozer(.*)collision)
(involved(.*)dozer(.*)accident)
(in

(involved(.*)digger(.*)collision)
(involved(.*)digger(.*)accident)
(involved(.*)van(.*)collision)
(involved(.*)van(.*)accident)
(involved(.*)bus(.*)collision)
(involved(.*)bus(.*)accident)
(involved(.*)utility(.*)collision)
(involved(.*)utility(.*)accident)
(participated in(.*)dozer(.*)collision)
(participated in(.*)dozer(.*)accident)
(participated in(.*)excavator(.*)collision)
(participated in(.*)excavator(.*)accident)
(participated in(.*)truck(.*)collision)
(participated in(.*)truck(.*)accident)
(participated in(.*)forklift(.*)collision)
(participated in(.*)forklift(.*)accident)
(participated in(.*)car(.*)collision)
(participated in(.*)car(.*)accident)
(participated in(.*)vehicle(.*)collision)
(participated in(.*)vehicle(.*)accident)
(participated in(.*)digger(.*)collision)
(participated in(.*)digger(.*)accident)
(participated in(.*)van(.*)collision)
(participated in(.*)van(.*)accident)
(participated in(.*)bus(.*)collision)
(participated in(.*)bus(.*)accident)
(participated in(.*)uti

  0%|          | 0/100 [00:00<?, ?it/s]

IndexError: list index out of range

In [561]:
chk = 'the country gm and regional gm were on their way to their apartment when they were involved in a motor vehicle collision.'
check_presence('hello(.*)vehicle(.*)collision', chk)
#check_presence('\bregional (.*) were (.*) way\b', chk)

False

In [554]:
#bool(re.search('\bregional (.*) were (.*) way\b', chk))
bool(re.search('involved (.*) motor (.*) collision', chk))

True

In [524]:
#finds_df

In [525]:
# How many incidents were classified
len(list(dict.fromkeys(finds_df[finds_df.finds_list == True].incid_nums)))

8

In [526]:
def deepdive_results(dat, focus='finds'):
    for inc in dat.incident_id:
        sub_df = finds_df[finds_df.incid_nums == inc]
        sub_df = sub_df.drop(['finds_pats'], axis=1)
        cats = list(dict.fromkeys(sub_df.incid_cats))
        
        if focus == 'misses':
            bool_chk = len(cats) > 0
        else:
            bool_chk = len(cats) == 0
            
        if bool_chk:
            continue
        else:    
            par_text = incidents.text[incidents.incident_id == inc].values[0]   
            sen_toks = sent_tokenize(par_text)

            print(f'{inc} {cats}')
            print('='*len(inc))
            for sen in sen_toks: 
                print(sen)    
        print('\n')   

In [527]:
deepdive_results(sample100, focus='finds')

20200556 ['vehicle incident']
dozer backed into bucket of excavator as the dozer operator was grading the rightaway, he went to track back on the road and saw the excavator but thought the bucket was on the ground.
the excavator bucket was raised because that operator was waiting to move in to move dirt.
the dozer then backed into the bucket of the excavator causing the back window to break and dent the air conditioning cover work was stopped immediately and safety department, foreman, and superintendents notified.
employee exited equipment and was taken for post accident drug and alcohol screening.
all results were negative.


20203461 ['vehicle incident']
chain-link post impacted by utility resulting in damage to the utility and post driver reversing a utility in the plant at train 1 has struck a chain-link post resulting in damage to the post and ute front drivers’ panel.
driver attended the site clinic for d&a testing.
negative


20160897 ['vehicle incident']
vanhout concrete truck

In [502]:
deepdive_results(sample100, focus='misses')

20201961 []
client process safety breach - blinds installed and subsequently removed without blinds board being updated at 0720hrs on tuesday 21st july 2020, a wood t&i supervisor completed the details for a hot work permit for the task of "install irr line section; weld and fabricate" and attended the client operations permit office to have the permit issued.
the permit was subsequently issued by the client operator, however as the permit did not indicate that any blinds needed to be installed, the blinds board in the permit office was not updated as required by client procedure.
the work crew subsequently removed the redundant line section and installed a blind at each end of the remaining pipework.
on monday 3rd august 2020, the supervisor applied for and was issued a new permit to install the new section.
the crew installed the new pipe section and removed the previously installed blinds.
at the end of the task, the crew returned the permit to the permit office to close-out, at whi

In [93]:
stem.stem('Be particles in hands')

'be particles in hand'

In [110]:
org_string = "This is employee's hand"
pattern = r"'s"

# Replace all occurrences of character 's with an empty string
org_string = re.sub(pattern, '', org_string )
print(org_string)

This is employee hand
