In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency

In [24]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
from nltk.stem import PorterStemmer
from nltk.stem import SnowballStemmer 
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from nltk.util import ngrams

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\bwils\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bwils\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\bwils\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
injury = pd.read_csv('severeinjury.csv', encoding='latin-1')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


We are going to look at OSHA recorded severe injuries between January 2015 and Spetmeber 2020. Interested to see if there is a pattern in what injuries lead to hospitalization and or amputation.

In [4]:
injury.head()

Unnamed: 0,ID,UPA,EventDate,Employer,Address1,Address2,City,State,Zip,Latitude,...,Nature,NatureTitle,Part of Body,Part of Body Title,Event,EventTitle,Source,SourceTitle,Secondary Source,Secondary Source Title
0,2015010015,931176,1/1/2015,FCI Otisville Federal Correctional Institution,Two Mile Drive,,OTISVILLE,NEW YORK,10963.0,41.46,...,111,Fractures,513,Lower leg(s),1214,Injured by physical contact with person while ...,5721,Co-worker,5772.0,Inmate or detainee in custody
1,2015010016,930267,1/1/2015,Kalahari Manufacturing LLC,171 Progress Drive,,LAKE DELTON,WISCONSIN,53940.0,43.59,...,1522,Second degree heat (thermal) burns,519,"Leg(s), n.e.c.",317,"Ignition of vapors, gases, or liquids",7261,"Welding, cutting, and blow torches",,
2,2015010018,929823,1/1/2015,Schneider National Bulk Carrier,420 CORAOPOLIS ROAD,,CORAOPOLIS,PENNSYLVANIA,15108.0,40.49,...,10,"Traumatic injuries and disorders, unspecified",9999,Nonclassifiable,4331,Other fall to lower level less than 6 feet,8421,"Semi, tractor-trailer, tanker truck",741.0,Ladders-fixed
3,2015010019,929711,1/1/2015,PEPSI BOTTLING GROUP INC.,4541 HOUSTON AVE.,,MACON,GEORGIA,31206.0,32.77,...,1972,"Soreness, pain, hurt-nonspecified injury",510,"Leg(s), unspecified",640,Caught in or compressed by equipment or object...,8623,Pallet jack-powered,8420.0,"Truck-motorized freight hauling and utility, u..."
4,2015010020,929642,1/1/2015,North American Pipe Corporation,210 South Arch Street,,JANESVILLE,WISCONSIN,53545.0,42.67,...,111,Fractures,4429,"Finger(s), fingernail(s), n.e.c.",6411,Caught in running equipment or machinery durin...,350,"Metal, woodworking, and special material machi...",,


There are a number of columns that we can elminate to make the data more manageable. Since most labor laws are made on the state and federal level we will drop location data except for state. Further inspection could be done on the dropped columns, but we are going to focus our search for now. The source data noted the the Lat and Long columns may be unreliable so we will be dropping them.

We can also drop the UPA ID as it is a duplicate ID, and the Final Narrative. We could run NLP on the FInal Narrative, but that is beyond the scope of this review. 

We will also drop titles columns after constructing dictionaries for labeling them later.

In [5]:
injury.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59469 entries, 0 to 59468
Data columns (total 26 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   ID                      59469 non-null  int64  
 1   UPA                     59469 non-null  int64  
 2   EventDate               59469 non-null  object 
 3   Employer                59469 non-null  object 
 4   Address1                59457 non-null  object 
 5   Address2                5488 non-null   object 
 6   City                    59457 non-null  object 
 7   State                   59469 non-null  object 
 8   Zip                     59455 non-null  float64
 9   Latitude                59441 non-null  float64
 10  Longitude               59441 non-null  float64
 11  Primary NAICS           59467 non-null  object 
 12  Hospitalized            59469 non-null  float64
 13  Amputation              59467 non-null  float64
 14  Inspection              20129 non-null

Before we drop them we will first fill in our missing fields. 
- Because we don't intend to use them in our analysis we will ignore the following: Address1, Address2, City, Zip, Latitude, Longitude. 
- Primary NAICS will be filled with 0 as will hospitalization and amputation. 
- Inspections will be converted to a binary column as the report numbers don't provide us with further information.
- Secondary Souce will be filled with 0 and Secondary Source Title will be filled with None

In [6]:
fill = {'Primary NAICS':0, 'Hospitalized':0, 'Amputation':0, 'Inspection':0, 
        'Secondary Source':0, 'Secondary Source Title':'None'}
injury.fillna(value=fill, inplace=True)

In [7]:
injury.loc[injury['Inspection']!=0, 'Inspection'] = 1
injury.head()

Unnamed: 0,ID,UPA,EventDate,Employer,Address1,Address2,City,State,Zip,Latitude,...,Nature,NatureTitle,Part of Body,Part of Body Title,Event,EventTitle,Source,SourceTitle,Secondary Source,Secondary Source Title
0,2015010015,931176,1/1/2015,FCI Otisville Federal Correctional Institution,Two Mile Drive,,OTISVILLE,NEW YORK,10963.0,41.46,...,111,Fractures,513,Lower leg(s),1214,Injured by physical contact with person while ...,5721,Co-worker,5772.0,Inmate or detainee in custody
1,2015010016,930267,1/1/2015,Kalahari Manufacturing LLC,171 Progress Drive,,LAKE DELTON,WISCONSIN,53940.0,43.59,...,1522,Second degree heat (thermal) burns,519,"Leg(s), n.e.c.",317,"Ignition of vapors, gases, or liquids",7261,"Welding, cutting, and blow torches",0.0,
2,2015010018,929823,1/1/2015,Schneider National Bulk Carrier,420 CORAOPOLIS ROAD,,CORAOPOLIS,PENNSYLVANIA,15108.0,40.49,...,10,"Traumatic injuries and disorders, unspecified",9999,Nonclassifiable,4331,Other fall to lower level less than 6 feet,8421,"Semi, tractor-trailer, tanker truck",741.0,Ladders-fixed
3,2015010019,929711,1/1/2015,PEPSI BOTTLING GROUP INC.,4541 HOUSTON AVE.,,MACON,GEORGIA,31206.0,32.77,...,1972,"Soreness, pain, hurt-nonspecified injury",510,"Leg(s), unspecified",640,Caught in or compressed by equipment or object...,8623,Pallet jack-powered,8420.0,"Truck-motorized freight hauling and utility, u..."
4,2015010020,929642,1/1/2015,North American Pipe Corporation,210 South Arch Street,,JANESVILLE,WISCONSIN,53545.0,42.67,...,111,Fractures,4429,"Finger(s), fingernail(s), n.e.c.",6411,Caught in running equipment or machinery durin...,350,"Metal, woodworking, and special material machi...",0.0,


In [8]:
injury['Final Narrative']

0        Three correctional facility guards were escort...
1        Employee in the Machine Shop received second d...
2        A truck driver fell approximately 4 feet while...
3        An employee's leg was pinned between a truck a...
4        An employee working on the Line 6 Auto-Beller ...
                               ...                        
59464    A postal employee fell down a flight of 5 stai...
59465    A trainer was training a new employee on how t...
59466    The grinding blade on a deactivated meat grind...
59467    An employee fell while climbing a poplar tree,...
59468    An employee was climbing from one catwalk to a...
Name: Final Narrative, Length: 59469, dtype: object

In [9]:
nar1 = injury['Final Narrative'][0]
nar1

"Three correctional facility guards were escorting a restrained federal prison inmate when he became disruptive, requiring the use of force. \nTwo guards and the inmate fell onto the Lieutenant's right leg, fracturing his fibula. He was transported to the hospital and released the following day."

In [10]:
sents = nltk.sent_tokenize(nar1)
sents

['Three correctional facility guards were escorting a restrained federal prison inmate when he became disruptive, requiring the use of force.',
 "Two guards and the inmate fell onto the Lieutenant's right leg, fracturing his fibula.",
 'He was transported to the hospital and released the following day.']

In [11]:
words = nltk.word_tokenize(nar1)
len(words)

51

In [12]:
average_tokens = round(len(words)/len(sents))
average_tokens

17

In [13]:
unique_tokens = set(words)
len(unique_tokens)

41

In [37]:
stop_words = set(stopwords.words('english'))
final_tokens=[]
for each in words:
    if each not in stop_words:
        final_tokens.append(each)
final_tokens, len(final_tokens)


(['Three', 'correctional', 'facility', 'guards', 'escorting', 'restrained', 'federal', 'prison', 'inmate', 'became', 'disruptive', ',', 'requiring', 'use', 'force', '.', 'Two', 'guards', 'inmate', 'fell', 'onto', 'Lieutenant', "'s", 'right', 'leg', ',', 'fracturing', 'fibula', '.', 'He', 'transported', 'hospital', 'released', 'following', 'day', '.'], 36)

In [38]:
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word, pos='v') for word in final_tokens]
lemmatized_words

['Three', 'correctional', 'facility', 'guard', 'escort', 'restrain', 'federal', 'prison', 'inmate', 'become', 'disruptive', ',', 'require', 'use', 'force', '.', 'Two', 'guard', 'inmate', 'fell', 'onto', 'Lieutenant', "'s", 'right', 'leg', ',', 'fracture', 'fibula', '.', 'He', 'transport', 'hospital', 'release', 'follow', 'day', '.']

In [55]:
def prep_narrative(narrative):
    stop_words = set(stopwords.words('english'))
    sents = nltk.sent_tokenize(narrative)
    prepped_narrative = []
    for sentence in sents:
        words = nltk.word_tokenize(narrative)
        final_tokens=[]
        for each in words:
            if each not in stop_words:
                lemma = lemmatizer.lemmatize(each, pos='v') 
                final_tokens.append(lemma)
        prepped_narrative.extend(final_tokens)
    return prepped_narrative

In [52]:
prepped = prep_narrative(injury['Final Narrative'][0])
prepped

['Three', 'correctional', 'facility', 'guard', 'escort', 'restrain', 'federal', 'prison', 'inmate', 'become', 'disruptive', ',', 'require', 'use', 'force', '.', 'Two', 'guard', 'inmate', 'fell', 'onto', 'Lieutenant', "'s", 'right', 'leg', ',', 'fracture', 'fibula', '.', 'He', 'transport', 'hospital', 'release', 'follow', 'day', '.', 'Three', 'correctional', 'facility', 'guard', 'escort', 'restrain', 'federal', 'prison', 'inmate', 'become', 'disruptive', ',', 'require', 'use', 'force', '.', 'Two', 'guard', 'inmate', 'fell', 'onto', 'Lieutenant', "'s", 'right', 'leg', ',', 'fracture', 'fibula', '.', 'He', 'transport', 'hospital', 'release', 'follow', 'day', '.', 'Three', 'correctional', 'facility', 'guard', 'escort', 'restrain', 'federal', 'prison', 'inmate', 'become', 'disruptive', ',', 'require', 'use', 'force', '.', 'Two', 'guard', 'inmate', 'fell', 'onto', 'Lieutenant', "'s", 'right', 'leg', ',', 'fracture', 'fibula', '.', 'He', 'transport', 'hospital', 'release', 'follow', 'day', '.

In [56]:
injury['lemmatized'] = injury['Final Narrative'].apply(prep_narrative)

In [57]:
injury['lemmatized']

0        [Three, correctional, facility, guard, escort,...
1        [Employee, Machine, Shop, receive, second, deg...
2        [A, truck, driver, fell, approximately, 4, fee...
3        [An, employee, 's, leg, pin, truck, power, pal...
4        [An, employee, work, Line, 6, Auto-Beller, rea...
                               ...                        
59464    [A, postal, employee, fell, flight, 5, stairs,...
59465    [A, trainer, train, new, employee, use, crimp,...
59466    [The, grind, blade, deactivate, meat, grind, m...
59467    [An, employee, fell, climb, poplar, tree, ,, l...
59468    [An, employee, climb, one, catwalk, another, ....
Name: lemmatized, Length: 59469, dtype: object

In [59]:
injury['lemmatized'].to_pickle('Narrative_lemmatized.pkl')

In [25]:
unigram=[]
bigram=[]
trigram=[]
fourgram=[]
tokenized_text = []
for sentence in sents:
    sentence = sentence.lower()
    sequence = word_tokenize(sentence) 
    for word in sequence:
        if word =='.':
            sequence.remove(word) 
        else:
            unigram.append(word)
    tokenized_text.append(sequence) 
    bigram.extend(list(ngrams(sequence, 2)))  
#unigram, bigram, trigram, and fourgram models are created
    trigram.extend(list(ngrams(sequence, 3)))
    fourgram.extend(list(ngrams(sequence, 4)))

In [36]:
unigram = removal(unigram)
bigram = removal(bigram)
trigram = removal(trigram)             
fourgram = removal(fourgram)
freq_uni = nltk.FreqDist(unigram)
freq_bi = nltk.FreqDist(bigram)
freq_tri = nltk.FreqDist(trigram)
freq_four = nltk.FreqDist(fourgram)
print ("Most common bigrams: ", freq_uni.most_common(5))

Most common n-grams without stopword removal and without add-1 smoothing: 

Most common bigrams:  [('the', 5), ('guards', 2), ('inmate', 2), ('he', 2), (',', 2)]
Most common bigrams:  [(('three', 'correctional'), 1), (('correctional', 'facility'), 1), (('facility', 'guards'), 1), (('guards', 'were'), 1), (('were', 'escorting'), 1)]

Most common trigrams:  [(('three', 'correctional', 'facility'), 1), (('correctional', 'facility', 'guards'), 1), (('facility', 'guards', 'were'), 1), (('guards', 'were', 'escorting'), 1), (('were', 'escorting', 'a'), 1)]

Most common fourgrams:  [(('three', 'correctional', 'facility', 'guards'), 1), (('correctional', 'facility', 'guards', 'were'), 1), (('facility', 'guards', 'were', 'escorting'), 1), (('guards', 'were', 'escorting', 'a'), 1), (('were', 'escorting', 'a', 'restrained'), 1)]
