In [7]:
# Mounting Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
# Setting the current working directory
import os; os.chdir('/content/drive/MyDrive/AI_ML/Projects/Capstone-NLP')

### Importing the Packages

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [4]:
%tensorflow_version 1.x

import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns
import keras.layers as layers
from keras.models import Model
from keras import backend as K
np.random.seed(10)

print("Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("Hub version: ", hub.__version__)

TensorFlow 1.x selected.
Version:  1.15.2
Eager mode:  False
Hub version:  0.11.0


Using TensorFlow backend.


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score

## Load the dataset

In [10]:
#Function 1.1 - Load the dataset

def load_dataset(filename):
  data = pd.read_csv("IHMStefanini_industrial_safety_and_health_database_with_accidents_description.csv")
  return data

#Function 1.2 - Dataset Cleansing

def data_cleansing_info(data):
  data.drop("Unnamed: 0", axis=1, inplace=True)
  data.rename(columns={'Data':'Date', 'Countries':'Country', 'Accident Level' : 'AccLevel' ,  'Genre':'Gender', 'Employee or Third Party':'Employee type'}, inplace=True)
  return data.head(5), data.isnull().sum()

In [11]:
data = load_dataset("IHMStefanini_industrial_safety_and_health_database_with_accidents_description.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,Data,Countries,Local,Industry Sector,Accident Level,Potential Accident Level,Genre,Employee or Third Party,Critical Risk,Description
0,0,1/1/2016 0:00,Country_01,Local_01,Mining,I,IV,Male,Third Party,Pressed,While removing the drill rod of the Jumbo 08 f...
1,1,1/2/2016 0:00,Country_02,Local_02,Mining,I,IV,Male,Employee,Pressurized Systems,During the activation of a sodium sulphide pum...
2,2,1/6/2016 0:00,Country_01,Local_03,Mining,I,III,Male,Third Party (Remote),Manual Tools,In the sub-station MILPO located at level +170...
3,3,1/8/2016 0:00,Country_01,Local_04,Mining,I,I,Male,Third Party,Others,Being 9:45 am. approximately in the Nv. 1880 C...
4,4,1/10/2016 0:00,Country_01,Local_04,Mining,IV,IV,Male,Third Party,Others,Approximately at 11:45 a.m. in circumstances t...


## Preparing the data for our model

Done some preprocessing on the dataset to the 5 acc level:
- Data Cleansing
- Name Corrections
- Replace Roman numerals to Integers

In [12]:
data_cleansing_info(data)

(             Date  ...                                        Description
 0   1/1/2016 0:00  ...  While removing the drill rod of the Jumbo 08 f...
 1   1/2/2016 0:00  ...  During the activation of a sodium sulphide pum...
 2   1/6/2016 0:00  ...  In the sub-station MILPO located at level +170...
 3   1/8/2016 0:00  ...  Being 9:45 am. approximately in the Nv. 1880 C...
 4  1/10/2016 0:00  ...  Approximately at 11:45 a.m. in circumstances t...
 
 [5 rows x 10 columns], Date                        0
 Country                     0
 Local                       0
 Industry Sector             0
 AccLevel                    0
 Potential Accident Level    0
 Gender                      0
 Employee type               0
 Critical Risk               0
 Description                 0
 dtype: int64)

In [13]:
data.head() #Column names corrected

Unnamed: 0,Date,Country,Local,Industry Sector,AccLevel,Potential Accident Level,Gender,Employee type,Critical Risk,Description
0,1/1/2016 0:00,Country_01,Local_01,Mining,I,IV,Male,Third Party,Pressed,While removing the drill rod of the Jumbo 08 f...
1,1/2/2016 0:00,Country_02,Local_02,Mining,I,IV,Male,Employee,Pressurized Systems,During the activation of a sodium sulphide pum...
2,1/6/2016 0:00,Country_01,Local_03,Mining,I,III,Male,Third Party (Remote),Manual Tools,In the sub-station MILPO located at level +170...
3,1/8/2016 0:00,Country_01,Local_04,Mining,I,I,Male,Third Party,Others,Being 9:45 am. approximately in the Nv. 1880 C...
4,1/10/2016 0:00,Country_01,Local_04,Mining,IV,IV,Male,Third Party,Others,Approximately at 11:45 a.m. in circumstances t...


In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 425 entries, 0 to 424
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Date                      425 non-null    object
 1   Country                   425 non-null    object
 2   Local                     425 non-null    object
 3   Industry Sector           425 non-null    object
 4   AccLevel                  425 non-null    object
 5   Potential Accident Level  425 non-null    object
 6   Gender                    425 non-null    object
 7   Employee type             425 non-null    object
 8   Critical Risk             425 non-null    object
 9   Description               425 non-null    object
dtypes: object(10)
memory usage: 33.3+ KB


In [15]:
data.columns

Index(['Date', 'Country', 'Local', 'Industry Sector', 'AccLevel',
       'Potential Accident Level', 'Gender', 'Employee type', 'Critical Risk',
       'Description'],
      dtype='object')

In [121]:
data["AccLevel"].value_counts()

I      316
II      40
III     31
IV      30
V        8
Name: AccLevel, dtype: int64

In [29]:
genres = data["AccLevel"].values
genres

array(['I', 'I', 'I', 'I', 'IV', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I',
       'IV', 'I', 'I', 'III', 'I', 'I', 'I', 'I', 'I', 'II', 'II', 'I',
       'I', 'I', 'I', 'I', 'II', 'I', 'I', 'III', 'V', 'I', 'I', 'I', 'I',
       'I', 'I', 'II', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'III', 'I',
       'III', 'I', 'I', 'III', 'II', 'I', 'I', 'I', 'I', 'II', 'IV', 'I',
       'I', 'I', 'I', 'I', 'IV', 'I', 'II', 'I', 'I', 'I', 'I', 'III',
       'I', 'I', 'I', 'II', 'IV', 'I', 'I', 'III', 'I', 'I', 'I', 'I',
       'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'II',
       'I', 'I', 'III', 'I', 'I', 'IV', 'I', 'I', 'I', 'II', 'I', 'I',
       'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I',
       'IV', 'I', 'II', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'V', 'II', 'I',
       'I', 'I', 'I', 'I', 'IV', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I',
       'I', 'I', 'I', 'IV', 'IV', 'I', 'I', 'I', 'I', 'I', 'I', 'V',
       'III', 'I', 'I', 'IV', 'I', 'III', 'III', 'IV', '

In [37]:
descriptions = data['Description']
genres = data['AccLevel'].values
type(descriptions)

pandas.core.series.Series

In [48]:
genres = genres.reshape((425,1))

In [49]:
type(genres)

numpy.ndarray

In [50]:
train_size = int(len(descriptions) * .8)

train_descriptions = descriptions[:train_size].astype('str')
train_genres = genres[:train_size]

test_descriptions = descriptions[train_size:].astype('str')
test_genres = genres[train_size:]

In [135]:
X  = descriptions.copy()
y = genres

In [137]:
# Split into train+val and test
X_trainval, test_descriptions, y_trainval, test_genres = train_test_split(X, y, test_size=0.2, stratify=y, random_state=69)

In [138]:
# Split train into train-val
train_descriptions, X_val, train_genres, y_val = train_test_split(X_trainval, y_trainval, test_size=0.1, stratify=y_trainval, random_state=21)

In [123]:
# ## create train weights
# from sklearn.utils.class_weight import compute_sample_weight
# weight = compute_sample_weight(class_weight='balanced', y=train_genres)
# train_descriptions['weight'] = weight.astype('float32')

In [124]:
# ## create test weights
# test_descriptions['weight'] = np.ones(len(test_genres)).astype('float32') ## set them all to 1

In [139]:
from sklearn.preprocessing import MultiLabelBinarizer

encoder = MultiLabelBinarizer()
encoder.fit_transform(train_genres)
train_encoded = encoder.transform(train_genres)
test_encoded = encoder.transform(test_genres)
num_classes = len(encoder.classes_)
print(num_classes)
# Print all possible genres and the labels for the first movie in our training dataset
print(encoder.classes_)
print(train_encoded[16])

5
['I' 'II' 'III' 'IV' 'V']
[1 0 0 0 0]


In [140]:
description_embeddings = hub.text_embedding_column(key = "descriptions", module_spec="https://tfhub.dev/google/universal-sentence-encoder/2", trainable=False)

In [None]:
#description_embeddings = hub.text_embedding_column(key="sentence", module_spec="https://tfhub.dev/google/nnlm-en-dim128/1")

In [128]:
#embed = hub.Module("https://tfhub.dev/google/universal-sentence-encoder/2")

In [129]:
#embeddings = embed(["The quick brown fox jumps over the lazy dog.","I am a sentence for which I would like to get its embedding"])

In [130]:
#print(session.run(embeddings))

In [141]:
multi_label_head = tf.contrib.estimator.multi_label_head(
    num_classes,
    loss_reduction=tf.losses.Reduction.SUM_OVER_BATCH_SIZE
)

In [142]:
features = {
  "descriptions": np.array(train_descriptions).astype(np.str)
}
labels = np.array(train_encoded).astype(np.int32)
train_input_fn = tf.estimator.inputs.numpy_input_fn(features, labels, shuffle=True, batch_size=32, num_epochs=25)
estimator = tf.estimator.DNNEstimator(head=multi_label_head,hidden_units=[64,10],feature_columns=[description_embeddings], optimizer=tf.train.AdagradOptimizer(learning_rate=0.003))

INFO:tensorflow:Using default config.


INFO:tensorflow:Using default config.






INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmpn346xas6', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fbe1cb32650>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmpn346xas6', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fbe1cb32650>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [143]:
estimator.train(input_fn=train_input_fn)

INFO:tensorflow:Calling model_fn.


INFO:tensorflow:Calling model_fn.






INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Done calling model_fn.


INFO:tensorflow:Done calling model_fn.


INFO:tensorflow:Create CheckpointSaverHook.


INFO:tensorflow:Create CheckpointSaverHook.


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Done running local_init_op.


INFO:tensorflow:Done running local_init_op.


INFO:tensorflow:Saving checkpoints for 0 into /tmp/tmpn346xas6/model.ckpt.


INFO:tensorflow:Saving checkpoints for 0 into /tmp/tmpn346xas6/model.ckpt.


INFO:tensorflow:loss = 0.69845223, step = 1


INFO:tensorflow:loss = 0.69845223, step = 1


INFO:tensorflow:global_step/sec: 45.1487


INFO:tensorflow:global_step/sec: 45.1487


INFO:tensorflow:loss = 0.6721833, step = 101 (2.226 sec)


INFO:tensorflow:loss = 0.6721833, step = 101 (2.226 sec)


INFO:tensorflow:global_step/sec: 56.8662


INFO:tensorflow:global_step/sec: 56.8662


INFO:tensorflow:loss = 0.6478019, step = 201 (1.757 sec)


INFO:tensorflow:loss = 0.6478019, step = 201 (1.757 sec)


INFO:tensorflow:Saving checkpoints for 240 into /tmp/tmpn346xas6/model.ckpt.


INFO:tensorflow:Saving checkpoints for 240 into /tmp/tmpn346xas6/model.ckpt.


INFO:tensorflow:Loss for final step: 0.6632383.


INFO:tensorflow:Loss for final step: 0.6632383.


<tensorflow_estimator.python.estimator.canned.dnn.DNNEstimator at 0x7fbe1cac5210>

In [144]:
# Define our eval input_fn and run eval
eval_input_fn = tf.estimator.inputs.numpy_input_fn({"descriptions": np.array(test_descriptions).astype(np.str)}, test_encoded.astype(np.int32), shuffle=False)
estimator.evaluate(input_fn=eval_input_fn)

INFO:tensorflow:Calling model_fn.


INFO:tensorflow:Calling model_fn.






INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore










INFO:tensorflow:Done calling model_fn.


INFO:tensorflow:Done calling model_fn.


INFO:tensorflow:Starting evaluation at 2021-04-03T12:06:19Z


INFO:tensorflow:Starting evaluation at 2021-04-03T12:06:19Z


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Restoring parameters from /tmp/tmpn346xas6/model.ckpt-240


INFO:tensorflow:Restoring parameters from /tmp/tmpn346xas6/model.ckpt-240


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Done running local_init_op.


INFO:tensorflow:Done running local_init_op.


INFO:tensorflow:Finished evaluation at 2021-04-03-12:06:27


INFO:tensorflow:Finished evaluation at 2021-04-03-12:06:27


INFO:tensorflow:Saving dict for global step 240: auc = 0.8616955, auc_precision_recall = 0.62739146, average_loss = 0.6432119, global_step = 240, loss = 0.6432119


INFO:tensorflow:Saving dict for global step 240: auc = 0.8616955, auc_precision_recall = 0.62739146, average_loss = 0.6432119, global_step = 240, loss = 0.6432119


INFO:tensorflow:Saving 'checkpoint_path' summary for global step 240: /tmp/tmpn346xas6/model.ckpt-240


INFO:tensorflow:Saving 'checkpoint_path' summary for global step 240: /tmp/tmpn346xas6/model.ckpt-240


{'auc': 0.8616955,
 'auc_precision_recall': 0.62739146,
 'average_loss': 0.6432119,
 'global_step': 240,
 'loss': 0.6432119}

In [162]:
raw_test = ["Due to the overheating of 2 bars in row 5 of cell 7 a spark is produced, which is projected and manages to reach the Chief of guard who was in the corridor, producing a first degree burn in the neck."]

In [160]:
# Generate predictions
predict_input_fn = tf.estimator.inputs.numpy_input_fn({"descriptions": np.array(raw_test)}, shuffle=False) #.astype(np.str)
results = estimator.predict(predict_input_fn)

In [161]:
# Display predictions
for acc_level in results:
  top_4 = acc_level['probabilities'].argsort()[-4:][::-1]
  #print(pred)
  for level in top_4:
    text_acc_level = encoder.classes_[level]
    print(text_acc_level + ': ' + str(round(acc_level['probabilities'][level] * 100, 2)) + '%')
  print('')

INFO:tensorflow:Calling model_fn.


INFO:tensorflow:Calling model_fn.






INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Done calling model_fn.


INFO:tensorflow:Done calling model_fn.


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Restoring parameters from /tmp/tmpn346xas6/model.ckpt-240


INFO:tensorflow:Restoring parameters from /tmp/tmpn346xas6/model.ckpt-240


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Done running local_init_op.


INFO:tensorflow:Done running local_init_op.


I: 53.64%
II: 47.96%
IV: 47.71%
V: 47.69%

