In [14]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

import torch

# Loading DataFrame and removing unwanted columns

In [12]:
df = pd.read_csv("Accident_description.csv")
# df.columns
df.drop(["Unnamed: 0", "Data", "Countries", "Local"], axis = 1, inplace = True)
df

Unnamed: 0,Industry Sector,Accident Level,Potential Accident Level,Genre,Employee or Third Party,Critical Risk,Description
0,Mining,I,IV,Male,Third Party,Pressed,While removing the drill rod of the Jumbo 08 f...
1,Mining,I,IV,Male,Employee,Pressurized Systems,During the activation of a sodium sulphide pum...
2,Mining,I,III,Male,Third Party (Remote),Manual Tools,In the sub-station MILPO located at level +170...
3,Mining,I,I,Male,Third Party,Others,Being 9:45 am. approximately in the Nv. 1880 C...
4,Mining,IV,IV,Male,Third Party,Others,Approximately at 11:45 a.m. in circumstances t...
...,...,...,...,...,...,...,...
420,Mining,I,III,Male,Third Party,Others,"Being approximately 5:00 a.m. approximately, w..."
421,Mining,I,II,Female,Employee,Others,The collaborator moved from the infrastructure...
422,Metals,I,II,Male,Employee,Venomous Animals,During the environmental monitoring activity i...
423,Metals,I,II,Male,Employee,Cut,The Employee performed the activity of strippi...


# Brief Data Analysis

In [8]:
for col in df.iloc[:, :-1].columns:
    print(df[col].value_counts())
    print(len(df[col].unique()))

Industry Sector
Mining    241
Metals    134
Others     50
Name: count, dtype: int64
3
Accident Level
I      316
II      40
III     31
IV      30
V        8
Name: count, dtype: int64
5
Potential Accident Level
IV     143
III    106
II      95
I       49
V       31
VI       1
Name: count, dtype: int64
6
Genre
Male      403
Female     22
Name: count, dtype: int64
2
Employee or Third Party
Third Party             189
Employee                179
Third Party (Remote)     57
Name: count, dtype: int64
3
Critical Risk
Others                                       232
Pressed                                       24
Manual Tools                                  20
Chemical substances                           17
Venomous Animals                              16
Cut                                           14
Projection                                    13
Bees                                          10
Fall                                           9
Vehicles and Mobile Equipment               

# Encoding categorical columns:

* **Industry sector:** One-hot encoding
* **Accident level:** Label encoding
* **Genre:** One-hot encoding
* **Employee or Third Party:** Label encoding
* **Critical Risk:** Label encoding

In [10]:
df.iloc[:, :-1].columns

Index(['Industry Sector', 'Accident Level', 'Potential Accident Level',
       'Genre', 'Employee or Third Party', 'Critical Risk'],
      dtype='object')

In [13]:
for col in df.iloc[:, :-1].columns:
    if col != "Genre":
        le = LabelEncoder()
        df.loc[:, col] = le.fit_transform(df.loc[:, col])
    else:
        df = pd.get_dummies(df, columns = ["Genre"], drop_first = True)

df

Unnamed: 0,Industry Sector,Accident Level,Potential Accident Level,Employee or Third Party,Critical Risk,Description,Genre_Male
0,1,0,3,1,20,While removing the drill rod of the Jumbo 08 f...,True
1,1,0,3,0,21,During the activation of a sodium sulphide pum...,True
2,1,0,2,2,15,In the sub-station MILPO located at level +170...,True
3,1,0,0,1,16,Being 9:45 am. approximately in the Nv. 1880 C...,True
4,1,3,3,1,16,Approximately at 11:45 a.m. in circumstances t...,True
...,...,...,...,...,...,...,...
420,1,0,2,1,16,"Being approximately 5:00 a.m. approximately, w...",True
421,1,0,1,0,16,The collaborator moved from the infrastructure...,False
422,0,0,1,0,31,During the environmental monitoring activity i...,True
423,0,0,1,0,6,The Employee performed the activity of strippi...,True


# Learned Embedding on Critical Risk feature

In [17]:
n_unique_classes = df.loc[:, "Critical Risk"].nunique()
emb_size = 5
emb_layer = torch.nn.Embedding(num_embeddings = n_unique_classes, embedding_dim = emb_size)

In [20]:
loss_fn = torch.nn.MSELoss()
optimizer = torch.optim.Adam(emb_layer.parameters())

  _torch_pytree._register_pytree_node(


In [None]:
X = 

In [None]:
for i in range(10):
    

In [4]:
from collections import Counter
import torch
import torch.nn as nn

# Let's say you have 2 sentences(lowercased, punctuations removed) :
sentences = "i am new to PyTorch i am having fun"

words = sentences.split(' ')
    
vocab = Counter(words) # create a dictionary
print(f"Vocab: {vocab}")
vocab = sorted(vocab, key=vocab.get, reverse=True)
print(f"Vocab: {vocab}")
vocab_size = len(vocab)
print(f"Vocab size: {vocab_size}")

# map words to unique indices
word2idx = {word: ind for ind, word in enumerate(vocab)} 
print(f"Word2idx: {word2idx}")

# word2idx = {'i': 0, 'am': 1, 'new': 2, 'to': 3, 'pytorch': 4, 'having': 5, 'fun': 6}

encoded_sentences = [word2idx[word] for word in words]
print(f"Encoded sentences: {encoded_sentences}")

# encoded_sentences = [0, 1, 2, 3, 4, 0, 1, 5, 6]

# let's say you want embedding dimension to be 3
emb_dim = 3 

emb_layer = nn.Embedding(vocab_size, emb_dim)
word_vectors = emb_layer(torch.LongTensor(encoded_sentences))
print(f"Word vectors: {word_vectors}")
print(word_vectors.shape)

Vocab: Counter({'i': 2, 'am': 2, 'new': 1, 'to': 1, 'PyTorch': 1, 'having': 1, 'fun': 1})
Vocab: ['i', 'am', 'new', 'to', 'PyTorch', 'having', 'fun']
Vocab size: 7
Word2idx: {'i': 0, 'am': 1, 'new': 2, 'to': 3, 'PyTorch': 4, 'having': 5, 'fun': 6}
Encoded sentences: [0, 1, 2, 3, 4, 0, 1, 5, 6]
Word vectors: tensor([[ 0.6345,  1.7582, -1.4964],
        [-2.0281, -1.4347, -0.0276],
        [ 0.5465,  0.2519, -0.9080],
        [ 0.8139,  2.0662, -1.0541],
        [-1.2938, -1.6526, -1.5017],
        [ 0.6345,  1.7582, -1.4964],
        [-2.0281, -1.4347, -0.0276],
        [-0.0497, -0.9410,  0.1442],
        [-0.8747,  1.5118,  1.0076]], grad_fn=<EmbeddingBackward0>)
torch.Size([9, 3])
