# Import Libraries

In [1]:
#!pip install transformers==4.6.1
#!pip install transformers-interpret


In [2]:
import pandas as pd
import numpy as np

import numpy as np
import pandas as pd
import random,os
import sklearn
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
import tqdm
from tqdm import tqdm
from sklearn import metrics
import re
import math

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from torch.optim.lr_scheduler import *
import transformers
from transformers import AutoTokenizer, AutoModel, RobertaModel, RobertaTokenizer, AutoModelForSequenceClassification
from transformers_interpret import SequenceClassificationExplainer


# Improt Dataset

In [3]:
fmfDataset = pd.read_excel("data_set2.xlsx")

# Clean the data

In [4]:
df = fmfDataset[["Parsed Event Description", "Broad Category 1"]]
len(df)

5630

In [5]:
df = df.dropna()
len(df)

5630

In [6]:
# dropping ALL duplicate values
df.drop_duplicates(subset ="Parsed Event Description", keep = False, inplace = True)
len(df)

5236

In [7]:
if torch.cuda.is_available():     
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")
print( 'device set to =>', device)

There are 1 GPU(s) available.
We will use the GPU: Tesla T4
device set to => cuda


In [8]:
def remove(sentences):
    sentences = re.sub('[^., A-Za-z0-9]+', '', sentences)
    sentences = ".".join([" ".join(sentance.split(" ")[4:]) if "IT WAS REPORTED" in sentance or "IT IS REPORT" in sentance or "CUSTOMER REPORT" in sentance else sentance for sentance in sentences.split(".")])

    last_sentence = sentences.split(".")[-1] 
    if "INJURY OR MEDICAL INTERVENTION" in last_sentence:
        sentences = "".join(sentences.split(".")[:-1])

    return sentences

In [9]:
df["cleaned text"] = df["Parsed Event Description"].apply(remove)

In [10]:
class Config:    
    max_len = 128
    output_dim = 768
    classes = 6
    batch_size = 8
    epochs = 10
    lr = 0.00001 #0.00020892962347716094
    take_mean = True 
    return_index = 0
    text_column = "cleaned text"
    label_column = "label"

In [11]:
df["Broad Category 1"].unique()

array(['Components - missing/not working/detached',
       'Contamination - Syringe/needle', 'Components - Damage / breakage',
       'Labelling  - Incorrect/missing label/markings',
       'Safety feature - Failed/needlestick injury',
       'Packaging - Contaminated/damaged'], dtype=object)

In [12]:
score_map = dict(zip(range(6), ['Components - missing/not working/detached', 'Components - Damage / breakage', 'Contamination - Syringe/needle', 'Labelling  - Incorrect/missing label/markings', 'Safety feature - Failed/needlestick injury', 'Packaging - Contaminated/damaged']))
inverse_score_map = dict(zip(['Components - missing/not working/detached', 'Components - Damage / breakage', 'Contamination - Syringe/needle', 'Labelling  - Incorrect/missing label/markings', 'Safety feature - Failed/needlestick injury', 'Packaging - Contaminated/damaged'], range(6)))


In [13]:
df["label"] = df['Broad Category 1'].map(inverse_score_map)
df.head()


Unnamed: 0,Parsed Event Description,Broad Category 1,cleaned text,label
0,IT WAS REPORTED IN 10 BD¿ SYRINGES WITH NEEDLE...,Components - missing/not working/detached,10 BD SYRINGES WITH NEEDLES LUER SLIP SYRINGES...,0
1,ON (B)(6) 2016 I HAD AN INJECTION INTO MY LEFT...,Contamination - Syringe/needle,ON B6 2016 I HAD AN INJECTION INTO MY LEFT EYE...,2
10,"CUSTOMER REPORTED THE FOLLOWING PROBLEM: ""I HA...",Components - missing/not working/detached,PROBLEM I HAVE ORDERED THIS PRODUCT MANY TIMES...,0
11,"(B)(4). THIS DEVICE CASE, WHICH DOES NOT INVOL...",Components - Damage / breakage,"B4. THIS DEVICE CASE, WHICH DOES NOT INVOLVE A...",1
12,"(B)(4). THIS DEVICE CASE, WHICH DOES NOT INVOL...",Components - Damage / breakage,"B4. THIS DEVICE CASE, WHICH DOES NOT INVOLVE A...",1


In [14]:
df["rev_label"] = df['label'].map(score_map)
df.head()


Unnamed: 0,Parsed Event Description,Broad Category 1,cleaned text,label,rev_label
0,IT WAS REPORTED IN 10 BD¿ SYRINGES WITH NEEDLE...,Components - missing/not working/detached,10 BD SYRINGES WITH NEEDLES LUER SLIP SYRINGES...,0,Components - missing/not working/detached
1,ON (B)(6) 2016 I HAD AN INJECTION INTO MY LEFT...,Contamination - Syringe/needle,ON B6 2016 I HAD AN INJECTION INTO MY LEFT EYE...,2,Contamination - Syringe/needle
10,"CUSTOMER REPORTED THE FOLLOWING PROBLEM: ""I HA...",Components - missing/not working/detached,PROBLEM I HAVE ORDERED THIS PRODUCT MANY TIMES...,0,Components - missing/not working/detached
11,"(B)(4). THIS DEVICE CASE, WHICH DOES NOT INVOL...",Components - Damage / breakage,"B4. THIS DEVICE CASE, WHICH DOES NOT INVOLVE A...",1,Components - Damage / breakage
12,"(B)(4). THIS DEVICE CASE, WHICH DOES NOT INVOL...",Components - Damage / breakage,"B4. THIS DEVICE CASE, WHICH DOES NOT INVOLVE A...",1,Components - Damage / breakage


In [15]:
modelGlobVar = "cambridgeltl/SapBERT-from-PubMedBERT-fulltext"
tokenizer = AutoTokenizer.from_pretrained(modelGlobVar)

train, val = train_test_split(df, test_size = 0.15, random_state = 0)


Downloading:   0%|          | 0.00/462 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/198 [00:00<?, ?B/s]

In [16]:
class DatasetCreation(Dataset):
    def __init__( self, text_column, label_column, tokenizer):
        self.text_column = text_column
        self.label_column = label_column
        self.tokenizer = tokenizer
        
    def __len__(self):
        return len(self.text_column)
        
    def __getitem__(self, idx):
        text_data = self.tokenizer.encode_plus(
            self.text_column[idx],
            add_special_tokens = True,
            truncation = True,
            return_attention_mask = True,
            padding= 'max_length',
            max_length = Config.max_len,
        )            #pad_to_max_length = True,
        input_ids =text_data['input_ids']
        masks = text_data['attention_mask']
        label_column = self.label_column[idx]
        
        return {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'label': torch.tensor(label_column, dtype=torch.long),
            'attention_mask': torch.tensor(masks, dtype=torch.long )
        }

In [17]:
train_data = DatasetCreation(text_column= train["Parsed Event Description"].values,label_column = train["label"].values,tokenizer = tokenizer)
val_data = DatasetCreation(text_column= val["Parsed Event Description"].values ,label_column = val["label"].values,tokenizer = tokenizer)

In [18]:
trainDataLoader = DataLoader(train_data, batch_size=Config.batch_size, shuffle=True)
valDataLoader = DataLoader(val_data, batch_size=Config.batch_size)

In [19]:
class ClassifierModel(nn.Module):
    def __init__( self, dropout):
        super(ClassifierModel, self).__init__()
        self.bert = AutoModel.from_pretrained(modelGlobVar) 
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(Config.output_dim, Config.classes)
        self.relu = nn.ReLU()
        
    def forward( self, input_id, mask):
        pooled_out = self.bert(input_ids= input_id, attention_mask=mask)['pooler_output']    
        dropuout_output = self.dropout(pooled_out)
        linear_output = self.linear(dropuout_output)
        final_layer = self.relu(linear_output)
        return final_layer


In [20]:
modelClassifier = ClassifierModel(dropout = 0.20)
modelClassifier = modelClassifier.to(device)

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [21]:
def evaluation_metrics(y_true, y_pred, strMethod):
    precision = metrics.precision_score(y_true, y_pred, average="micro")
    cm = metrics.confusion_matrix(y_true, y_pred)
    recall = metrics.recall_score(y_true, y_pred, average="micro")
    
    balanced_accuracy = metrics.balanced_accuracy_score(y_true, y_pred, adjusted=False)
    f1_score = metrics.f1_score(y_true, y_pred, average= "micro") 
    
    print(f'\n{strMethod}: \nBalanced Accuracy {balanced_accuracy}, \nPrecision {precision} \nRecall {recall} \nF1 Score {f1_score}')
    print("Confusion Matrix: ", cm)

    return f1_score

In [22]:
def training(model, train_dl, val_dl, lr, epochs):
    optimizer = torch.optim.Adam(modelClassifier.parameters(), lr = lr) # play with weight decay and scheduler and optimizer , weight_decay = 0
    criterion = nn.CrossEntropyLoss()

    if torch.cuda.is_available():
        model = model.cuda()
        criterion = criterion.cuda()
    
    prev_loss = 9999
    loss = 999
    prev_f1 = -999
    f1_score = -999
    for epoch_num in range(epochs):

        list_label = []
        list_prob = []
        total_train_loss = 0 # accuracy metrics

        
        for item in tqdm(train_dl):
            train_label = item['label'].to(device)
            mask = item['attention_mask'].to(device)
            input_id = item['input_ids'].squeeze(1).to(device)
            
            output = model(input_id, mask)
            batch_loss = criterion(output, train_label)
            total_train_loss += batch_loss.item()            

            train_label = train_label.cpu().detach().numpy()
            sigmoid_prob = output.argmax(dim=1).cpu().detach().numpy()

            list_label.append(np.array(train_label))
            list_prob.append(np.array(sigmoid_prob))

            model.zero_grad()
            batch_loss.backward()
            optimizer.step()
        
        list_label = np.concatenate(list_label, axis=0)
        list_prob = np.concatenate(list_prob, axis=0)

        total_val_loss = 0
        list_val_label = []
        list_val_prob = []
        lst = []
        j = 0
        with torch.no_grad():
            for item in val_dl:
                val_label = item['label'].to(device)
                mask = item['attention_mask'].to(device)
                input_id = item['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)

                bach_loss = criterion(output, val_label)
                total_val_loss += batch_loss.item()
                
                val_label = val_label.cpu().detach().numpy()
                sigmoid_prob = output.argmax(dim=1).cpu().detach().numpy()

                for i in range(len(val_label)):
                    if(val_label[i] != sigmoid_prob[i]):
                        lst.append([j, val_label[i], sigmoid_prob[i]])
                    j += 1

                list_val_label.append(np.array(val_label))
                list_val_prob.append(np.array(sigmoid_prob))

        list_val_label = np.concatenate(list_val_label, axis=0)
        list_val_prob = np.concatenate(list_val_prob, axis=0)

        print( f'\nEpochs: {epoch_num + 1} | Train Loss: {total_train_loss / len(train_data): .3f} | Validation Loss: {total_val_loss / len(val_data)}')

        evaluation_metrics(list_label, list_prob, "training")        
        f1_score = evaluation_metrics(list_val_label, list_val_prob, "validation")
        loss = total_val_loss

        if(prev_loss > loss and prev_f1 < f1_score):
            prev_loss = loss
            prev_f1 = f1_score
            modelClassifier.bert.save_pretrained("savedAgain")

            val["predictions"] = list_val_prob

        val["predicted_classes"] = val ['predictions'].map(score_map)
        val.to_csv("val_predictions.csv")

training(modelClassifier, trainDataLoader, valDataLoader, Config.lr, Config.epochs)


100%|██████████| 557/557 [01:44<00:00,  5.35it/s]



Epochs: 1 | Train Loss:  0.095 | Validation Loss: 0.09426839797551395

training: 
Balanced Accuracy 0.528185996525078, 
Precision 0.7460674157303371 
Recall 0.7460674157303371 
F1 Score 0.746067415730337
Confusion Matrix:  [[1132  183   48   24    0    2]
 [ 218  956   30    8    0    0]
 [ 154  114  931   47    3   10]
 [  40   30    8  298    0    0]
 [  84   33    6    1    0    0]
 [  20   22   40    5    0    3]]

validation: 
Balanced Accuracy 0.5858410613541211, 
Precision 0.8358778625954199 
Recall 0.8358778625954199 
F1 Score 0.8358778625954199
Confusion Matrix:  [[197  16   8   1   0   0]
 [ 44 175   5   0   0   0]
 [  5   2 226   1   0   0]
 [  7   0   1  59   0   0]
 [  8   9   2   0   0   0]
 [  2   3  15   0   0   0]]


100%|██████████| 557/557 [01:48<00:00,  5.13it/s]



Epochs: 2 | Train Loss:  0.047 | Validation Loss: 0.055510127931150774

training: 
Balanced Accuracy 0.699579434691412, 
Precision 0.8847191011235955 
Recall 0.8847191011235955 
F1 Score 0.8847191011235955
Confusion Matrix:  [[1240  107   31    9    0    2]
 [ 122 1062   21    4    0    3]
 [   5   14 1227    3    0   10]
 [   7    2    1  364    0    2]
 [  70   30   15    2    1    6]
 [   4    5   37    1    0   43]]

validation: 
Balanced Accuracy 0.6859142250638716, 
Precision 0.8587786259541985 
Recall 0.8587786259541985 
F1 Score 0.8587786259541985
Confusion Matrix:  [[185  25   8   1   0   3]
 [ 28 188   6   0   2   0]
 [  1   3 227   2   0   1]
 [  2   0   0  65   0   0]
 [  6  10   1   0   1   1]
 [  0   2   8   1   0   9]]


100%|██████████| 557/557 [01:55<00:00,  4.84it/s]



Epochs: 3 | Train Loss:  0.025 | Validation Loss: 0.002902252699127634

training: 
Balanced Accuracy 0.8875557889629907, 
Precision 0.9462921348314607 
Recall 0.9462921348314607 
F1 Score 0.9462921348314607
Confusion Matrix:  [[1310   49   19    5    4    2]
 [  62 1131    9    1    7    2]
 [   1    6 1237    0    1   14]
 [   0    1    0  375    0    0]
 [  13   13    1    0   94    3]
 [   4    3   18    0    1   64]]

validation: 
Balanced Accuracy 0.743080145397506, 
Precision 0.8447837150127226 
Recall 0.8447837150127226 
F1 Score 0.8447837150127226
Confusion Matrix:  [[177  35   7   1   1   1]
 [ 27 184   5   0   8   0]
 [  4   4 223   1   0   2]
 [  4   1   1  61   0   0]
 [  3   6   0   0  10   0]
 [  1   1   9   0   0   9]]


100%|██████████| 557/557 [01:55<00:00,  4.82it/s]



Epochs: 4 | Train Loss:  0.014 | Validation Loss: 0.0014562246155079084

training: 
Balanced Accuracy 0.9472378897140953, 
Precision 0.9710112359550562 
Recall 0.9710112359550562 
F1 Score 0.9710112359550562
Confusion Matrix:  [[1333   33   12    5    6    0]
 [  26 1175    5    1    5    0]
 [   2    2 1249    0    0    6]
 [   0    1    0  375    0    0]
 [   5    7    0    1  110    1]
 [   0    0   11    0    0   79]]

validation: 
Balanced Accuracy 0.7291448939573448, 
Precision 0.8562340966921119 
Recall 0.8562340966921119 
F1 Score 0.8562340966921119
Confusion Matrix:  [[186  26   6   1   1   2]
 [ 33 182   6   0   3   0]
 [  2   3 226   1   0   2]
 [  4   0   0  63   0   0]
 [  5   7   0   0   7   0]
 [  0   2   8   1   0   9]]


100%|██████████| 557/557 [01:55<00:00,  4.84it/s]



Epochs: 5 | Train Loss:  0.008 | Validation Loss: 0.0006280578121912616

training: 
Balanced Accuracy 0.9779653412802674, 
Precision 0.9840449438202247 
Recall 0.9840449438202247 
F1 Score 0.9840449438202247
Confusion Matrix:  [[1361   13   11    2    1    1]
 [  16 1187    4    0    5    0]
 [   2    2 1251    0    0    4]
 [   1    1    0  374    0    0]
 [   1    2    0    0  121    0]
 [   0    0    5    0    0   85]]

validation: 
Balanced Accuracy 0.7324714617600517, 
Precision 0.851145038167939 
Recall 0.851145038167939 
F1 Score 0.851145038167939
Confusion Matrix:  [[184  28   6   1   1   2]
 [ 33 181   5   0   5   0]
 [  1   4 225   1   0   3]
 [  5   0   0  62   0   0]
 [  6   5   0   0   8   0]
 [  0   2   8   1   0   9]]


100%|██████████| 557/557 [01:55<00:00,  4.83it/s]



Epochs: 6 | Train Loss:  0.006 | Validation Loss: 0.0005875761471404374

training: 
Balanced Accuracy 0.992167929217076, 
Precision 0.9898876404494382 
Recall 0.9898876404494382 
F1 Score 0.9898876404494382
Confusion Matrix:  [[1369   10    7    1    2    0]
 [  15 1194    3    0    0    0]
 [   3    1 1254    0    0    1]
 [   0    1    0  375    0    0]
 [   0    0    0    0  124    0]
 [   0    0    1    0    0   89]]

validation: 
Balanced Accuracy 0.7609961443892118, 
Precision 0.8524173027989822 
Recall 0.8524173027989822 
F1 Score 0.8524173027989822
Confusion Matrix:  [[161  49   7   1   2   2]
 [ 15 197   6   1   5   0]
 [  0   3 227   1   0   3]
 [  0   1   0  65   0   1]
 [  2   8   1   0   8   0]
 [  0   1   6   1   0  12]]


100%|██████████| 557/557 [01:55<00:00,  4.83it/s]



Epochs: 7 | Train Loss:  0.004 | Validation Loss: 0.015006768680710828

training: 
Balanced Accuracy 0.9858224116877098, 
Precision 0.9914606741573033 
Recall 0.9914606741573033 
F1 Score 0.9914606741573033
Confusion Matrix:  [[1372    8    3    2    1    3]
 [   7 1201    2    1    1    0]
 [   2    1 1255    0    0    1]
 [   0    0    0  376    0    0]
 [   1    1    0    0  122    0]
 [   0    0    4    0    0   86]]

validation: 
Balanced Accuracy 0.8016307250056268, 
Precision 0.8435114503816794 
Recall 0.8435114503816794 
F1 Score 0.8435114503816794
Confusion Matrix:  [[181  30   5   1   3   2]
 [ 27 182   4   0  11   0]
 [  6   6 214   1   0   7]
 [  6   1   0  59   0   1]
 [  1   4   0   0  14   0]
 [  0   1   4   1   1  13]]


100%|██████████| 557/557 [01:55<00:00,  4.83it/s]



Epochs: 8 | Train Loss:  0.004 | Validation Loss: 0.0002290087935109516

training: 
Balanced Accuracy 0.9866489102927017, 
Precision 0.990561797752809 
Recall 0.990561797752809 
F1 Score 0.990561797752809
Confusion Matrix:  [[1375    7    3    1    2    1]
 [   8 1198    2    1    2    1]
 [   2    2 1251    0    0    4]
 [   0    1    0  375    0    0]
 [   0    2    0    0  122    0]
 [   0    0    3    0    0   87]]

validation: 
Balanced Accuracy 0.7384152435262018, 
Precision 0.8486005089058524 
Recall 0.8486005089058524 
F1 Score 0.8486005089058524
Confusion Matrix:  [[188  26   5   1   0   2]
 [ 34 182   5   0   3   0]
 [ 11   3 217   1   0   2]
 [  4   0   0  62   0   1]
 [  6   6   0   0   7   0]
 [  1   1   6   1   0  11]]


100%|██████████| 557/557 [01:55<00:00,  4.84it/s]



Epochs: 9 | Train Loss:  0.003 | Validation Loss: 0.00017725305679744779

training: 
Balanced Accuracy 0.9932613847313888, 
Precision 0.9930337078651685 
Recall 0.9930337078651685 
F1 Score 0.9930337078651685
Confusion Matrix:  [[1376    7    2    1    2    1]
 [   6 1200    2    0    3    1]
 [   2    1 1256    0    0    0]
 [   1    0    0  375    0    0]
 [   0    1    1    0  122    0]
 [   0    0    0    0    0   90]]

validation: 
Balanced Accuracy 0.7011638598956943, 
Precision 0.8409669211195929 
Recall 0.8409669211195929 
F1 Score 0.8409669211195929
Confusion Matrix:  [[170  44   4   1   1   2]
 [ 22 195   4   1   2   0]
 [  0  10 221   1   0   2]
 [  4   1   0  61   0   1]
 [  2  11   0   0   6   0]
 [  0   2   9   1   0   8]]


100%|██████████| 557/557 [01:55<00:00,  4.83it/s]



Epochs: 10 | Train Loss:  0.003 | Validation Loss: 0.0005360650332317552

training: 
Balanced Accuracy 0.991955532608606, 
Precision 0.9912359550561798 
Recall 0.9912359550561798 
F1 Score 0.9912359550561798
Confusion Matrix:  [[1370   10    3    1    4    1]
 [  13 1198    1    0    0    0]
 [   1    1 1257    0    0    0]
 [   1    1    0  374    0    0]
 [   1    1    0    0  122    0]
 [   0    0    0    0    0   90]]

validation: 
Balanced Accuracy 0.7553534278961419, 
Precision 0.8473282442748091 
Recall 0.8473282442748091 
F1 Score 0.8473282442748091
Confusion Matrix:  [[199  16   5   1   0   1]
 [ 52 161   5   0   6   0]
 [  6   1 223   1   0   3]
 [  4   0   0  63   0   0]
 [  6   4   0   0   9   0]
 [  4   0   4   1   0  11]]


In [23]:
modelLoaded = AutoModelForSequenceClassification.from_pretrained("savedAgain")
multiclass_explainer = SequenceClassificationExplainer(model=modelLoaded, tokenizer=tokenizer)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at savedAgain and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
multiclass_explainer

<transformers_interpret.explainers.sequence_classification.SequenceClassificationExplainer at 0x7f934da1c650>

In [25]:
textSen = "WHILE USING A UNSPECIFIED BDÂ¿ SYRINGE WITH NEEDLE, Â¿WHEN TRYING TO PUT THE SHIELD ONTO THE NEEDLE, THE NEEDLE WENT THROUGH THE SHIELD AND POKED HERÂ¿. IT IS UN-CLEAR IF THE STICK WAS CLEAN OR DIRTY, IT IS UNKNOWN WHOM THE NEEDLE WAS USED ON."


In [26]:
word_attributions = multiclass_explainer(text=textSen) #" Components - Damage / breakage" # first and last
html = multiclass_explainer.visualize()


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
1.0,LABEL_1 (0.52),LABEL_1,-0.12,"[CLS] while using a unspec ##ified bd ##a ¿ syringe with needle , a ¿ when trying to put the shield onto the needle , the needle went through the shield and po ##ke ##d her ##a ¿ . it is un - clear if the stick was clean or dir ##ty , it is unknown whom the needle was used on . [SEP]"
,,,,


# DONE!