<a href="https://colab.research.google.com/github/Deawsp/CodiEsp/blob/main/icd10_multi_label_text_classification_DistilBERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Importing python libraries and preparing the environment

In [1]:
# Installing the transformer library
!pip install -q transformers

[K     |████████████████████████████████| 1.8MB 18.2MB/s 
[K     |████████████████████████████████| 890kB 54.6MB/s 
[K     |████████████████████████████████| 3.2MB 52.4MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [2]:
# Importing stock ml libraries
import warnings
warnings.simplefilter('ignore')
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import DistilBertTokenizer, DistilBertModel
import logging
logging.basicConfig(level=logging.ERROR)

In [3]:
# setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [4]:
def hamming_score(y_true, y_pred, normalize=True, sample_weight=None):
    acc_list = []
    for i in range(y_true.shape[0]):
        set_true = set( np.where(y_true[i])[0] )
        set_pred = set( np.where(y_pred[i])[0] )
        tmp_a = None
        if len(set_true) == 0 and len(set_pred) == 0:
            tmp_a = 1
        else:
            tmp_a = len(set_true.intersection(set_pred))/\
                    float( len(set_true.union(set_pred)) )
        acc_list.append(tmp_a)
    return np.mean(acc_list)

# Importing and Pre-Processing the domain data

In [None]:
# # # mount colab to google drive
from google.colab import drive 
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import zipfile
my_zipfolder = '/content/gdrive/MyDrive/icd10_multi_label_classification/train.csv.zip'
with zipfile.ZipFile(my_zipfolder, 'r') as zip_ref:
  zip_ref.extractall('working_directory')


In [None]:
# my_ziptrain = '/content/working_directory/train.csv.zip'
# with zipfile.ZipFile(my_ziptrain, 'r') as zip_reftrain:
#   zip_reftrain.extractall('data')


import os   
import shutil 
os.mkdir("data")
shutil.move("/content/working_directory/train.csv", "/content/data") 



'/content/data/train.csv'

In [None]:
df = pd.read_csv('/content/data/train.csv')
df['list'] = df[df.columns[2:]].values.tolist() #Create new column call list 
new_df = df[['text', 'list']].copy()
new_df.head()

Unnamed: 0,text,list
0,['A 62-year-old woman with a history of arteri...,"[1, 0, 1, 1, 0, 0, 0]"
1,['A 4-year-old patient presented with tirednes...,"[1, 0, 1, 0, 0, 1, 0]"
2,['A 72-year-old male diagnosed with primary op...,"[1, 0, 0, 0, 0, 0, 0]"
3,['A 49-year-old male smoker of 12 cigarettes a...,"[1, 0, 0, 0, 0, 0, 0]"
4,['A 39-year-old male patient presented to the ...,"[1, 0, 0, 0, 0, 0, 0]"


#Preparing the dataset and dataloader

In [None]:
# Sections of config
# Defining some key variables that will be used later on in the triaining
MAX_LEN =  512 # change to 512 instead of 200
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 50
LEARNING_RATE = 1e-05
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', truncation=True, do_lower_case=True)

In [None]:
class MultiLabelDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = self.data.list
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [None]:
# Creating the dataset and dataloader for the neural network

train_size = 0.8
train_dataset=new_df.sample(frac=train_size,random_state=200)
test_dataset=new_df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(new_df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (339, 2)
TRAIN Dataset: (271, 2)
TEST Dataset: (68, 2)


In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

#Creating the Neural Network for Fine Tuning

In [None]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. 

class DistilBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistilBERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.1)
        self.classifier = torch.nn.Linear(768, 7)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.Tanh()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

model = DistilBERTClass()
model.to(device)

DistilBERTClass(
  (l1): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in_featu

In [None]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [None]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

#Fine Tuning the Model

In [None]:
def train(epoch):
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids, )

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _%5000==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
         
            # logger.info(f'Epoch:{epoch}, Loss: {loss.item()}'
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [None]:
def validation():
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [None]:
import logging
import warnings                        # To ignore any warnings
warnings.filterwarnings("ignore")

# a function  to create and save logs in the log files
def log(path, file):

    """[Create a log file to record the experiment's logs]
    
    Arguments:
        path {string} -- path to the directory
        file {string} -- file name
    
    Returns:
        [obj] -- [logger that record logs]
    """

    # check if the file exist
    log_file = os.path.join(path, file)

    if not os.path.isfile(log_file):
        open(log_file, "w+").close()

    console_logging_format = "%(message)s"
    file_logging_format = "%(message)s"
    
    
    

    # in case we want loglevel and time 
    # console_logging_format = "%(levelname)s %(message)s"
    # file_logging_format = "%(levelname)s: %(asctime)s: %(message)s"

    # configure logger
    logging.basicConfig(level=logging.INFO, format=console_logging_format)
    logger = logging.getLogger()
    
    # create a file handler for output file
    handler = logging.FileHandler(log_file)

    # set the logging level for log file
    handler.setLevel(logging.INFO)
    
    # create a logging format
    formatter = logging.Formatter(file_logging_format)
    handler.setFormatter(formatter)
    

    # add the handlers to the logger
    logger.addHandler(handler)

    return logger

In [None]:
# set a logger file
# os.mkdir("logs")

path = "/content/logs"
logger = log(path, file="train.logs")


In [None]:
# logger.info("Start Training")

for epoch in range(EPOCHS):
    train(epoch)
    outputs, targets = validation()
    outputs = np.array(outputs) >= 0.5
    accuracy = metrics.accuracy_score(targets, outputs)
    f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
    f1_score_macro = metrics.f1_score(targets, outputs, average='macro')

    # logger.info(f"{accuracy}, {f1_score_micro}, {f1_score_macro}")

    # logger.info(f"{f1_score_micro},")
    # logger.info(f"{f1_score_macro},")
    # logger.info(f"Accuracy Score = {accuracy} ,")
    # logger.info(f"F1 Score (Micro) = {f1_score_micro} ,")
    # logger.info(f"F1 Score (Macro) = {f1_score_macro} ,")

    print(f"Accuracy Score = {accuracy}")
    print(f"F1 Score (Micro) = {f1_score_micro}")
    print(f"F1 Score (Macro) = {f1_score_macro}")
    print()

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Epoch: 0, Loss:  0.7058376669883728
Accuracy Score = 0.0
F1 Score (Micro) = 0.0
F1 Score (Macro) = 0.0

Epoch: 1, Loss:  0.5376745462417603
Accuracy Score = 0.0
F1 Score (Micro) = 0.0
F1 Score (Macro) = 0.0

Epoch: 2, Loss:  0.5997713804244995
Accuracy Score = 0.0
F1 Score (Micro) = 0.0
F1 Score (Macro) = 0.0

Epoch: 3, Loss:  0.5152737498283386
Accuracy Score = 0.0
F1 Score (Micro) = 0.0
F1 Score (Macro) = 0.0

Epoch: 4, Loss:  0.5746962428092957
Accuracy Score = 0.014705882352941176
F1 Score (Micro) = 0.015037593984962407
F1 Score (Macro) = 0.009523809523809523

Epoch: 5, Loss:  0.495268851518631
Accuracy Score = 0.11764705882352941
F1 Score (Micro) = 0.40963855421686746
F1 Score (Macro) = 0.261352843024002

Epoch: 6, Loss:  0.45978689193725586
Accuracy Score = 0.16176470588235295
F1 Score (Micro) = 0.4606741573033708
F1 Score (Macro) = 0.3263853263853264

Epoch: 7, Loss:  0.3597797155380249
Accuracy Score = 0.14705882352941177
F1 Score (Micro) = 0.4457142857142857
F1 Score (Macro) =

In [None]:
import pandas as pd
import pylab as plt

# Create dataframe
file_name = "/content/logs/train.logs"
dflog = pd.read_csv(file_name)
# dflog.plot()
# plt.show()
dflog.tail()

Unnamed: 0,0.3235294117647059,0.6602870813397129,0.6159376532275693
44,0.323529,0.666667,0.622765
45,0.338235,0.666667,0.62788
46,0.382353,0.681416,0.646638
47,0.338235,0.669725,0.634705
48,0.308824,0.657277,0.619133


#Validating the Model

In [None]:
for epoch in range(EPOCHS):
    outputs, targets = validation(epoch)
    outputs = np.array(outputs) >= 0.5
    accuracy = metrics.accuracy_score(targets, outputs)
    f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
    f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
    print(f"Accuracy Score = {accuracy}")
    print(f"F1 Score (Micro) = {f1_score_micro}")
    print(f"F1 Score (Macro) = {f1_score_macro}")