<a href="https://colab.research.google.com/github/Deawsp/CodiEsp/blob/main/icd10_multi_label_classification_codiesp50.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Importing python libraries and preparing the environment

In [1]:
# Installing the transformer library
!pip install -q transformers

[K     |████████████████████████████████| 1.8MB 10.7MB/s 
[K     |████████████████████████████████| 890kB 37.4MB/s 
[K     |████████████████████████████████| 3.2MB 37.7MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [2]:
# Importing ml libraries
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig

In [3]:
# setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

# Importing and Pre-Processing the domain data

In [4]:
# # # mount colab to google drive
from google.colab import drive 
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import zipfile
my_zipfolder = '/content/gdrive/MyDrive/icd10_multi_label_classification/train.csv.zip'
with zipfile.ZipFile(my_zipfolder, 'r') as zip_ref:
  zip_ref.extractall('working_directory')


In [6]:
# # # Copy multiple file in google drive folder to another folder
import os
import shutil
src ='/content/gdrive/MyDrive/icd_codiesp'
src_files = os.listdir(src)
if not os.path.exists('/content/data'):
    os.mkdir('/content/data')
for file_name in src_files:
    full_file_name = os.path.join(src, file_name)
    if os.path.isfile(full_file_name):
        shutil.copy(full_file_name, '/content/data')

In [7]:
df = pd.read_csv('/content/data/codiEsp50.csv')
df['list'] = df[df.columns[2:]].values.tolist() #Create new column call list 
new_df = df[['text', 'list']].copy()
new_df.head()

Unnamed: 0,text,list
0,['A 49-year-old male smoker of 12 cigarettes a...,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, ..."
1,['We describe the case of a 47-year-old female...,"[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,['This is a one-month-old male of Moroccan ori...,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
3,['A 39-year-old woman diagnosed with complex r...,"[1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,"['A 3-year-old male patient, with no relevant ...","[1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


#Preparing the dataset and dataloader

In [31]:
# Sections of config
# Defining some key variables that will be used later on in the triaining
MAX_LEN =  512 # change to 512 instead of 200
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 100
LEARNING_RATE = 1e-05
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',return_dict=False)

In [32]:
import pandas as pd
import os
import torch
from pathlib import Path
import pickle

from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler

from transformers import (WEIGHTS_NAME, BertConfig,
                                  BertForSequenceClassification, BertTokenizer,
                                  XLMConfig, XLMForSequenceClassification,
                                  XLMTokenizer, XLNetConfig,
                                  XLNetForSequenceClassification,
                                  XLNetTokenizer)

MODEL_CLASSES = {
    'bert': (BertConfig, BertForSequenceClassification, BertTokenizer),
    'xlnet': (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
    'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer)
}

class InputExample(object):
    """A single training/test example for simple sequence classification."""

    def __init__(self, guid, text_a, text_b=None, label=None):
        """Constructs a InputExample.
        Args:
            guid: Unique id for the example.
            text_a: string. The untokenized text of the first sequence. For single
            sequence tasks, only this sequence must be specified.
            text_b: (Optional) string. The untokenized text of the second sequence.
            Only must be specified for sequence pair tasks.
            labels: (Optional) [string]. The label of the example. This should be
            specified for train and dev examples, but not for test examples.
        """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        if isinstance(label, list):
            self.label = label
        elif label:
            self.label = str(label)
        else:
            self.label = None


class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, input_mask, segment_ids, label_id):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id


def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    """Truncates a sequence pair in place to the maximum length."""

    # This is a simple heuristic which will always truncate the longer sequence
    # one token at a time. This makes more sense than truncating an equal percent
    # of tokens from each, since if one sequence is very short then each token
    # that's truncated likely contains more information than a longer sequence.
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b):
            tokens_a.pop()
        else:
            tokens_b.pop()


def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer):
    """Loads a data file into a list of `InputBatch`s."""

    label_map = {label: i for i, label in enumerate(label_list)}

    features = []
    for (ex_index, example) in enumerate(examples):
        try:
            tokens_a = tokenizer.tokenize(example.text_a)
        except:
            print("Cannot tokenise item {}, Text:{}".format(
                ex_index, example.text_a))

        tokens_b = None
        if example.text_b:
            tokens_b = tokenizer.tokenize(example.text_b)
            # Modifies `tokens_a` and `tokens_b` in place so that the total
            # length is less than the specified length.
            # Account for [CLS], [SEP], [SEP] with "- 3"
            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
        else:
            # Account for [CLS] and [SEP] with "- 2"
            if len(tokens_a) > max_seq_length - 2:
                tokens_a = tokens_a[:(max_seq_length - 2)]

        tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
        segment_ids = [0] * len(tokens)

        if tokens_b:
            tokens += tokens_b + ["[SEP]"]
            segment_ids += [1] * (len(tokens_b) + 1)

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding = [0] * (max_seq_length - len(input_ids))
        input_ids += padding
        input_mask += padding
        segment_ids += padding

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length

        if isinstance(example.label, list):
            label_id = []
            for label in example.label:
                label_id.append(float(label))
        else:
            if example.label != None:
                label_id = label_map[example.label]
            else:
                label_id = ''

        features.append(
            InputFeatures(input_ids=input_ids,
                          input_mask=input_mask,
                          segment_ids=segment_ids,
                          label_id=label_id))
    return features


class DataProcessor(object):
    """Base class for data converters for sequence classification data sets."""

    def get_train_examples(self, filename, size=-1):
        """Gets a collection of `InputExample`s for the train set."""
        raise NotImplementedError()

    def get_dev_examples(self, filename, size=-1):
        """Gets a collection of `InputExample`s for the dev set."""
        raise NotImplementedError()

    def get_test_examples(self, filename, size=-1):
        """Gets a collection of `InputExample`s for the dev set."""
        raise NotImplementedError()

    def get_labels(self):
        """Gets the list of labels for this data set."""
        raise NotImplementedError()


class NERTextProcessor(DataProcessor):

    def __init__(self, data_dir, label_dir):
        self.data_dir = data_dir
        self.label_dir = label_dir
        self.labels = None

    def get_train_examples(self, filename='train.txt'):
        """Gets a collection of `InputExample`s for the dev set."""
        return self._create_examples(self.read_col_file(os.path.join(self.data_dir, filename)), "train")

    def get_dev_examples(self, filename='val.txt', size=-1):
        """Gets a collection of `InputExample`s for the dev set."""
        return self._create_examples(self.read_col_file(os.path.join(self.data_dir, filename)), "val")

    def get_test_examples(self, filename='test.txt', size=-1):
        """Gets a collection of `InputExample`s for the test set."""
        return self._create_examples(self.read_col_file(os.path.join(self.data_dir, filename)), "test")

    def get_labels(self, filename='labels.csv'):
        """See base class."""
        if self.labels == None:
            self.labels = list(pd.read_csv(os.path.join(
                self.label_dir, filename), header=None)[0].astype('str').values)
        return self.labels

    def _create_examples(self, lines, set_type):
        examples = []
        for i, (sentence, label) in enumerate(lines):
            guid = "%s-%s" % (set_type, i)
            text_a = ' '.join(sentence)
            text_b = None
            label = label
            examples.append(InputExample(
                guid=guid, text_a=text_a, text_b=text_b, label=label))
        return examples

    def read_col_file(self, filename):
        '''
        read file
        return format :
        [ ['EU', 'B-ORG'], ['rejects', 'O'], ['German', 'B-MISC'], ['call', 'O'], ['to', 'O'], ['boycott', 'O'], 
        ['British', 'B-MISC'], ['lamb', 'O'], ['.', 'O'] ]
        '''
        f = open(filename)
        data = []
        sentence = []
        label = []
        for line in f:
            if len(line) == 0 or line.startswith('-DOCSTART') or line[0] == "\n":
                if len(sentence) > 0:
                    data.append((sentence, label))
                    sentence = []
                    label = []
                continue
            splits = line.split(' ')
            sentence.append(splits[0])
            label.append(splits[-1][:-1])

        if len(sentence) > 0:
            data.append((sentence, label))
            sentence = []
            label = []
        return data


class TextProcessor(DataProcessor):

    def __init__(self, data_dir, label_dir):
        self.data_dir = data_dir
        self.label_dir = label_dir
        self.labels = None

    def get_train_examples(self, filename='train.csv', text_col='text', label_col='label', size=-1):

        if size == -1:
            data_df = pd.read_csv(os.path.join(self.data_dir, filename))

            return self._create_examples(data_df, "train", text_col=text_col, label_col=label_col)
        else:
            data_df = pd.read_csv(os.path.join(self.data_dir, filename))
#             data_df['comment_text'] = data_df['comment_text'].apply(cleanHtml)
            return self._create_examples(data_df.sample(size), "train", text_col=text_col, label_col=label_col)

    def get_dev_examples(self, filename='val.csv', text_col='text', label_col='label', size=-1):

        if size == -1:
            data_df = pd.read_csv(os.path.join(self.data_dir, filename))
            return self._create_examples(data_df, "dev", text_col=text_col, label_col=label_col)
        else:
            data_df = pd.read_csv(os.path.join(self.data_dir, filename))
            return self._create_examples(data_df.sample(size), "dev", text_col=text_col, label_col=label_col)

    def get_test_examples(self, filename='val.csv', text_col='text', label_col='label', size=-1):
        data_df = pd.read_csv(os.path.join(self.data_dir, filename))
#         data_df['comment_text'] = data_df['comment_text'].apply(cleanHtml)
        if size == -1:
            return self._create_examples(data_df, "test",  text_col=text_col, label_col=None)
        else:
            return self._create_examples(data_df.sample(size), "test", text_col=text_col, label_col=None)

    def get_labels(self, filename='labels.csv'):
        """See base class."""
        if self.labels == None:
            self.labels = list(pd.read_csv(os.path.join(
                self.label_dir, filename), header=None)[0].astype('str').values)
        return self.labels

    def _create_examples(self, df, set_type, text_col, label_col):
        """Creates examples for the training and dev sets."""
        if label_col == None:
            return list(df.apply(lambda row: InputExample(guid=row.index, text_a=row[text_col], label=None), axis=1))
        else:
            return list(df.apply(lambda row: InputExample(guid=row.index, text_a=row[text_col], label=str(row[label_col])), axis=1))


class MultiLabelTextProcessor(TextProcessor):

    def _create_examples(self, df, set_type, text_col, label_col):
        def _get_labels(row, label_col):
            if isinstance(label_col, list):
                return list(row[label_col])
            else:
                # create one hot vector of labels
                label_list = self.get_labels()
                labels = [0] * len(label_list)
                labels[label_list.index(row[label_col])] = 1
                return labels

        """Creates examples for the training and dev sets."""
        if label_col == None:
            return list(df.apply(lambda row: InputExample(guid=row.index, text_a=row[text_col], label=[]), axis=1))
        else:
            return list(df.apply(lambda row: InputExample(guid=row.index, text_a=row[text_col],
                                                          label=_get_labels(row, label_col)), axis=1))


class BertDataBunch(object):

    def get_dl_from_texts(self, texts):

        test_examples = []
        input_data = []

        for index, text in enumerate(texts):
            test_examples.append(InputExample(index, text, label=None))
            input_data.append({
                'id': index,
                'text': text
            })
        test_features = convert_examples_to_features(test_examples, label_list=self.labels,
                                                     tokenizer=self.tokenizer, max_seq_length=self.maxlen)

        all_input_ids = torch.tensor(
            [f.input_ids for f in test_features], dtype=torch.long)
        all_input_mask = torch.tensor(
            [f.input_mask for f in test_features], dtype=torch.long)
        all_segment_ids = torch.tensor(
            [f.segment_ids for f in test_features], dtype=torch.long)

        test_data = TensorDataset(
            all_input_ids, all_input_mask, all_segment_ids)

        test_sampler = SequentialSampler(test_data)
        return DataLoader(test_data, sampler=test_sampler, batch_size=self.bs)

    def save(self, filename="databunch.pkl"):
        tmp_path = self.data_dir/'tmp'
        tmp_path.mkdir(exist_ok=True)
        with open(str(tmp_path/filename), "wb") as f:
            pickle.dump(self, f)

    @staticmethod
    def load(data_dir, backend='nccl', filename="databunch.pkl"):

        try:
            torch.distributed.init_process_group(backend=backend,
                                                 init_method="tcp://localhost:23459",
                                                 rank=0, world_size=1)
        except:
            pass

        tmp_path = data_dir/'tmp'
        with open(str(tmp_path/filename), "rb") as f:
            databunch = pickle.load(f)

        return databunch

    def __init__(self, data_dir, label_dir, tokenizer, train_file='train.csv', val_file='val.csv', test_data=None,
                 label_file='labels.csv', text_col='text', label_col='label', bs=32, maxlen=512,
                 multi_gpu=True, multi_label=False, backend="nccl", model_type='bert', custom_sampler=None):
        
        if isinstance(tokenizer, str):
            _,_,tokenizer_class = MODEL_CLASSES[model_type]
            # instantiate the new tokeniser object using the tokeniser name
            tokenizer = tokenizer_class.from_pretrained(tokenizer, do_lower_case=('uncased' in tokenizer))

        self.tokenizer = tokenizer  
        self.data_dir = data_dir
        self.maxlen = maxlen
        self.bs = bs
        self.train_dl = None
        self.val_dl = None
        self.test_dl = None
        self.multi_label = multi_label
        self.n_gpu = 0
        self.custom_sampler = custom_sampler
        if multi_gpu:
            self.n_gpu = torch.cuda.device_count()

        if multi_label:
            processor = MultiLabelTextProcessor(data_dir, label_dir)
        else:
            processor = TextProcessor(data_dir, label_dir)

        self.labels = processor.get_labels(label_file)

        if train_file:
            # Train DataLoader
            train_examples = processor.get_train_examples(
                train_file, text_col=text_col, label_col=label_col)
            train_features = convert_examples_to_features(train_examples, label_list=self.labels,
                                                          tokenizer=tokenizer, max_seq_length=maxlen)

            all_input_ids = torch.tensor(
                [f.input_ids for f in train_features], dtype=torch.long)
            all_input_mask = torch.tensor(
                [f.input_mask for f in train_features], dtype=torch.long)
            all_segment_ids = torch.tensor(
                [f.segment_ids for f in train_features], dtype=torch.long)
            if multi_label:
                all_label_ids = torch.tensor(
                    [f.label_id for f in train_features], dtype=torch.float)
            else:
                all_label_ids = torch.tensor(
                    [f.label_id for f in train_features], dtype=torch.long)

            train_data = TensorDataset(
                all_input_ids, all_input_mask, all_segment_ids, all_label_ids)

            train_batch_size = bs * max(1, self.n_gpu)

            if multi_gpu:
                if self.custom_sampler is not None:
                    train_sampler = self.custom_sampler
                else:
                    train_sampler = RandomSampler(train_data)
            else:
                try:
#                    torch.distributed.init_process_group(backend='nccl')
                    torch.distributed.init_process_group(backend=backend,
                                                         init_method="tcp://localhost:23459",
                                                         rank=0, world_size=1)
                except:
                    pass
                # torch.distributed.init_process_group(backend='nccl')
                train_sampler = DistributedSampler(train_data)
            self.train_dl = DataLoader(
                train_data, sampler=train_sampler, batch_size=train_batch_size)

        if val_file:
            # Validation DataLoader
            val_examples = processor.get_dev_examples(
                val_file, text_col=text_col, label_col=label_col)
            val_features = convert_examples_to_features(val_examples, label_list=self.labels,
                                                        tokenizer=tokenizer, max_seq_length=maxlen)

            all_input_ids = torch.tensor(
                [f.input_ids for f in val_features], dtype=torch.long)
            all_input_mask = torch.tensor(
                [f.input_mask for f in val_features], dtype=torch.long)
            all_segment_ids = torch.tensor(
                [f.segment_ids for f in val_features], dtype=torch.long)
            if multi_label:
                all_label_ids = torch.tensor(
                    [f.label_id for f in val_features], dtype=torch.float)
            else:
                all_label_ids = torch.tensor(
                    [f.label_id for f in val_features], dtype=torch.long)

            val_data = TensorDataset(
                all_input_ids, all_input_mask, all_segment_ids, all_label_ids)

            val_batch_size = bs * max(1, self.n_gpu)
            if multi_gpu:
                val_sampler = RandomSampler(val_data)
            else:
                try:
#                    torch.distributed.init_process_group(backend=backend)
                    torch.distributed.init_process_group(backend=backend,
                                                         init_method="tcp://localhost:23459",
                                                         rank=0, world_size=1)
                    
                except:
                    pass

                val_sampler = DistributedSampler(val_data)

            self.val_dl = DataLoader(
                val_data, sampler=val_sampler, batch_size=val_batch_size)

        if test_data:
            test_examples = []
            input_data = []

            for index, text in enumerate(test_data):
                test_examples.append(InputExample(index, text))
                input_data.append({
                    'id': index,
                    'text': text
                })

            test_features = convert_examples_to_features(test_examples, label_list=self.labels,
                                                         tokenizer=tokenizer, max_seq_length=maxlen)
            all_input_ids = torch.tensor(
                [f.input_ids for f in test_features], dtype=torch.long)
            all_input_mask = torch.tensor(
                [f.input_mask for f in test_features], dtype=torch.long)
            all_segment_ids = torch.tensor(
                [f.segment_ids for f in test_features], dtype=torch.long)

            test_data = TensorDataset(
                all_input_ids, all_input_mask, all_segment_ids)

            test_sampler = SequentialSampler(test_data)
            self.test_dl = DataLoader(
                test_data, sampler=test_sampler, batch_size=bs)



In [33]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = self.data.list
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

    

In [34]:
# Creating the dataset and dataloader for the neural network

train_size = 0.8
train_dataset=new_df.sample(frac=train_size,random_state=200)
test_dataset=new_df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(new_df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (440, 2)
TRAIN Dataset: (352, 2)
TEST Dataset: (88, 2)


In [35]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

#Creating the Neural Network for Fine Tuning

In [36]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. 

class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased', return_dict=False)
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 50) #change here 6 to 7 
    
    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output


model = BERTClass()
model.to(device)

BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    

In [37]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [38]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

#Fine Tuning the Model

In [39]:
def train(epoch):
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids, )

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _%5000==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
         
            # logger.info(f'Epoch:{epoch}, Loss: {loss.item()}'
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [40]:
def validation():
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [41]:
import logging
import warnings                        # To ignore any warnings
warnings.filterwarnings("ignore")

# a function  to create and save logs in the log files
def log(path, file):

    """[Create a log file to record the experiment's logs]
    
    Arguments:
        path {string} -- path to the directory
        file {string} -- file name
    
    Returns:
        [obj] -- [logger that record logs]
    """

    # check if the file exist
    log_file = os.path.join(path, file)

    if not os.path.isfile(log_file):
        open(log_file, "w+").close()

    console_logging_format = "%(message)s"
    file_logging_format = "%(message)s"
    
    
    

    # in case we want loglevel and time 
    # console_logging_format = "%(levelname)s %(message)s"
    # file_logging_format = "%(levelname)s: %(asctime)s: %(message)s"

    # configure logger
    logging.basicConfig(level=logging.INFO, format=console_logging_format)
    logger = logging.getLogger()
    
    # create a file handler for output file
    handler = logging.FileHandler(log_file)

    # set the logging level for log file
    handler.setLevel(logging.INFO)
    
    # create a logging format
    formatter = logging.Formatter(file_logging_format)
    handler.setFormatter(formatter)
    

    # add the handlers to the logger
    logger.addHandler(handler)

    return logger

In [42]:
# set a logger file
os.mkdir("logs")
path = "/content/logs"
logger = log(path, file="train.logs")


In [43]:
# logger.info("Start Training")

for epoch in range(EPOCHS):
    train(epoch)
    outputs, targets = validation()
    outputs = np.array(outputs) >= 0.5
    accuracy = metrics.accuracy_score(targets, outputs)
    f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
    f1_score_macro = metrics.f1_score(targets, outputs, average='macro')

    # logger.info(f"{accuracy}, {f1_score_micro}, {f1_score_macro}")

    # logger.info(f"{f1_score_micro},")
    # logger.info(f"{f1_score_macro},")
    # logger.info(f"Accuracy Score = {accuracy} ,")
    # logger.info(f"F1 Score (Micro) = {f1_score_micro} ,")
    # logger.info(f"F1 Score (Macro) = {f1_score_macro} ,")

    print(f"Accuracy Score = {accuracy}")
    print(f"F1 Score (Micro) = {f1_score_micro}")
    print(f"F1 Score (Macro) = {f1_score_macro}")
    print()

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Epoch: 0, Loss:  0.74190753698349
Accuracy Score = 0.0
F1 Score (Micro) = 0.05714285714285714
F1 Score (Macro) = 0.0090137398833051

Epoch: 1, Loss:  0.5235307812690735
Accuracy Score = 0.0
F1 Score (Micro) = 0.0
F1 Score (Macro) = 0.0

Epoch: 2, Loss:  0.3997751772403717
Accuracy Score = 0.0
F1 Score (Micro) = 0.0
F1 Score (Macro) = 0.0

Epoch: 3, Loss:  0.3998764753341675
Accuracy Score = 0.0
F1 Score (Micro) = 0.0
F1 Score (Macro) = 0.0

Epoch: 4, Loss:  0.32689300179481506
Accuracy Score = 0.0
F1 Score (Micro) = 0.0
F1 Score (Macro) = 0.0

Epoch: 5, Loss:  0.3157314360141754
Accuracy Score = 0.0
F1 Score (Micro) = 0.0
F1 Score (Macro) = 0.0

Epoch: 6, Loss:  0.315310537815094
Accuracy Score = 0.0
F1 Score (Micro) = 0.0
F1 Score (Macro) = 0.0

Epoch: 7, Loss:  0.2531642019748688
Accuracy Score = 0.0
F1 Score (Micro) = 0.0
F1 Score (Macro) = 0.0

Epoch: 8, Loss:  0.26182281970977783
Accuracy Score = 0.0
F1 Score (Micro) = 0.0
F1 Score (Macro) = 0.0

Epoch: 9, Loss:  0.240232139825820

#Plot

In [None]:
import pandas as pd
import pylab as plt

# Create dataframe
file_name = "/content/logs/train.logs"
dflog = pd.read_csv(file_name)
# dflog.plot()
# plt.show()
dflog.tail()

Unnamed: 0,0.3235294117647059,0.6602870813397129,0.6159376532275693
44,0.323529,0.666667,0.622765
45,0.338235,0.666667,0.62788
46,0.382353,0.681416,0.646638
47,0.338235,0.669725,0.634705
48,0.308824,0.657277,0.619133


In [None]:
def load_metrics(load_path):

    if load_path==None:
        return
    
    state_dict = torch.load(load_path, map_location=device)
    print(f'Model loaded from <== {load_path}')
    
    return state_dict['train_loss_list'], state_dict['valid_loss_list'], state_dict['global_steps_list']
    
destination_folder = '/content/logs'
train_loss_list, valid_loss_list, global_steps_list = load_metrics(destination_folder + '/train.logs')
plt.plot(global_steps_list, train_loss_list, label='Train')
plt.plot(global_steps_list, valid_loss_list, label='Valid')
plt.xlabel('Global Steps')
plt.ylabel('Loss')
plt.legend()
plt.show() 

#Validating the Model

In [None]:
for epoch in range(EPOCHS):
    outputs, targets = validation(epoch)
    outputs = np.array(outputs) >= 0.5
    accuracy = metrics.accuracy_score(targets, outputs)
    f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
    f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
    print(f"Accuracy Score = {accuracy}")
    print(f"F1 Score (Micro) = {f1_score_micro}")
    print(f"F1 Score (Macro) = {f1_score_macro}")

#Saving the Trained Model for inference

In [None]:
import shutil
 
if os.path.exists('/content/data/models'): 
  shutil.rmtree('/content/data/models')
else:
  os.mkdir('/content/data/models')
  OUTPUT_DIR = '/content/data/models' 

In [None]:
# # #Saving the files for inference

# output_model_file = './models/pytorch_distilbert_news.bin'
# output_vocab_file = './models/vocab_distilbert_news.bin'

if not os.path.exists('/content/data/models'):
    os.mkdir('/content/data/models')


output_model_file = '/content/data/models/pytorch_bert_icd.pt'
output_vocab_file = '/content/data/models/vocab_bert_icd.pt'



torch.save(model, output_model_file)
tokenizer.save_vocabulary(output_vocab_file)

print('Saved')

Saved


In [None]:
PATH = '/content/data/models/pytorch_bert_icd.pt'
model = torch.load(PATH)
model.eval()

In [None]:
texts = ['I really love the Netflix original movies',
		 'this movie is not worth watching']
predictions = model.predict_batch(texts)

ModuleAttributeError: ignored

In [None]:
texts = ['I really love the Netflix original movies',
		 'this movie is not worth watching']
predictions = model.predict_batch(texts)

ModuleAttributeError: ignored