In [None]:
!pip install -r requirements.txt

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import torch
import numpy as np
import pandas as pd
import transformers
import os
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset)
from transformers import BertForTokenClassification, BertTokenizer, BertConfig, BertModel
from torch.nn import CrossEntropyLoss
import torch.nn.functional as F
from seqeval.metrics import classification_report

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)

cuda


In [None]:
# Paths and training parameters
MAX_LEN = 200
BATCH_SIZE = 32
EPOCHS = 5
LEARNING_RATE = 2e-05
SEP_TOKEN = 2
PADDING_TOKEN = 0

TRAIN_PATH = 'train_data.txt'
TEST_PATH = 'test_data.txt'
FOLDER_PATH = '/content/drive/MyDrive'
MODEL_WEIGHTS_PATH = '/content/drive/MyDrive/model_weights.pt'

label_list = ['O', '[SEP]', '[CLS]', 'I-PROD', 'B-PROD']
num_labels = len(label_list) + 1

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
# Build features class
class InputExample(object):
  def __init__(self, text, label):
    self.text = text
    self.label = label

class InputFeatures(object):
  def __init__(self, input_ids, input_mask, segment_ids, label_id, valid_ids, label_mask):
    self.input_ids = input_ids
    self.input_mask = input_mask
    self.segment_ids = segment_ids
    self.label_id = label_id
    self.valid_ids = valid_ids
    self.label_mask = label_mask

In [None]:
def read_file(filename):
  f = open(filename)
  data = []
  sentence = []
  label = []
  for line in f:
      if len(line) == 0 or line.startswith('-DOCSTART') or line[0] == "\n":
          if len(sentence) > 0:
              data.append((sentence, label))
              sentence = []
              label = []
          continue
      splits = line.split(' ')
      sentence.append(splits[0])
      label.append(splits[-1][:-1])
      
  if len(sentence) > 0:
      data.append((sentence, label))
      sentence = []
      label = []

  return data

In [None]:
# Create samples from dataset
def create_examples(lines):
  examples = []
  for sentence, label in lines:
    text = ' '.join(sentence)
    label = label
    examples.append(InputExample(text=text, label=label))
  return examples

In [None]:
# Read data
split_train = read_file(os.path.join(FOLDER_PATH, TRAIN_PATH))
split_test = read_file(os.path.join(FOLDER_PATH, TEST_PATH))

train_examples = create_examples(split_train)
test_examples = create_examples(split_test)

In [None]:
# Extract features for every sample
def convert_examples_to_features(examples, label_list, tokenizer, max_len, padding_token):
  label_map = {label : i for i, label in enumerate(label_list,1)}

  features = []
  for example in examples:
      textlist = example.text.split(' ')
      labellist = example.label
      tokens = []
      labels = []
      valid = []
      label_mask = []
      for i, word in enumerate(textlist):
          token = tokenizer.tokenize(word)
          tokens.extend(token)
          label_1 = labellist[i]
          for m in range(len(token)):
              if m == 0:
                  labels.append(label_1)
                  valid.append(1)
                  label_mask.append(1)
              else:
                  valid.append(0)
      if len(tokens) >= max_len - 1:
          tokens = tokens[0:(max_len - 2)]
          labels = labels[0:(max_len - 2)]
          valid = valid[0:(max_len - 2)]
          label_mask = label_mask[0:(max_len - 2)]
      ntokens = []
      segment_ids = []
      label_ids = []
      ntokens.append("[CLS]")
      segment_ids.append(0)
      valid.insert(0,1)
      label_mask.insert(0,1)
      label_ids.append(label_map["[CLS]"])
      for i, token in enumerate(tokens):
          ntokens.append(token)
          segment_ids.append(0)
          if len(labels) > i:
              label_ids.append(label_map[labels[i]])
      ntokens.append("[SEP]")
      segment_ids.append(0)
      valid.append(1)
      label_mask.append(1)
      label_ids.append(label_map["[SEP]"])
      input_ids = tokenizer.convert_tokens_to_ids(ntokens)
      input_mask = [1] * len(input_ids)
      label_mask = [1] * len(label_ids)
      while len(input_ids) < max_len:
          input_ids.append(padding_token)
          input_mask.append(padding_token)
          segment_ids.append(padding_token)
          label_ids.append(padding_token)
          valid.append(1)
          label_mask.append(padding_token)
      while len(label_ids) < max_len:
          label_ids.append(padding_token)
          label_mask.append(padding_token)
      assert len(input_ids) == max_len
      assert len(input_mask) == max_len
      assert len(segment_ids) == max_len
      assert len(label_ids) == max_len
      assert len(valid) == max_len
      assert len(label_mask) == max_len
      features.append(
              InputFeatures(input_ids=input_ids,
                            input_mask=input_mask,
                            segment_ids=segment_ids,
                            label_id=label_ids,
                            valid_ids=valid,
                            label_mask=label_mask))
  return features

In [None]:
train_features = convert_examples_to_features(train_examples, label_list, tokenizer, MAX_LEN, PADDING_TOKEN)
test_features  = convert_examples_to_features(test_examples, label_list, tokenizer, MAX_LEN, PADDING_TOKEN)

In [None]:
def create_custom_dataset(features):
  all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
  all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
  all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
  all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
  all_valid_ids = torch.tensor([f.valid_ids for f in features], dtype=torch.long)
  all_lmask_ids = torch.tensor([f.label_mask for f in features], dtype=torch.long)

  return TensorDataset(
      all_input_ids,
      all_input_mask,
      all_segment_ids,
      all_label_ids,
      all_valid_ids,
      all_lmask_ids
  )

In [None]:
train_dataset = create_custom_dataset(train_features)
test_dataset = create_custom_dataset(test_features)

In [None]:
class CustomBert(BertForTokenClassification):
   
    def forward(self, input_ids, token_type_ids=None, attention_mask=None, 
                labels=None,valid_ids=None,attention_mask_label=None):
        
        sequence_output = self.bert(input_ids, token_type_ids, attention_mask)

        batch_size,max_len,feat_dim = sequence_output.last_hidden_state.shape
        valid_output = torch.zeros(batch_size, max_len, feat_dim, dtype=torch.float32, device=device)
        for i in range(batch_size):
          jj = -1
          for j in range(max_len):
            if valid_ids[i][j].item() == 1:
              jj += 1
              valid_output[i][jj] = sequence_output.last_hidden_state[i][j]
        sequence_output = self.dropout(valid_output)
        logits = self.classifier(sequence_output)

        if labels is not None:
            loss_fct = CrossEntropyLoss(ignore_index=0)
            attention_mask_label = None
            if attention_mask_label is not None:
                active_loss = attention_mask_label.view(-1) == 1
                active_logits = logits.view(-1, self.num_labels)[active_loss]
                active_labels = labels.view(-1)[active_loss]
                loss = loss_fct(active_logits, active_labels)
            else:
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            return loss
        else:
            return logits


In [None]:
model = CustomBert.from_pretrained('bert-base-cased', num_labels=num_labels)
model.to(device)

In [None]:
def train(train_dataset, model, tokenizer):
  train_sampler = RandomSampler(train_dataset)
  train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=BATCH_SIZE)

  optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

  for epoch in range(EPOCHS):
    model.train()
    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        input_ids, input_mask, segment_ids, label_ids, valid_ids,l_mask = batch
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'token_type_ids': batch[2],
                  'labels':         batch[3]}
        outputs = model(input_ids, segment_ids, input_mask, label_ids,valid_ids,l_mask)
        loss = outputs 

        if step % 500 == 0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
      
        loss.backward()
        optimizer.step()
        model.zero_grad()    

In [None]:
train(train_dataset, model, tokenizer)

In [None]:
# torch.save(model.state_dict(), MODEL_WEIGHTS_PATH)

In [None]:
model.load_state_dict(torch.load(MODEL_WEIGHTS_PATH))
model.eval()

In [None]:
def evaluate(eval_dataset, model, tokenizer, label_map, dataset_name):
  eval_sampler = SequentialSampler(eval_dataset)
  eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=BATCH_SIZE)

  # print("Num examples = ", len(eval_dataset))
  # print("Batch size = ", BATCH_SIZE)
  y_true = []
  y_pred = []
  for batch in eval_dataloader:
    model.eval()
    batch = tuple(t.to(device) for t in batch)

    with torch.no_grad():
        input_ids, input_mask, segment_ids, label_ids, valid_ids, l_mask = batch
        outputs = model(input_ids, segment_ids, input_mask, valid_ids=valid_ids, 
                        attention_mask_label=l_mask)
        logits = outputs

    logits = torch.argmax(F.log_softmax(logits, dim=2), dim=2)
    logits = logits.detach().cpu().numpy()
    label_ids = label_ids.to('cpu').numpy()
    input_mask = input_mask.to('cpu').numpy()

    for i, label in enumerate(label_ids):
        temp_1 = []
        temp_2 = []
        for j, m in enumerate(label):
          if j == 0:
              continue
          elif label_ids[i][j] == SEP_TOKEN:
              y_true.append(temp_1)
              y_pred.append(temp_2)
              break
          else:
              temp_1.append(label_map[label_ids[i][j]])
              temp_2.append(label_map[logits[i][j]])

  report = classification_report(y_true, y_pred, digits=4)
  print(f'Results for {dataset_name}:\n {report}')

In [None]:
label_map = {i : label for i, label in enumerate(label_list, 1)}
evaluate(train_dataset, model, tokenizer, label_map, "TRAIN")
evaluate(test_dataset, model, tokenizer, label_map, "TEST")

Results for TRAIN:
               precision    recall  f1-score   support

        PROD     0.9397    0.9702    0.9547      1510

   micro avg     0.9397    0.9702    0.9547      1510
   macro avg     0.9397    0.9702    0.9547      1510
weighted avg     0.9397    0.9702    0.9547      1510

Results for TEST:
               precision    recall  f1-score   support

        PROD     0.9167    0.9167    0.9167        12

   micro avg     0.9167    0.9167    0.9167        12
   macro avg     0.9167    0.9167    0.9167        12
weighted avg     0.9167    0.9167    0.9167        12

