## Installing all packages for colab

In [1]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/98/87/ef312eef26f5cecd8b17ae9654cdd8d1fae1eb6dbd87257d6d73c128a4d0/transformers-4.3.2-py3-none-any.whl (1.8MB)
[K     |████████████████████████████████| 1.8MB 5.8MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/fd/5b/44baae602e0a30bcc53fbdbc60bd940c15e143d252d658dfdefce736ece5/tokenizers-0.10.1-cp36-cp36m-manylinux2010_x86_64.whl (3.2MB)
[K     |████████████████████████████████| 3.2MB 18.9MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 35.3MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893261 sha256=07311ff8acd

In [2]:
!pip install allennlp

Collecting allennlp
[?25l  Downloading https://files.pythonhosted.org/packages/9e/10/0637bb46d2f9eaf8c475fcf4ea4d8dcdbb184ab726b1c4bf5be0547211be/allennlp-2.0.1-py3-none-any.whl (580kB)
[K     |████████████████████████████████| 583kB 5.1MB/s 
Collecting jsonpickle
  Downloading https://files.pythonhosted.org/packages/bb/1a/f2db026d4d682303793559f1c2bb425ba3ec0d6fd7ac63397790443f2461/jsonpickle-2.0.0-py2.py3-none-any.whl
Collecting overrides==3.1.0
  Downloading https://files.pythonhosted.org/packages/ff/b1/10f69c00947518e6676bbd43e739733048de64b8dd998e9c2d5a71f44c5d/overrides-3.1.0.tar.gz
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/14/67/e42bd1181472c95c8cda79305df848264f2a7f62740995a46945d9797b67/sentencepiece-0.1.95-cp36-cp36m-manylinux2014_x86_64.whl (1.2MB)
[K     |████████████████████████████████| 1.2MB 24.4MB/s 
Collecting transformers<4.3,>=4.1
[?25l  Downloading https://files.pythonhosted.org/packages/88/b1/41130a228dd656a1a31ba28159

In [3]:
!pip install seqeval

Collecting seqeval
[?25l  Downloading https://files.pythonhosted.org/packages/9d/2d/233c79d5b4e5ab1dbf111242299153f3caddddbb691219f363ad55ce783d/seqeval-1.2.2.tar.gz (43kB)
[K     |███████▌                        | 10kB 15.8MB/s eta 0:00:01[K     |███████████████                 | 20kB 11.8MB/s eta 0:00:01[K     |██████████████████████▌         | 30kB 9.5MB/s eta 0:00:01[K     |██████████████████████████████  | 40kB 8.9MB/s eta 0:00:01[K     |████████████████████████████████| 51kB 2.8MB/s 
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-cp36-none-any.whl size=16171 sha256=fa2bbacff45733c8710c2dc27db828c7a4599fda10de4551fd15968f4b66f7ca
  Stored in directory: /root/.cache/pip/wheels/52/df/1b/45d75646c37428f7e626214704a0e35bd3cfc32eda37e59e5f
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [4]:
!pip install pytorch-crf

Collecting pytorch-crf
  Downloading https://files.pythonhosted.org/packages/96/7d/4c4688e26ea015fc118a0327e5726e6596836abce9182d3738be8ec2e32a/pytorch_crf-0.7.2-py3-none-any.whl
Installing collected packages: pytorch-crf
Successfully installed pytorch-crf-0.7.2


Connect to google drive

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Importing packages

In [6]:
import numpy as np
import torch
from torch import nn
from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel
from transformers import BertForTokenClassification, AdamW
from allennlp.modules.elmo import Elmo, batch_to_ids

from torchcrf import CRF

from sklearn.model_selection import KFold, ParameterGrid

from transformers import get_linear_schedule_with_warmup

import matplotlib
from matplotlib import pyplot as plt

import tqdm
from seqeval.metrics import f1_score, accuracy_score
%matplotlib inline

import json

Connect to device

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

In [8]:
torch.cuda.get_device_name(0)

'Tesla T4'

## Preprocessing

Let's prepare the data for the input of BERT tokenizer.

In [7]:
def split_text_label(filename):
    f = open(filename)
    split_labeled_text = []
    sentence = []
    for line in f:
        if len(line)==0 or line.startswith('-DOCSTART') or line[0]=="\n":
            if len(sentence) > 0:
                split_labeled_text.append(sentence)
                sentence = []
            continue
        splits = line.split(' ')
        sentence.append([splits[0],splits[-1].rstrip("\n")])
    if len(sentence) > 0:
        split_labeled_text.append(sentence)
        sentence = []
    sentences = []
    labels = []
    for sent in split_labeled_text:
        sentence = []
        label = []
        for s_l in sent:
            sentence.append(s_l[0])
            label.append(s_l[1])
        sentences.append(sentence)
        labels.append(label)
    return sentences, labels

In [8]:
train_data, train_labels = split_text_label("drive/MyDrive/coNLL/train.txt")
valid_data, valid_labels = split_text_label("drive/MyDrive/coNLL/valid.txt")
test_data, test_labels = split_text_label("drive/MyDrive/coNLL/test.txt")

In [9]:
# the first sentence
train_data[0]

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']

In [10]:
# its tokens
train_labels[0]

['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']

Lets create tag dictionary

In [11]:
try:
  with open('/content/drive/My Drive/models/tag2idx.json', 'r') as f:
    tag2idx = json.load(f)
except:
  tag_values = set()
  for l in train_labels:
      tag_values.update(l)
  tag_values.update(["PAD"])
  tag2idx = {t: i for i, t in enumerate(tag_values)}
  with open('/content/drive/My Drive/models/tag2idx.json', 'w') as f:
    json.dump(tag2idx,f)

In [12]:
tag2idx

{'B-LOC': 8,
 'B-MISC': 5,
 'B-ORG': 3,
 'B-PER': 4,
 'I-LOC': 7,
 'I-MISC': 9,
 'I-ORG': 1,
 'I-PER': 6,
 'O': 2,
 'PAD': 0}

In [13]:
idx2tag = {v: k for k, v in tag2idx.items()}

### Tokenization with BertTokenizer

BERT (Bidirectional Encoder Representations from Transformers) is a method of pretraining language representations. These vectors (representations) are used as high-quality feature inputs to downstream models. BERT offers an advantage over models like Word2Vec, because while each word has a fixed representation under Word2Vec regardless of the context within which the word appears, BERT produces word representations that are dynamically informed by the words around them.

The Bert implementation comes with a pretrained tokenizer and a definied vocabulary. We load the one related to the smallest pre-trained model bert-base-cased. We use the cased variate since it is well suited for NER.

In [14]:
def tokenize_and_preserve_labels(sentence, text_labels, tokenizer):
    tokenized_sentence = []
    labels = []

    for word, label in zip(sentence, text_labels):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    return ['[CLS]'] + tokenized_sentence + ['[SEP]'], ['O'] + labels + ['O']

In [15]:
def create_dataloader(tokenizer, data, labels, datatype='train', desired_pad='max', batch_size=128):
  """
  returns: TensorDataset, RandomSampler (for valid and test SequentialSampler), DataLoader
  """
  data_tokenized = [tokenize_and_preserve_labels(s, l, tokenizer) for s, l in zip(data, labels)]
  data_tokens = [x[0] for x in data_tokenized]
  data_labels = [x[1] for x in data_tokenized]

  if desired_pad=='max':
    DISIRED_LENGTH = np.max([len(sen) for sen in data_tokens])
  elif desired_pad=='mean':
    DISIRED_LENGTH = int(np.mean([len(sen) for sen in data_tokens]))
  elif isinstance(desired_pad, int):
    DISIRED_LENGTH = desired_pad
  else:
    raise ValueError("How it should be padded?")
  
  data_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in data_tokens],
                          maxlen=DISIRED_LENGTH, dtype="long", value=0.0,
                          truncating="post", padding="post")
  
  data_tags = pad_sequences([[tag2idx.get(l) for l in seq_labels] for seq_labels in data_labels],
                     maxlen=DISIRED_LENGTH, value=tag2idx["PAD"], padding="post",
                     dtype="long", truncating="post")
  
  data_masks = [[float(i != 0.0) for i in ii] for ii in data_ids]

  # Creating tensors
  data_elmo_ids = batch_to_ids(data_tokens)
  data_bert_ids = torch.tensor(data_ids)
  data_tags = torch.tensor(data_tags)
  data_masks = torch.tensor(data_masks)

  # We need to pad elmo ids to have the same sequence length as BERT.
  if data_elmo_ids.shape[1] < data_bert_ids.shape[1]:
    data_elmo_ids = torch.cat((data_elmo_ids,
                                torch.zeros((data_elmo_ids.shape[0],
                                             data_bert_ids.shape[1]-data_elmo_ids.shape[1],
                                             data_elmo_ids.shape[2]))), dim=1).type(torch.LongTensor)
    
  data_dataset = TensorDataset(data_elmo_ids, data_bert_ids, data_masks, data_tags)
  if datatype == 'train':
    data_sampler = RandomSampler(data_dataset)
  else:
    data_sampler = SequentialSampler(data_dataset)
  data_dataloader = DataLoader(data_dataset, sampler=data_sampler, batch_size=batch_size)

  return data_dataset, data_sampler, data_dataloader

IF WE RUN THIS NOTEBOOK NOT THE FIRST TIME, WE LOAD THE TOKENIZER FROM THE GOOGLE DRIVE.

In [16]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




___


In [17]:
train_dataset, train_sampler, train_dataloader = create_dataloader(tokenizer, train_data, train_labels)

In [18]:
for s in train_dataset:
  max_seq_len = s[1].shape[0]
  break

In [19]:
valid_dataset, valid_sampler, valid_dataloader = create_dataloader(tokenizer, valid_data, valid_labels,
                                                                   datatype='valid',
                                                                   desired_pad=max_seq_len)

In [20]:
for s in valid_dataset:
  print(s[0].shape)
  print(s[1].shape)
  break

torch.Size([173, 50])
torch.Size([173])


### BERT & ELMo setup

The transformer package provides a BertForTokenClassification class for token-level predictions. BertForTokenClassification is a fine-tuning model that wraps BertModel and adds token-level classifier on top of the BertModel.

In [21]:
# Bert pre-trained model selected in the list: bert-base-uncased, 
# bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased,
# bert-base-multilingual-cased, bert-base-chinese.
BERT_MODEL = 'bert-base-cased'

# The name of the task to train.I'm going to name this 'yelp'.
TASK_NAME = 'first'

# The output directory where the fine-tuned model and checkpoints will be written.
OUTPUT_DIR = f'outputs/{TASK_NAME}/'

# The directory where the evaluation reports will be written to.
REPORTS_DIR = f'reports/{TASK_NAME}_evaluation_report/'


# This is where BERT will look for pre-trained models to load parameters from.
CACHE_DIR = 'cache/'

# create BERT model
#bert = BertForTokenClassification.from_pretrained(
#                        BERT_MODEL,
#                        output_hidden_states=True)
#        
#for pars in bert.parameters():
#    pars.requires_grad = False


In [22]:
# medium ELMo weights
weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5'

# medium ELMO options
options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_options.json'

# create ElMO model (we've already found that to use 2 elmo layers is the best choise)
#elmo = Elmo(options_file, weight_file, num_output_representations=2,
#                  dropout=0, requires_grad=False)

#elmo(train_elmo_ids[:2])['elmo_representations'][0].shape

## Define the model

In [23]:
class BEboC(nn.Module):
    """
    BERT+Elmo+biLSTM+one CRF
    """
    def __init__(self, hidden_size=128, num_labels=len(tag2idx), elmo_layers=2,
                 bert_layers=1, concat_bert=True, bilstm_layers=1):
        """
        Creates model
        
        Parameters
        ----------
        hidden_size:
        num_labels:
        elmo_layers: int, default=2
            Num of ELMo layers to be considered
        bert_layers: int, default=1
            Num of final BERT hidden layers to be used as embedding vector.
        concat_bert: bool, default=True
            Whether to concat (True) or sum (False) last BERT hidden layers.
        bilstm_layers: int, default=1
        """
        super(BEboC, self).__init__()

        self.hidden_size = hidden_size
        self.num_labels = num_labels
        self.elmo_layers = elmo_layers
        self.bert_layers = bert_layers
        self.concat_bert = concat_bert
        self.bilstm_layers = bilstm_layers
        
        self.bert = BertForTokenClassification.from_pretrained(
                        BERT_MODEL,
                        output_hidden_states=True)
        
        for pars in self.bert.parameters():
            pars.requires_grad = False
        
        bert_embedding_dim = self.bert.config.to_dict()['hidden_size']

        self.elmo = Elmo(options_file, weight_file, self.elmo_layers, dropout=0, requires_grad=False)
        
        elmo_embedding_dim = 512 # it's always fixed

        if self.concat_bert:
          self.linear1 = nn.Linear(bert_embedding_dim*self.bert_layers+elmo_embedding_dim*self.elmo_layers, 1024)
        else:
          self.linear1 = nn.Linear(bert_embedding_dim+elmo_embedding_dim*self.elmo_layers, 1024)
        
        self.bilstm = nn.LSTM(1024, self.hidden_size, self.bilstm_layers, bidirectional=True)
        
        self.linear2 = nn.Linear(self.hidden_size*2, self.num_labels)
        self.crf = CRF(num_tags=self.num_labels, batch_first=True)
    
    def forward(self, elmo_ids, bert_ids, attention_mask):
        """
        Forward propogate of model.
        
        Parameters
        ----------
        sequence:
        attention_mask:
        
        Returns
        -------
        Logits
        
        """

        bert_hiddens = self.bert(bert_ids, attention_mask=attention_mask)[1]
        elmo_hiddens = self.elmo(elmo_ids)

        if self.concat_bert:
            bert_embedding = torch.cat(bert_hiddens[-self.bert_layers:], dim=2)#[bert_hiddens[-i] for i in range(-1, -self.bert_layers-1, -1)], dim=0)
        else:
            emb_sum = 0
            for h in bert_hiddens[-self.bert_layers:]:
                emb_sum += h
            bert_embedding = emb_sum

        elmo_bert_embeddings = torch.clone(bert_embedding)

        for el_hi in elmo_hiddens['elmo_representations']:
            elmo_bert_embeddings = torch.cat((elmo_bert_embeddings, el_hi), dim=-1)

        linear1_output = nn.functional.relu(self.linear1(elmo_bert_embeddings))

        bilstm_output, (h_n, c_n) = self.bilstm(linear1_output)
        linear2_output = nn.functional.relu(self.linear2(bilstm_output))
        return linear2_output

In [42]:
def train(model, train_dataloader, optimizer, scheduler=None, n_epoch=5,
          max_grad_norm=None, validate=True, valid_dataloader=None,
          show_info=True, save_model=True):
    loss_values = []
    if validate and valid_dataloader is not None:
        validation_loss_values = []
        valid_accuracies = []
        valid_f1_scores = []

    for e in range(n_epoch):
        if show_info:
          print(f"\nEpoch #{e}")
        # Training

        model.train()

        total_loss = 0

        if show_info:
            enumerator = enumerate(tqdm.tqdm(train_dataloader, position=0, leave=True))
        else:
            enumerator = enumerate(train_dataloader)

        for step, batch in enumerator:
            if device.type != 'cpu':
                batch = tuple(t.to(device) for t in batch)
            b_elmo_ids, b_bert_ids, b_input_mask, b_labels = batch
            model.zero_grad()

            logits = model.forward(b_elmo_ids, b_bert_ids, b_input_mask.byte())
            
            # because we need negative log likelyhood
            loss = -1*model.crf.forward(logits, b_labels, mask=b_input_mask.byte())

            loss.backward()

            total_loss += loss.item()

            if show_info and (step+1) % 10 == 0:
                print(f"\n{step}: avg loss per batch: {total_loss/step}\n")

            if max_grad_norm is not None:
                torch.nn.utils.clip_grad_norm_(parameters=model.parameters(),
                                            max_norm=max_grad_norm)

            optimizer.step()

            if scheduler is not None:
                scheduler.step()

        avg_train_loss = total_loss / len(train_dataloader)
        if show_info:
            print(f"Average train loss: {avg_train_loss}")

        loss_values.append(avg_train_loss)

        if validate and valid_dataloader is not None:
          # Validation

            model.eval()

            eval_loss, eval_accuracy = 0, 0
            predictions, true_labels = [], []

            for batch in valid_dataloader:
                if device.type != 'cpu':
                    batch = tuple(t.to(device) for t in batch)
                b_elmo_ids, b_bert_ids, b_input_mask, b_labels = batch

                with torch.no_grad():
                    logits = model.forward(b_elmo_ids, b_bert_ids, b_input_mask.byte())
                    loss = -1*model.crf.forward(logits, b_labels, mask=b_input_mask.byte())
                    tags = model.crf.decode(logits, mask=b_input_mask.byte())

                # move loss to cpu
                eval_loss += loss.item()
                predictions.extend(tags)
                labels_ = b_labels.detach().cpu().numpy()
                true_labels.extend(labels_)

            eval_loss = eval_loss / len(valid_dataloader)
            validation_loss_values.append(eval_loss)
            if show_info:
                print(f"Validation loss: {eval_loss}")

            all_predicted_tags = []
            for s in predictions:
                tag_names = [idx2tag[i] for i in s]
                all_predicted_tags.append(tag_names)

            all_true_tags = []
            for s in true_labels:
                tag_names = [idx2tag[i] for i in s if idx2tag[i] != 'PAD']
                all_true_tags.append(tag_names)

            valid_acc = accuracy_score(all_predicted_tags, all_true_tags)
            valid_f1 = f1_score(all_predicted_tags, all_true_tags)
            valid_accuracies.append(valid_acc)
            valid_f1_scores.append(valid_f1)

            if show_info:
                print(f"Validation accuracy: {valid_acc}")
                print(f"Validation F1-score: {valid_f1}\n")
            
        if save_model and (e+1)%10 == 0:
            tokenizer.save_pretrained(f'/content/drive/My Drive/models/ElMo_BERT_biLSTM_oneCRF_{e}_tokenizer.pth')
            checkpoint = {'model': BEboC(hidden_size=512, bert_layers=2),
                          'state_dict': model.state_dict(), 
                          'optimizer' : optimizer.state_dict()}

            torch.save(checkpoint,
                        f'/content/drive/My Drive/models/ElMo_BERT_biLSTM_oneCRF_{e}_state_dict.pth')

    return loss_values, validation_loss_values, valid_accuracies, valid_f1_scores

In [32]:
logits = model.forward(train_dataset[0:2][0], train_dataset[0:2][1], train_dataset[0:2][2].byte())

In [38]:
print(logits.shape)
print(train_dataset[0:2][3].shape)

torch.Size([2, 173, 10])
torch.Size([2, 173])


In [36]:
model.crf.forward(logits, train_dataset[0:2][3], train_dataset[0:2][2].byte())

tensor(-36.3391, grad_fn=<SumBackward0>)

### Cross-validation

Fix some train parameters


In [43]:
RANDOM_SEED = 42

N_FOLDS = 3

N_EPOCHS = 5

In [44]:
kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_SEED)

In [45]:
# model = BEboc(batch_size=BATCH_SIZE, hidden_size=128, num_labels=len(tag2idx),
#                     bert_layers=2, concat=False)
# model.to(device)

In [46]:
# opt = AdamW(params=model.parameters(),lr=1e-3)
# train(model, train_dataloader, opt)

In [47]:
%time
param_grid = {
    'opt': ['AdamW'],
    'lr': [3e-4, 7e-4, 1e-3],
    'bert_layers': [2,3],
    'concat': [True, False],
    'max_grad_norm': [None]#[1., None]
}

param_grid = {
    'opt': ['AdamW'],
    'lr': [1e-3],
    'bert_layers': [2],
    'concat': [False],
    'max_grad_norm': [None]
}

grid = ParameterGrid(param_grid)

params_results = {}

for m, ps in enumerate(grid):
  print(f"Model #{m} of {len(grid)}")
  _p_r = {'params': ps}
  
  mean_train_losses = 0
  mean_valid_losses = 0
  mean_valid_accs = 0
  mean_valid_f1s = 0

  for i, (train_index, valid_index) in enumerate(kf.split(train_data)):
    train_fold = torch.utils.data.Subset(train_data, train_index)
    valid_fold = torch.utils.data.Subset(train_data, valid_index)
    train_dataloader = DataLoader(train_fold, batch_size=BATCH_SIZE)
    valid_dataloader = DataLoader(valid_fold, batch_size=BATCH_SIZE)

    model = BEboc(batch_size=BATCH_SIZE, hidden_size=128, num_labels=len(tag2idx),
                    bert_layers=ps['bert_layers'], concat=ps['concat'])
    model.to(device)

    if ps['opt'] == 'Adam':
      optimizer = torch.optim.Adam(params=model.parameters(),lr=ps['lr'])
    else:
      optimizer = AdamW(params=model.parameters(),lr=ps['lr'])

    train_losses, valid_losses, valid_accs, valid_f1s = train(model,
                                            train_dataloader,
                                            optimizer,
                                            n_epoch=N_EPOCHS,
                                            max_grad_norm=ps['max_grad_norm'],
                                            valid_dataloader=valid_dataloader,
                                            show_info=False)
    
    mean_train_losses += np.array(train_losses)
    mean_valid_losses += np.array(valid_losses)
    mean_valid_accs += np.array(valid_accs)
    mean_valid_f1s += np.array(valid_f1s)
  
  mean_train_losses /= N_FOLDS
  mean_valid_losses /= N_FOLDS
  mean_valid_accs /= N_FOLDS
  mean_valid_f1s /= N_FOLDS
  _p_r['mean_train_losses'] = list(mean_train_losses)
  _p_r['mean_valid_losses'] = list(mean_valid_losses)
  _p_r['mean_valid_accs'] = list(mean_valid_accs)
  _p_r['mean_valid_f1s'] = list(mean_valid_f1s)
  params_results[m] = _p_r 

with open("/content/drive/My Drive/params_results.json", "w") as w:
  json.dump(params_results, w)

CPU times: user 1e+03 ns, sys: 1e+03 ns, total: 2 µs
Wall time: 4.29 µs
Model #0 of 1


NameError: ignored

In [None]:
with open("/content/drive/My Drive/params_results.json", "w") as w:
  json.dump(params_results, w)

After train on 1 epoch on small dataset the best result gave the model that concatenates two last bert layers and has learning rate 3e-4, so let's train such model on all train data. Also, let's increase LSTM hidden size to be 512 and use now linear scheduler.

### Final model train

In [None]:
N_EPOCHS = 20

In [None]:
total_steps = len(train_dataloader) *  N_EPOCHS

In [26]:
model = BEboC(hidden_size=512, bert_layers=2)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435779157.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

In [None]:
model.to(device)

In [None]:
optimizer = AdamW(params=model.parameters(),lr=3e-4)

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

if device.type != 'cpu':
    model.to(device)

#train_losses, valid_losses, valid_accs, valid_f1s
results = train(model, train_dataloader, optimizer, scheduler, n_epoch=N_EPOCHS,
     validate=True, valid_dataloader=valid_dataloader)

If we want to train model for more time

In [None]:
N_EPOCHS = 5
total_steps = len(train_dataloader) *  N_EPOCHS

optimizer = AdamW(params=model.parameters(),lr=1e-4)

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

if device.type != 'cpu':
    model.to(device)
  
model.train()

#train_losses, valid_losses, valid_accs, valid_f1s
results_1 = train(model, train_dataloader, optimizer, scheduler, n_epoch=N_EPOCHS,
     validate=True, valid_dataloader=valid_dataloader)

If we want to plot the reults of learning

In [None]:
plt.figure(figsize=(8,6))
plt.plot(results[0], label='train')
plt.plot(results[1], label='valid', c='g')
plt.ylabel("loss")
plt.xlabel("#epoch")
plt.xticks(np.arange(0,20,2))
plt.grid()
plt.legend()
plt.show()

If we want to evaluate model on the test set

In [None]:
test_dataset, test_sampler, test_dataloader = create_dataloader(tokenizer, test_data, test_labels,
                                                                   datatype='test',
                                                                   desired_pad=max_seq_len)

In [None]:
model.eval()
test_losses = []
true_labels = []
pred_labels = []
for step, batch in enumerate(test_dataloader):
    # add batch to gpu
    batch = tuple(t.to(device) for t in batch)
    b_elmo_ids, b_bert_ids, b_input_mask, b_labels = batch
    batch_true_labels = b_labels
    for bl in batch_true_labels.detach().cpu().tolist():
      tag_names = [idx2tag[i] for i in bl if idx2tag[i] != 'PAD']
      true_labels.append(tag_names)
    
    # Always clear any previously calculated gradients before performing a backward pass.
    # forward pass
    # This will return the loss (rather than the model output)
    # because we have provided the `labels`.
    with torch.no_grad():
        logits = model.forward(b_elmo_ids, b_bert_ids, b_input_mask.byte())
        loss = model.crf.forward(logits, b_labels, b_input_mask.byte())
        test_losses.append(loss.item())
        tags = model.crf.decode(logits, b_input_mask.byte())
    for t in tags:
      tag_names = [idx2tag[i] for i in t]
      pred_labels.append(tag_names)


In [None]:
f1_score(true_labels, pred_labels)

___

In [None]:
torch.save(model, '/content/drive/My Drive/models/Elmo_BERT_biLSTM_oneCRF_final.pth')

In [None]:
torch.save(model.state_dict(), '/content/drive/My Drive/models/ELmo_BERT_biLSTM_oneCRF_final_state_dict.pth')

##Load the model

In [None]:
model = torch.load('/content/drive/My Drive/models/BERT_biLSTM_oneCRF.pth',
                   map_location=torch.device('cpu'))

In [None]:
model = torch.load('/content/drive/My Drive/models/Elmo_BERT_biLSTM_oneCRF.pth',
                   map_location=torch.device('cpu'))

In [24]:
def load_checkpoint(tokenizer_path, checkpoint_path):
    """Loads both tokenizer and our pretrained model"""
    tokenizer = tokenizer = BertTokenizer.from_pretrained(tokenizer_path)
    checkpoint = torch.load(checkpoint_path)
    model = checkpoint['model']
    model.load_state_dict(checkpoint['state_dict'])
    #for parameter in model.parameters():
    #    parameter.requires_grad = False

    model.eval()
    return tokenizer, model

In [34]:
tokenizer, model = load_checkpoint('/content/drive/My Drive/models/ElMo_BERT_biLSTM_oneCRF_19_tokenizer.pth',
                                     '/content/drive/My Drive/models/ElMo_BERT_biLSTM_oneCRF_19_state_dict.pth')



In [35]:
train_dataset, train_sampler, train_dataloader = create_dataloader(tokenizer, train_data, train_labels)

In [36]:
for s in train_dataset:
  max_seq_len = s[1].shape[0]
  break

In [37]:
test_dataset, test_sampler, test_dataloader = create_dataloader(tokenizer, test_data, test_labels,
                                                                   datatype='test',
                                                                   desired_pad=max_seq_len)

In [38]:
model.to(device)
model.eval()
test_losses = []
true_labels = []
pred_labels = []
for step, batch in enumerate(test_dataloader):
    # add batch to gpu
    batch = tuple(t.to(device) for t in batch)
    b_elmo_ids, b_bert_ids, b_input_mask, b_labels = batch
    batch_true_labels = b_labels
    for bl in batch_true_labels.detach().cpu().tolist():
      tag_names = [idx2tag[i] for i in bl if idx2tag[i] != 'PAD']
      true_labels.append(tag_names)
    
    # Always clear any previously calculated gradients before performing a backward pass.
    # forward pass
    # This will return the loss (rather than the model output)
    # because we have provided the `labels`.
    with torch.no_grad():
        logits = model.forward(b_elmo_ids, b_bert_ids, b_input_mask.byte())
        loss = model.crf.forward(logits, b_labels, b_input_mask.byte())
        test_losses.append(loss.item())
        tags = model.crf.decode(logits, b_input_mask.byte())
    for t in tags:
      tag_names = [idx2tag[i] for i in t]
      pred_labels.append(tag_names)



In [39]:
f1_score(true_labels, pred_labels)

0.8405783838198957

## FINETUNE

In [50]:
class BEbiC(nn.Module):
    """
    BERT+Elmo+biLSTM+CRFs
    """
    def __init__(self, hidden_size=128, num_labels=len(tag2idx), num_heads=1,
                 elmo_layers=2, bert_layers=1, concat_bert=True, bilstm_layers=1):
        """
        Creates model
        
        Parameters
        ----------
        hidden_size:
          LSTM parameter
        num_labels:
          The number of CRF labels
        num_heads:
          The number of CRF heads
        elmo_layers: int, default=2
          Num of ELMo layers to be considered
        bert_layers: int, default=1
          Num of final BERT hidden layers to be used as embedding vector.
        concat_bert: bool, default=True
          Whether to concat (True) or sum (False) last BERT hidden layers.
        bilstm_layers: int, default=1
        """
        super(BEbiC, self).__init__()

        self.hidden_size = hidden_size
        self.num_labels = num_labels
        self.num_heads = num_heads
        self.elmo_layers = elmo_layers
        self.bert_layers = bert_layers
        self.concat_bert = concat_bert
        self.bilstm_layers = bilstm_layers
        
        self.bert = BertForTokenClassification.from_pretrained(
                        BERT_MODEL,
                        output_hidden_states=True)
        
        for pars in self.bert.parameters():
            pars.requires_grad = False
        
        bert_embedding_dim = self.bert.config.to_dict()['hidden_size']

        self.elmo = Elmo(options_file, weight_file, self.elmo_layers, dropout=0, requires_grad=False)
        
        elmo_embedding_dim = 512 # it's always fixed

        if self.concat_bert:
          self.linear1 = nn.Linear(bert_embedding_dim*self.bert_layers+elmo_embedding_dim*self.elmo_layers, 1024)
        else:
          self.linear1 = nn.Linear(bert_embedding_dim+elmo_embedding_dim*self.elmo_layers, 1024)
        
        self.bilstm = nn.LSTM(1024, self.hidden_size, self.bilstm_layers, bidirectional=True)
        
        # multiple heads
        self.linears2 = []
        self.crfs = []

        for _ in range(self.num_heads):
          self.linears2.append(nn.Linear(self.hidden_size*2, self.num_labels))
          self.crfs.append(CRF(num_tags=self.num_labels, batch_first=True))
    
    def forward(self, elmo_ids, bert_ids, attention_mask):
        """
        Forward propogate of model.
        
        Parameters
        ----------
        elmo_ids:
        bert_ids:
        
        Returns
        -------
        Logits or list of logits if number of heads > 1
        
        """

        bert_hiddens = self.bert(bert_ids, attention_mask=attention_mask)[1]
        elmo_hiddens = self.elmo(elmo_ids)

        if self.concat_bert:
            bert_embedding = torch.cat(bert_hiddens[-self.bert_layers:], dim=2)#[bert_hiddens[-i] for i in range(-1, -self.bert_layers-1, -1)], dim=0)
        else:
            emb_sum = 0
            for h in bert_hiddens[-self.bert_layers:]:
                emb_sum += h
            bert_embedding = emb_sum

        elmo_bert_embeddings = torch.clone(bert_embedding)

        for el_hi in elmo_hiddens['elmo_representations']:
            elmo_bert_embeddings = torch.cat((elmo_bert_embeddings, el_hi), dim=-1)

        linear1_output = nn.functional.relu(self.linear1(elmo_bert_embeddings))

        bilstm_output, (h_n, c_n) = self.bilstm(linear1_output)

        linears2_outputs = []
        for i in range(self.num_heads):
          linears2_output.append(nn.functional.relu(self.linears2[i](bilstm_output)))

        # returning logits
        return linears2_outputs[0] if self.num_heads==1 else linears2_outputs

In [None]:
def new_train(model, train_dataloaders, optimizer, scheduler=None, n_epoch=5,
          max_grad_norm=None, validate=True, valid_dataloaders=None,
          show_info=True, save_model=True):

    loss_values = []
    if validate and valid_dataloaders is not None:
        validation_loss_values = []
        valid_accuracies = []
        valid_f1_scores = []
    
    if isinstance(train_dataloaders, list):
      n_heads = len(train_dataloaders)
    else:
      n_heads = 1
      train_dataloader = train_dataloaders

    for e in range(n_epoch):
        if show_info:
          print(f"\nEpoch #{e}")
        # Training

        model.train()

        total_loss = 0

        if show_info:
            _loader = train_dataloader if n_heads == 1 else train_dataloader[0]
            enumerator = enumerate(tqdm.tqdm(_loader, position=0, leave=True))
        else:
            enumerator = enumerate(train_dataloader)

        for step, batch in enumerator:
            if device.type != 'cpu':
                batch = tuple(t.to(device) for t in batch)
            b_elmo_ids, b_bert_ids, b_input_mask, b_labels = batch
            model.zero_grad()

            logits = model.forward(b_elmo_ids, b_bert_ids, b_input_mask.byte())
            
            # because we need negative log likelyhood
            loss = -1*model.crf.forward(logits, b_labels, mask=b_input_mask.byte())

            loss.backward()

            total_loss += loss.item()

            if show_info and (step+1) % 10 == 0:
                print(f"\n{step}: avg loss per batch: {total_loss/step}\n")

            if max_grad_norm is not None:
                torch.nn.utils.clip_grad_norm_(parameters=model.parameters(),
                                            max_norm=max_grad_norm)

            optimizer.step()

            if scheduler is not None:
                scheduler.step()

        avg_train_loss = total_loss / len(train_dataloader)
        if show_info:
            print(f"Average train loss: {avg_train_loss}")

        loss_values.append(avg_train_loss)

        if validate and valid_dataloader is not None:
          # Validation

            model.eval()

            eval_loss, eval_accuracy = 0, 0
            predictions, true_labels = [], []

            for batch in valid_dataloader:
                if device.type != 'cpu':
                    batch = tuple(t.to(device) for t in batch)
                b_elmo_ids, b_bert_ids, b_input_mask, b_labels = batch

                with torch.no_grad():
                    logits = model.forward(b_elmo_ids, b_bert_ids, b_input_mask.byte())
                    loss = -1*model.crf.forward(logits, b_labels, mask=b_input_mask.byte())
                    tags = model.crf.decode(logits, mask=b_input_mask.byte())

                # move loss to cpu
                eval_loss += loss.item()
                predictions.extend(tags)
                labels_ = b_labels.detach().cpu().numpy()
                true_labels.extend(labels_)

            eval_loss = eval_loss / len(valid_dataloader)
            validation_loss_values.append(eval_loss)
            if show_info:
                print(f"Validation loss: {eval_loss}")

            all_predicted_tags = []
            for s in predictions:
                tag_names = [idx2tag[i] for i in s]
                all_predicted_tags.append(tag_names)

            all_true_tags = []
            for s in true_labels:
                tag_names = [idx2tag[i] for i in s if idx2tag[i] != 'PAD']
                all_true_tags.append(tag_names)

            valid_acc = accuracy_score(all_predicted_tags, all_true_tags)
            valid_f1 = f1_score(all_predicted_tags, all_true_tags)
            valid_accuracies.append(valid_acc)
            valid_f1_scores.append(valid_f1)

            if show_info:
                print(f"Validation accuracy: {valid_acc}")
                print(f"Validation F1-score: {valid_f1}\n")
            
        if save_model and (e+1)%10 == 0:
            tokenizer.save_pretrained(f'/content/drive/My Drive/models/ElMo_BERT_biLSTM_oneCRF_{e}_tokenizer.pth')
            checkpoint = {'model': BEboC(hidden_size=512, bert_layers=2),
                          'state_dict': model.state_dict(), 
                          'optimizer' : optimizer.state_dict()}

            torch.save(checkpoint,
                        f'/content/drive/My Drive/models/ElMo_BERT_biLSTM_oneCRF_{e}_state_dict.pth')

    return loss_values, validation_loss_values, valid_accuracies, valid_f1_scores