In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
PATH_TO_PROJECT = '/content/drive/My Drive/Serious/'
# path to conll class as well as to conll data
PATH_TO_CONLL = PATH_TO_PROJECT + 'coNLL/'
PATH_TO_TAG2IDX = PATH_TO_CONLL + 'tag2idx.json'
PATH_TO_ONE_TAG2IDX = PATH_TO_CONLL + 'one_tag2idx.json'
PATH_TO_CHECKPOINT = '/content/drive/My Drive/models/'

### Installing all packages for colab

In [3]:
!pip install -r '/content/drive/My Drive/Serious/requirements.txt'

Collecting transformers~=4.3
[?25l  Downloading https://files.pythonhosted.org/packages/f9/54/5ca07ec9569d2f232f3166de5457b63943882f7950ddfcc887732fc7fb23/transformers-4.3.3-py3-none-any.whl (1.9MB)
[K     |████████████████████████████████| 1.9MB 7.5MB/s 
[?25hCollecting allennlp~=2.0
[?25l  Downloading https://files.pythonhosted.org/packages/e7/bd/c75fa01e3deb9322b637fe0be45164b40d43747661aca9195b5fb334947c/allennlp-2.1.0-py3-none-any.whl (585kB)
[K     |████████████████████████████████| 593kB 38.2MB/s 
[?25hCollecting seqeval~=1.2
[?25l  Downloading https://files.pythonhosted.org/packages/9d/2d/233c79d5b4e5ab1dbf111242299153f3caddddbb691219f363ad55ce783d/seqeval-1.2.2.tar.gz (43kB)
[K     |████████████████████████████████| 51kB 8.7MB/s 
[?25hCollecting pytorch-crf~=0.7
  Downloading https://files.pythonhosted.org/packages/96/7d/4c4688e26ea015fc118a0327e5726e6596836abce9182d3738be8ec2e32a/pytorch_crf-0.7.2-py3-none-any.whl
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloadi

###Importing packages

In [4]:
import numpy as np
import torch
from torch import nn
from torch.optim import AdamW
from transformers import BertTokenizer, BertModel
from transformers import BertForTokenClassification
from allennlp.modules.elmo import Elmo, batch_to_ids

from torchcrf import CRF

from sklearn.model_selection import KFold, ParameterGrid

from transformers import get_linear_schedule_with_warmup

import matplotlib
from matplotlib import pyplot as plt

%matplotlib inline

Connect to device

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

print(torch.cuda.get_device_name(0))

In [7]:
torch.cuda.empty_cache()

In [8]:
t = torch.cuda.get_device_properties(0).total_memory
r = torch.cuda.memory_reserved(0) 
a = torch.cuda.memory_allocated(0)

### Preprocessing

In [8]:
import sys
sys.path.append(PATH_TO_PROJECT)
sys.path.append(PATH_TO_CONLL)

from importlib import reload
import conll as co

In [9]:
#conll_old = co.CoNLL_old(PATH_TO_CONLL)
#for typ in conll_old.types:
#  conll_old.split_text_label(typ)
#conll_old.create_tag2idx(PATH_TO_TAG2IDX)
#conll_old.create_idx2tag()
conll = co.CoNLL(PATH_TO_CONLL)
# splitting raw data to sentences and labels
for typ in conll.types:
    conll.split_text_label(typ)

# define set of all labels
conll.create_set_of_labels()

# for multihead model
conll.create_tag2idx(PATH_TO_TAG2IDX)
conll.create_idx2tag()

for typ in conll.types:
    # for multiple heads of CRF layer
    conll.create_one_labeled_data(typ)

    # creating one_tag2idx dictionary
    conll.create_one_tag2idx(PATH_TO_ONE_TAG2IDX)
    conll.create_idx2one_tag()

In [10]:
conll.idx2tag

{0: 'PAD',
 1: 'I-ORG',
 2: 'O',
 3: 'B-ORG',
 4: 'B-PER',
 5: 'B-MISC',
 6: 'I-PER',
 7: 'I-LOC',
 8: 'B-LOC',
 9: 'I-MISC'}

### Tokenization with BertTokenizer

BERT (Bidirectional Encoder Representations from Transformers) is a method of pretraining language representations. These vectors (representations) are used as high-quality feature inputs to downstream models. BERT offers an advantage over models like Word2Vec, because while each word has a fixed representation under Word2Vec regardless of the context within which the word appears, BERT produces word representations that are dynamically informed by the words around them.

The Bert implementation comes with a pretrained tokenizer and a definied vocabulary. We load the one related to the smallest pre-trained model bert-base-cased. We use the cased variate since it is well suited for NER.

In [11]:
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




### Creating dataloaders

In [12]:
import data_loaders as dalo

In [13]:
train_dataset, train_sampler, train_dataloader = dalo.create_dataloader_old(conll.sentences['train'],
                                                                            conll.labels['train'], conll.tag2idx,
                                                                            bert_tokenizer)

In [21]:
train_dataset[0][0].shape

torch.Size([173])

In [22]:
valid_dataset, valid_sampler, valid_dataloader = dalo.create_dataloader_old(conll.sentences['valid'],
                                                            conll.labels['valid'], conll.tag2idx,
                                                            bert_tokenizer, 'valid', desired_pad=train_dataset[0][0].shape[0])

### BERT & ELMo setup

The transformer package provides a BertForTokenClassification class for token-level predictions. BertForTokenClassification is a fine-tuning model that wraps BertModel and adds token-level classifier on top of the BertModel.

In [14]:
from bert_config import *
from elmo_config import *

### Define the model

In [15]:
# importing BEboC model
from models import *

import model_utils as mu

In [16]:
class BEboC(nn.Module):
    """
    BERT+Elmo+biLSTM+one CRF
    """
    def __init__(self, hidden_size=128, num_labels=4, elmo_layers=2,
                 bert_layers=1, concat_bert=True, bilstm_layers=1):
        """
        Creates model
        
        Parameters
        ----------
        hidden_size:
        num_labels:
        elmo_layers: int, default=2
            Num of ELMo layers to be considered
        bert_layers: int, default=1
            Num of final BERT hidden layers to be used as embedding vector.
        concat_bert: bool, default=True
            Whether to concat (True) or sum (False) last BERT hidden layers.
        bilstm_layers: int, default=1
        """
        super(BEboC, self).__init__()

        self.hidden_size = hidden_size
        self.num_labels = num_labels
        self.elmo_layers = elmo_layers
        self.bert_layers = bert_layers
        self.concat_bert = concat_bert
        self.bilstm_layers = bilstm_layers
        
        self.bert = BertForTokenClassification.from_pretrained(
                        BERT_MODEL,
                        output_hidden_states=True)
        
        for pars in self.bert.parameters():
            pars.requires_grad = False
        
        bert_embedding_dim = self.bert.config.to_dict()['hidden_size']

        self.elmo = Elmo(options_file, weight_file, self.elmo_layers, dropout=0, requires_grad=False)
        
        elmo_embedding_dim = 512 # it's always fixed

        if self.concat_bert:
          self.linear1 = nn.Linear(bert_embedding_dim*self.bert_layers+elmo_embedding_dim*self.elmo_layers, 1024)
        else:
          self.linear1 = nn.Linear(bert_embedding_dim+elmo_embedding_dim*self.elmo_layers, 1024)
        
        self.bilstm = nn.LSTM(1024, self.hidden_size, self.bilstm_layers, bidirectional=True)
        
        self.linear2 = nn.Linear(self.hidden_size*2, self.num_labels)
        self.crf = CRF(num_tags=self.num_labels, batch_first=True)
    
    def get_model_pars_dict(self):
        """
        Returns dict with described model's parameters.
        
        """
        pars = {}
        pars['hidden_size'] = self.hidden_size
        pars['num_labels'] = self.num_labels
        pars['elmo_layers'] = self.elmo_layers
        pars['bert_layers'] = self.bert_layers
        pars['concat_bert'] = int(self.concat_bert)
        pars['bilstm_layers'] = self.bilstm_layers

        return pars
    
    def forward(self, bert_ids, elmo_ids, attention_mask):
        """
        Forward propogate of model.
        
        Parameters
        ----------
        sequence:
        attention_mask:
        
        Returns
        -------
        Logits
        
        """

        bert_hiddens = self.bert(bert_ids, attention_mask=attention_mask)[1]
        elmo_hiddens = self.elmo(elmo_ids)

        if self.concat_bert:
            bert_embedding = torch.cat(bert_hiddens[-self.bert_layers:], dim=2)#[bert_hiddens[-i] for i in range(-1, -self.bert_layers-1, -1)], dim=0)
        else:
            emb_sum = 0
            for h in bert_hiddens[-self.bert_layers:]:
                emb_sum += h
            bert_embedding = emb_sum

        elmo_bert_embeddings = torch.clone(bert_embedding)

        for el_hi in elmo_hiddens['elmo_representations']:
            elmo_bert_embeddings = torch.cat((elmo_bert_embeddings, el_hi), dim=-1)

        linear1_output = nn.functional.relu(self.linear1(elmo_bert_embeddings))

        bilstm_output, (h_n, c_n) = self.bilstm(linear1_output)
        linear2_output = nn.functional.relu(self.linear2(bilstm_output))
        return linear2_output

### Cross-validation with the best hyperparams on 5 folds

Fix some train parameters


In [17]:
N_FOLDS = 5
RANDOM_SEED = 42
N_EPOCHS = 15
BATCH_SIZE = 128

In [18]:
kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_SEED)

In [19]:
TAG_NAMES = ['ORG', 'LOC', 'PER', 'MISC']

In [None]:
all_tag_results = []
model_results = []

for i, (train_index, valid_index) in enumerate(kf.split(train_dataset)):
    print(f"FOLD #{i}\n")
    # train_dataset based on conll and defined above specially for multiple-head model input
    train_fold = torch.utils.data.Subset(train_dataset, train_index)
    valid_fold = torch.utils.data.Subset(train_dataset, valid_index)
    
    _train_dataloader = torch.utils.data.DataLoader(train_fold, batch_size=BATCH_SIZE)
    _valid_dataloader = torch.utils.data.DataLoader(valid_fold, batch_size=BATCH_SIZE)

    total_steps = len(train_fold) *  N_EPOCHS

    model = BEboC(hidden_size=512, num_labels=len(conll.tag2idx), bert_layers=2)
    model.to(device)

    optimizer = AdamW(params=model.parameters(),lr=5e-4)

    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=total_steps
    )

    loss_values, validation_loss_values, valid_accuracies, valid_f1_scores = mu.train_old(model, _train_dataloader, optimizer, conll.idx2tag, device, scheduler, n_epoch=N_EPOCHS,
                                                                                          valid_dataloader=_valid_dataloader, save_model=False)
    
    # evaluating on all validation data
    eval_loss, acc, f1 = mu.eval_old(model, _valid_dataloader, device, conll.idx2tag)
    model_results.append({'acc': acc, 'f1': f1})

    # evaluating on separate tags
    tag_results = {}
    for tag in TAG_NAMES:
        # old version of create dataloader function is very suitable here
        tag_train_dataset, _, _ = dalo.create_dataloader_old(conll.sentences['train'],
                                                            conll.one_tag_dict['train'][tag], conll.tag2idx,
                                                            bert_tokenizer, desired_pad=train_dataset[0][0].shape[0])
        tag_valid_fold = torch.utils.data.Subset(tag_train_dataset, valid_index)
        tag_valid_dataloader = torch.utils.data.DataLoader(tag_valid_fold, batch_size=BATCH_SIZE)
        eval_loss, acc, f1 = mu.eval_old(model, tag_valid_dataloader, device, conll.idx2tag)
        
        tag_results[tag] = {'loss': eval_loss, 'acc': acc, 'f1': f1}
    
    print(f"tag_results:{tag_results}")
    all_tag_results.append(tag_results)

    torch.cuda.empty_cache()

with open(PATH_TO_CHECKPOINT+"BEboC-5fold_tag_results.json", "w") as f:
    json.dump(all_tag_results, f)

with open(PATH_TO_CHECKPOINT+"BEboC-5fold_model_results.json", "w") as f:
    json.dump(model_results, f)


### Old model selection using grid

In [None]:
%time
param_grid = {
    'opt': ['AdamW'],
    'lr': [3e-4, 7e-4, 1e-3],
    'bert_layers': [2,3],
    'concat': [True, False],
    'max_grad_norm': [None]#[1., None]
}

param_grid = {
    'opt': ['AdamW'],
    'lr': [1e-3],
    'bert_layers': [2],
    'concat': [False],
    'max_grad_norm': [None]
}

grid = ParameterGrid(param_grid)

params_results = {}

for m, ps in enumerate(grid):
  print(f"Model #{m} of {len(grid)}")
  _p_r = {'params': ps}
  
  mean_train_losses = 0
  mean_valid_losses = 0
  mean_valid_accs = 0
  mean_valid_f1s = 0

  for i, (train_index, valid_index) in enumerate(kf.split(train_data)):
    train_fold = torch.utils.data.Subset(train_data, train_index)
    valid_fold = torch.utils.data.Subset(train_data, valid_index)
    train_dataloader = DataLoader(train_fold, batch_size=BATCH_SIZE)
    valid_dataloader = DataLoader(valid_fold, batch_size=BATCH_SIZE)

    model = BEboc(batch_size=BATCH_SIZE, hidden_size=128, num_labels=len(tag2idx),
                    bert_layers=ps['bert_layers'], concat=ps['concat'])
    model.to(device)

    if ps['opt'] == 'Adam':
      optimizer = torch.optim.Adam(params=model.parameters(),lr=ps['lr'])
    else:
      optimizer = AdamW(params=model.parameters(),lr=ps['lr'])

    train_losses, valid_losses, valid_accs, valid_f1s = train(model,
                                            train_dataloader,
                                            optimizer,
                                            n_epoch=N_EPOCHS,
                                            max_grad_norm=ps['max_grad_norm'],
                                            valid_dataloader=valid_dataloader,
                                            show_info=False)
    
    mean_train_losses += np.array(train_losses)
    mean_valid_losses += np.array(valid_losses)
    mean_valid_accs += np.array(valid_accs)
    mean_valid_f1s += np.array(valid_f1s)
  
  mean_train_losses /= N_FOLDS
  mean_valid_losses /= N_FOLDS
  mean_valid_accs /= N_FOLDS
  mean_valid_f1s /= N_FOLDS
  _p_r['mean_train_losses'] = list(mean_train_losses)
  _p_r['mean_valid_losses'] = list(mean_valid_losses)
  _p_r['mean_valid_accs'] = list(mean_valid_accs)
  _p_r['mean_valid_f1s'] = list(mean_valid_f1s)
  params_results[m] = _p_r 

with open("/content/drive/My Drive/params_results.json", "w") as w:
  json.dump(params_results, w)

In [None]:
with open("/content/drive/My Drive/params_results.json", "w") as w:
  json.dump(params_results, w)

After train on 1 epoch on small dataset the best result gave the model that concatenates two last bert layers and has learning rate 5e-4, so let's train such model on all train data. Also, let's increase LSTM hidden size to be 512 and use now linear scheduler.

### Final model train

In [None]:
N_EPOCHS = 20

In [None]:
total_steps = len(train_dataloader) *  N_EPOCHS

In [None]:
model = BEboC(hidden_size=512, bert_layers=2)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435779157.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

In [None]:
model.to(device)

In [None]:
optimizer = AdamW(params=model.parameters(),lr=3e-4)

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

if device.type != 'cpu':
    model.to(device)

#train_losses, valid_losses, valid_accs, valid_f1s
results = train(model, train_dataloader, optimizer, scheduler, n_epoch=N_EPOCHS,
     validate=True, valid_dataloader=valid_dataloader)

If we want to train model for more time

In [None]:
N_EPOCHS = 5
total_steps = len(train_dataloader) *  N_EPOCHS

optimizer = AdamW(params=model.parameters(),lr=1e-4)

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

if device.type != 'cpu':
    model.to(device)
  
model.train()

#train_losses, valid_losses, valid_accs, valid_f1s
results_1 = train(model, train_dataloader, optimizer, scheduler, n_epoch=N_EPOCHS,
     validate=True, valid_dataloader=valid_dataloader)

If we want to plot the reults of learning

In [None]:
plt.figure(figsize=(8,6))
plt.plot(results[0], label='train')
plt.plot(results[1], label='valid', c='g')
plt.ylabel("loss")
plt.xlabel("#epoch")
plt.xticks(np.arange(0,20,2))
plt.grid()
plt.legend()
plt.show()

If we want to evaluate model on the test set

In [None]:
test_dataset, test_sampler, test_dataloader = create_dataloader(tokenizer, test_data, test_labels,
                                                                   datatype='test',
                                                                   desired_pad=max_seq_len)

In [None]:
model.eval()
test_losses = []
true_labels = []
pred_labels = []
for step, batch in enumerate(test_dataloader):
    # add batch to gpu
    batch = tuple(t.to(device) for t in batch)
    b_elmo_ids, b_bert_ids, b_input_mask, b_labels = batch
    batch_true_labels = b_labels
    for bl in batch_true_labels.detach().cpu().tolist():
      tag_names = [idx2tag[i] for i in bl if idx2tag[i] != 'PAD']
      true_labels.append(tag_names)
    
    # Always clear any previously calculated gradients before performing a backward pass.
    # forward pass
    # This will return the loss (rather than the model output)
    # because we have provided the `labels`.
    with torch.no_grad():
        logits = model.forward(b_elmo_ids, b_bert_ids, b_input_mask.byte())
        loss = model.crf.forward(logits, b_labels, b_input_mask.byte())
        test_losses.append(loss.item())
        tags = model.crf.decode(logits, b_input_mask.byte())
    for t in tags:
      tag_names = [idx2tag[i] for i in t]
      pred_labels.append(tag_names)


In [None]:
f1_score(true_labels, pred_labels)

___

In [None]:
torch.save(model, '/content/drive/My Drive/models/Elmo_BERT_biLSTM_oneCRF_final.pth')

In [None]:
torch.save(model.state_dict(), '/content/drive/My Drive/models/ELmo_BERT_biLSTM_oneCRF_final_state_dict.pth')

##Load the model

In [None]:
model = torch.load('/content/drive/My Drive/models/BERT_biLSTM_oneCRF.pth',
                   map_location=torch.device('cpu'))

In [None]:
model = torch.load('/content/drive/My Drive/models/Elmo_BERT_biLSTM_oneCRF.pth',
                   map_location=torch.device('cpu'))

In [21]:
def load_checkpoint(tokenizer_path, checkpoint_path):
    """Loads both tokenizer and our pretrained model"""
    tokenizer = tokenizer = BertTokenizer.from_pretrained(tokenizer_path)
    checkpoint = torch.load(checkpoint_path)
    model = checkpoint['model']
    model.load_state_dict(checkpoint['state_dict'])
    #for parameter in model.parameters():
    #    parameter.requires_grad = False

    model.eval()
    return tokenizer, model

In [23]:
tokenizer, model = load_checkpoint('/content/drive/My Drive/models/ElMo_BERT_biLSTM_oneCRF_19_tokenizer.pth',
                                     '/content/drive/My Drive/models/ElMo_BERT_biLSTM_oneCRF_19_state_dict.pth')

In [32]:
tokenizer.save_pretrained(f'/content/drive/My Drive/models/ElMo_BERT_biLSTM_oneCRF_19_tokenizer.pth')
checkpoint = {'model': BEboC(hidden_size=512, bert_layers=2),
              'state_dict': model.state_dict(), 
              'optimizer' : optimizer.state_dict()}

torch.save(checkpoint,
            f'/content/drive/My Drive/models/ElMo_BERT_biLSTM_oneCRF_19_state_dict.pth')

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

In [33]:
train_dataset, train_sampler, train_dataloader = create_dataloader(tokenizer, train_data, train_labels)

In [34]:
for s in train_dataset:
  max_seq_len = s[1].shape[0]
  break

In [35]:
test_dataset, test_sampler, test_dataloader = create_dataloader(tokenizer, test_data, test_labels,
                                                                   datatype='test',
                                                                   desired_pad=max_seq_len)

In [None]:
model.to(device)
model.eval()
test_losses = []
true_labels = []
pred_labels = []
for step, batch in enumerate(test_dataloader):
    # add batch to gpu
    batch = tuple(t.to(device) for t in batch)
    b_elmo_ids, b_bert_ids, b_input_mask, b_labels = batch
    batch_true_labels = b_labels
    for bl in batch_true_labels.detach().cpu().tolist():
      tag_names = [idx2tag[i] for i in bl if idx2tag[i] != 'PAD']
      true_labels.append(tag_names)
    
    # Always clear any previously calculated gradients before performing a backward pass.
    # forward pass
    # This will return the loss (rather than the model output)
    # because we have provided the `labels`.
    with torch.no_grad():
        logits = model.forward(b_elmo_ids, b_bert_ids, b_input_mask.byte())
        loss = model.crf.forward(logits, b_labels, b_input_mask.byte())
        test_losses.append(loss.item())
        tags = model.crf.decode(logits, b_input_mask.byte())
    for t in tags:
      tag_names = [idx2tag[i] for i in t]
      pred_labels.append(tag_names)



In [None]:
f1_score(true_labels, pred_labels)

0.8405783838198957