In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
PATH_TO_PROJECT = '/content/drive/My Drive/Serious/'
# path to conll class as well as to conll data
PATH_TO_CONLL = PATH_TO_PROJECT + 'coNLL/'
PATH_TO_TAG2IDX = PATH_TO_CONLL + 'tag2idx.json'
PATH_TO_ONE_TAG2IDX = PATH_TO_CONLL + 'one_tag2idx.json'
PATH_TO_CHECKPOINT = '/content/drive/My Drive/models/'

### Intall requirements

In [5]:
!pip install -r '/content/drive/My Drive/Serious/requirements.txt'

Collecting transformers~=4.3
[?25l  Downloading https://files.pythonhosted.org/packages/f9/54/5ca07ec9569d2f232f3166de5457b63943882f7950ddfcc887732fc7fb23/transformers-4.3.3-py3-none-any.whl (1.9MB)
[K     |████████████████████████████████| 1.9MB 16.9MB/s 
[?25hCollecting allennlp~=2.0
[?25l  Downloading https://files.pythonhosted.org/packages/e7/bd/c75fa01e3deb9322b637fe0be45164b40d43747661aca9195b5fb334947c/allennlp-2.1.0-py3-none-any.whl (585kB)
[K     |████████████████████████████████| 593kB 49.7MB/s 
[?25hCollecting seqeval~=1.2
[?25l  Downloading https://files.pythonhosted.org/packages/9d/2d/233c79d5b4e5ab1dbf111242299153f3caddddbb691219f363ad55ce783d/seqeval-1.2.2.tar.gz (43kB)
[K     |████████████████████████████████| 51kB 8.8MB/s 
[?25hCollecting pytorch-crf~=0.7
  Downloading https://files.pythonhosted.org/packages/96/7d/4c4688e26ea015fc118a0327e5726e6596836abce9182d3738be8ec2e32a/pytorch_crf-0.7.2-py3-none-any.whl
Collecting sacremoses
[?25l  Downloading https://fi

### Loading coNLL

In [6]:
import sys
sys.path.append(PATH_TO_PROJECT)
sys.path.append(PATH_TO_CONLL)

from importlib import reload
import conll as co

In [7]:
conll = co.CoNLL(PATH_TO_CONLL)

In [62]:
# splitting raw data to sentences and labels
for typ in conll.types:
    conll.split_text_label(typ)

# define set of all labels
conll.create_set_of_labels()

for typ in conll.types:
    # for multiple heads of CRF layer
    conll.create_one_labeled_data(typ)

    # creating one_tag2idx dictionary
    conll.create_one_tag2idx(PATH_TO_ONE_TAG2IDX)
    conll.create_idx2one_tag()

In [63]:
# dict of tag2idx mapping for each CRF-head (one head responsible for 'LOC' etc.)
conll.one_tag2idx

{'LOC': {'B-LOC': 0, 'I-LOC': 3, 'O': 2, 'PAD': 1},
 'MISC': {'B-MISC': 0, 'I-MISC': 3, 'O': 2, 'PAD': 1},
 'ORG': {'B-ORG': 2, 'I-ORG': 3, 'O': 1, 'PAD': 0},
 'PER': {'B-PER': 2, 'I-PER': 0, 'O': 3, 'PAD': 1}}

In [64]:
print(f"sen example: {conll.sentences['train'][0]}")
print(f"tags example: {conll.labels['train'][0]}")
print(f"tags example with only 'ORG' tag: {conll.one_tag_dict['train']['ORG'][0]}")
print(f"tags for CRF tags has labels: {conll.one_tag_dict['train'].keys()}")

sen example: ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']
tags example: ['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']
tags example with only 'ORG' tag: ['B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
tags for CRF tags has labels: dict_keys(['LOC', 'PER', 'ORG', 'MISC'])


### Importing packages

In [8]:
import numpy as np
import torch
from torch import nn
from torch.optim import AdamW
from transformers import BertTokenizer, BertModel
from transformers import BertForTokenClassification
from allennlp.modules.elmo import Elmo, batch_to_ids

from torchcrf import CRF

from sklearn.model_selection import KFold, ParameterGrid

from transformers import get_linear_schedule_with_warmup

import matplotlib
from matplotlib import pyplot as plt

%matplotlib inline

### Creating dataloaders

In [152]:
import data_loaders as dalo

In [153]:
reload(dalo)

<module 'data_loaders' from '/content/drive/My Drive/Serious/data_loaders.py'>

In [13]:
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




In [91]:
TAG_NAMES = ['ORG', 'LOC','PER']
NUM_OF_HEADS = len(TAG_NAMES)

In [92]:
# in the second argument we pass list of tag names for every head of the model
train_dataset, train_sampler, train_dataloader = dalo.create_dataloader(conll, TAG_NAMES, bert_tokenizer)

# sanity check for output sizes
assert train_dataset[0][0].shape[0] == train_dataset[0][1].shape[0]
if NUM_OF_HEADS > 1:
  assert train_dataset[0][2].shape[0] == NUM_OF_HEADS
  assert train_dataset[0][2].shape[1] == train_dataset[0][0].shape[0]
else:
  assert len(train_dataset[0][2].shape) == NUM_OF_HEADS # == 1
  assert train_dataset[0][2].shape[0] == train_dataset[0][0].shape[0]
assert train_dataset[0][3].shape[0] == train_dataset[0][0].shape[0]

print(f"bert sentence shape: {train_dataset[0][0].shape}")
print(f"elmo sentence shape: {train_dataset[0][1].shape}")
print(f"number of heads: {train_dataset[0][2].shape[0] if NUM_OF_HEADS > 1 else 1}")
print(f"tokens len: {train_dataset[0][2].shape[1] if NUM_OF_HEADS > 1 else train_dataset[0][2].shape[0]}")
print(f"mask shape: {train_dataset[0][3].shape}")

bert sentence shape: torch.Size([173])
elmo sentence shape: torch.Size([173, 50])
number of heads: 3
tokens len: 173
mask shape: torch.Size([173])


In [99]:
valid_dataset, valid_sampler, valid_dataloader = dalo.create_dataloader(conll, TAG_NAMES, bert_tokenizer,
                                                                        'valid', desired_pad=train_dataset[0][0].shape[0])

# sanity check for output sizes
assert valid_dataset[0][0].shape[0] == train_dataset[0][1].shape[0]
if NUM_OF_HEADS > 1:
  assert valid_dataset[0][2].shape[0] == NUM_OF_HEADS
  assert valid_dataset[0][2].shape[1] == train_dataset[0][0].shape[0]
else:
  assert len(valid_dataset[0][2].shape) == NUM_OF_HEADS # == 1
  assert valid_dataset[0][2].shape[0] == train_dataset[0][0].shape[0]
assert valid_dataset[0][3].shape[0] == train_dataset[0][0].shape[0]

print(f"bert sentence shape: {valid_dataset[0][0].shape}")
print(f"elmo sentence shape: {valid_dataset[0][1].shape}")

bert sentence shape: torch.Size([173])
elmo sentence shape: torch.Size([173, 50])


### Creating model

In [203]:
from bert_config import *
from elmo_config import *

In [204]:
class BEbiC(nn.Module):
    """
    BERT+Elmo+biLSTM+CRFs
    """
    def __init__(self, hidden_size=128, num_labels=4, tag_names=TAG_NAMES,
                 elmo_layers=2, bert_layers=1, concat_bert=True,
                 bilstm_layers=1, bilstm_dropout=0):
        """
        Creates model
        
        Parameters
        ----------
        hidden_size: int, default=128
          LSTM parameter
        num_labels: int, defualt=4
          The number of each CRF labels (ex: B-LABEL, I-LABEL, O, PAD for multiple heads case)
        tag_names: list of str
          List of tag names for models heads
        elmo_layers: int, default=2
          Num of ELMo layers to be considered
        bert_layers: int, default=1
          Num of final BERT hidden layers to be used as embedding vector.
        concat_bert: bool, default=True
          Whether to concat (True) or sum (False) last BERT hidden layers.
        bilstm_layers: int, default=1

        """

        super(BEbiC, self).__init__()

        self.hidden_size = hidden_size
        self.num_labels = num_labels
        self.tag_names = tag_names
        self.num_heads = len(self.tag_names)
        self.elmo_layers = elmo_layers
        self.bert_layers = bert_layers
        self.concat_bert = concat_bert
        self.bilstm_layers = bilstm_layers
        self.bilstm_dropout = bilstm_dropout
        
        self.bert = BertForTokenClassification.from_pretrained(
                        BERT_MODEL,
                        output_hidden_states=True)
        
        for pars in self.bert.parameters():
            pars.requires_grad = False
        
        bert_embedding_dim = self.bert.config.to_dict()['hidden_size']

        self.elmo = Elmo(options_file, weight_file, self.elmo_layers, dropout=0, requires_grad=False)
        
        elmo_embedding_dim = 512 # it's always fixed

        if self.concat_bert:
          self.linear1 = nn.Linear(bert_embedding_dim*self.bert_layers+elmo_embedding_dim*self.elmo_layers, 1024)
        else:
          self.linear1 = nn.Linear(bert_embedding_dim+elmo_embedding_dim*self.elmo_layers, 1024)
        
        self.bilstm = nn.LSTM(1024, self.hidden_size, self.bilstm_layers, 
                              bidirectional=True, dropout=self.bilstm_dropout)

        self.heads = {}
        for i, tag in enumerate(self.tag_names):
            lin_crf = nn.ModuleDict({'linear': nn.Linear(self.hidden_size*2, self.num_labels),
                                     'crf': CRF(num_tags=self.num_labels, batch_first=True)})
            self.heads[tag] = lin_crf

        self.heads = nn.ModuleDict(self.heads)
        self.active_heads = {head: True for head in self.heads.keys()}

    def add_head(self, tag_name):
        """
        Adds new head to the model

        """
        self.tag_names.append(tag_name)
        self.num_heads += 1
        lin_crf = nn.ModuleDict({'linear': nn.Linear(self.hidden_size*2, self.num_labels),
                                     'crf': CRF(num_tags=self.num_labels, batch_first=True)})
        self.heads.update({tag_name: lin_crf})


    def shared_forward(self, bert_ids, elmo_ids, attention_mask):
        """
        Forward propogate of model shared layers.
        
        Parameters
        ----------
        bert_ids:
        elmo_ids:
        attention_mask:
        
        Returns
        -------
        Bilstm logits with shape (seq_len, batch, 2*self.hidden_size)
        
        """

        mask = attention_mask.byte()
        bert_hiddens = self.bert(bert_ids, attention_mask=mask)[1]
        elmo_hiddens = self.elmo(elmo_ids)

        if self.concat_bert:
            bert_embedding = torch.cat(bert_hiddens[-self.bert_layers:], dim=2)
        else:
            emb_sum = 0
            for h in bert_hiddens[-self.bert_layers:]:
                emb_sum += h
            bert_embedding = emb_sum

        elmo_bert_embeddings = torch.clone(bert_embedding)
        for el_hi in elmo_hiddens['elmo_representations']:
            elmo_bert_embeddings = torch.cat((elmo_bert_embeddings, el_hi), dim=-1)

        linear1_output = nn.functional.relu(self.linear1(elmo_bert_embeddings))

        bilstm_output, (h_n, c_n) = self.bilstm(linear1_output)

        return bilstm_output
    
    def get_one_head_loss(self, bilstm_logits, head_labels, attention_mask, head_tag):
        """
        Returns negative log-likelihood for one head.
        You should run it after shared forward.

        Parameters
        ----------
        bilstm_logits:
        head_labels:
        attention_mask:
        head_tag: str
          Key of self.heads dictionary.
        
        Returns
        -------
        Loss

        """
        lin_out = nn.functional.relu(self.heads[head_tag]['linear'](bilstm_logits))
        loss = -1*self.heads[head_tag]['crf'].forward(lin_out, head_labels, mask=attention_mask.byte())
        return loss
    
    def get_one_head_seq(self, bilstm_logits, attention_mask, head_tag):
        """
        Returns the most likely sequence of labels for the given head.
        You should run it after shared forward.

        Parameters
        ----------
        bilstm_logits:
        attention_mask:
        head_tag: str
          Key of self.heads dictionary.
        
        Returns
        -------
        List
        """
      
        lin_out = nn.functional.relu(self.heads[head_tag]['linear'](bilstm_logits))
        seq = self.heads[head_tag]['crf'].decode(lin_out, mask=attention_mask.byte())
        return seq
    
    def forward(self, bert_ids, elmo_ids, head_labels, attention_mask):
        """
        Forward model pass.
        
        Parameters
        ----------
        bert_ids:
        elmo_ids:
        head_labels:
        attention_mask:
        
        Returns
        -------
        Total loss for all heads.

        """

        mask = attention_mask.byte()
        bilstm_logits = self.shared_forward(bert_ids, elmo_ids, mask)
        head_loss = 0
        for i, tag in enumerate(self.heads.keys()):
          _one_head_labels = head_labels[:,i,:] if len(self.heads.keys()) > 1 else head_labels
          head_loss += self.get_one_head_loss(bilstm_logits, _one_head_labels, mask, tag)
        return head_loss
    
    def freeze_head(self, head_tag):
        """
        Freezes model's head parameters.

        """
        if head_tag not in self.heads.keys():
            raise ValueError(f"Unknown head tag. Please, give one of {self.heads.keys()}")
        
        for parameter in self.heads[head_tag].parameters():
            parameter.requires_grad = False
        
        self.active_heads[head_tag] = False
    
    def unfreeze_head(self, head_tag):
        """
        Unfreezes model's head parameters.

        """
        if head_tag not in self.heads.keys():
            raise ValueError(f"Unknown head tag. Please, give one of {self.heads.keys()}")
        
        for parameter in self.heads[head_tag].parameters():
            parameter.requires_grad = True
        
        self.active_heads[head_tag] = True

In [None]:
model = BEbiC(hidden_size=512, bert_layers=2, bilstm_layers=2, bilstm_dropout=0.3)

In [141]:
import model_utils as mu

In [142]:
reload(mu)

<module 'model_utils' from '/content/drive/My Drive/Serious/model_utils.py'>

In [20]:
N_EPOCHS = 10
total_steps = len(train_dataloader) *  N_EPOCHS

In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

model.to(device)

Two head experiment

In [None]:
model.tag_names

In [None]:
optimizer = AdamW(params=model.parameters(),lr=5e-4)

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

if device.type != 'cpu':
    model.to(device)

loss_value, head_results = mu.train(model, train_dataloader, optimizer, device, conll, scheduler, n_epoch=N_EPOCHS,
                                valid_dataloader=valid_dataloader, path_to_save=PATH_TO_CHECKPOINT)

In [None]:
mu.eval_model(model, valid_dataloader, device, conll)

### Loading pretrained model

In [205]:
bert_tokenizer, model, opt_state = mu.load_checkpoint(PATH_TO_CHECKPOINT+'BEbic_9_state_dict.pth',
                                                      PATH_TO_CHECKPOINT+'BEbic_9_tokenizer.pth')

In [206]:
model.heads

ModuleDict(
  (ORG): ModuleDict(
    (linear): Linear(in_features=1024, out_features=4, bias=True)
    (crf): CRF(num_tags=4)
  )
  (LOC): ModuleDict(
    (linear): Linear(in_features=1024, out_features=4, bias=True)
    (crf): CRF(num_tags=4)
  )
  (PER): ModuleDict(
    (linear): Linear(in_features=1024, out_features=4, bias=True)
    (crf): CRF(num_tags=4)
  )
)

In [30]:
optimizer = AdamW(params=model.parameters(),lr=3e-4)
optimizer.load_state_dict(opt_state)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

model.to(device)

In [208]:
valid_dataset, valid_sampler, valid_dataloader = dalo.create_dataloader(conll, TAG_NAMES, bert_tokenizer, 'valid', desired_pad=173)

In [209]:
head_result, mean_loss, mean_acc, mean_f1 = mu.eval_model(model, valid_dataloader, device, conll)

In [210]:
head_result

{'LOC': {'acc': 0.9913228024930153, 'f1': 0.9035911602209944},
 'ORG': {'acc': 0.9772324306898775, 'f1': 0.6343975283213182},
 'PER': {'acc': 0.9769100580270793, 'f1': 0.3787528868360277}}

#### Continue to train pretrained model

In [None]:
TAG_NAMES = ['ORG', 'LOC', 'PER']
NUM_OF_HEADS = len(TAG_NAMES)

# in the second argument we pass list of tag names for every head of the model
train_dataset, train_sampler, train_dataloader = dalo.create_dataloader(conll, TAG_NAMES, bert_tokenizer)

In [None]:
N_EPOCHS = 5
total_steps = len(train_dataloader) *  N_EPOCHS

In [None]:
optimizer = AdamW(params=model.parameters(),lr=1e-4)

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

if device.type != 'cpu':
    model.to(device)

loss_value, head_results = mu.train(model, train_dataloader, optimizer, device, conll, scheduler, n_epoch=N_EPOCHS,
                                valid_dataloader=valid_dataloader, path_to_save=PATH_TO_CHECKPOINT)

### Load one-head model to compare with mul-head one

In [193]:
conll_old = co.CoNLL_old(PATH_TO_CONLL)
for typ in conll_old.types:
  conll_old.split_text_label(typ)
conll_old.create_tag2idx(PATH_TO_TAG2IDX)
conll_old.create_idx2tag()

In [109]:
# importing BEboC model class
from models import *

In [110]:
bert_tokenizer, old_model, opt_state = mu.load_checkpoint(PATH_TO_CHECKPOINT+'ElMo_BERT_biLSTM_oneCRF_19_state_dict.pth',
                                                          PATH_TO_CHECKPOINT+'ElMo_BERT_biLSTM_oneCRF_19_tokenizer.pth')



In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

old_model.to(device)

PER

We need an old dataloader here

In [195]:
valid_dataset, valid_sampler, valid_dataloader = dalo.create_dataloader_old(conll.sentences['valid'],
                                                                            conll.one_tag_dict['valid']['PER'], conll_old.tag2idx,
                                                                            bert_tokenizer, datatype='valid', desired_pad=173)

In [197]:
mu.eval_old(old_model, valid_dataloader, device, conll_old.idx2tag)



(2879.9386127178486, 0.8668600902643456, 0.4219006007646095)

ORG

In [198]:
valid_dataset, valid_sampler, valid_dataloader = dalo.create_dataloader_old(conll.sentences['valid'],
                                                                            conll.one_tag_dict['valid']['ORG'], conll_old.tag2idx,
                                                                            bert_tokenizer, datatype='valid', desired_pad=173)

In [199]:
mu.eval_old(old_model, valid_dataloader, device, conll_old.idx2tag)



(4071.6029616135816, 0.8300827423167849, 0.3436274160188289)

LOC

In [200]:
valid_dataset, valid_sampler, valid_dataloader = dalo.create_dataloader_old(conll.sentences['valid'],
                                                                            conll.one_tag_dict['valid']['LOC'], conll_old.tag2idx,
                                                                            bert_tokenizer, datatype='valid', desired_pad=173)

In [201]:
mu.eval_old(old_model, valid_dataloader, device, conll_old.idx2tag)



(4093.646503155048, 0.8329706640876854, 0.44121974053764873)

The last values above are f1-scores.

In the case of multiple-head fitting of this 3 heads at the same time we got:

'PER' - 0.3787528868360277 (vs 0.4219006007646095)

'ORG' - 0.6343975283213182 (vs 0.3436274160188289)

'LOC' - 0.9035911602209944 (vs 0.44121974053764873)

So, results of multiple heads look better!