In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
PATH_TO_PROJECT = '/content/drive/My Drive/Serious/'
# path to conll class as well as to conll data
PATH_TO_CONLL = PATH_TO_PROJECT + 'coNLL/'
PATH_TO_TAG2IDX = PATH_TO_CONLL + 'tag2idx.json'
PATH_TO_ONE_TAG2IDX = PATH_TO_CONLL + 'one_tag2idx.json'
PATH_TO_CHECKPOINT = '/content/drive/My Drive/models/'

### Intall requirements

In [3]:
!pip install -r '/content/drive/My Drive/Serious/requirements.txt'

Collecting transformers~=4.3
[?25l  Downloading https://files.pythonhosted.org/packages/98/87/ef312eef26f5cecd8b17ae9654cdd8d1fae1eb6dbd87257d6d73c128a4d0/transformers-4.3.2-py3-none-any.whl (1.8MB)
[K     |████████████████████████████████| 1.8MB 7.2MB/s 
[?25hCollecting allennlp~=2.0
[?25l  Downloading https://files.pythonhosted.org/packages/9e/10/0637bb46d2f9eaf8c475fcf4ea4d8dcdbb184ab726b1c4bf5be0547211be/allennlp-2.0.1-py3-none-any.whl (580kB)
[K     |████████████████████████████████| 583kB 52.5MB/s 
[?25hCollecting seqeval~=1.2
[?25l  Downloading https://files.pythonhosted.org/packages/9d/2d/233c79d5b4e5ab1dbf111242299153f3caddddbb691219f363ad55ce783d/seqeval-1.2.2.tar.gz (43kB)
[K     |████████████████████████████████| 51kB 7.8MB/s 
[?25hCollecting pytorch-crf~=0.7
  Downloading https://files.pythonhosted.org/packages/96/7d/4c4688e26ea015fc118a0327e5726e6596836abce9182d3738be8ec2e32a/pytorch_crf-0.7.2-py3-none-any.whl
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloadi

### Loading coNLL

In [3]:
import sys
sys.path.append(PATH_TO_PROJECT)
sys.path.append(PATH_TO_CONLL)

from importlib import reload
import conll as co

In [4]:
conll = co.CoNLL(PATH_TO_CONLL)

In [5]:
# splitting raw data to sentences and labels
for typ in conll.types:
    conll.split_text_label(typ)

# define set of all labels
conll.create_set_of_labels()

for typ in conll.types:
    # for multiple heads of CRF layer
    conll.create_one_labeled_data(typ)

    # creating one_tag2idx dictionary
    conll.create_one_tag2idx(PATH_TO_ONE_TAG2IDX)
    conll.create_idx2one_tag()

In [6]:
# dict of tag2idx mapping for each CRF-head (one head responsible for 'LOC' etc.)
conll.one_tag2idx

{'LOC': {'B-LOC': 0, 'I-LOC': 3, 'O': 2, 'PAD': 1},
 'MISC': {'B-MISC': 0, 'I-MISC': 3, 'O': 2, 'PAD': 1},
 'ORG': {'B-ORG': 2, 'I-ORG': 3, 'O': 1, 'PAD': 0},
 'PER': {'B-PER': 2, 'I-PER': 0, 'O': 3, 'PAD': 1}}

In [7]:
print(f"sen example: {conll.sentences['train'][0]}")
print(f"tags example: {conll.labels['train'][0]}")
print(f"tags example with only 'ORG' tag: {conll.one_tag_dict['train']['ORG'][0]}")
print(f"tags for CRF tags has labels: {conll.one_tag_dict['train'].keys()}")

sen example: ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']
tags example: ['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']
tags example with only 'ORG' tag: ['B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
tags for CRF tags has labels: dict_keys(['MISC', 'ORG', 'PER', 'LOC'])


### Importing packages

In [8]:
import numpy as np
import torch
from torch import nn
from transformers import BertTokenizer, BertModel
from transformers import BertForTokenClassification, AdamW
from allennlp.modules.elmo import Elmo, batch_to_ids

from torchcrf import CRF

from sklearn.model_selection import KFold, ParameterGrid

from transformers import get_linear_schedule_with_warmup

import matplotlib
from matplotlib import pyplot as plt

%matplotlib inline

### Creating dataloaders

In [9]:
import data_loaders as dalo

In [10]:
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)

In [11]:
TAG_NAMES = ['ORG', 'LOC', 'PER']
NUM_OF_HEADS = len(TAG_NAMES)

In [12]:
# in the second argument we pass list of tag names for every head of the model
train_dataset, train_sampler, train_dataloader = delo.create_dataloader(conll, TAG_NAMES, bert_tokenizer)

# sanity check for output sizes
assert train_dataset[0][0].shape[0] == train_dataset[0][1].shape[0]
if NUM_OF_HEADS > 1:
  assert train_dataset[0][2].shape[0] == NUM_OF_HEADS
  assert train_dataset[0][2].shape[1] == train_dataset[0][0].shape[0]
else:
  assert len(train_dataset[0][2].shape) == NUM_OF_HEADS # == 1
  assert train_dataset[0][2].shape[0] == train_dataset[0][0].shape[0]
assert train_dataset[0][3].shape[0] == train_dataset[0][0].shape[0]

print(f"bert sentence shape: {train_dataset[0][0].shape}")
print(f"elmo sentence shape: {train_dataset[0][1].shape}")
print(f"number of heads: {train_dataset[0][2].shape[0] if NUM_OF_HEADS > 1 else 1}")
print(f"tokens len: {train_dataset[0][2].shape[1] if NUM_OF_HEADS > 1 else train_dataset[0][2].shape[0]}")
print(f"mask shape: {train_dataset[0][3].shape}")

bert sentence shape: torch.Size([173])
elmo sentence shape: torch.Size([173, 50])
number of heads: 3
tokens len: 173
mask shape: torch.Size([173])


In [13]:
valid_dataset, valid_sampler, valid_dataloader = delo.create_dataloader(conll, TAG_NAMES, bert_tokenizer, 'valid')

### Creating model

In [15]:
from bert_config import *
from elmo_config import *

In [50]:
class BEbiC(nn.Module):
    """
    BERT+Elmo+biLSTM+CRFs
    """
    def __init__(self, hidden_size=128, num_labels=4, tag_names=TAG_NAMES,
                 elmo_layers=2, bert_layers=1, concat_bert=True, bilstm_layers=1):
        """
        Creates model
        
        Parameters
        ----------
        hidden_size: int, default=128
          LSTM parameter
        num_labels: int, defualt=4
          The number of each CRF labels (ex: B-LABEL, I-LABEL, O, PAD for multiple heads case)
        tag_names: list of str
          List of tag names for models heads
        elmo_layers: int, default=2
          Num of ELMo layers to be considered
        bert_layers: int, default=1
          Num of final BERT hidden layers to be used as embedding vector.
        concat_bert: bool, default=True
          Whether to concat (True) or sum (False) last BERT hidden layers.
        bilstm_layers: int, default=1

        """

        super(BEbiC, self).__init__()

        self.hidden_size = hidden_size
        self.num_labels = num_labels
        self.tag_names = tag_names
        self.num_heads = len(self.tag_names)
        self.elmo_layers = elmo_layers
        self.bert_layers = bert_layers
        self.concat_bert = concat_bert
        self.bilstm_layers = bilstm_layers
        
        self.bert = BertForTokenClassification.from_pretrained(
                        BERT_MODEL,
                        output_hidden_states=True)
        
        for pars in self.bert.parameters():
            pars.requires_grad = False
        
        bert_embedding_dim = self.bert.config.to_dict()['hidden_size']

        self.elmo = Elmo(options_file, weight_file, self.elmo_layers, dropout=0, requires_grad=False)
        
        elmo_embedding_dim = 512 # it's always fixed

        if self.concat_bert:
          self.linear1 = nn.Linear(bert_embedding_dim*self.bert_layers+elmo_embedding_dim*self.elmo_layers, 1024)
        else:
          self.linear1 = nn.Linear(bert_embedding_dim+elmo_embedding_dim*self.elmo_layers, 1024)
        
        self.bilstm = nn.LSTM(1024, self.hidden_size, self.bilstm_layers, bidirectional=True)

        self.heads = {}
        for i, tag in enumerate(self.tag_names):
            lin_crf = nn.ModuleDict({'linear': nn.Linear(self.hidden_size*2, self.num_labels),
                                     'crf': CRF(num_tags=self.num_labels, batch_first=True)})
            self.heads[tag] = lin_crf

        self.heads = nn.ModuleDict(self.heads)
        self.active_heads = {head: True for head in self.heads.keys()}

    def add_head(self, tag_name):
        """
        Adds new head to the model

        """
        self.tag_names.append(tag_name)
        self.num_heads += 1
        lin_crf = nn.ModuleDict({'linear': nn.Linear(self.hidden_size*2, self.num_labels),
                                     'crf': CRF(num_tags=self.num_labels, batch_first=True)})
        self.heads.update({tag_name: lin_crf})


    def shared_forward(self, bert_ids, elmo_ids, attention_mask):
        """
        Forward propogate of model shared layers.
        
        Parameters
        ----------
        bert_ids:
        elmo_ids:
        attention_mask:
        
        Returns
        -------
        Bilstm logits with shape (seq_len, batch, 2*self.hidden_size)
        
        """

        mask = attention_mask.byte()
        bert_hiddens = self.bert(bert_ids, attention_mask=mask)[1]
        elmo_hiddens = self.elmo(elmo_ids)

        if self.concat_bert:
            bert_embedding = torch.cat(bert_hiddens[-self.bert_layers:], dim=2)
        else:
            emb_sum = 0
            for h in bert_hiddens[-self.bert_layers:]:
                emb_sum += h
            bert_embedding = emb_sum

        elmo_bert_embeddings = torch.clone(bert_embedding)

        for el_hi in elmo_hiddens['elmo_representations']:
            elmo_bert_embeddings = torch.cat((elmo_bert_embeddings, el_hi), dim=-1)

        linear1_output = nn.functional.relu(self.linear1(elmo_bert_embeddings))

        bilstm_output, (h_n, c_n) = self.bilstm(linear1_output)

        return bilstm_output
    
    def get_one_head_loss(self, bilstm_logits, head_labels, attention_mask, head_tag):
        """
        Returns negative log-likelihood for one head.
        You should run it after shared forward.

        Parameters
        ----------
        bilstm_logits:
        head_labels:
        attention_mask:
        head_tag: str
          Key of self.heads dictionary.
        
        Returns
        -------
        Loss

        """
        lin_out = nn.functional.relu(self.heads[head_tag]['linear'](bilstm_logits))
        loss = -1*self.heads[head_tag]['crf'].forward(lin_out, head_labels, mask=attention_mask.byte())
        return loss
    
    def get_one_head_seq(self, bilstm_logits, attention_mask, head_tag):
        """
        Returns the most likely sequence of labels for the given head.
        You should run it after shared forward.

        Parameters
        ----------
        bilstm_logits:
        attention_mask:
        head_tag: str
          Key of self.heads dictionary.
        
        Returns
        -------
        List
        """
      
        lin_out = nn.functional.relu(self.heads[head_tag]['linear'](bilstm_logits))
        seq = self.heads[head_tag]['crf'].decode(lin_out, mask=attention_mask.byte())
        return seq
    
    def forward(self, bert_ids, elmo_ids, head_labels, attention_mask):
        """
        Forward model pass.
        
        Parameters
        ----------
        bert_ids:
        elmo_ids:
        head_labels:
        attention_mask:
        
        Returns
        -------
        Total loss for all heads.

        """

        mask = attention_mask.byte()
        bilstm_logits = self.shared_forward(bert_ids, elmo_ids, mask)
        head_loss = 0
        for i, tag in enumerate(self.heads.keys()):
          head_loss += self.get_one_head_loss(bilstm_logits, head_labels[:,i,:], mask, tag)
        return head_loss
    
    def freeze_head(self, head_tag):
        """
        Freezes model's head parameters.

        """
        if head_tag not in self.heads.keys():
            raise ValueError(f"Unknown head tag. Please, give one of {self.heads.keys()}")
        
        for parameter in self.heads[head_tag].parameters():
            parameter.requires_grad = False
        
        self.active_heads[head_tag] = False
    
    def unfreeze_head(self, head_tag):
        """
        Unfreezes model's head parameters.

        """
        if head_tag not in self.heads.keys():
            raise ValueError(f"Unknown head tag. Please, give one of {self.heads.keys()}")
        
        for parameter in self.heads[head_tag].parameters():
            parameter.requires_grad = True
        
        self.active_heads[head_tag] = True

In [17]:
model = BEbiC(hidden_size=512, bert_layers=2)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

In [37]:
import model_utils as mu

In [19]:
N_EPOCHS = 10
total_steps = len(train_dataloader) *  N_EPOCHS

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

model.to(device)

In [40]:
optimizer = AdamW(params=model.parameters(),lr=3e-4)

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

if device.type != 'cpu':
    model.to(device)

loss_value, head_results = mu.train(model, train_dataloader, optimizer, device, conll, scheduler, n_epoch=N_EPOCHS,
                                valid_dataloader=valid_dataloader, path_to_save=PATH_TO_CHECKPOINT)

  0%|          | 0/110 [00:00<?, ?it/s]


Epoch #0


  9%|▉         | 10/110 [00:19<03:15,  1.95s/it]


9: avg loss per batch: 271.7017664026331



 18%|█▊        | 20/110 [00:39<02:56,  1.97s/it]


19: avg loss per batch: 254.34123470908716



 27%|██▋       | 30/110 [00:59<02:37,  1.97s/it]


29: avg loss per batch: 247.08477642892421



 36%|███▋      | 40/110 [01:18<02:17,  1.96s/it]


39: avg loss per batch: 242.25025353064905



 45%|████▌     | 50/110 [01:38<01:57,  1.96s/it]


49: avg loss per batch: 241.1801637403008



 55%|█████▍    | 60/110 [01:58<01:37,  1.96s/it]


59: avg loss per batch: 241.12439999337923



 64%|██████▎   | 70/110 [02:17<01:17,  1.95s/it]


69: avg loss per batch: 239.86491416157156



 73%|███████▎  | 80/110 [02:37<00:58,  1.96s/it]


79: avg loss per batch: 238.20666375140098



 82%|████████▏ | 90/110 [02:56<00:39,  1.97s/it]


89: avg loss per batch: 237.9859621426586



 91%|█████████ | 100/110 [03:16<00:19,  1.96s/it]


99: avg loss per batch: 236.5918443467882



100%|██████████| 110/110 [03:35<00:00,  1.96s/it]


109: avg loss per batch: 234.36654560427417

Average train loss: 232.23594064423534





Mean validation loss: 241.23662504592457
Mean validation accuracy: 0.32518596126752153
Mean validation F1-score: 0.20561712888078007



### Loading pretrained model

In [None]:
model = BEbiC(hidden_size=512, bert_layers=2)

In [19]:
optimizer = AdamW(params=model.parameters(),lr=3e-4)

In [42]:
bert_tokenizer, model, optimizer = mu.load_checkpoint(model, optimizer,
                                                   PATH_TO_CHECKPOINT+'BEbic_19_tokenizer_v1.pth',
                                                   PATH_TO_CHECKPOINT+'BEbic_19_state_dict_v1.pth')



In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

model.to(device)

In [44]:
valid_dataset, valid_sampler, valid_dataloader = dalo.create_dataloader(conll, TAG_NAMES, bert_tokenizer, 'valid')

  0%|          | 0/110 [22:01<?, ?it/s]


In [47]:
head_result, mean_loss, mean_acc, mean_f1 = mu.eval_model(model, valid_dataloader, device, conll)



In [48]:
head_result

{'LOC': {'acc': 0.9873871695680206, 'f1': 0.8200941046221977},
 'ORG': {'acc': 0.9786562432839029, 'f1': 0.6420724708968684},
 'PER': {'acc': 0.9772995916612938, 'f1': 0.3731228340392761}}

In [55]:
print(np.sum([1 if 'B-LOC' in x else 0 for x in conll.one_tag_dict['train']['LOC']])/len(conll.one_tag_dict['train']['LOC']))

0.36514493269710135


#### Continue to train pretrained model

In [33]:
TAG_NAMES = ['ORG', 'LOC', 'PER']
NUM_OF_HEADS = len(TAG_NAMES)

# in the second argument we pass list of tag names for every head of the model
train_dataset, train_sampler, train_dataloader = dalo.create_dataloader(conll, TAG_NAMES, bert_tokenizer)

In [30]:
N_EPOCHS = 5
total_steps = len(train_dataloader) *  N_EPOCHS

In [None]:
optimizer = AdamW(params=model.parameters(),lr=1e-4)

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

if device.type != 'cpu':
    model.to(device)

loss_value, head_results = mu.train(model, train_dataloader, optimizer, device, conll, scheduler, n_epoch=N_EPOCHS,
                                valid_dataloader=valid_dataloader, path_to_save=PATH_TO_CHECKPOINT)