In [1]:
!pip install datasets transformers pytorch-crf -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m401.2/401.2 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
from datasets import load_dataset
dataset = load_dataset("conll2003")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading data:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/312k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/283k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [3]:
dataset['train'][0].keys()

dict_keys(['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'])

In [61]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchcrf import CRF
from torch.utils.data import Dataset , DataLoader
from transformers import BertTokenizer, BertModel
class MyModel(nn.Module):
    def __init__(self , in_embed_dim , out_dim , num_class_pos , num_class_ner ):
        super(MyModel , self).__init__()
        self.in_embed_dim = in_embed_dim
        self.out_dim = out_dim
        self.bertmodel = BertModel.from_pretrained('bert-base-uncased')
        self.pos_ffn = nn.Linear(out_dim , num_class_pos)
        self.ner_ffn = nn.Linear(out_dim+num_class_pos , num_class_ner)
        self.crf = CRF(num_class_ner)
    def pos_forward(self , pos_data):
        embed_vector = self.bertmodel(pos_data.to(torch.int)).last_hidden_state
        out_pos = self.pos_ffn(embed_vector)
        return embed_vector , F.softmax(out_pos,dim = 2)
    def ner_forward(self , ner_data):
        embed_vector , out_pos = self.pos_forward(ner_data)
        out = torch.cat((embed_vector , out_pos) , dim = 2)
        out = self.ner_ffn(out)
        return out

    def forward(self, pos_data , ner_data , ner_tag):
        embed_vector , out_pos = self.pos_forward(pos_data)
        out_ner = self.ner_forward(ner_data)
        # import pdb
        # pdb.set_trace()
        loss_ner = self.crf(out_ner.cpu() , ner_tag.to(torch.int))
        return out_pos , out_ner , loss_ner





class PosDataset(Dataset):
    def __init__(self, posdata , max_len_sequence):
        self.posdata = posdata
        self.max_len_sequence = max_len_sequence
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.pad_id = 0
    def __len__(self):
        return len(self.posdata)
    def __getitem__(self, idx):
        data = {}
        pad = torch.tensor([-1])
        data['tokens'] = self.tokenizer.encode(self.posdata[idx]['tokens'], add_special_tokens=True, return_tensors='pt')[0]
        data['pos_tags'] = torch.cat((pad ,torch.tensor(self.posdata[idx]['pos_tags']) , pad) )
        if len(data['tokens']) > self.max_len_sequence :
            data['tokens'] = data['tokens'][:self.max_len_sequence]
            data['pos_tags'] = data['pos_tags'][:self.max_len_sequence]
        else :
            len_pad = self.max_len_sequence - len(data['tokens'])
            pad_vector = torch.tensor([0]*len_pad)
            data['tokens'] = torch.cat((data['tokens'], pad_vector))
            pad_vector = torch.tensor([-1]*len_pad)
            data['pos_tags'] = torch.cat((data['pos_tags'],  pad_vector))
        return data
class NERDataset(Dataset):
    def __init__(self, posdata , max_len_sequence):
        self.posdata = posdata
        self.max_len_sequence = max_len_sequence
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.pad_id = 0
    def __len__(self):
        return len(self.posdata)
    def __getitem__(self, idx):
        data = {}
        data['tokens'] = self.tokenizer.encode(self.posdata[idx]['tokens'], add_special_tokens=True, return_tensors='pt')[0]
        pad = torch.tensor([-1])
        data['ner_tags'] = torch.cat((pad ,torch.tensor(self.posdata[idx]['ner_tags']) , pad) )
        if len(data['tokens']) > self.max_len_sequence :
            data['tokens'] = data['tokens'][:self.max_len_sequence]
            data['ner_tags'] = data['ner_tags'][:self.max_len_sequence]
        else :
            len_pad = self.max_len_sequence - len(data['tokens'])
            pad_vector = torch.tensor([0]*len_pad)
            data['tokens'] = torch.cat((data['tokens'], pad_vector))
            pad_vector = torch.tensor([-1]*len_pad)
            data['ner_tags'] = torch.cat((data['ner_tags'],  pad_vector))
        return data
        return data

In [59]:
max_sequence_len = 30
batch_size = 16
Posdata = PosDataset(dataset['train'] , max_sequence_len)
Nerdata = NERDataset(dataset['train'] , max_sequence_len)
pos_dataloader = DataLoader(Posdata , batch_size = batch_size , shuffle = True)
ner_dataloader = DataLoader(Nerdata , batch_size = batch_size , shuffle = True)
print(len(Nerdata))
print(len(Posdata))

14041
14041


In [53]:
from tqdm import tqdm
from itertools import cycle
num_class_pos = 47
num_class_ner = 9
in_embed_dim = 300
out_dim = 768
lamb = 0.5
device = "cuda" if torch.cuda.is_available() else "cpu"
def train_model(num_epochs, loss_function, optimizer, model, pos_dataloader ,ner_dataloader,  lamb):
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0.0
        dataloader1_iter = iter(cycle(pos_dataloader))
        dataloader2_iter = iter(cycle(ner_dataloader))
        num_batches = min(len(pos_dataloader), len(ner_dataloader))
        pbar = tqdm(range(num_batches))
        for i in pbar :
            pos_data = next(dataloader1_iter)
            ner_data = next(dataloader2_iter)
            optimizer.zero_grad()


            out_pos , out_ner , loss_ner = model(pos_data['tokens'].to(device) , ner_data['tokens'].to(device) , ner_data['ner_tags'])
            # import pdb
            # pdb.set_trace()

            out_pos = out_pos.view(-1,num_class_pos).cpu()
            pos_tags = pos_data['pos_tags'].flatten()
            mask = (pos_tags != -1)
            filtered_out_pos = out_pos[mask]
            filtered_pos_tags = pos_tags[mask]
            loss1 = loss_function['pos'](filtered_out_pos.to(torch.float), filtered_pos_tags.to(torch.long))

            # out_ner = out_ner.view(-1,num_class_ner).cpu()
            # ner_tags = ner_data['ner_tags'].flatten()
            # mask = (ner_tags != -1)
            # filtered_out_ner = out_ner[mask]
            # filtered_ner_tags = ner_tags[mask]
            # loss2 = loss_function['ner'](filtered_out_ner, filtered_ner_tags.to(torch.long))
            loss2 = loss_ner
            loss = loss1 + (1-lamb) * loss2
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            pbar.set_description("Epoch: {}, Loss: {:4f}".format(epoch + 1, total_loss/(i+1)))



In [None]:
import torch.nn as nn

loss_funtion = {"pos": nn.CrossEntropyLoss(), "ner": nn.CrossEntropyLoss()}

model = MyModel(in_embed_dim , out_dim , num_class_pos , num_class_ner).to(device)
opt = torch.optim.AdamW(model.parameters() , lr = 1e-3)

train_model(num_epochs = 10 , loss_function=loss_funtion , optimizer=opt , model = model , pos_dataloader = pos_dataloader , ner_dataloader = ner_dataloader , lamb = lamb)

In [None]:
!