In [1]:
import pandas as pd
import numpy as np
import transformers
from transformers import BertTokenizer, BertModel, get_linear_schedule_with_warmup
import torch
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import torch.nn as nn
from sklearn.metrics import accuracy_score
import pickle as pkl
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
import string

In [2]:
class Config:
    CLS = [101]
    SEP = [102]
    VALUE_TOKEN = [0]
    MAX_LEN = 128
    TRAIN_BATCH_SIZE = 32
    VAL_BATCH_SIZE = 8
    EPOCHS = 3
    TOKENIZER = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=False)

In [3]:
class Dataset:
  
  def __init__(self, texts, tags):
    
    #Texts: [['Diana', 'is', 'a', 'girl], ['she', 'plays'. 'football']]
    #tags: [[0, 1, 2, 5], [1, 3. 5]]
    
    self.texts = texts
    self.tags = tags
  
  def __len__(self):
    return len(self.texts)

  def __getitem__(self, index):
    texts = self.texts[index]
    tags = self.tags[index]
  
    #Токинезатор
    ids = []
    target_tag = []

    for i, s in enumerate(texts):
        inputs = Config.TOKENIZER.encode(s, add_special_tokens=False)
     
        input_len = len(inputs)
        ids.extend(inputs)
        target_tag.extend(input_len * [tags[i]])
    
    #To Add Special Tokens, subtract 2 from MAX_LEN
    ids = ids[:Config.MAX_LEN - 2]
    target_tag = target_tag[:Config.MAX_LEN - 2]

    #Add Sepcial Tokens
    ids = Config.CLS + ids + Config.SEP
    target_tags = Config.VALUE_TOKEN + target_tag + Config.VALUE_TOKEN

    mask = [1] * len(ids)
    token_type_ids = [0] * len(ids)

    #Add Padding if the input_len is small

    padding_len = Config.MAX_LEN - len(ids)
    ids = ids + ([0] * padding_len)
    target_tags = target_tags + ([0] * padding_len)
    mask = mask + ([0] * padding_len)
    token_type_ids = token_type_ids + ([0] * padding_len)

    return {
        "ids" : torch.tensor(ids, dtype=torch.long),
        "mask" : torch.tensor(mask, dtype=torch.long),
        "token_type_ids" : torch.tensor(token_type_ids, dtype=torch.long),
        "target_tags" : torch.tensor(target_tags, dtype=torch.long)
      }

In [4]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [5]:
class NERBertModel(nn.Module):
    
    def __init__(self, num_tag):
        super(NERBertModel, self).__init__()
        self.num_tag = num_tag
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.bert_drop = nn.Dropout(0.3)
        self.out_tag = nn.Linear(768, self.num_tag)
        
    def forward(self, ids, mask, token_type_ids, target_tags):
        output, _ = self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids, return_dict=False)
        bert_out = self.bert_drop(output) 
        tag = self.out_tag(bert_out)
    
        #Calculate the loss
        Critirion_Loss = nn.CrossEntropyLoss()
        active_loss = mask.view(-1) == 1
        active_logits = tag.view(-1, self.num_tag)
        active_labels = torch.where(active_loss, target_tags.view(-1), torch.tensor(Critirion_Loss.ignore_index).type_as(target_tags))
        loss = Critirion_Loss(active_logits, active_labels)
        return tag, loss

In [6]:
data = pd.read_csv("NER dataset.csv", encoding="latin-1")

In [7]:
#Filling Missing Values and Label Encoding
data["Sentence #"] = data["Sentence #"].fillna(method='ffill')
le = LabelEncoder().fit(data['Tag'])
data['Tag'] = le.transform(data['Tag'])
pkl.dump(le, open('labelenc.pkl', 'wb'))
data.head()


Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,16
1,Sentence: 1,of,IN,16
2,Sentence: 1,demonstrators,NNS,16
3,Sentence: 1,have,VBP,16
4,Sentence: 1,marched,VBN,16


In [8]:
data_gr = data.groupby("Sentence #").agg({'Word': list, 'POS':list, 'Tag':list})
data_gr.head()

Unnamed: 0_level_0,Word,POS,Tag
Sentence #,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Sentence: 1,"[Thousands, of, demonstrators, have, marched, ...","[NNS, IN, NNS, VBP, VBN, IN, NNP, TO, VB, DT, ...","[16, 16, 16, 16, 16, 16, 2, 16, 16, 16, 16, 16..."
Sentence: 10,"[Iranian, officials, say, they, expect, to, ge...","[JJ, NNS, VBP, PRP, VBP, TO, VB, NN, TO, JJ, J...","[3, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16..."
Sentence: 100,"[Helicopter, gunships, Saturday, pounded, mili...","[NN, NNS, NNP, VBD, JJ, NNS, IN, DT, NNP, JJ, ...","[16, 16, 7, 16, 16, 16, 16, 16, 2, 16, 16, 16,..."
Sentence: 1000,"[They, left, after, a, tense, hour-long, stand...","[PRP, VBD, IN, DT, NN, JJ, NN, IN, NN, NNS, .]","[16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16]"
Sentence: 10000,"[U.N., relief, coordinator, Jan, Egeland, said...","[NNP, NN, NN, NNP, NNP, VBD, NNP, ,, NNP, ,, J...","[2, 16, 16, 6, 14, 16, 7, 16, 2, 16, 3, 16, 3,..."


In [9]:
train_sent, val_sent, train_tag, val_tag = train_test_split(data_gr['Word'], data_gr['Tag'], test_size=0.01, random_state=10)

In [10]:
train_dataset = Dataset(texts = train_sent, tags = train_tag)
val_dataset = Dataset(texts = val_sent, tags = val_tag)
train_data_loader = DataLoader(train_dataset, batch_size=Config.TRAIN_BATCH_SIZE)
val_data_loader = DataLoader(val_dataset, batch_size=Config.VAL_BATCH_SIZE)

In [11]:
num_tag = len(data.Tag.value_counts())
model = NERBertModel(num_tag=num_tag)
model = model.to(device)

In [12]:
def get_hyperparameters(model, ff):

    # ff: full_finetuning
    if ff:
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "gamma", "beta"]
        optimizer_grouped_parameters = [
            {
                "params": [
                    p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
                ],
                "weight_decay_rate": 0.01,
            },
            {
                "params": [
                    p for n, p in param_optimizer if any(nd in n for nd in no_decay)
                ],
                "weight_decay_rate": 0.0,
            },
        ]
    else:
        param_optimizer = list(model.classifier.named_parameters())
        optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

    return optimizer_grouped_parameters

In [13]:
FULL_FINETUNING = True
optimizer_grouped_parameters = get_hyperparameters(model, FULL_FINETUNING)
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=3e-5)
num_train_steps = int(len(train_sent) / Config.TRAIN_BATCH_SIZE * Config.EPOCHS)
scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=0, 
    num_training_steps=num_train_steps
)

In [14]:
def prediction(test_sentence, model, le):
    for i in list(string.punctuation):
        test_sentence = test_sentence.replace(i, ' ' + i)
    test_sentence = test_sentence.split()
    print(test_sentence)
    Token_inputs = Config.TOKENIZER.encode(test_sentence, add_special_tokens=False)
    print(Token_inputs)
    test_dataset =  Dataset(test_sentence, tags= [[1] * len(test_sentence)])
    num_tag = len(le.classes_)
   
    with torch.no_grad():
        data = test_dataset[0]
        for i, j in data.items():
            data[i] = j.to(device).unsqueeze(0)
        tag, _ = model(**data)
        print(le.inverse_transform(tag.argmax(2).cpu().numpy().reshape(-1))[1:len(Token_inputs)+1])

In [15]:
phases = ['T', 'V']
num_epochs = 3
for epoch in range(num_epochs):
    for phase in phases:
        loss_ = 0
        if phase == 'T':
            model.train()  # Установить модель в режим обучения
            dataloader = train_data_loader
        else:
            model.eval()   #Установить модель в режим оценки
            dataloader = val_data_loader
        for data in tqdm(dataloader, total = len(dataloader)):
            for i, j in data.items():
                data[i] = j.to(device)
            optimizer.zero_grad()
            with torch.set_grad_enabled(phase == 'T'):
                _, loss = model(**data)
                loss_ += loss.item()
                if phase == 'T':
                    loss.backward()
                    optimizer.step()
                    scheduler.step()

        print(f"Epoch: {epoch + 1}, phase: {phase}, loss: {loss_ / len(dataloader)}")

100%|██████████| 1484/1484 [11:47<00:00,  2.10it/s]


Epoch: 1, phase: T, loss: 0.21848472558725074


100%|██████████| 60/60 [00:03<00:00, 18.61it/s]


Epoch: 1, phase: V, loss: 0.1496036701525251


100%|██████████| 1484/1484 [11:48<00:00,  2.09it/s]


Epoch: 2, phase: T, loss: 0.14673179116933494


100%|██████████| 60/60 [00:03<00:00, 18.40it/s]


Epoch: 2, phase: V, loss: 0.14289187093575795


100%|██████████| 1484/1484 [11:47<00:00,  2.10it/s]


Epoch: 3, phase: T, loss: 0.12597861566211818


100%|██████████| 60/60 [00:03<00:00, 18.15it/s]

Epoch: 3, phase: V, loss: 0.14368095487977067





In [16]:
test_sentence = "Charles I was born in Fife on 19 November 1600."
prediction(test_sentence, model, le)
test_sentence = "She is playing football."
prediction(test_sentence, model, le)

['Charles', 'I', 'was', 'born', 'in', 'Fife', 'on', '19', 'November', '1600', '.']
[100, 100, 2001, 2141, 1999, 100, 2006, 2539, 100, 14883, 1012]
['B-per' 'O' 'O' 'O' 'O' 'O' 'O' 'B-art' 'O' 'O' 'O']
['She', 'is', 'playing', 'football', '.']
[100, 2003, 2652, 2374, 1012]
['B-per' 'I-per' 'I-org' 'B-art' 'B-per']
