# This notebook is only for colab usage !!!

In [None]:
import os 
from google.colab import output
#os.chdir("drive/MyDrive/NLP_A3")


In [None]:
from google.colab import drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [None]:
!pip install transformers seqeval[gpu]
output.clear()

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertConfig, BertForTokenClassification
from tqdm import tqdm

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cpu


## Converting tokenization files into a more suitable format

In [None]:
dataframe = pd.read_csv("/content/gdrive/MyDrive/ner.csv")
dataframe.head

<bound method NDFrame.head of             Sentence #                                           Sentence  \
0          Sentence: 1  Thousands of demonstrators have marched throug...   
1          Sentence: 2  Families of soldiers killed in the conflict jo...   
2          Sentence: 3  They marched from the Houses of Parliament to ...   
3          Sentence: 4  Police put the number of marchers at 10,000 wh...   
4          Sentence: 5  The protest comes on the eve of the annual con...   
...                ...                                                ...   
47954  Sentence: 47955  Indian border security forces are accusing the...   
47955  Sentence: 47956  Indian officials said no one was injured in Sa...   
47956  Sentence: 47957  Two more landed in fields belonging to a nearb...   
47957  Sentence: 47958  They say not all of the rockets exploded upon ...   
47958  Sentence: 47959    Indian forces said they responded to the attack   

                                             

## Unique labels

In [None]:
from collections import Counter


unique_labels = set()
label_counter = Counter()
for list_of_tags in dataframe["Tag"]:
  for tag in eval(list_of_tags):
    unique_labels.add(tag)
    label_counter[tag] += 1

In [None]:
from pprint import pprint
pprint(label_counter)

Counter({'O': 887908,
         'B-geo': 37644,
         'B-tim': 20333,
         'B-org': 20143,
         'I-per': 17251,
         'B-per': 16990,
         'I-org': 16784,
         'B-gpe': 15870,
         'I-geo': 7414,
         'I-tim': 6528,
         'B-art': 402,
         'B-eve': 308,
         'I-art': 297,
         'I-eve': 253,
         'B-nat': 201,
         'I-gpe': 198,
         'I-nat': 51})


In [None]:
# Map each label into its id representation and vice versa  
# https://towardsdatascience.com/named-entity-recognition-with-bert-in-pytorch-a454405e0b6a
labels_to_ids = {k: v for v, k in enumerate(sorted(unique_labels))}
ids_to_labels = {v: k for v, k in enumerate(sorted(unique_labels))}
print(labels_to_ids)

{'B-art': 0, 'B-eve': 1, 'B-geo': 2, 'B-gpe': 3, 'B-nat': 4, 'B-org': 5, 'B-per': 6, 'B-tim': 7, 'I-art': 8, 'I-eve': 9, 'I-geo': 10, 'I-gpe': 11, 'I-nat': 12, 'I-org': 13, 'I-per': 14, 'I-tim': 15, 'O': 16}


## Transforming word tokens into tokens for BERT

In [None]:
import transformers
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained('dslim/bert-base-NER')

def ids_to_tokens(input): 
    return tokenizer.convert_ids_to_tokens(input)

Downloading (…)okenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/829 [00:00<?, ?B/s]

In [None]:
def align_label(texts, labels):
    tokenized_inputs = tokenizer(texts, padding='max_length', max_length=512, truncation=True)

    word_ids = tokenized_inputs.word_ids()
    previous_word_idx = None
    label_ids = []
    for word_idx in word_ids:
        if word_idx is None:
            label_ids.append(-100)
        elif word_idx != previous_word_idx:
            try:
                label_ids.append(labels_to_ids[labels[word_idx]])
            except:
                label_ids.append(-100)
        else:
            try:
                label_ids.append(labels_to_ids[labels[word_idx]] if True else -100)
            except:
                label_ids.append(-100)
        previous_word_idx = word_idx

    return label_ids

In [None]:
align_label(df_train["Sentence"].iloc[0], eval(df_train["Tag"].iloc[0]))

NameError: ignored

BERT Aligned tokens are done.



## Model Training

https://towardsdatascience.com/named-entity-recognition-with-bert-in-pytorch-a454405e0b6a

In [None]:
import torch

class DataSequence(torch.utils.data.Dataset):
    def __init__(self, df):
        lb = map(eval, df['Tag'])
        txt = df['Sentence'].values.tolist()
        self.texts = [tokenizer(str(i),
                               padding='max_length', max_length = 512, truncation=True, return_tensors="pt") for i in txt]
        self.labels = [align_label(i,j) for i,j in zip(txt, lb)]

    def __len__(self):
        return len(self.labels)

    def get_batch_data(self, idx):
        return self.texts[idx]

    def get_batch_labels(self, idx):
        return torch.LongTensor(self.labels[idx])

    def __getitem__(self, idx):
        batch_data = self.get_batch_data(idx)
        batch_labels = self.get_batch_labels(idx)

        return batch_data, batch_labels

In [None]:
import numpy as np

df_train, df_val, df_test = np.split(dataframe[:].sample(frac=1, random_state=42),
                            [int(.8 * len(dataframe[:])), int(.9 * len(dataframe[:]))])

In [None]:
df_test.shape

(4796, 4)

In [None]:
from transformers import BertForTokenClassification

class BertModel(torch.nn.Module): 
    def __init__(self): 
        super(BertModel, self).__init__() 
        self.bert = BertForTokenClassification.from_pretrained('dslim/bert-base-NER', num_labels=len(unique_labels), ignore_mismatched_sizes=True)

    def forward(self,input_ids, label=None): 
        output = self.bert(labels=label, input_ids = input_ids, return_dict=False) 
        return output

In [None]:
train_dataset = DataSequence(df_train)
type(train_dataset.lb)

In [None]:
from torch.optim import SGD

def train_loop(model, df_train, df_val): 
    train_dataset = DataSequence(df_train)
    val_dataset = DataSequence(df_val)

    train_dataloader = DataLoader(train_dataset, num_workers=4, batch_size=BATCH_SIZE, shuffle=True)
    val_dataloader = DataLoader(val_dataset, num_workers=4, batch_size=BATCH_SIZE)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    #device = "cpu"

    optimizer = SGD(model.parameters(), lr=LEARNING_RATE)

    if use_cuda:
        model = model.cuda()

    best_acc = 0
    best_loss = 1000
    print("All is ok")
    for epoch_num in range(EPOCHS): 
        total_acc_train = 0
        total_loss_train = 0

        model.train()

        for train_data, train_label in tqdm(train_dataloader): 
            #print(train_label)
            train_label = train_label.to(device)
            #mask = train_data['attention_mask'].squeeze(1).to(device)
            input_id = train_data['input_ids'].squeeze(1).to(device)
            
            optimizer.zero_grad()
            loss, logits = model(input_id, train_label)
          
            for i in range(logits.shape[0]): 
                #words = sentence.replace("-", "").split()
                #print(logits_clean)
                #for i in range(len(eval(answer))-1):
                #    print(words[i], eval(answer)[i])
                tokenized_sentence = ids_to_tokens(train_data["input_ids"][0][0])
                #print(tokenized_sentence)
                
                logits_clean = logits[i][train_label[i] != -100]
                label_clean = train_label[i][train_label[i] != -100]
                predictions = logits_clean.argmax(dim=1)
                #print(predictions)
                #print(label_clean)
                #print([ids_to_labels[x] for x in label_clean.cpu().numpy()])
                #print([ids_to_labels[x] for x in label_clean])

                acc = (predictions == label_clean).float().mean()
                total_acc_train += acc
                total_loss_train += loss.item()

            loss.backward()
            optimizer.step()

        model.eval()

        total_acc_val = 0
        total_loss_val = 0

        for val_data, val_label in val_dataloader: 
            val_label = val_label.to(device)
            #mask = val_data['attention_mask'].squeeze(1).to(device)
            input_id = val_data['input_ids'].squeeze(1).to(device)

            loss, logits = model(input_id, val_label)

            for i in range(logits.shape[0]): 
              logits_clean = logits[i][val_label[i] != -100]
              label_clean = val_label[i][val_label[i] != -100]

              predictions = logits_clean.argmax(dim=1)
              acc = (predictions == label_clean).float().mean()
              total_acc_val += acc
              total_loss_val += loss.item()

        val_accuracy = total_acc_val / len(df_val)
        val_loss = total_loss_val / len(df_val)
        print(
            f'Epochs: {epoch_num + 1} | Loss: {total_loss_train / len(df_train): .3f} | Accuracy: {total_acc_train / len(df_train): .3f} | Val_Loss: {total_loss_val / len(df_val): .3f} | Accuracy: {total_acc_val / len(df_val): .3f}')

LEARNING_RATE = 0.001
EPOCHS = 2
BATCH_SIZE = 2

model = BertModel()
try:
  train_loop(model, df_train, df_val)
except:
  pass
finally:
  torch.save(model.state_dict(), "/content/gdrive/MyDrive/NLP_A3/bert_trainedNEREnglish")

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dslim/bert-base-NER and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([9, 768]) in the checkpoint and torch.Size([17, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([17]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


All is ok


100%|██████████| 19184/19184 [1:06:03<00:00,  4.84it/s]


Epochs: 1 | Loss:  0.345 | Accuracy:  0.905 | Val_Loss:  0.246 | Accuracy:  0.928


100%|██████████| 19184/19184 [1:05:59<00:00,  4.85it/s]


Epochs: 2 | Loss:  0.228 | Accuracy:  0.933 | Val_Loss:  0.207 | Accuracy:  0.937


In [None]:
from transformers import pipeline 
model = BertModel()

model.load_state_dict(torch.load('/content/gdrive/MyDrive/NLP_A3/bert_trainedNEREnglish', map_location=torch.device('cpu')))

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dslim/bert-base-NER and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([9, 768]) in the checkpoint and torch.Size([17, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([17]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<All keys matched successfully>

In [None]:
test_dataset = DataSequence(df_test)
test_dataloader = DataLoader(test_dataset, num_workers=4, batch_size=1)
total_acc_test = 0
total_loss_test = 0
i = 0
for test_data, test_label in test_dataloader: 
    test_label = test_label.to(device)
    #mask = val_data['attention_mask'].squeeze(1).to(device)
    input_id = test_data['input_ids'].squeeze(1).to(device)

    loss, logits = model(input_id, test_label)

    for i in range(logits.shape[0]): 
      logits_clean = logits[i][test_label[i] != -100]
      label_clean = test_label[i][test_label[i] != -100]

      predictions = logits_clean.argmax(dim=1)
      acc = (predictions == label_clean).float().mean()
      total_acc_test += acc
      total_loss_test += loss.item()

    test_accuracy = total_acc_test / len(df_test)
    test_loss = total_loss_test / len(df_test)

print(f'Test_Loss: {total_loss_test / len(df_test): .3f} | Accuracy: {total_acc_test / len(df_test): .3f}')



[1;30;43mВыходные данные были обрезаны до нескольких последних строк (5000).[0m
0
Test_Loss:  0.093 | Accuracy:  0.449
0
Test_Loss:  0.093 | Accuracy:  0.449
0
Test_Loss:  0.094 | Accuracy:  0.450
0
Test_Loss:  0.094 | Accuracy:  0.450
0
Test_Loss:  0.094 | Accuracy:  0.450
0
Test_Loss:  0.094 | Accuracy:  0.450
0
Test_Loss:  0.094 | Accuracy:  0.450
0
Test_Loss:  0.094 | Accuracy:  0.450
0
Test_Loss:  0.094 | Accuracy:  0.451
0
Test_Loss:  0.094 | Accuracy:  0.451
0
Test_Loss:  0.094 | Accuracy:  0.451
0
Test_Loss:  0.094 | Accuracy:  0.451
0
Test_Loss:  0.094 | Accuracy:  0.451
0
Test_Loss:  0.094 | Accuracy:  0.452
0
Test_Loss:  0.094 | Accuracy:  0.452
0
Test_Loss:  0.094 | Accuracy:  0.452
0
Test_Loss:  0.094 | Accuracy:  0.452
0
Test_Loss:  0.094 | Accuracy:  0.452
0
Test_Loss:  0.094 | Accuracy:  0.453
0
Test_Loss:  0.094 | Accuracy:  0.453
0
Test_Loss:  0.094 | Accuracy:  0.453
0
Test_Loss:  0.094 | Accuracy:  0.453
0
Test_Loss:  0.095 | Accuracy:  0.453
0
Test_Loss:  0.095 |