In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader

In [2]:
from transformers import BertTokenizer, BertConfig, BertForTokenClassification

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [5]:
data = pd.read_csv("ner_datasetreference.csv", encoding='unicode_escape')

In [6]:
data.count()

Sentence #      47959
Word          1048565
POS           1048575
Tag           1048575
dtype: int64

In [7]:
print(f"Number tof tags: {format(len(data.Tag.unique()))}")
freq = data.Tag.value_counts()
freq

Number tof tags: 17


Tag
O        887908
B-geo     37644
B-tim     20333
B-org     20143
I-per     17251
B-per     16990
I-org     16784
B-gpe     15870
I-geo      7414
I-tim      6528
B-art       402
B-eve       308
I-art       297
I-eve       253
B-nat       201
I-gpe       198
I-nat        51
Name: count, dtype: int64

In [8]:
tags = {}
for tag, count in zip(freq.index, freq):
    if tag != "O":
        if tag[2:5] not in tags.keys():
            tags[tag[2:5]] = count
        else:
            tags[tag[2:5]] += count
    continue
    
print(sorted(tags.items(), key=lambda x: x[1], reverse = True))


[('geo', 45058), ('org', 36927), ('per', 34241), ('tim', 26861), ('gpe', 16068), ('art', 699), ('eve', 561), ('nat', 252)]


In [9]:
# removing low freq entities

In [10]:
entities_to_remove = ["B-nat", "I-nat"]

In [11]:
data = data[~data.Tag.isin(entities_to_remove)]
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [12]:
data = data.fillna(method='ffill')
data

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O
...,...,...,...,...
1048570,Sentence: 47959,they,PRP,O
1048571,Sentence: 47959,responded,VBD,O
1048572,Sentence: 47959,to,TO,O
1048573,Sentence: 47959,the,DT,O


In [14]:
data['sentence'] = data[['Sentence #', 'Word', 'Tag']].groupby(['Sentence #'])['Word'].transform(lambda x: ' '.join(x))

In [15]:
data['target'] = data[['Sentence #', 'Word', 'Tag']].groupby(['Sentence #'])['Tag'].transform(lambda x: ','.join(x))

In [16]:
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag,sentence,target
0,Sentence: 1,Thousands,NNS,O,Thousands of demonstrators have marched throug...,"O,O,O,O,O,O,B-geo,O,O,O,O,O,B-geo,O,O,O,O,O,B-..."
1,Sentence: 1,of,IN,O,Thousands of demonstrators have marched throug...,"O,O,O,O,O,O,B-geo,O,O,O,O,O,B-geo,O,O,O,O,O,B-..."
2,Sentence: 1,demonstrators,NNS,O,Thousands of demonstrators have marched throug...,"O,O,O,O,O,O,B-geo,O,O,O,O,O,B-geo,O,O,O,O,O,B-..."
3,Sentence: 1,have,VBP,O,Thousands of demonstrators have marched throug...,"O,O,O,O,O,O,B-geo,O,O,O,O,O,B-geo,O,O,O,O,O,B-..."
4,Sentence: 1,marched,VBN,O,Thousands of demonstrators have marched throug...,"O,O,O,O,O,O,B-geo,O,O,O,O,O,B-geo,O,O,O,O,O,B-..."


In [17]:
tag2id = {k:v for v, k in enumerate(data.Tag.unique())}
id2tag = {v:k for v,k in enumerate(data.Tag.unique())}

In [18]:
tag2id

{'O': 0,
 'B-geo': 1,
 'B-gpe': 2,
 'B-per': 3,
 'I-geo': 4,
 'B-org': 5,
 'I-org': 6,
 'B-tim': 7,
 'B-art': 8,
 'I-art': 9,
 'I-per': 10,
 'I-gpe': 11,
 'I-tim': 12,
 'B-eve': 13,
 'I-eve': 14}

In [19]:
data = data[['sentence', 'target']].drop_duplicates().reset_index(drop=True)

In [20]:
len(data)

47599

In [132]:
MAX_LEN = 128
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 1
lr = 1e-5
MAX_GRAD_NORM = 10

In [75]:
def tokenize_and_preserve_labels(sentence, target, tokenizer):
    
    tokenized_sentence = []
    labels = []
    
    sentence = sentence.strip()
    
    for word, label in zip(sentence.split(), target.split(',')):
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)
        
        tokenized_sentence.extend(tokenized_word)
        
        labels.extend([label]*n_subwords)
    return tokenized_sentence, labels

In [76]:
class NERData(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __getitem__(self, index):
        sentence = self.data.sentence[index]
        labels = self.data.target[index]
        tokenized_sentence, labels = tokenize_and_preserve_labels(sentence, labels, self.tokenizer)
        tokenized_sentence = ["[CLS]"] + tokenized_sentence + ["[SEP]"]
        labels = ["O"] + labels + ["O"]
        
        maxlen = self.max_len
        
        if len(tokenized_sentence) > maxlen:
            tokenized_sentence = tokenized_sentence[:maxlen]
            labels = labels[:maxlen]
        
        else:
            tokenized_sentence = tokenized_sentence + ["[PAD]" for _ in range(maxlen-len(tokenized_sentence))]
            labels = labels + ["O" for _ in range(maxlen- len(labels))]
        
        attention_mask = [1 if tok!= "[PAD]" else 0 for tok in tokenized_sentence]
        
        ids = self.tokenizer.convert_tokens_to_ids(tokenized_sentence)
        
        label_ids = [tag2id[label] for label in labels]
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
              'mask': torch.tensor(attention_mask, dtype=torch.long),
              #'token_type_ids': torch.tensor(token_ids, dtype=torch.long),
              'targets': torch.tensor(label_ids, dtype=torch.long)
        }
        
    def __len__(self):
        return self.len
        

In [77]:
train_size = 0.8
train_dataset = data.sample(frac=train_size,random_state=200)
test_dataset = data.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

In [78]:
print("FULL Dataset: {}".format(data.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

FULL Dataset: (47599, 2)
TRAIN Dataset: (38079, 2)
TEST Dataset: (9520, 2)


In [79]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [80]:
training_set = NERData(train_dataset, tokenizer, MAX_LEN)

In [81]:
training_set[0]

{'ids': tensor([  101,  2062,  2084,  3263,  2111,  3230,  1996,  3098,  1997,  1996,
          3979,  1999,  2002, 19205,  2874,  1012,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,  

In [82]:
test_set = NERData(test_dataset, tokenizer, MAX_LEN)

In [83]:
test_set[0]

{'ids': tensor([  101,  2027,  9847,  2013,  1996,  3506,  1997,  3323,  2000,  1037,
          8320,  1999, 11804,  2380,  1012,   102,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,  

In [84]:
train_loader = DataLoader(training_set, batch_size=TRAIN_BATCH_SIZE, shuffle=True)

In [85]:
test_loader = DataLoader(test_set, batch_size=VALID_BATCH_SIZE, shuffle=True)

# MODEL AND TRAINING

In [133]:
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(id2tag), id2label=id2tag,
                                                   label2id=tag2id)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

In [134]:
model = model.to(device)

In [135]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=lr)

In [136]:
len(train_loader)

9520

In [137]:
def train_model(epoch):
    tr_loss, tr_accuracy = 0, 0
    no_tr_examples, no_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    
    model.train()
    
    for idx, batch in enumerate(train_loader):
        inputs = batch['ids'].to(device, dtype=torch.long)
        mask = batch['mask'].to(device, dtype=torch.long)
        target = batch['targets'].to(device, dtype=torch.long)
        
        output = model(input_ids = inputs, attention_mask = mask, labels= target)
        
        loss, tr_logits = output.loss, output.logits
        
        tr_loss += loss.item()
        
        no_tr_steps+=1
        no_tr_examples+= target.size(0)
        
        flattened_targets = target.view(-1)  # for accuracy calculations
        
        active_logits = tr_logits.view(-1, model.num_labels)
        
        flattened_preds = torch.argmax(active_logits, axis=1)
        
        active_mask = mask.view(-1) == 1
        
        targets = torch.masked_select(flattened_targets, active_mask)
        prediction = torch.masked_select(flattened_preds, active_mask)
        
        tr_preds.extend(prediction)
        tr_labels.extend(targets)
        
        tmp_tr_accuracy = accuracy_score(targets.cpu().numpy(), prediction.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy
        
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=MAX_GRAD_NORM)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (idx + 1) % 128 == 0:
                print(
                    f"Epoch [{epoch + 1}/{EPOCHS}], "
                    f"Step [{idx + 1}/{len(train_loader)}], "
                    f"Loss: {loss.item():.4f}"
                )
        elif (idx + 1) == len(train_loader):
                print(
                    f"Epoch [{epoch + 1}/{EPOCHS}], "
                    f"Step [{idx + 1}/{len(train_loader)}], "
                    f"Loss: {loss.item():.4f}"
                )
        
        
    epoch_loss = tr_loss / no_tr_steps
    tr_accuracy = tr_accuracy / no_tr_steps
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")
        

In [138]:
for epoch in range(EPOCHS):
    print(f"Training epoch: {epoch + 1}")
    train_model(epoch)

Training epoch: 1
Epoch [1/1], Step [128/9520], Loss: 0.2417
Epoch [1/1], Step [256/9520], Loss: 0.1265
Epoch [1/1], Step [384/9520], Loss: 0.0854
Epoch [1/1], Step [512/9520], Loss: 0.0686
Epoch [1/1], Step [640/9520], Loss: 0.0267
Epoch [1/1], Step [768/9520], Loss: 0.0128
Epoch [1/1], Step [896/9520], Loss: 0.0080
Epoch [1/1], Step [1024/9520], Loss: 0.0874
Epoch [1/1], Step [1152/9520], Loss: 0.0872
Epoch [1/1], Step [1280/9520], Loss: 0.0569
Epoch [1/1], Step [1408/9520], Loss: 0.0103
Epoch [1/1], Step [1536/9520], Loss: 0.0911
Epoch [1/1], Step [1664/9520], Loss: 0.0764
Epoch [1/1], Step [1792/9520], Loss: 0.0177
Epoch [1/1], Step [1920/9520], Loss: 0.0106
Epoch [1/1], Step [2048/9520], Loss: 0.0203
Epoch [1/1], Step [2176/9520], Loss: 0.0345
Epoch [1/1], Step [2304/9520], Loss: 0.0309
Epoch [1/1], Step [2432/9520], Loss: 0.0220
Epoch [1/1], Step [2560/9520], Loss: 0.0156
Epoch [1/1], Step [2688/9520], Loss: 0.0139
Epoch [1/1], Step [2816/9520], Loss: 0.0349
Epoch [1/1], Step [29

NameError: name 'nb_tr_steps' is not defined

In [143]:
def evaluate(model, test_loader):
    model.eval()
    
    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []
    
    with torch.no_grad():
        for idx, batch in enumerate(test_loader):
            
            ids = batch['ids'].to(device, dtype = torch.long)
            mask = batch['mask'].to(device, dtype = torch.long)
            targets = batch['targets'].to(device, dtype = torch.long)
            
            outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
            loss, eval_logits = outputs.loss, outputs.logits
            
            eval_loss += loss.item()

            nb_eval_steps += 1
            nb_eval_examples += targets.size(0)
        
            if idx % 100==0:
                loss_step = eval_loss/nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")
              
            # compute evaluation accuracy
            flattened_targets = targets.view(-1) # shape (batch_size * seq_len,)
            active_logits = eval_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
            # now, use mask to determine where we should compare predictions with targets (includes [CLS] and [SEP] token predictions)
            active_accuracy = mask.view(-1) == 1 # active accuracy is also of shape (batch_size * seq_len,)
            targets = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)
            
            eval_labels.extend(targets)
            eval_preds.extend(predictions)
            
            tmp_eval_accuracy = accuracy_score(targets.cpu().numpy(), predictions.cpu().numpy())
            eval_accuracy += tmp_eval_accuracy
    
    #print(eval_labels)
    #print(eval_preds)

    labels = [id2tag[id.item()] for id in eval_labels]
    predictions = [id2tag[id.item()] for id in eval_preds]

    #print(labels)
    #print(predictions)
    
    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps
    print(f"Validation Loss: {eval_loss}")
    print(f"Validation Accuracy: {eval_accuracy}")

    return labels, predictions

In [144]:
labels, predictions = evaluate(model, test_loader)

Validation loss per 100 evaluation steps: 0.00827204529196024
Validation loss per 100 evaluation steps: 0.029712026720373236
Validation loss per 100 evaluation steps: 0.027599494499721985
Validation loss per 100 evaluation steps: 0.02745406838180149
Validation loss per 100 evaluation steps: 0.025739501265274328
Validation loss per 100 evaluation steps: 0.026640120697178818
Validation loss per 100 evaluation steps: 0.026957039131098016
Validation loss per 100 evaluation steps: 0.026841955683046297
Validation loss per 100 evaluation steps: 0.026646109997898647
Validation loss per 100 evaluation steps: 0.02638547526447566
Validation loss per 100 evaluation steps: 0.02698067665458531
Validation loss per 100 evaluation steps: 0.026874133325091497
Validation loss per 100 evaluation steps: 0.027066146774406698
Validation loss per 100 evaluation steps: 0.02717061439807751
Validation loss per 100 evaluation steps: 0.027227676632647465
Validation loss per 100 evaluation steps: 0.0271809019264561

In [147]:
from seqeval.metrics import classification_report

print(classification_report([labels], [predictions]))

              precision    recall  f1-score   support

         art       0.83      0.03      0.07       144
         eve       0.68      0.28      0.39        69
         geo       0.82      0.89      0.85     11268
         gpe       0.96      0.91      0.93      3447
         org       0.71      0.61      0.66      6831
         per       0.76      0.80      0.78      5456
         tim       0.86      0.82      0.84      4368

   micro avg       0.81      0.80      0.80     31583
   macro avg       0.80      0.62      0.65     31583
weighted avg       0.80      0.80      0.80     31583



In [148]:
torch.save(model, 'model.pt')

In [150]:
print(model.state_dict())

OrderedDict([('bert.embeddings.position_ids', tensor([[  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
          14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,
          28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,
          42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,
          56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,
          70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,
          84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,
          98,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
         112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125,
         126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139,
         140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153,
         154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167,
      

In [151]:
model.bert.embeddings.word_embeddings.weight.shape

torch.Size([30522, 768])

In [152]:
model.bert.embeddings.position_embeddings.weight.shape

torch.Size([512, 768])

In [159]:
# 12 enocder layers

In [160]:
model.classifier.weight.shape

torch.Size([15, 768])

In [161]:
model.classifier.bias.shape

torch.Size([15])