In [23]:
import pandas as pd
from transformers import BertTokenizerFast, BertForTokenClassification
import torch.utils.data as data
import torch
from tqdm import tqdm

In [15]:
df = pd.read_csv("./ner.csv")
sent = df["text"].values[0]
print(sent)

Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .


In [7]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")
text_tokenized = tokenizer(sent, padding="max_length", max_length=512, truncation=True, return_tensors="pt")
print(text_tokenized)

{'input_ids': tensor([[  101, 26159,  1104,  8568,  4487,  5067,  1138,  9639,  1194,  1498,
          1106,  5641,  1103,  1594,  1107,  5008,  1105,  4555,  1103, 10602,
          1104,  1418,  2830,  1121,  1115,  1583,   119,   102,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,  

In [9]:
tokenizer.decode(text_tokenized["input_ids"][0])

'[CLS] Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [P

In [13]:
tokens = tokenizer.convert_ids_to_tokens(text_tokenized["input_ids"][0])
word_ids = text_tokenized.word_ids()
print(tokens)
print(word_ids)
### sub-word tokens + special tokens
### corresponding word idx in the originial sentence

['[CLS]', 'Thousands', 'of', 'demons', '##tra', '##tors', 'have', 'marched', 'through', 'London', 'to', 'protest', 'the', 'war', 'in', 'Iraq', 'and', 'demand', 'the', 'withdrawal', 'of', 'British', 'troops', 'from', 'that', 'country', '.', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PA

In [30]:
def get_data_label_details(df):
    
    labels = [label.split() for label in df["labels"].values.tolist()]
    unique_labels = set()
    for sent_label in labels:
        [unique_labels.add(token_lb) for token_lb in sent_label]
    
    num_unique_labels = len(unique_labels)
    print(f"Number of Unique Labels: {num_unique_labels}")
    label_to_idx = { label : idx for idx, label in enumerate(sorted(unique_labels))}
    idx_to_label = { idx : label for idx, label in enumerate(sorted(unique_labels))}
    return num_unique_labels, label_to_idx, idx_to_label

def align_label(tokenized_sent, labels, label_to_idx):
    word_ids = tokenized_sent.word_ids()
    previous_word_idx = None
    label_ids = []
    
    for word_idx in word_ids:
        if word_idx is None: ### special token
            label_ids.append(-100)
        elif word_idx != previous_word_idx: ### new token
            try:
                label_ids.append(label_to_idx[labels[word_idx]])
            except:
                ### not in vocabulary
                label_ids.append(-100)
        else: ### repeated token from the same word
            label_ids.append(-100)
        
        previous_word_idx = word_idx
        
    return label_ids
            
    

In [31]:
class NERDataset(data.Dataset):
    
    def __init__(self, filepath, tokenizer, start=None, end=None):
        df = pd.read_csv(filepath)
        self.num_unique_labels, self.label_to_idx, self.idx_to_label = get_data_label_details(df)
        if(start is not None and end is not None):
            df = df[start:end]
        labels = [label.split() for label in df['labels'].values.tolist()] ### list of lists where each list of the NER labels
        sentences = df["text"].values.tolist() ### list of sentences
        self.txt = [tokenizer(sent, padding="max_length", max_length=512, truncation=True, return_tensors="pt") for sent in sentences]
        self.labels = [align_label(sent, label, label_to_idx=self.label_to_idx) for sent, label in zip(self.txt, labels)]
        self.len = len(self.labels)
    
    def __len__(self):
        return self.len
    
    def __getitem__(self, index):
        return self.txt[index], torch.LongTensor(self.labels[index])

In [24]:
class BertModel(torch.nn.Module):

    def __init__(self, num_unique_labels):

        super(BertModel, self).__init__()

        self.bert = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=num_unique_labels)

    def forward(self, input_id, mask, label):

        output = self.bert(input_ids=input_id, attention_mask=mask, labels=label, return_dict=False)

        return output

In [40]:
def trainer(model):
    
    tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")
    trainDataset = NERDataset(filepath="./ner.csv", tokenizer=tokenizer, start=0, end=90)
    validationDataset = NERDataset(filepath="./ner.csv", tokenizer=tokenizer, start=90, end=100)
    trainDataLoader = data.DataLoader(trainDataset, batch_size=4, shuffle=True)
    valDataLoader = data.DataLoader(validationDataset, batch_size=4)
    
    cuda_available = torch.cuda.is_available() 
    device = torch.device("cuda" if  cuda_available else "cpu")
    optimizer = torch.optim.SGD(model.parameters(), lr=3e-4)
    
    if cuda_available:
        model.cuda()
    
    for epoch in range(1):
        
        train_acc, val_acc = 0, 0
        model.train()
        for train_data, train_labels in tqdm(trainDataLoader):
            train_labels = train_labels.to(device)
            input_ids = train_data["input_ids"].squeeze(1).to(device)
            attn_masks = train_data["attention_mask"].squeeze(1).to(device)
            
            optimizer.zero_grad()
            loss, logits = model(input_ids, attn_masks, train_labels)
            ### LOGITS => BATCH_SIZE * SEQ_LEN * NUM_LABELS
            ## LABELS => BATCH_SIZE * SEQ_LEN
            for idx in range(logits.shape[0]): ### iterate over all samples
                ### remove redundant tokens for accuracy computation
                clean_logits = logits[idx][train_labels[idx] != -100] ## SEQ_LEN' * NUM_LABELS
                clean_labels = train_labels[idx][train_labels[idx] != -100 ] ## SEQ_LEN'
                pred = clean_logits.argmax(dim=1) ### SEQ_LEN'
                train_acc += (pred == clean_labels).float().mean()
                
            loss.backward()
            optimizer.step()
            print(f"Train Loss: {loss.item()}", end="\r")
        
        ### EVALUATION ON VALIDATION SET
        model.eval()
        for val_data, val_labels in tqdm(valDataLoader):
            val_labels = val_labels.to(device)
            input_ids = val_data["input_ids"].squeeze(1).to(device)
            attn_masks = val_data["attention_mask"].squeeze(1).to(device)
            
            loss, logits = model(input_ids, attn_masks, val_labels)
            
            for idx in range(logits.shape[0]): ### iterate over all samples
                ### remove redundant tokens for accuracy computation
                clean_logits = logits[idx][val_labels[idx] != -100] ## SEQ_LEN' * NUM_LABELS
                clean_labels = val_labels[idx][val_labels[idx] != -100 ] ## SEQ_LEN'
                pred = clean_logits.argmax(dim=1) ### SEQ_LEN'
                val_acc += (pred == clean_labels).float().mean()
            print(f"Val Loss: {loss.item()}", end="\r")
        
        print(f"Epoch: {epoch+1} | Train Acc: {train_acc/len(trainDataset)} | Val Acc: {val_acc/len(validationDataset)}")
            

            
    
    

In [41]:
trainer(BertModel(17))

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

Number of Unique Labels: 17
Number of Unique Labels: 17


  4%|▍         | 1/23 [00:10<03:49, 10.44s/it]

Train Loss: 2.592233419418335

  9%|▊         | 2/23 [00:19<03:19,  9.50s/it]

Train Loss: 2.6166350841522217

 13%|█▎        | 3/23 [00:28<03:05,  9.25s/it]

Train Loss: 2.638348340988159

 17%|█▋        | 4/23 [00:37<02:58,  9.37s/it]

Train Loss: 2.5442395210266113

 22%|██▏       | 5/23 [00:47<02:48,  9.35s/it]

Train Loss: 2.423124313354492

 26%|██▌       | 6/23 [00:56<02:40,  9.45s/it]

Train Loss: 2.464458703994751

 30%|███       | 7/23 [01:06<02:33,  9.57s/it]

Train Loss: 2.4632091522216797

 35%|███▍      | 8/23 [01:19<02:38, 10.58s/it]

Train Loss: 2.3159444332122803

 39%|███▉      | 9/23 [01:31<02:34, 11.04s/it]

Train Loss: 2.42718768119812

 43%|████▎     | 10/23 [01:41<02:21, 10.88s/it]

Train Loss: 2.333347797393799

 48%|████▊     | 11/23 [01:52<02:11, 10.92s/it]

Train Loss: 2.422255516052246

 52%|█████▏    | 12/23 [02:03<01:58, 10.77s/it]

Train Loss: 2.331909418106079

 57%|█████▋    | 13/23 [02:14<01:48, 10.80s/it]

Train Loss: 2.1902542114257812

 61%|██████    | 14/23 [02:24<01:35, 10.64s/it]

Train Loss: 2.291637897491455

 65%|██████▌   | 15/23 [02:36<01:27, 10.91s/it]

Train Loss: 2.210230588912964

 70%|██████▉   | 16/23 [02:46<01:16, 10.91s/it]

Train Loss: 2.296802520751953

 74%|███████▍  | 17/23 [02:57<01:04, 10.76s/it]

Train Loss: 2.3105380535125732

 78%|███████▊  | 18/23 [03:07<00:53, 10.68s/it]

Train Loss: 2.040647029876709

 83%|████████▎ | 19/23 [03:18<00:42, 10.58s/it]

Train Loss: 2.105609655380249

 87%|████████▋ | 20/23 [03:28<00:31, 10.36s/it]

Train Loss: 2.0822019577026367

 91%|█████████▏| 21/23 [03:37<00:20, 10.24s/it]

Train Loss: 2.0999724864959717

 96%|█████████▌| 22/23 [03:48<00:10, 10.29s/it]

Train Loss: 2.0365583896636963

100%|██████████| 23/23 [03:55<00:00, 10.23s/it]


Train Loss: 2.301121473312378

 33%|███▎      | 1/3 [00:04<00:08,  4.01s/it]

Val Loss: 1.9178657531738281

 67%|██████▋   | 2/3 [00:08<00:04,  4.08s/it]

Val Loss: 2.0228183269500732

100%|██████████| 3/3 [00:10<00:00,  3.57s/it]

Epoch: 1 | Train Acc: 0.5831993818283081 | Val Acc: 0.8510143160820007



