In [None]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
from google.colab import drive
import pprint  # for pretty printing our device stats
from google.colab import files
import pandas as pd
import numpy as np
from tqdm import tqdm, trange
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

#install transformers to google colab
!pip install transformers
from transformers import BertTokenizer, BertConfig
import math
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import transformers
from transformers import BertForTokenClassification, AdamW
from transformers import get_linear_schedule_with_warmup
 
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
%matplotlib inline


import seaborn as sns
drive.mount('/content/gdrive')
%cd "/content/gdrive/My Drive/DataSets/Annotated Corpus 2"

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
/content/gdrive/My Drive/DataSets/Annotated Corpus 2


Class to simplfy Named Entity Recognition with BERT and handling tokenizing, attention mask setting, classifying, setting device and testing.  

In [None]:
"""
Input: pandas: 0th column: text (splitted to words) (["i","am","him"]),  1th column: labels ["v","b","c"]  

Kullanım: 
NERModel = BERT4NER()
NERModel.workOn(df)
NERModel.initCLSF()
NERMODEL.setFullInput() 
NERMODEL.train(epochs=3)
"""

class Bert4NER():

  def __init__(self):
      self.DataMapped = False
      self.tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)
      self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
      # device sayısı 

      
  def workOn(self,data):
    self.sentences = data.iloc[:,0]
    self.labels = data.iloc[:,1] 
    self.tag_values = list(set(np.concatenate(self.labels.values,axis=0)))
    # Ayrıca ek olarak PAD ekledi, paddingdeki kelmelerin ayrı sayılması için 
    self.tag_values.append("PAD") 
    self.tag2idx = {t: i for i, t in enumerate(self.tag_values)} 
    self.DataMapped = True 
    print("Labels Mapped") 
    

  def initCLSF(self):
        assert (self.DataMapped==True)
        """ Bert Token verilerine özel üretilmiş bir classification modeli """
        self.model = BertForTokenClassification.from_pretrained(
            "bert-base-cased",
            #Labeller burada veriliyor, çıktı katmanı da ona göre hesaplanıyor sanırım
            num_labels=len(self.tag2idx),
            output_attentions = False,  
            output_hidden_states = False 
        )
        if (torch.cuda.is_available()):
          self.model.cuda(); """ Modeln GPU kullanması için"""

  def tokenize_and_preserve_labels(self,sentence, text_labels):
    tokenized_sentence = []
    labels = []

    for word, label in zip(sentence, text_labels):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = self.tokenizer.tokenize(word) 
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

  #Input olarak cümleler veriliyor
  def setPredictInput(self,test_sentence): 

    tokenized_sentence = self.tokenizer.encode(test_sentence)
    input_ids = torch.tensor([tokenized_sentence]).cuda()
    with torch.no_grad():
      output = self.model(input_ids)
    label_indices = np.argmax(output[0].to('cpu').numpy(), axis=2)
    # join bpe split tokens
    tokens = self.tokenizer.convert_ids_to_tokens(input_ids.to('cpu').numpy()[0])
    new_tokens, new_labels = [], []
    for token, label_idx in zip(tokens, label_indices[0]):
        if token.startswith("##"):
            new_tokens[-1] = new_tokens[-1] + token[2:]
        else:
            new_labels.append(self.tag_values[label_idx])
            new_tokens.append(token)
    for token, label in zip(new_tokens, new_labels):
        print("{}\t{}".format(label, token))

    return zip(new_tokens,new_labels)    


  """ Tokenize, train-test split, padding, tensorin Creating DataLoader"""
  def setFullInput(self,testSize= 0.1, MAX_LEN=75,batch_size=32):
    tokenized_texts_and_labels = [
      self.tokenize_and_preserve_labels(sent, labs) for sent, labs in zip(self.sentences, self.labels) # Tokenize edilmiş Cümle-Label çiftleri
    ]

    tokenized_texts = [token_label_pair[0] for token_label_pair in tokenized_texts_and_labels]
    labels = [token_label_pair[1] for token_label_pair in tokenized_texts_and_labels] 

    print("Tokenization Done")

    input_ids = pad_sequences([self.tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", value=0.0,
                          truncating="post", padding="post")
    tags = pad_sequences([[self.tag2idx.get(l) for l in lab] for lab in labels],
                     maxlen=MAX_LEN, value=self.tag2idx["PAD"], padding="post",
                     dtype="long", truncating="post")
    
    print("Padding is Done")

    attention_masks = [[float(i != 0.0) for i in ii] for ii in input_ids]
    tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(input_ids, tags,
                                                                random_state=2018, test_size=testSize)
    tr_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids,
                                                random_state=2018, test_size=testSize)
    

    """ tensor'a dönüşen veriler sonradan gpu'ya ya da cpu'ya konulma gibi özelliklere sahip olabiliyorlar """
    tr_inputs = torch.tensor(tr_inputs)  
    val_inputs = torch.tensor(val_inputs)
    tr_tags = torch.tensor(tr_tags)
    val_tags = torch.tensor(val_tags)
    tr_masks = torch.tensor(tr_masks)
    val_masks = torch.tensor(val_masks)

    print("Tensoring is Done")

    train_data = TensorDataset(tr_inputs, tr_masks, tr_tags) 
    train_sampler = RandomSampler(train_data)

    """ train_data ve train_sampler birleştirilerek train_dataloader oluşturuluyor."""
    self.train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size) 
    del train_data
    del train_sampler

    valid_data = TensorDataset(val_inputs, val_masks, val_tags)
    valid_sampler = SequentialSampler(valid_data)

    """ Aynısı validation data için de geçerli"""

    self.valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=batch_size)
    del valid_data
    del valid_sampler
    print("DataLoaders are Ready")
    

  def initOptimizer(self,FULL_FINETUNING=False,epochs=1): 

    if FULL_FINETUNING:
        param_optimizer = list(self.model.named_parameters())  #Çok uzun bir liste, büyük ihtimalle modelin ağırlıkları ile ilgili değerler 
        """ param_optimizer, (201,2) shape'inde bir liste """


        no_decay = ['bias', 'gamma', 'beta']

        """ optimizer_grouped_parameters, 2 key'e sahip 2 tane dictionary'den oluşan bir liste. 
            dictionary'nin params anahtarının açılaması şöyle:

            param_optimizer: 
            n p
            n p
            n p
            n p
            .
            .
            .
            (201 tane)
            
            buradaki p'leri seçiyor EĞER  n'in içinde hiç 'bias', 'gamma' ya da 'beta' yoksa
            """
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
            'weight_decay_rate': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay_rate': 0.0}
        ] 
    else:

        """ Eğer full finetuning yoksa, fine_tuning'i sadece model.classifier'ın parametreleri için yapıyor"""
        param_optimizer = list(self.model.classifier.named_parameters())
        optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

    """ AdamW'nin tanımı: Implements Adam algorithm with weight decay fix 
      Kısacası, parametreleri vererek o parametrelere göre bir optimizer oluşturuyor.
      AdamW ile ilgili makale: https://arxiv.org/abs/1711.05101

    """
    self.optimizer = AdamW(
        optimizer_grouped_parameters,
        lr=3e-5,
        eps=1e-8
    ) 


    # Total number of training steps is number of batches * number of epochs.
    total_steps = len(self.train_dataloader) * epochs

    # Create the learning rate Decay.
    self.lrDecay = get_linear_schedule_with_warmup(
        self.optimizer,
        num_warmup_steps=0,
        num_training_steps=total_steps
    )
    print("Optimizer Ready\n")


  def test(self):
    # Put the model into evaluation mode
        self.model.eval()
        # Reset the validation loss for this epoch.
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        predictions , true_labels = [], []
        for batch in self.valid_dataloader:
    
            batch = tuple(t.to(self.device) for t in batch)


            b_input_ids, b_input_mask, b_labels = batch

            # Telling the model not to compute or store gradients,
            # saving memory and speeding up validation
            with torch.no_grad():
                # Forward pass, calculate logit predictions.
                # This will return the logits rather than the loss because we have not provided labels.
                outputs = self.model(b_input_ids, token_type_ids=None,
                                attention_mask=b_input_mask, labels=b_labels)
            # Move logits and labels to CPU
            logits = outputs[1].detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            # Calculate the accuracy for this batch of test sentences.
            eval_loss += outputs[0].mean().item()
            predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
            true_labels.extend(label_ids)

        eval_loss = eval_loss / len(self.valid_dataloader)
        validation_loss_values.append(eval_loss)
        print("Validation loss: {}".format(eval_loss))
        pred_tags = [self.tag_values[p_i] for p, l in zip(predictions, true_labels)
                                    for p_i, l_i in zip(p, l) if self.tag_values[l_i] != "PAD"]
        valid_tags = [self.tag_values[l_i] for l in true_labels
                                      for l_i in l if self.tag_values[l_i] != "PAD"]
        print("Validation Accuracy: {}".format(accuracy_score(pred_tags, valid_tags)))  
        print("Validation F1 Score: {}".format(f1_score(pred_tags, valid_tags,average="weighted")))
  def train(self,epochs=1,max_grad_norm=1.0,initOpt = True):
    if(initOpt):
      self.initOptimizer(epochs=epochs)
    ## Store the average loss after each epoch so we can plot them.
    loss_values, validation_loss_values = [], []

    for _ in trange(epochs, desc="Epoch"): 

        # Put the model into training mode.
        self.model.train()
        # Reset the total loss for this epoch.
        total_loss = 0

        # Training loop

         """ Önceden tensor dataset ve random sampler ile birleştirerek elde ettiğimiz data_loader'ı enumerate ederek step ve batch'i çekiyoruz"""
        for step, batch in enumerate(self.train_dataloader):
            # add batch to gpu
            batch = tuple(t.to(self.device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch
            # Always clear any previously calculated gradients before performing a backward pass.
            self.model.zero_grad()
            # forward pass
            # This will return the loss (rather than the model output)
            # because we have provided the `labels`. 
            outputs = self.model(b_input_ids, token_type_ids=None,
                            attention_mask=b_input_mask, labels=b_labels)
            # get the loss
            loss = outputs[0]
            # Perform a backward pass to calculate the gradients.
            loss.backward()
            # track train loss
            total_loss += loss.item()
            # Clip the norm of the gradient
            # This is to help prevent the "exploding gradients" problem.
            
            """ Gradyan vektörlerin normlarını yani orijine olan uzaklıklarını kırpıyor: yüksek olanları beliri bir seviyeye indiriyor 
                Böylece yüksek sayılara ulaşması önlenmiş oluyor
             """
            torch.nn.utils.clip_grad_norm_(parameters=self.model.parameters(), max_norm=max_grad_norm)

            
            # update parameters
            self.optimizer.step()
            # Update the learning rate.
            self.lrDecay.step()

        # Calculate the average loss over the training data.
        avg_train_loss = total_loss / len(self.train_dataloader)
        print("Average train loss: {}".format(avg_train_loss))

        # Store the loss value for plotting the learning curve.
        loss_values.append(avg_train_loss)
 
        # Put the model into evaluation mode
        self.model.eval()
        # Reset the validation loss for this epoch.
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        predictions , true_labels = [], []
        for batch in self.valid_dataloader:
    
            batch = tuple(t.to(self.device) for t in batch)


            b_input_ids, b_input_mask, b_labels = batch

            # Telling the model not to compute or store gradients,
            # saving memory and speeding up validation
            with torch.no_grad():
                # Forward pass, calculate logit predictions.
                # This will return the logits rather than the loss because we have not provided labels.
                outputs = self.model(b_input_ids, token_type_ids=None,
                                attention_mask=b_input_mask, labels=b_labels)
            # Move logits and labels to CPU
            logits = outputs[1].detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            # Calculate the accuracy for this batch of test sentences.
            eval_loss += outputs[0].mean().item()
            predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
            true_labels.extend(label_ids)

        eval_loss = eval_loss / len(self.valid_dataloader)
        validation_loss_values.append(eval_loss)
        print("Validation loss: {}".format(eval_loss))
        pred_tags = [self.tag_values[p_i] for p, l in zip(predictions, true_labels)
                                    for p_i, l_i in zip(p, l) if self.tag_values[l_i] != "PAD"]
        valid_tags = [self.tag_values[l_i] for l in true_labels
                                      for l_i in l if self.tag_values[l_i] != "PAD"]
        print("Validation Accuracy: {}".format(accuracy_score(pred_tags, valid_tags)))  
        print("Validation F1 Score: {}".format(f1_score(pred_tags, valid_tags,average="weighted")))
        

In [None]:
data = pd.read_csv("ner_datasetreference.csv", encoding="latin1").fillna(method="ffill")
data.tail(10)

Unnamed: 0,Sentence #,Word,POS,Tag
1048565,Sentence: 47958,impact,NN,O
1048566,Sentence: 47958,.,.,O
1048567,Sentence: 47959,Indian,JJ,B-gpe
1048568,Sentence: 47959,forces,NNS,O
1048569,Sentence: 47959,said,VBD,O
1048570,Sentence: 47959,they,PRP,O
1048571,Sentence: 47959,responded,VBD,O
1048572,Sentence: 47959,to,TO,O
1048573,Sentence: 47959,the,DT,O
1048574,Sentence: 47959,attack,NN,O


Preprcoessing on data to match input format

In [None]:
group = data.groupby("Sentence #")
preprocessed = data.drop(columns=["POS"])
preprocessed = pd.concat([group["Word"].apply(list).reset_index(name="Word")["Word"], 
          group["Tag"].apply(list).reset_index(name="Tag")["Tag"]],axis=1)
allSentencesIdx = [] 
for i in range(len(preprocessed)):
  if (len(set(preprocessed.iloc[i]["Tag"]))==1 and preprocessed.iloc[i]["Tag"][0] == "O"):
    allSentencesIdx.append(preprocessed.index[i])

preprocessed = preprocessed.drop(allSentencesIdx)    

for i in range(len(preprocessed)):
  assert (len(set(preprocessed.iloc[i]["Tag"]))!=1) 

preprocessed.head(10)
preprocessed

Unnamed: 0,Word,Tag
0,"[Thousands, of, demonstrators, have, marched, ...","[O, O, O, O, O, O, B-geo, O, O, O, O, O, B-geo..."
1,"[Iranian, officials, say, they, expect, to, ge...","[B-gpe, O, O, O, O, O, O, O, O, O, O, O, O, O,..."
2,"[Helicopter, gunships, Saturday, pounded, mili...","[O, O, B-tim, O, O, O, O, O, B-geo, O, O, O, O..."
4,"[U.N., relief, coordinator, Jan, Egeland, said...","[B-geo, O, O, B-per, I-per, O, B-tim, O, B-geo..."
5,"[Mr., Egeland, said, the, latest, figures, sho...","[B-per, I-per, O, O, O, O, O, O, O, O, O, O, O..."
...,...,...
47953,"[Opposition, activists, have, called, for, pro...","[O, O, O, O, O, O, O, O, O, B-tim, I-tim, O, O..."
47954,"[Opposition, leader, Mir, Hossein, Mousavi, ha...","[O, O, O, B-per, I-per, O, O, O, O, O, O, O, O..."
47955,"[On, Thursday, ,, Iranian, state, media, publi...","[O, B-tim, O, B-gpe, O, O, O, O, O, O, O, O, B..."
47956,"[Following, Iran, 's, disputed, June, 12, elec...","[O, B-geo, O, O, B-tim, I-tim, O, O, O, O, O, ..."


Create model and work on preprocessed data, initialize classifier and set full input

In [None]:

del NERModel
NERModel = Bert4NER()
NERModel.workOn(preprocessed)
NERModel.initCLSF()
NERModel.setFullInput() 

Labels Mapped


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

Tokenization Done
Padding is Done
Tensoring is Done
DataLoaders are Ready


Train Model

In [None]:
NERModel.train(epochs=3)






















Epoch:   0%|          | 0/3 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

Optimizer Ready

Average train loss: 1.4441184082515959
Validation loss: 0.8749889815226197
Validation Accuracy: 0.77870480731029























Epoch:  33%|███▎      | 1/3 [14:50<29:40, 890.20s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

Validation F1 Score: 0.8748786745922239
Average train loss: 0.7883829250737545
Validation loss: 0.7012383001856506
Validation Accuracy: 0.7909933383226617























Epoch:  67%|██████▋   | 2/3 [29:39<14:50, 890.07s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

Validation F1 Score: 0.8707480268284599
Average train loss: 0.7011766467893983
Validation loss: 0.6648929389193654
Validation Accuracy: 0.798865389768181























Epoch: 100%|██████████| 3/3 [44:31<00:00, 890.51s/it]

Validation F1 Score: 0.8710316494640086





Testing Model

In [None]:
NERModel.setPredictInput( "A statement from the U.S. Consumer Product Safety Commission said Tuesday the recall involves a play kitchen learning toy made in Mexico and imported by Fisher-Price , a division of Mattel .")

O	[CLS]
O	A
O	statement
O	from
O	the
B-geo	U
B-geo	.
B-geo	S
B-geo	.
I-org	Consumer
O	Product
I-org	Safety
I-org	Commission
O	said
O	Tuesday
O	the
O	recall
O	involves
O	a
O	play
O	kitchen
O	learning
O	toy
O	made
O	in
O	Mexico
O	and
O	imported
O	by
B-per	Fisher
O	-
O	Price
O	,
O	a
O	division
O	of
B-geo	Mattel
O	.
O	[SEP]


<zip at 0x7f7c71c98f48>