In [1]:
#importing required modules
import pandas as pd
import pickle
import warnings
warnings.filterwarnings("ignore")

In [2]:
#function to convert input data into features as 'tokens' and their 'ner-tags'
#it returns a dictionary containing training,validation and test splits of input data

#if you want  features from scratch,then please pass preprocessed as False
def load_data(train_from_scratch=False):
    
    #if preprocessed is true,then simply loading the already transformed dataset
    if not train_from_scratch:
        dataset= pickle.load(open('processed_dataset', "rb"))
    
    #else ,making dataset from scratch
    else:
        #loading the contents of input files
        l=[]
        with File.as_handle('hi_train.conll', 'r',encoding="utf8") as fp:
             l.append(fp.readlines())
        
        #to store tokens in a sentance and their ner tags
        sent=[]
        curr1=[]
        curr2=[]
        for i in range(len(l[0])):
            s=l[0][i].split()
            if len(s)>0 and s[0]!='#':
                curr1.append(s[0])
                curr2.append(s[-1])
            else:
                if len(curr1)>0:
                    sent.append([curr1,curr2])
                    curr1=[]
                    curr2=[]
        #making a label map 
        labels = ['B-CORP','B-CW','B-GRP','B-LOC','B-PER','B-PROD','I-CORP','I-CW','I-GRP','I-LOC','I-PER','I-PROD','O']
        label_map = {label:i for i, label in enumerate(labels)}
        
        #converting labelsi.e ner tags into integer values
        for i in range(len(sent)):
            k=sent[i][1]
            f=[]
            for j in k:
                f.append(label_map[j])
            sent[i][1]=f
        #makaing test,train and validation splits
        
        train_end=int(0.8*len(sent))
        train=sent[:train_end]
        validation=sent[train_end:]
        
        l=[]
        #loading test data
        with File.as_handle('hi_dev.conll', 'r',encoding="utf8") as fp:
             l.append(fp.readlines())
        
        #to store tokens in a sentance and their ner tags of test data
        sent=[]
        curr1=[]
        curr2=[]
        for i in range(len(l[0])):
            s=l[0][i].split()
            if len(s)>0 and s[0]!='#':
                curr1.append(s[0])
                curr2.append(s[-1])
            else:
                if len(curr1)>0:
                    sent.append([curr1,curr2])
                    curr1=[]
                    curr2=[]
        for i in range(len(sent)):
            k=sent[i][1]
            f=[]
            for j in k:
                f.append(label_map[j])
            sent[i][1]=f
        test=sent
        #converting to dataframes
        test = pd.DataFrame(test)
        train=pd.DataFrame(train)
        validation=pd.DataFrame(validation)
        #renaming columns
        test.columns=train.columns=validation.columns=["tokens", "ner_tags"]
        #storing them in a dictionary
        dataset={'train':train,'test':test,'validation':validation}
        #storing the dataset in pickle file
        ds_file= open('processed_dataset', "wb")
        pickle.dump(dataset,ds_file)
        ds_file.close()
    #returning dict dataset
    return dataset

In [3]:
#importing req modules

import pytorch_lightning as pl
from transformers import AutoModel, AutoTokenizer, AutoConfig ,AutoModelForTokenClassification,AdamW,get_linear_schedule_with_warmup
from datasets import load_dataset
from typing import Optional, List, Any,Union
import random
import numpy as np
import torch
from torch.nn import CrossEntropyLoss, MSELoss
from torch.utils.data import DataLoader, TensorDataset
from seqeval.metrics import f1_score, precision_score, recall_score,accuracy_score
from dataclasses import dataclass
import os

In [4]:
#function for setting seed
def set_seed(s):
    random.seed(s)
    np.random.seed(s)
    torch.manual_seed(s)

In [5]:
#dataclass for inputfeatures to the indic-bert model

@dataclass
class InputFeatures:
    input_ids: Any
    attention_mask: Any
    token_type_ids: Any = None
    label: Any = None
    candidates: Any = None
    example_id: str = None

In [6]:
#function to create trainer 
def create_trainer(model,gradient_accumulation_steps,num_train_epochs,max_grad_norm):
    #setting seed
    set_seed(2)
    
    #initialing parameters for the trainer
    train_params = dict(
        accumulate_grad_batches=gradient_accumulation_steps,
        #give accelerator as 'gpu' if you have gpus avalilable
        accelerator="cpu",
        max_epochs=num_train_epochs,
        gradient_clip_val=max_grad_norm,
        enable_checkpointing =True
    )
    
    #creating the trainer using pytorch
    trainer = pl.Trainer(**train_params)
    #returning trainer
    return trainer

In [7]:
#defining named entity recognition class
class ner(pl.LightningModule):
    #if you want to train from scratch,please assign train_from_scatch value as true while initializing a variable of this class
    def __init__(self,train_from_scratch=False):
        #calling base modules init
        super().__init__()
        
        #initializing parameters
        self.scratch=train_from_scratch
        self.max_seq_length=128
        self.train_batch_size=64
        self.gradient_accumulation_steps=1
        self.num_train_epochs=5
        self.warmup_steps=0
        self.eval_batch_size=64
        self.weight_decay=0.0
        self.learning_rate=2e-5
        self.adam_epsilon=1e-8
        self.max_grad_norm=1.0
        self.pad_token_label_id = CrossEntropyLoss().ignore_index
        #loading dataset
        self.dataset=load_data(train_from_scratch=self.scratch)
        self.labels =['B-CORP','B-CW','B-GRP','B-LOC','B-PER','B-PROD','I-CORP','I-CW','I-GRP','I-LOC','I-PER','I-PROD','O']
        args = {'num_labels':13}
        
        
        #getting the inidc bert model and its tokenizer
        #by passing num_labels=7 the model we will get will have a softmax with 13 diff output classes as the final layer
        self.config= AutoConfig.from_pretrained('ai4bharat/indic-bert',**args)
        self.tokenizer= AutoTokenizer.from_pretrained('ai4bharat/indic-bert',config=self.config)
        self.model = AutoModelForTokenClassification.from_pretrained('ai4bharat/indic-bert',config=self.config)
        
    #function for forward pass into the model
    def forward(self, **inputs):
        return self.model(**inputs)
    
    #function to load the features,i.e conversion of dataset features into InputFeatures format of the indic bert model
    def load_features(self,mode):
        #conversion to inputfeatures
        features = self.convert_examples_to_features(mode)
        return features
    
    #function to convert dataset features to inputfeatures format of the model
    def convert_examples_to_features(self,mode):
        #to store the converted features
        features=[]
        #getting the current modes features
        data=self.dataset[mode]
        #renaming columns
        data.columns=['sentence','word_labels']
        #converting dataset feature i.e['sentence','word_labels'] into inputfeatures format i.e inputids,token_type_ids
                                                                                                #attention_mask,encoded_labels
        for index in range(len(data)):
            #getting the current sentence and its word labels
            sentence = data.sentence[index]  
            labels = data.word_labels[index]
            #appending the initial token of every sentance by the special token
            input_ids=[2]
            token_type_ids=[0]
            attention_mask=[1]
            encoded_labels =[-100]
            
            l=0
            #next for each word in the sentance
            for word in sentence:
                #getting the encodings of the word,by passing it into the indic-bert tokenizer
                encoding=self.tokenizer(word,add_special_tokens=False)
                #if the len of inputt ids in the curr encoding is greater then max seq len -1 then skipping that word
                le=len(encoding['input_ids'])
                if len(input_ids)+le>=self.max_seq_length-1:
                    break
                #appeding the curr word encodings into the respective lists
                input_ids.extend(encoding['input_ids'])
                token_type_ids.extend(encoding['token_type_ids'])
                attention_mask.extend(encoding['attention_mask'])
                #assging only the first input id of the curr word to the ner tag of that word in the sentance 
                #and assigning the reaming input ids label values as -100
                encoded_labels.append(labels[l]) 
                l+=1
                for i in range(len(encoding['input_ids'])-1):
                    encoded_labels.append(-100)
            #adding a special token at the end of the sentance
            input_ids.append(3)
            token_type_ids.append(0)
            attention_mask.append(1)
            encoded_labels.append(-100)
            #if the len of input ids in curr sentance id not 128 then appending dummy values
            for extra in range(128-len(input_ids)):
                input_ids.append(0)
                token_type_ids.append(0)
                attention_mask.append(0)
                encoded_labels.append(-100)
            if len(encoded_labels)>128:
              continue
            #converting the current sentance into the inputfeatures varible and appending to features list
            features.append(InputFeatures(
                input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=encoded_labels
            ) )
        #returning inputfeatures
        return features
    
    #function to input features into batches and make them into DataLoader variable
    def make_loader(self, features, batch_size):
        #conversion to tensors
        all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
        all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
        all_token_type_ids = torch.tensor([f.token_type_ids or 0 for f in features], dtype=torch.long)
        all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
        #splitting into batches
        return DataLoader(
            TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels),
            batch_size=batch_size,)
    
    #function to preapare the train data to be given into the model
    def train_dataloader(self):
        #getting train batch size
        train_batch_size = self.train_batch_size
        #converting train data features into inputfeatures format
        train_features = self.load_features('train')
        #getting  dataloader on train features and train batch size
        dataloader = self.make_loader(train_features, train_batch_size)
        
        #initializing a linear schedular
        t_total = (
            (len(dataloader.dataset) // (train_batch_size * 1))
            // self.gradient_accumulation_steps
            * float(self.num_train_epochs)
        )
        scheduler = get_linear_schedule_with_warmup(self.opt, num_warmup_steps=self.warmup_steps, num_training_steps=t_total)
        self.lr_scheduler = scheduler
        
        #returning train dataloader
        return dataloader
    
    #function to preapare the validation data to be given into the model
    def val_dataloader(self):
        #converting validation data features into inputfeatures format
        dev_features = self.load_features('validation')
        #getting validation dataloader
        dataloader = self.make_loader(dev_features, self.eval_batch_size)
        return dataloader

    #function to preapare the test data to be given into the model
    def test_dataloader(self):
        #converting test data features into inputfeatures format
        test_features = self.load_features('test')
        #getting test dataloader
        dataloader = self.make_loader(test_features, self.eval_batch_size)
        return dataloader
    
    #function to perform training step on a single batch of data
    def training_step(self, batch, batch_idx):
        #converting inputfeatures to dict
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1],'token_type_ids':batch[2], 'labels': batch[3]}
        #getting the outputs from the model
        outputs = self(**inputs)
        #getting the loss from the current pass
        loss = outputs[0]
        return {'loss': loss}
    
    #function to perform validation step on a single batch of data
    def validation_step(self, batch, batch_nb):
        #converting inputfeatures to dict
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1],'token_type_ids':batch[2], 'labels': batch[3]}
        #getting the outputs from the model
        outputs = self(**inputs)
        #getting loss and softmax values
        tmp_eval_loss, logits = outputs[:2]
        preds = logits.detach().cpu().numpy()
        #getting actual label ids
        out_label_ids = inputs['labels'].detach().cpu().numpy()
        #returning values as dict
        return {'val_loss': tmp_eval_loss.detach().cpu(),'pred': preds,'target': out_label_ids}

    #function to perform test step on a batch of data
    def test_step(self, batch, batch_nb):
        return self.validation_step(batch, batch_nb)
    
    #function to configure optimizer
    def configure_optimizers(self):
        
        #getting the parameters to optimize from inidc-bert model
        model = self.model
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {
                'params': [p for n, p in model.named_parameters()
                           if not any(nd in n for nd in no_decay)],
                'weight_decay': self.weight_decay,
            },
            {
                'params': [p for n, p in model.named_parameters()
                           if any(nd in n for nd in no_decay)],
                'weight_decay': 0.0,
            },
        ]
        #initializing the adam optimizer
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=self.learning_rate,
                          eps=self.adam_epsilon)
        self.opt = optimizer
        #returning the optimizer
        return [optimizer]

    #function to perform optimizatuon step
    def optimizer_step(self, epoch=None, batch_idx=None, optimizer=None, optimizer_idx=None,optimizer_closure=None, 
                       on_tpu=None, using_native_amp=None, using_lbfgs=None):
        optimizer.step(closure=optimizer_closure)
        optimizer.zero_grad()
        self.lr_scheduler.step()
    
    #function to calculate our required to f-score
    def _eval_end(self, outputs):
        #getting the predicted labels 
        preds = np.concatenate([x['pred'] for x in outputs], axis=0)
        preds = np.argmax(preds, axis=2)
        #gettinf actual labels
        out_label_ids = np.concatenate([x['target'] for x in outputs], axis=0)
        #getting label map
        label_map = {i: label for i, label in enumerate(self.labels)}
        #to store mapped values
        out_label_list = [[] for _ in range(out_label_ids.shape[0])]
        preds_list = [[] for _ in range(out_label_ids.shape[0])]
        
        #mapping lables values from int to string
        for i in range(out_label_ids.shape[0]):
            for j in range(out_label_ids.shape[1]):
                if out_label_ids[i, j] != self.pad_token_label_id:
                    out_label_list[i].append(label_map[out_label_ids[i][j]])
                    preds_list[i].append(label_map[preds[i][j]])
        #getting the f score on actual and predicted labels
        macro_fscore=f1_score(out_label_list, preds_list)
        micro_fscore=accuracy_score(out_label_list, preds_list)
        #returning f score
        return macro_fscore,micro_fscore
    
    #function to compute our req metric(f-score) on validation end i.e the outputs is actually the appending of outputs
    #from diff validation batchs i.e appending of outputs from the validation step
    def validation_epoch_end(self, outputs):
        #getting f score
        f1,f2 = self._eval_end(outputs)
    #function to compute our req metric(f-score) on test end
    def test_epoch_end(self, outputs):
        #getting fcore on test data
        f1,f2 = self._eval_end(outputs)
        #writing the fscore into the file
        self.d.write('micro_fscore:'+str(f2))
        self.d.write('macro_fscore:'+str(f1))
    #function to start the running of whole ner model
    def run_module(self):
        #creating trainer
        trainer = create_trainer(self, self.gradient_accumulation_steps,self.num_train_epochs,self.max_grad_norm)
        #congiguring optimizer
        opt=self.configure_optimizers()
        #opening file to write f scores
        self.d=open('f_score','w')
        
        #if train from scratch is true
        if self.scratch:
            #getting train ,test and validation dataloaders
            td=self.train_dataloader()
            vd=self.val_dataloader()
            te=self.test_dataloader()
            #training the model on train dataloader
            trainer.fit(self,train_dataloaders=td,val_dataloaders=vd)
            #storing to tuned model
            trainer.save_checkpoint("tuned.ckpt")
            #computing training f score and writing into file
            self.d.write('training f-score:')
            trainer.test(self,dataloaders=td)
            #computing validation f score and writing into file
            self.d.write('\n')
            self.d.write('validation f-score:')
            trainer.test(self,dataloaders=vd)
            #computing test f score and writing into file
            self.d.write('\n')
            self.d.write('test f-score:')
            trainer.test(self,dataloaders=te)
            #closing the file
            self.d.close()
            
        else:
            #getting train ,test and validation dataloaders
            td=self.train_dataloader()
            vd=self.val_dataloader()
            te=self.test_dataloader()
            
            #loading from the already tuned model
            checkpoints='tuned.ckpt'
            trainer.fit(self,train_dataloaders=td,val_dataloaders=vd,ckpt_path=checkpoints)
            #computing training f score and writing into file
            self.d.write('training f-score:')
            trainer.test(self,dataloaders=td)
            #computing validation f score and writing into file
            self.d.write('\n')
            self.d.write('validation f-score:')
            trainer.test(self,dataloaders=vd)
            #computing test f score and writing into file
            self.d.write('\n')
            self.d.write('test f-score:')
            trainer.test(self,dataloaders=te)
            #closing the file
            self.d.close()

In [8]:
var=ner()

Some weights of the model checkpoint at ai4bharat/indic-bert were not used when initializing AlbertForTokenClassification: ['predictions.bias', 'predictions.LayerNorm.bias', 'predictions.decoder.weight', 'predictions.LayerNorm.weight', 'predictions.dense.weight', 'predictions.dense.bias', 'sop_classifier.classifier.bias', 'sop_classifier.classifier.weight', 'predictions.decoder.bias']
- This IS expected if you are initializing AlbertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForTokenClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and a

In [9]:
var.run_module()

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
Restoring states from the checkpoint path at tuned.ckpt
Restored all states from the checkpoint file at tuned.ckpt

  | Name  | Type                         | Params
-------------------------------------------------------
0 | model | AlbertForTokenClassification | 32.9 M
-------------------------------------------------------
32.9 M    Trainable params
0         Non-trainable params
32.9 M    Total params
131.452   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{}
--------------------------------------------------------------------------------


Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{}
--------------------------------------------------------------------------------


Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{}
--------------------------------------------------------------------------------
