# indicBERT

In [1]:
import torch

In [2]:
from transformers import AutoModelForTokenClassification, AutoConfig, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForTokenClassification, EarlyStoppingCallback, IntervalStrategy
import numpy as np

tokenizer = AutoTokenizer.from_pretrained("INDICBERTTokenizer/working/tokenizer_indicBERT")
model_fine_tuned = AutoModelForTokenClassification.from_pretrained("INDICBERT/working/my_indicBERT")

In [2]:
# Let's download the Naampadam (Indic NER) dataset

from datasets import ClassLabel, load_dataset, load_metric, DownloadMode

lang='hi'

raw_datasets = load_dataset('ai4bharat/naamapadam', lang)

In [3]:
# Tokenize all texts and align the labels with them.
padding = "max_length"
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples[text_column_name],
        padding=padding,
        truncation=True,
        max_length=512,
        # We use this argument because the texts in our dataset are lists of words (with a label for each word).
        is_split_into_words=True,
    )
    # print(tokenized_inputs)
    labels = []
    for i, label in enumerate(examples[label_column_name]):
        # print('=====')
        # print('{} {}'.format(i,label)) #ak
        word_ids = tokenized_inputs.word_ids(batch_index=i)

        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx

        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    # print(tokenized_inputs)
    return tokenized_inputs

In [5]:
train_dataset_test  = raw_datasets["test"]

In [6]:
label_mapping = {
    0: 'O',
    1: 'B-PER',
    2: 'I-PER',
    3: 'B-ORG',
    4: 'I-ORG',
    5: 'B-LOC',
    6: 'I-LOC'
}

In [7]:
sample = train_dataset_test

test_data=[]

for i in sample['tokens']:
    sentence = ' '.join(i)
    test_data.append(sentence)

ground_truth = []

for i in sample['ner_tags']:
    output_list = [label_mapping[num] for num in i]
    ground_truth.append(output_list)

In [8]:
def get_ner(sentence):
    tok_sentence = tokenizer(sentence, return_tensors='pt')

    with torch.no_grad():
        logits = model_fine_tuned(**tok_sentence).logits.argmax(-1)
        predicted_tokens_classes = [
            model_fine_tuned.config.id2label[t.item()] for t in logits[0]]

        predicted_labels = []

        previous_token_id = 0
        word_ids = tok_sentence.word_ids()
        for word_index in range(len(word_ids)):
            if word_ids[word_index] == None:
                previous_token_id = word_ids[word_index]
            elif word_ids[word_index] == previous_token_id:
                previous_token_id = word_ids[word_index]
            else:
                predicted_labels.append(predicted_tokens_classes[word_index])
                previous_token_id = word_ids[word_index]

        ner_output = []
        l = min (len(sentence.split(' ')) , len(predicted_labels) )
        for index in range(l):
            ner_output.append(
                (sentence.split(' ')[index], predicted_labels[index]))
        return ner_output

In [10]:
import torch

labeled_output = []
output=[]
for i in test_data:
    l = get_ner(i)
    labeled_output.append(l)
    temp=[]
    for j in l:
        temp.append(j[1])
    output.append(temp)

In [13]:
b=[]
for i in output:
    tags_string = ' '.join(i)
    temp = tags_string.strip().split()
    b.append(temp)

g = []
labels_list=[]

for i in range(len(ground_truth)):
    tags_string = ' '.join(ground_truth[i])
    labels_list = tags_string.strip().split()
    g.append(labels_list)

In [14]:
for i in range(867):
    x = len(b[i])
    y = len(g[i])
    while(x<y):
        b[i].append('NA')
        x = len(b[i])
        y = len(g[i])
        
    while(x>y):
        b[i].pop()
        x = len(b[i])
        y = len(g[i])      

In [16]:

from sklearn.metrics import classification_report

ground_truth_flat_bert = [label for sublist in g for label in sublist]
predicted_labels_flat_bert = [label for sublist in b for label in sublist]

print(classification_report(ground_truth_flat_bert, predicted_labels_flat_bert, labels=[label for label in set(ground_truth_flat_bert) if label != 'NA']))

              precision    recall  f1-score   support

           O       0.98      0.98      0.98     16513
       I-ORG       0.81      0.77      0.79       512
       B-LOC       0.82      0.81      0.82       613
       I-PER       0.91      0.91      0.91       747
       B-PER       0.85      0.89      0.87       788
       I-LOC       0.80      0.59      0.68       199
       B-ORG       0.79      0.81      0.80       521

    accuracy                           0.95     19893
   macro avg       0.85      0.82      0.83     19893
weighted avg       0.95      0.95      0.95     19893



In [17]:
from sklearn.metrics import f1_score

f1_scores_bert = f1_score(ground_truth_flat_bert, predicted_labels_flat_bert, labels=['B-LOC', 'B-MISC', 'B-ORG', 'B-PER', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PER', 'O'], average=None)

# Calculate macro F1 score
macro_f1_bert = sum(f1_scores_bert) / len(f1_scores_bert)

print("Macro F1 Score:", macro_f1_bert)

Macro F1 Score: 0.6490502408696407


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


# indic_NER 

In [18]:
from transformers import AutoModelForTokenClassification, AutoConfig, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForTokenClassification, EarlyStoppingCallback, IntervalStrategy
import numpy as np

tokenizer_ner = AutoTokenizer.from_pretrained("INDICNERTokenizer/working/tokenizer_indicNER")
model_fine_tuned_ner = AutoModelForTokenClassification.from_pretrained("INDICNER/working/my_indicNER")

In [19]:
def get_ner(sentence):
    tok_sentence = tokenizer_ner(sentence, return_tensors='pt')

    with torch.no_grad():
        logits = model_fine_tuned_ner(**tok_sentence).logits.argmax(-1)
        predicted_tokens_classes = [
            model_fine_tuned_ner.config.id2label[t.item()] for t in logits[0]]

        predicted_labels = []

        previous_token_id = 0
        word_ids = tok_sentence.word_ids()
        for word_index in range(len(word_ids)):
            if word_ids[word_index] == None:
                previous_token_id = word_ids[word_index]
            elif word_ids[word_index] == previous_token_id:
                previous_token_id = word_ids[word_index]
            else:
                predicted_labels.append(predicted_tokens_classes[word_index])
                previous_token_id = word_ids[word_index]

        ner_output = []
        l = min (len(sentence.split(' ')) , len(predicted_labels) )
        for index in range(l):
            ner_output.append(
                (sentence.split(' ')[index], predicted_labels[index]))
        return ner_output



In [20]:
# str = "दरअसल , जनवरी से चीन और नेपाल के सीमावर्ती क्षेत्रों को संचार सुविधा के लिए अब बैलून नेटवर्क सिस्टम की शुरूआत की जा रही है, जिसके साथ ही उत्तराखंड बैलून से नेटवर्क सुविधा देने वाला पहला राज्य बनेगा।  "
labeled_output_ner = []
output_ner=[]
for i in test_data:
    l = get_ner(i)
    labeled_output_ner.append(l)
    temp=[]
    for j in l:
        temp.append(j[1])
    output_ner.append(temp)

In [22]:
n=[]
for i in output_ner:
    tags = ' '.join(i)
    temp = tags.strip().split()
    n.append(temp)

In [23]:
for i in range(867):
    x = len(n[i])
    y = len(g[i])
    while(x<y):
        b[i].append('NA')
        x = len(n[i])
        y = len(g[i])
        
    while(x>y):
        b[i].pop()
        x = len(n[i])
        y = len(g[i])

In [24]:
from sklearn.metrics import classification_report

ground_truth_flat_ner = [label for sublist in g for label in sublist]
predicted_labels_flat_ner = [label for sublist in n for label in sublist]

print(classification_report(ground_truth_flat_ner, predicted_labels_flat_ner, labels=[label for label in set(ground_truth_flat_ner) if label != 'NA']))

              precision    recall  f1-score   support

           O       0.98      0.97      0.98     16513
       I-ORG       0.68      0.78      0.73       512
       B-LOC       0.85      0.87      0.86       613
       I-PER       0.90      0.94      0.92       747
       B-PER       0.88      0.92      0.90       788
       I-LOC       0.83      0.66      0.74       199
       B-ORG       0.74      0.83      0.79       521

    accuracy                           0.95     19893
   macro avg       0.84      0.85      0.84     19893
weighted avg       0.96      0.95      0.95     19893



In [25]:
from sklearn.metrics import f1_score

f1_scores_ner = f1_score(ground_truth_flat_ner, predicted_labels_flat_ner, labels=['B-LOC', 'B-MISC', 'B-ORG', 'B-PER', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PER', 'O'], average=None)

# Calculate macro F1 score
macro_f1_ner = sum(f1_scores_ner) / len(f1_scores_ner)

print("Macro F1 Score:", macro_f1_ner)

Macro F1 Score: 0.6563827168901009


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
