Finetune MuRIL(Multilingual Representations for Indian Languages) model using WikiAnn dataset for Kannada

### Load libraries and Dataset

In [1]:
!pip3 install datasets
!pip3 install tokenizers
!pip3 install transformers[sentencepiece]
#!pip3 install wandb
!pip3 install seqeval
from torch.utils.data import DataLoader

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.3.2-py3-none-any.whl (362 kB)
[K     |████████████████████████████████| 362 kB 7.8 MB/s 
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 67.5 MB/s 
Collecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 62.0 MB/s 
Collecting fsspec[http]>=2021.05.0
  Downloading fsspec-2022.5.0-py3-none-any.whl (140 kB)
[K     |████████████████████████████████| 140 kB 75.2 MB/s 
[?25hCollecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 5.7 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3

In [2]:
from datasets import load_dataset

dataset = load_dataset("wikiann", "kn")

Downloading builder script:   0%|          | 0.00/3.94k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/12.6k [00:00<?, ?B/s]

Downloading and preparing dataset wikiann/kn (download: 223.17 MiB, generated: 101.05 KiB, post-processed: Unknown size, total: 223.27 MiB) to /root/.cache/huggingface/datasets/wikiann/kn/1.1.0/4bfd4fe4468ab78bb6e096968f61fab7a888f44f9d3371c2f3fea7e74a5a354e...


Downloading data:   0%|          | 0.00/234M [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/100 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/100 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/100 [00:00<?, ? examples/s]

Dataset wikiann downloaded and prepared to /root/.cache/huggingface/datasets/wikiann/kn/1.1.0/4bfd4fe4468ab78bb6e096968f61fab7a888f44f9d3371c2f3fea7e74a5a354e. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
labels = dataset["train"].features["ner_tags"].feature.names
labels

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']

In [4]:
dataset.column_names

{'test': ['tokens', 'ner_tags', 'langs', 'spans'],
 'train': ['tokens', 'ner_tags', 'langs', 'spans'],
 'validation': ['tokens', 'ner_tags', 'langs', 'spans']}

In [5]:
dataset.shape

{'test': (100, 4), 'train': (100, 4), 'validation': (100, 4)}

In [6]:
from transformers import AutoTokenizer, AutoConfig

config = AutoConfig.from_pretrained("google/muril-base-cased")
label2id = {"O": 0,
"B-PER": 1,
"I-PER": 2,
"B-ORG": 3,
"I-ORG": 4,
"B-LOC": 5,
"I-LOC": 6
}
id2label = {y:x for x,y in label2id.items()}
config.id2label = id2label
config.label2id = label2id
config.num_labels = len(id2label.keys())
tokenizer = AutoTokenizer.from_pretrained('google/muril-base-cased')
tokenizer.config = config

Downloading:   0%|          | 0.00/411 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/206 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.02M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/113 [00:00<?, ?B/s]

### Data Preprocessing

In [8]:
#Get the values for input_ids, token_type_ids, attention_mask
def tokenize_adjust_labels(all_samples_per_split):
  tokenized_samples = tokenizer.batch_encode_plus(all_samples_per_split["tokens"], is_split_into_words=True, truncation=True)  
  total_adjusted_labels = []

  for k, label in enumerate(all_samples_per_split[f"ner_tags"]):
    prev_wid = None
    word_ids_list = tokenized_samples.word_ids(batch_index=k)
    label_ids = []

    for wid in word_ids_list:
      if(wid is None): #special tokens
        label_ids.append(-100)
      elif(wid!=prev_wid): #single word in a single token
        label_ids.append(label[wid])
      else: #word split in different tokens
        label_ids.append(-100)
      prev_wid = wid
    total_adjusted_labels.append(label_ids)
  tokenized_samples["labels"] = total_adjusted_labels
  return tokenized_samples

tokenized_dataset = dataset.map(tokenize_adjust_labels, batched=True, remove_columns=['tokens', 'ner_tags', 'langs', 'spans'])

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [9]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)
data_collator

DataCollatorForTokenClassification(tokenizer=PreTrainedTokenizerFast(name_or_path='google/muril-base-cased', vocab_size=197285, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}), padding=True, max_length=None, pad_to_multiple_of=None, label_pad_token_id=-100, return_tensors='pt')

### Finetuning

In [10]:
import torch
from torch.utils.data import DataLoader
from transformers import AutoModelForTokenClassification, AdamW

In [11]:
#check for gpu
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda')

In [12]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
import numpy as np
from datasets import load_metric
metric = load_metric("seqeval")
def compute_metrics(p):
    prediction, label = p
    prediction = np.argmax(prediction, axis=-1) #axis=2

    # Remove ignored index (special tokens)
    true_predictions = [
        [labels[p] for (p, l) in zip(pred, lab) if l != -100]
        for pred, lab in zip(prediction, label)
    ]
    true_labels = [
                   [labels[l] for l in lab if l!=-100]
                   for lab in label
    ]
    results = metric.compute(predictions=true_predictions, references=true_labels)
    
    flattened_results = {
        "overall_precision": results["overall_precision"],
        "overall_recall": results["overall_recall"],
        "overall_f1": results["overall_f1"],
        "overall_accuracy": results["overall_accuracy"],}

    # for k in results.keys():
    #   if(k not in flattened_results.keys()):
    #     flattened_results[k+"_f1"]=results[k]["f1"]

    return flattened_results

Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

In [13]:
#sample run
fine_tuned_model = './outputs_muril/'
model = AutoModelForTokenClassification.from_pretrained("google/muril-base-cased", num_labels=len(labels))
model.config = config
model.to(device)

Downloading:   0%|          | 0.00/909M [00:00<?, ?B/s]

Some weights of the model checkpoint at google/muril-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized fr

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(197285, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwi

In [None]:
#reuse saved model
model = AutoModelForTokenClassification.from_pretrained(fine_tuned_model, num_labels=len(labels))
model.to(device)

In [14]:
batch_size = 16
training_args = TrainingArguments(
    output_dir="./outputs_muril/", 
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size, 
    evaluation_strategy="epoch",
    num_train_epochs=100,
    save_strategy="no",
    report_to="none"
    )
#    learning_rate=2e-5, #3e-5

In [15]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics)

trainer.train()
#wandb.finish()

***** Running training *****
  Num examples = 100
  Num Epochs = 100
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 700


Epoch,Training Loss,Validation Loss,Overall Precision,Overall Recall,Overall F1,Overall Accuracy
1,No log,1.90598,0.0,0.0,0.0,0.728834
2,No log,1.899888,0.008333,0.008929,0.008621,0.306748
3,No log,1.808303,0.0,0.0,0.0,0.728834
4,No log,1.792226,0.0,0.0,0.0,0.728834
5,No log,1.776832,0.0,0.0,0.0,0.728834
6,No log,1.763822,0.0,0.0,0.0,0.728834
7,No log,1.751353,0.0,0.0,0.0,0.728834
8,No log,1.738429,0.0,0.0,0.0,0.728834
9,No log,1.725591,0.0,0.0,0.0,0.728834
10,No log,1.712289,0.0,0.0,0.0,0.728834


***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
***** Running Evaluation *****
  Num examples = 10

TrainOutput(global_step=700, training_loss=1.1768408639090402, metrics={'train_runtime': 149.2278, 'train_samples_per_second': 67.012, 'train_steps_per_second': 4.691, 'total_flos': 170460970058808.0, 'train_loss': 1.1768408639090402, 'epoch': 100.0})

In [16]:
fine_tuned_model = './outputs_muril/'
tokenizer.save_pretrained(fine_tuned_model)
model.save_pretrained(fine_tuned_model)

tokenizer config file saved in ./outputs_muril/tokenizer_config.json
Special tokens file saved in ./outputs_muril/special_tokens_map.json
Configuration saved in ./outputs_muril/config.json
Model weights saved in ./outputs_muril/pytorch_model.bin


In [17]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 100
  Batch size = 16


{'epoch': 100.0,
 'eval_loss': 0.8438961505889893,
 'eval_overall_accuracy': 0.905521472392638,
 'eval_overall_f1': 0.6234817813765182,
 'eval_overall_precision': 0.5703703703703704,
 'eval_overall_recall': 0.6875,
 'eval_runtime': 0.3321,
 'eval_samples_per_second': 301.109,
 'eval_steps_per_second': 21.078}

In [18]:
prediction, label, _ = trainer.predict(tokenized_dataset["test"])
prediction = np.argmax(prediction, axis=-1)
# Remove ignored index (special tokens)
true_predictions = [
        [labels[p] for (p, l) in zip(pred, lab) if l != -100]
        for pred, lab in zip(prediction, label)
    ]
true_labels = [
                   [labels[l] for l in lab if l!=-100]
                   for lab in label
    ]
results = metric.compute(predictions=true_predictions, references=true_labels)
results

***** Running Prediction *****
  Num examples = 100
  Batch size = 16


{'LOC': {'f1': 0.6601941747572816,
  'number': 47,
  'precision': 0.6071428571428571,
  'recall': 0.723404255319149},
 'ORG': {'f1': 0.5833333333333334,
  'number': 32,
  'precision': 0.525,
  'recall': 0.65625},
 'PER': {'f1': 0.704225352112676,
  'number': 33,
  'precision': 0.6578947368421053,
  'recall': 0.7575757575757576},
 'overall_accuracy': 0.8918918918918919,
 'overall_f1': 0.6504065040650405,
 'overall_precision': 0.5970149253731343,
 'overall_recall': 0.7142857142857143}

### Using HuggingFace pipeline

In [20]:
from transformers import pipeline
tokenizer = AutoTokenizer.from_pretrained(fine_tuned_model)
model = AutoModelForTokenClassification.from_pretrained(fine_tuned_model)

nlp = pipeline("ner", model=model, tokenizer=tokenizer)

Didn't find file ./outputs_muril/added_tokens.json. We won't load it.
loading file ./outputs_muril/vocab.txt
loading file ./outputs_muril/tokenizer.json
loading file None
loading file ./outputs_muril/special_tokens_map.json
loading file ./outputs_muril/tokenizer_config.json
loading configuration file ./outputs_muril/config.json
Model config BertConfig {
  "_name_or_path": "./outputs_muril/",
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 768,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-PER",
    "2": "I-PER",
    "3": "B-ORG",
    "4": "I-ORG",
    "5": "B-LOC",
    "6": "I-LOC"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-LOC": 5,
    "B-ORG": 3,
    "B-PER": 1,
    "I-LOC": 6,
    "I-ORG": 4,
    "I-PER": 2,
    "O": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embed

In [21]:
example = "ಭಾರತ ನನ್ನ ದೇಶ"
for entity in nlp(example):
  print(entity)

{'entity': 'B-LOC', 'score': 0.20336346, 'index': 1, 'word': 'ಭಾರತ', 'start': 0, 'end': 4}
{'entity': 'B-ORG', 'score': 0.18066743, 'index': 2, 'word': 'ನನ್ನ', 'start': 5, 'end': 9}
{'entity': 'I-LOC', 'score': 0.17406245, 'index': 3, 'word': 'ದೇಶ', 'start': 10, 'end': 13}


In [22]:
example = " ಎಲ್ಲರಿಗು ನಮಸ್ತೆ ಇದು ನಮ್ಮ ಹಳ್ಳಿ ರೇಡಿಯೋ ನಿಮ್ಮ ಜೊತೆ ನಾನು ಅನು ಕೊರೋನಾ ವಿರುದ್ಧ ಹೋರಾಡುತ್ತಿರುವ ಸೇನೆಯಲ್ಲಿ ಸೇವೆ ಸಲ್ಲಿಸುತ್ತಿರುವ  ಕೊರೋನಾ ಸೈನಿಕಲು\
 ಇತ್ತೀಚಿಗೆ ಎಲ್ಲಿ ನೋಡಿದ್ರು ಕೋರೋಣ ಬಗ್ಗೆಯೇ ಗುಣಗಾನ ನಾಮೇಲರಿಗೂ ಅನಿಸಿರಬೊಹುದು ಏನಪ್ಪಾ ಇದು ಎಲ್ಲಿ ನೋಡಿದರೇನು ಕೊರೋನಾ ಕೊರೋನಾ ಅಂತ  ನೇ ಮಾತಾಡ್ತಾರೆ   "
for entity in nlp(example):
  print(entity)

{'entity': 'B-ORG', 'score': 0.2108201, 'index': 9, 'word': 'ಹಳ್ಳಿ', 'start': 26, 'end': 31}
{'entity': 'B-ORG', 'score': 0.19058412, 'index': 10, 'word': 'ರೇಡಿಯೋ', 'start': 32, 'end': 38}


In [23]:
example = "ಎಲ್ಲರಿಗೂ ನಮಸ್ಕಾರ ನಮ್ಮ ಹಳ್ಳಿ ರೇಡಿಯೋ ವಾಹಿನಿ ಮುಖಾಂತರ ನಾನುಡಿ ನಾನು ಡಾಕ್ಟರ್ ಜೆ ಅಚ್ಚುತರಾ ಜನಾಭಿವೃದ್ಧಿ ಮಾಲಿಕೆಯಲ್ಲಿ ಕರೋನವೈರಸ್ ಕುರಿತು\
 ಮೂಲ ಮಾಹಿತಿಗಳನ್ನು ಪ್ರಸ್ತುತಪಡಿಸಲು ನಾವು ಪ್ರಯತ್ನ ಮಾಡ್ತಾಯಿದೀವಿ ನಂಜೊತೆಗೆ ಡಾಕ್ಟರ್ ಶೀಲ ಕರೆಯದಿದ್ದರೆ ಹಾಗೆನೇ "
for entity in nlp(example):
  print(entity)

{'entity': 'B-ORG', 'score': 0.20505813, 'index': 3, 'word': 'ನಮ್ಮ', 'start': 17, 'end': 21}
{'entity': 'B-ORG', 'score': 0.20062014, 'index': 4, 'word': 'ಹಳ್ಳಿ', 'start': 22, 'end': 27}
{'entity': 'I-ORG', 'score': 0.22275184, 'index': 5, 'word': 'ರೇಡಿಯೋ', 'start': 28, 'end': 34}


In [24]:
example = "ನನ್ನ ಹೆಸರು ದೇವರಾಜ್ ಅಂತ ತುರುವೇಕೆರೆ ತಾಲೂಕು ಸಂಘದ ಅಧ್ಯಕ್ಷರು ನಾನು ಈ ಕೊರೋನಾ ಬಗ್ಗೆ ಜನಗಳಿಗೆ ಮಾಹಿತಿ ನೆಡುವುದು ಏನು ಅಂತ ಅಂದರೆ \
 ಈಗ ಮಾಸ್ಕ ಎಲ್ಲಾ ಹಳ್ಳಿ ಕಡೆ ಹೋದರೆ ರಾಜಕಾರಣಿಗಳನ್ನ ಕೇಳುತ್ತಾರೆ ನಮಗೇನು ಸೌಲತ್ತು ಬರಲಿಲ್ಲ ನಮಗಿಂದ ಅಧಿಕಾರಿಗಳು ಎಲ್ಲಾ ಅಧಿಕಾರಿಗಳು ಏನು ಕೊಡುತ್ತ ಇಲ್ಲ \
 ಆಮೇಲೆ ರಾಜಕಾರಣಿಗಳು ಏನು ಕೊಡುತ್ತಿಲ್ಲ ಅಂತ"
for entity in nlp(example):
  print(entity)

{'entity': 'B-PER', 'score': 0.20590404, 'index': 3, 'word': 'ದೇವರ', 'start': 11, 'end': 15}
{'entity': 'I-PER', 'score': 0.2007467, 'index': 4, 'word': '##ಾಜ್', 'start': 15, 'end': 18}
{'entity': 'I-ORG', 'score': 0.17790702, 'index': 12, 'word': 'ಸಂಘದ', 'start': 41, 'end': 45}


In [25]:
example = "ಚಿಕ್ಕನಾಯಕನಹಳ್ಳಿ ತಾಲೂಕಿನ ಹಾಗೂ ತುಮಕೂರು ಜಿಲ್ಲೆಯೆಲ್ಲಾ ಸಾರ್ವಜನಿಕ ಬಂಧುಗಳೇ ಇತ್ತೀಚಿನ ದಿನಗಳಲ್ಲಿ ಕೋವಿಡ್ 19 ಕರೋನ ವೈರಸ್ \
ಹರಡಿಕ್ಕೆ ಆರಂಭವಾಗಿದ್ದು ಈ ಬಗ್ಗೆ ಬಹಳ ರೀತಿಯ ತೊಂದರೆಗಳು ಮಾತುಕತೆಗಳು ತರದಿಕ್ಕೆ ಅನಿಸುತ್ತ ಇದ್ದೀರಿ ವೈರಸ್ಸು ಹೊರಡೋದು "
for entity in nlp(example):
  print(entity)

{'entity': 'B-LOC', 'score': 0.21412751, 'index': 1, 'word': 'ಚಿಕ್ಕ', 'start': 0, 'end': 5}
{'entity': 'B-LOC', 'score': 0.19389635, 'index': 2, 'word': '##ನಾಯಕ', 'start': 5, 'end': 9}
{'entity': 'B-LOC', 'score': 0.1873924, 'index': 3, 'word': '##ನಹಳ್ಳಿ', 'start': 9, 'end': 15}
{'entity': 'B-LOC', 'score': 0.21823885, 'index': 6, 'word': 'ತುಮಕೂರು', 'start': 29, 'end': 36}


In [26]:
example = "ಎಲ್ಲರಿಗೂ ನಮಸ್ಕಾರ ನನ್ ಹೆಸರು ಸಾಗರ್ ನಲ್ಲಿ ಪ್ರಭು ಅಂತ ಹೇಳಿ ಭಾರತೀಯ ರೆಡ್ ಕ್ರಾಸ್ ಸಂಸ್ಥೆ ತುಮಕೂರು ಶಾಖೆ ಸ್ಕಿಲ್ ದೆವಲಪ್ಮೆಂಟ್ ಚೇರ್ಮನ್ \
ಆಗಿ ಕೆಲಸ ಮಾಡ್ತಾ ಇದ್ದೀನಿ ಈಗ ಎಲ್ಲರೂ ಕರೋನ ಬಗ್ಗೆ ಮಾತಾಡ್ತಾರೆ ಇಡೀ ವಿಶ್ವ ಬಿಡಿ ಪ್ರಪಂಚ ಕರೋನ ಬಗ್ಗೆ ಮಾತಾಡುತ್ತಿದೆ "
for entity in nlp(example):
  print(entity)

{'entity': 'B-ORG', 'score': 0.21775743, 'index': 12, 'word': 'ಭಾರತೀಯ', 'start': 54, 'end': 60}
{'entity': 'I-ORG', 'score': 0.22502445, 'index': 13, 'word': 'ರೆಡ್', 'start': 61, 'end': 65}
{'entity': 'I-ORG', 'score': 0.22513127, 'index': 14, 'word': 'ಕ್ರಾಸ್', 'start': 66, 'end': 72}
{'entity': 'I-ORG', 'score': 0.2248015, 'index': 15, 'word': 'ಸಂಸ್ಥೆ', 'start': 73, 'end': 79}
{'entity': 'B-ORG', 'score': 0.18103227, 'index': 16, 'word': 'ತುಮಕೂರು', 'start': 80, 'end': 87}
{'entity': 'B-LOC', 'score': 0.18921334, 'index': 48, 'word': 'ವಿಶ್ವ', 'start': 180, 'end': 185}
