Finetune XLM-RoBERTa(base) model using WikiAnn dataset(kn)

### Load dataset and libraries

[Transformer-based Named Entity Recognition](https://github.com/asahi417/tner)

In [1]:
!pip3 install datasets
!pip3 install tokenizers
!pip3 install transformers[sentencepiece]
#!pip3 install wandb
!pip3 install seqeval
from torch.utils.data import DataLoader

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.3.2-py3-none-any.whl (362 kB)
[K     |████████████████████████████████| 362 kB 5.0 MB/s 
Collecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 66.4 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting fsspec[http]>=2021.05.0
  Downloading fsspec-2022.5.0-py3-none-any.whl (140 kB)
[K     |████████████████████████████████| 140 kB 73.0 MB/s 
[?25hCollecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 15.1 MB/s 
[?25hCollecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |█████████████████████

In [2]:
from datasets import load_dataset

dataset = load_dataset("wikiann", "kn")

Downloading builder script:   0%|          | 0.00/3.94k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/12.6k [00:00<?, ?B/s]

Downloading and preparing dataset wikiann/kn (download: 223.17 MiB, generated: 101.05 KiB, post-processed: Unknown size, total: 223.27 MiB) to /root/.cache/huggingface/datasets/wikiann/kn/1.1.0/4bfd4fe4468ab78bb6e096968f61fab7a888f44f9d3371c2f3fea7e74a5a354e...


Downloading data:   0%|          | 0.00/234M [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/100 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/100 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/100 [00:00<?, ? examples/s]

Dataset wikiann downloaded and prepared to /root/.cache/huggingface/datasets/wikiann/kn/1.1.0/4bfd4fe4468ab78bb6e096968f61fab7a888f44f9d3371c2f3fea7e74a5a354e. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
labels = dataset["train"].features["ner_tags"].feature.names
labels

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']

In [4]:
from transformers import AutoTokenizer, AutoConfig

config = AutoConfig.from_pretrained("xlm-roberta-base")
label2id = {"O": 0,
"B-PER": 1,
"I-PER": 2,
"B-ORG": 3,
"I-ORG": 4,
"B-LOC": 5,
"I-LOC": 6
}
id2label = {y:x for x,y in label2id.items()}
config.id2label = id2label
config.label2id = label2id
config.num_labels = len(id2label.keys())
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')
tokenizer.config = config

Downloading:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/8.68M [00:00<?, ?B/s]

### Preprocessing

In [5]:
#Get the values for input_ids, token_type_ids, attention_mask
def tokenize_adjust_labels(all_samples_per_split):
  tokenized_samples = tokenizer.batch_encode_plus(all_samples_per_split["tokens"], is_split_into_words=True, truncation=True)  
  total_adjusted_labels = []

  for k, label in enumerate(all_samples_per_split[f"ner_tags"]):
    prev_wid = None
    word_ids_list = tokenized_samples.word_ids(batch_index=k)
    label_ids = []

    for wid in word_ids_list:
      if(wid is None): #special tokens
        label_ids.append(-100)
      elif(wid!=prev_wid): #single word in a single token
        label_ids.append(label[wid])
      else: #word split in different tokens
        label_ids.append(-100)
      prev_wid = wid
    total_adjusted_labels.append(label_ids)
  tokenized_samples["labels"] = total_adjusted_labels
  return tokenized_samples

tokenized_dataset = dataset.map(tokenize_adjust_labels, batched=True, remove_columns=['tokens', 'ner_tags', 'langs', 'spans'])



  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [6]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)
data_collator

DataCollatorForTokenClassification(tokenizer=PreTrainedTokenizerFast(name_or_path='xlm-roberta-base', vocab_size=250002, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)}), padding=True, max_length=None, pad_to_multiple_of=None, label_pad_token_id=-100, return_tensors='pt')

In [7]:
import numpy as np
from datasets import load_metric
metric = load_metric("seqeval")
def compute_metrics(p):
    prediction, label = p
    prediction = np.argmax(prediction, axis=-1) #axis=2

    # Remove ignored index (special tokens)
    true_predictions = [
        [labels[p] for (p, l) in zip(pred, lab) if l != -100]
        for pred, lab in zip(prediction, label)
    ]
    
    true_labels = [
                   [labels[l] for l in lab if l!=-100]
                   for lab in label
    ]
    results = metric.compute(predictions=true_predictions, references=true_labels)
    
    flattened_results = {
        "overall_precision": results["overall_precision"],
        "overall_recall": results["overall_recall"],
        "overall_f1": results["overall_f1"],
        "overall_accuracy": results["overall_accuracy"],}
    return flattened_results

Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

### finetune

In [8]:
import torch
from transformers import AutoModelForTokenClassification, AdamW

In [9]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda')

In [10]:
#sample run
fine_tuned_model = './outputs_xlmr/'
model = AutoModelForTokenClassification.from_pretrained("xlm-roberta-base", num_labels=len(labels))
model.config = config
model.to(device)

Downloading:   0%|          | 0.00/1.04G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForTokenClassification: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-st

XLMRobertaForTokenClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (Layer

In [None]:
#reuse saved model
#model = AutoModelForTokenClassification.from_pretrained(fine_tuned_model, num_labels=len(labels))
#model.to(device)

In [11]:
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(
    output_dir='./outputs_xlmr/', 
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,  
    evaluation_strategy="epoch",
    num_train_epochs=100,
    save_strategy="no",
    report_to="none"
    )

In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics)

trainer.train()
#wandb.finish()

***** Running training *****
  Num examples = 100
  Num Epochs = 100
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 700


Epoch,Training Loss,Validation Loss,Overall Precision,Overall Recall,Overall F1,Overall Accuracy
1,No log,1.096465,0.333333,0.008929,0.017391,0.72638
2,No log,0.797009,0.0,0.0,0.0,0.728834
3,No log,0.688599,0.214286,0.053571,0.085714,0.742331
4,No log,0.656546,0.358974,0.375,0.366812,0.807362
5,No log,0.534205,0.325758,0.383929,0.352459,0.82454
6,No log,0.569827,0.402778,0.517857,0.453125,0.839264
7,No log,0.604457,0.385965,0.392857,0.389381,0.855215
8,No log,0.537889,0.387879,0.571429,0.462094,0.845399
9,No log,0.634028,0.423729,0.446429,0.434783,0.853988
10,No log,0.551514,0.471014,0.580357,0.52,0.872393


***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
  _warn_prf(average, modifier, msg_start, len(result))
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
  _warn_prf(average, modifier, msg_start, len(result))
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
***** Running Evaluation *****
  Num examples = 10

TrainOutput(global_step=700, training_loss=0.055430670593466076, metrics={'train_runtime': 188.401, 'train_samples_per_second': 53.078, 'train_steps_per_second': 3.715, 'total_flos': 246842689537488.0, 'train_loss': 0.055430670593466076, 'epoch': 100.0})

In [13]:
#fine_tuned_model = './outputs_xlmr/'
tokenizer.save_pretrained(fine_tuned_model)
model.save_pretrained(fine_tuned_model)

tokenizer config file saved in ./outputs_xlmr/tokenizer_config.json
Special tokens file saved in ./outputs_xlmr/special_tokens_map.json
Configuration saved in ./outputs_xlmr/config.json
Model weights saved in ./outputs_xlmr/pytorch_model.bin


In [21]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 100
  Batch size = 16


{'epoch': 100.0,
 'eval_loss': 1.2168930768966675,
 'eval_overall_accuracy': 0.8588957055214724,
 'eval_overall_f1': 0.5172413793103449,
 'eval_overall_precision': 0.5,
 'eval_overall_recall': 0.5357142857142857,
 'eval_runtime': 0.3964,
 'eval_samples_per_second': 252.285,
 'eval_steps_per_second': 17.66}

In [22]:
prediction, label, _ = trainer.predict(tokenized_dataset["test"])
prediction = np.argmax(prediction, axis=2)
# Remove special tokens
true_predictions = [
        [labels[p] for (p, l) in zip(pred, lab) if l != -100]
        for pred, lab in zip(prediction, label)
    ]
true_labels = [
                   [labels[l] for l in lab if l!=-100]
                   for lab in label
    ]
results = metric.compute(predictions=true_predictions, references=true_labels)
results

***** Running Prediction *****
  Num examples = 100
  Batch size = 16


{'LOC': {'f1': 0.6078431372549019,
  'number': 47,
  'precision': 0.5636363636363636,
  'recall': 0.6595744680851063},
 'ORG': {'f1': 0.4827586206896552,
  'number': 32,
  'precision': 0.5384615384615384,
  'recall': 0.4375},
 'PER': {'f1': 0.6315789473684211,
  'number': 33,
  'precision': 0.5581395348837209,
  'recall': 0.7272727272727273},
 'overall_accuracy': 0.8608108108108108,
 'overall_f1': 0.5847457627118644,
 'overall_precision': 0.5564516129032258,
 'overall_recall': 0.6160714285714286}

### Huggingface pipeline

In [14]:
from transformers import pipeline
tokenizer = AutoTokenizer.from_pretrained(fine_tuned_model)
model = AutoModelForTokenClassification.from_pretrained(fine_tuned_model)
nlp = pipeline("ner", model=model, tokenizer=tokenizer)

Didn't find file ./outputs_xlmr/added_tokens.json. We won't load it.
loading file ./outputs_xlmr/sentencepiece.bpe.model
loading file ./outputs_xlmr/tokenizer.json
loading file None
loading file ./outputs_xlmr/special_tokens_map.json
loading file ./outputs_xlmr/tokenizer_config.json
loading configuration file ./outputs_xlmr/config.json
Model config XLMRobertaConfig {
  "_name_or_path": "./outputs_xlmr/",
  "architectures": [
    "XLMRobertaForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-PER",
    "2": "I-PER",
    "3": "B-ORG",
    "4": "I-ORG",
    "5": "B-LOC",
    "6": "I-LOC"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-LOC": 5,
    "B-ORG": 3,
    "B-PER": 1,
    "I-LOC": 6,
    "I-ORG": 4,
    "I-PER": 2,
    "O": 0
  },
  "layer_nor

In [15]:
example = "ಭಾರತ ನನ್ನ ದೇಶ"
for entity in nlp(example):
  print(entity)

{'entity': 'B-ORG', 'score': 0.5998021, 'index': 1, 'word': '▁ಭಾರತ', 'start': 0, 'end': 4}
{'entity': 'I-ORG', 'score': 0.9984975, 'index': 2, 'word': '▁ನನ್ನ', 'start': 5, 'end': 9}
{'entity': 'I-ORG', 'score': 0.9988293, 'index': 3, 'word': '▁ದೇಶ', 'start': 10, 'end': 13}


In [16]:
example = " ಎಲ್ಲರಿಗು ನಮಸ್ತೆ ಇದು ನಮ್ಮ ಹಳ್ಳಿ ರೇಡಿಯೋ ನಿಮ್ಮ ಜೊತೆ ನಾನು ಅನು ಕೊರೋನಾ ವಿರುದ್ಧ ಹೋರಾಡುತ್ತಿರುವ ಸೇನೆಯಲ್ಲಿ ಸೇವೆ ಸಲ್ಲಿಸುತ್ತಿರುವ  ಕೊರೋನಾ ಸೈನಿಕಲು\
 ಇತ್ತೀಚಿಗೆ ಎಲ್ಲಿ ನೋಡಿದ್ರು ಕೋರೋಣ ಬಗ್ಗೆಯೇ ಗುಣಗಾನ ನಾಮೇಲರಿಗೂ ಅನಿಸಿರಬೊಹುದು ಏನಪ್ಪಾ ಇದು ಎಲ್ಲಿ ನೋಡಿದರೇನು ಕೊರೋನಾ ಕೊರೋನಾ ಅಂತ  ನೇ ಮಾತಾಡ್ತಾರೆ   "
for entity in nlp(example):
  print(entity)

{'entity': 'B-PER', 'score': 0.99915755, 'index': 19, 'word': '▁ಅನು', 'start': 55, 'end': 58}
{'entity': 'I-PER', 'score': 0.9988041, 'index': 20, 'word': '▁ಕೊ', 'start': 59, 'end': 61}
{'entity': 'I-PER', 'score': 0.9981179, 'index': 21, 'word': 'ರೋ', 'start': 61, 'end': 63}
{'entity': 'B-PER', 'score': 0.5403711, 'index': 76, 'word': '▁ಕೊ', 'start': 230, 'end': 232}


In [17]:
example = "ಎಲ್ಲರಿಗೂ ನಮಸ್ಕಾರ ನಮ್ಮ ಹಳ್ಳಿ ರೇಡಿಯೋ ವಾಹಿನಿ ಮುಖಾಂತರ ನಾನುಡಿ ನಾನು ಡಾಕ್ಟರ್ ಜೆ ಅಚ್ಚುತರಾ ಜನಾಭಿವೃದ್ಧಿ ಮಾಲಿಕೆಯಲ್ಲಿ ಕರೋನವೈರಸ್ ಕುರಿತು\
 ಮೂಲ ಮಾಹಿತಿಗಳನ್ನು ಪ್ರಸ್ತುತಪಡಿಸಲು ನಾವು ಪ್ರಯತ್ನ ಮಾಡ್ತಾಯಿದೀವಿ ನಂಜೊತೆಗೆ ಡಾಕ್ಟರ್ ಶೀಲ ಕರೆಯದಿದ್ದರೆ ಹಾಗೆನೇ "
for entity in nlp(example):
  print(entity)

{'entity': 'B-PER', 'score': 0.77974296, 'index': 23, 'word': '▁ಜೆ', 'start': 70, 'end': 72}
{'entity': 'B-PER', 'score': 0.49406755, 'index': 68, 'word': 'ಶೀಲ', 'start': 198, 'end': 201}


In [18]:
example = "ನನ್ನ ಹೆಸರು ದೇವರಾಜ್ ಅಂತ ತುರುವೇಕೆರೆ ತಾಲೂಕು ಸಂಘದ ಅಧ್ಯಕ್ಷರು ನಾನು ಈ ಕೊರೋನಾ ಬಗ್ಗೆ ಜನಗಳಿಗೆ ಮಾಹಿತಿ ನೆಡುವುದು ಏನು ಅಂತ ಅಂದರೆ \
 ಈಗ ಮಾಸ್ಕ ಎಲ್ಲಾ ಹಳ್ಳಿ ಕಡೆ ಹೋದರೆ ರಾಜಕಾರಣಿಗಳನ್ನ ಕೇಳುತ್ತಾರೆ ನಮಗೇನು ಸೌಲತ್ತು ಬರಲಿಲ್ಲ ನಮಗಿಂದ ಅಧಿಕಾರಿಗಳು ಎಲ್ಲಾ ಅಧಿಕಾರಿಗಳು ಏನು ಕೊಡುತ್ತ ಇಲ್ಲ \
 ಆಮೇಲೆ ರಾಜಕಾರಣಿಗಳು ಏನು ಕೊಡುತ್ತಿಲ್ಲ ಅಂತ"
for entity in nlp(example):
  print(entity)

{'entity': 'B-PER', 'score': 0.9997569, 'index': 3, 'word': '▁ದೇವ', 'start': 11, 'end': 14}
{'entity': 'B-PER', 'score': 0.9201721, 'index': 4, 'word': 'ರಾಜ್', 'start': 14, 'end': 18}
{'entity': 'I-ORG', 'score': 0.5630753, 'index': 11, 'word': '▁ತಾ', 'start': 34, 'end': 36}
{'entity': 'I-ORG', 'score': 0.99622947, 'index': 12, 'word': 'ಲೂ', 'start': 36, 'end': 38}
{'entity': 'I-ORG', 'score': 0.99770975, 'index': 13, 'word': 'ಕು', 'start': 38, 'end': 40}
{'entity': 'I-ORG', 'score': 0.9965933, 'index': 14, 'word': '▁ಸಂಘದ', 'start': 41, 'end': 45}


In [19]:
example = "ಚಿಕ್ಕನಾಯಕನಹಳ್ಳಿ ತಾಲೂಕಿನ ಹಾಗೂ ತುಮಕೂರು ಜಿಲ್ಲೆಯೆಲ್ಲಾ ಸಾರ್ವಜನಿಕ ಬಂಧುಗಳೇ ಇತ್ತೀಚಿನ ದಿನಗಳಲ್ಲಿ ಕೋವಿಡ್ 19 ಕರೋನ ವೈರಸ್ \
ಹರಡಿಕ್ಕೆ ಆರಂಭವಾಗಿದ್ದು ಈ ಬಗ್ಗೆ ಬಹಳ ರೀತಿಯ ತೊಂದರೆಗಳು ಮಾತುಕತೆಗಳು ತರದಿಕ್ಕೆ ಅನಿಸುತ್ತ ಇದ್ದೀರಿ ವೈರಸ್ಸು ಹೊರಡೋದು "
for entity in nlp(example):
  print(entity)

{'entity': 'B-LOC', 'score': 0.9980665, 'index': 1, 'word': '▁ಚಿಕ್ಕ', 'start': 0, 'end': 5}
{'entity': 'B-LOC', 'score': 0.94261, 'index': 2, 'word': 'ನಾಯಕ', 'start': 5, 'end': 9}
{'entity': 'I-LOC', 'score': 0.81590694, 'index': 3, 'word': 'ನಹಳ್ಳಿ', 'start': 9, 'end': 15}
{'entity': 'B-LOC', 'score': 0.99853146, 'index': 6, 'word': '▁ತು', 'start': 29, 'end': 31}
{'entity': 'B-LOC', 'score': 0.99276733, 'index': 7, 'word': 'ಮ', 'start': 31, 'end': 32}
{'entity': 'B-LOC', 'score': 0.8765668, 'index': 8, 'word': 'ಕ', 'start': 32, 'end': 33}
{'entity': 'B-LOC', 'score': 0.87055266, 'index': 9, 'word': 'ೂರು', 'start': 33, 'end': 36}
{'entity': 'B-LOC', 'score': 0.9797524, 'index': 19, 'word': '▁ಕೋ', 'start': 87, 'end': 89}


In [20]:
example = "ಎಲ್ಲರಿಗೂ ನಮಸ್ಕಾರ ನನ್ ಹೆಸರು ಸಾಗರ್ ನಲ್ಲಿ ಪ್ರಭು ಅಂತ ಹೇಳಿ ಭಾರತೀಯ ರೆಡ್ ಕ್ರಾಸ್ ಸಂಸ್ಥೆ ತುಮಕೂರು ಶಾಖೆ ಸ್ಕಿಲ್ ದೆವಲಪ್ಮೆಂಟ್ ಚೇರ್ಮನ್ \
ಆಗಿ ಕೆಲಸ ಮಾಡ್ತಾ ಇದ್ದೀನಿ ಈಗ ಎಲ್ಲರೂ ಕರೋನ ಬಗ್ಗೆ ಮಾತಾಡ್ತಾರೆ ಇಡೀ ವಿಶ್ವ ಬಿಡಿ ಪ್ರಪಂಚ ಕರೋನ ಬಗ್ಗೆ ಮಾತಾಡುತ್ತಿದೆ "
for entity in nlp(example):
  print(entity)

{'entity': 'B-PER', 'score': 0.71421295, 'index': 9, 'word': '▁ಸಾಗ', 'start': 27, 'end': 30}
{'entity': 'B-ORG', 'score': 0.9997974, 'index': 17, 'word': '▁ಭಾರತೀಯ', 'start': 54, 'end': 60}
{'entity': 'I-ORG', 'score': 0.99571675, 'index': 18, 'word': '▁', 'start': 61, 'end': 62}
{'entity': 'I-ORG', 'score': 0.97864896, 'index': 19, 'word': 'ರೆ', 'start': 61, 'end': 63}
{'entity': 'I-ORG', 'score': 0.9980171, 'index': 20, 'word': 'ಡ್', 'start': 63, 'end': 65}
{'entity': 'I-ORG', 'score': 0.99900144, 'index': 21, 'word': '▁ಕ್ರ', 'start': 66, 'end': 69}
{'entity': 'I-ORG', 'score': 0.9992052, 'index': 22, 'word': 'ಾಸ್', 'start': 69, 'end': 72}
{'entity': 'B-LOC', 'score': 0.9938018, 'index': 24, 'word': '▁ತು', 'start': 80, 'end': 82}
{'entity': 'B-LOC', 'score': 0.97714895, 'index': 25, 'word': 'ಮ', 'start': 82, 'end': 83}
{'entity': 'B-LOC', 'score': 0.9446005, 'index': 26, 'word': 'ಕ', 'start': 83, 'end': 84}
