Finetune Multilingual bert model using WikiAnn dataset for Kannada

### Import libraries and dataset

In [1]:
!pip install transformers[sentencepiece]
!pip install datasets
!pip install tokenizers
!pip3 install seqeval
from torch.utils.data import DataLoader

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers[sentencepiece]
  Downloading transformers-4.20.0-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 4.8 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 64.9 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 15.1 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 65.9 MB/s 
Collecting sentencepiece!=0.1.92,>=0.1.91
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |█

In [2]:
from datasets import load_dataset

dataset = load_dataset("wikiann","kn")

Downloading builder script:   0%|          | 0.00/3.94k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/12.6k [00:00<?, ?B/s]

Downloading and preparing dataset wikiann/kn (download: 223.17 MiB, generated: 101.05 KiB, post-processed: Unknown size, total: 223.27 MiB) to /root/.cache/huggingface/datasets/wikiann/kn/1.1.0/4bfd4fe4468ab78bb6e096968f61fab7a888f44f9d3371c2f3fea7e74a5a354e...


Downloading data:   0%|          | 0.00/234M [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/100 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/100 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/100 [00:00<?, ? examples/s]

Dataset wikiann downloaded and prepared to /root/.cache/huggingface/datasets/wikiann/kn/1.1.0/4bfd4fe4468ab78bb6e096968f61fab7a888f44f9d3371c2f3fea7e74a5a354e. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
label_names = dataset["train"].features["ner_tags"].feature.names
label_names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']

In [4]:
dataset.column_names

{'test': ['tokens', 'ner_tags', 'langs', 'spans'],
 'train': ['tokens', 'ner_tags', 'langs', 'spans'],
 'validation': ['tokens', 'ner_tags', 'langs', 'spans']}

In [5]:
from transformers import AutoTokenizer, AutoConfig

config = AutoConfig.from_pretrained("bert-base-multilingual-cased")
label2id = {"O": 0,
"B-PER": 1,
"I-PER": 2,
"B-ORG": 3,
"I-ORG": 4,
"B-LOC": 5,
"I-LOC": 6
}
id2label = {y:x for x,y in label2id.items()}
config.id2label = id2label
config.label2id = label2id
config.num_labels = len(id2label.keys())
tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')
tokenizer.config = config

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/972k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.87M [00:00<?, ?B/s]

### Difference in tokenizers

In [None]:
#using mBERT
res = tokenizer.tokenize("ಸಂಬಂಧಿ")
res

['ಸ', '##ಂ', '##ಬ', '##ಂ', '##ಧಿ']

In [None]:
#using IndicBERT
from transformers import AutoTokenizer
tokeniser = AutoTokenizer.from_pretrained('ai4bharat/indic-bert', keep_accents=True)
res = tokeniser.tokenize("ಸಂಬಂಧಿ")
res

Downloading:   0%|          | 0.00/507 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.38M [00:00<?, ?B/s]

['▁ಸಂಬಂಧಿ']

In [None]:
#using MuRIL
tokniser = AutoTokenizer.from_pretrained('google/muril-base-cased')
res = tokniser.tokenize("ಸಂಬಂಧಿ")
res

loading configuration file https://huggingface.co/google/muril-base-cased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/d8ca6ce642f067ecf3d1163f4d2903b471287613933f2857ca8307e500bc7645.aff1657f5771205f5a0c6cb4816f125ee5f2f2d62dbf27e6b9fee30b0ebbf0f5
Model config BertConfig {
  "_name_or_path": "google/muril-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 768,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.20.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 197285
}

loading file https://huggingface.co/google/muril-base-cased/r

['ಸಂಬಂಧಿ']

### Data preprocessing

In [6]:
#Get the values for input_ids, token_type_ids, attention_mask
def tokenize_adjust_labels(all_samples_per_split):
  tokenized_samples = tokenizer.batch_encode_plus(all_samples_per_split["tokens"], is_split_into_words=True, truncation=True)
  #tokenized_samples is not a datasets object so this alone won't work with Trainer API, hence map is used 
  #so the new keys [input_ids, labels (after adjustment)] can be added to the datasets dict for each train test validation split
  total_adjusted_labels = []
  
  for k, label in enumerate(all_samples_per_split[f"ner_tags"]):
    prev_wid = None
    word_ids_list = tokenized_samples.word_ids(batch_index=k)
    label_ids = []

    for wid in word_ids_list:
      if(wid is None): #special tokens
        label_ids.append(-100)
      elif(wid!=prev_wid): #single word in a single token
        label_ids.append(label[wid])
      else: #word split in different tokens
        label_ids.append(-100)
      prev_wid = wid
    total_adjusted_labels.append(label_ids)
  tokenized_samples["labels"] = total_adjusted_labels
  return tokenized_samples

tokenized_dataset = dataset.map(tokenize_adjust_labels, batched=True, remove_columns=['tokens', 'ner_tags', 'langs', 'spans'])




  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [9]:
#tokenized_dataset['train'][5]

In [7]:
from transformers import DataCollatorForTokenClassification
#pad inputs as well as labels
data_collator = DataCollatorForTokenClassification(tokenizer)
data_collator

DataCollatorForTokenClassification(tokenizer=PreTrainedTokenizerFast(name_or_path='bert-base-multilingual-cased', vocab_size=119547, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}), padding=True, max_length=None, pad_to_multiple_of=None, label_pad_token_id=-100, return_tensors='pt')

#### weights and biases

In [10]:
#!pip install wandb

In [11]:
# import os
# import wandb
# os.environ["WANDB_API_KEY"]="0bf5675a5ef79ddff646f578401c8d29906c1ab2"
# os.environ["WANDB_ENTITY"]="aparna-m"
# os.environ["WANDB_PROJECT"]="mbert_finetune_ner"

### using Trainer API

In [8]:
import torch
from transformers import AutoModelForTokenClassification, AdamW

In [9]:
#check for gpu
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda')

In [10]:
import numpy as np
from datasets import load_metric
metric = load_metric("seqeval")
def compute_metrics(p):
    prediction, label = p
    prediction = np.argmax(prediction, axis=-1) #axis=2

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_names[p] for (p, l) in zip(pred, lab) if l != -100]
        for pred, lab in zip(prediction, label)
    ]
    true_labels = [
                   [label_names[l] for l in lab if l!=-100]
                   for lab in label
    ]
    results = metric.compute(predictions=true_predictions, references=true_labels)
    
    flattened_results = {
        "overall_precision": results["overall_precision"],
        "overall_recall": results["overall_recall"],
        "overall_f1": results["overall_f1"],
        "overall_accuracy": results["overall_accuracy"],}

    return flattened_results

Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

In [11]:
# loading the model
from transformers import TrainingArguments, Trainer
fine_tuned_model = './outputs_mbert/'
model = AutoModelForTokenClassification.from_pretrained("bert-base-multilingual-cased", num_labels=len(label_names))
model.config=config
model.to(device)

Downloading:   0%|          | 0.00/681M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForTokenClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at 

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwi

In [None]:
#use saved model
# model = AutoModelForTokenClassification.from_pretrained(fine_tuned_model, num_labels=len(label_names))
# model.to(device)

In [12]:
# training_args = TrainingArguments(#     output_dir="./fine_tune_bert_output",overwrite_output_dir=True,evaluation_strategy="epoch",learning_rate=2e-5,
#     per_device_train_batch_size=32,per_device_eval_batch_size=32,num_train_epochs=10,weight_decay=0.01,run_name = "ep_2",save_strategy='no',
#     report_to="wandb",label_names=label_names) 

training_args = TrainingArguments(
    output_dir='./outputs_mbert/',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16, 
    evaluation_strategy="epoch",
    num_train_epochs=100,
    report_to="none",
    save_strategy="no"
    )

In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()
#wandb.finish()

***** Running training *****
  Num examples = 100
  Num Epochs = 100
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 700


Epoch,Training Loss,Validation Loss,Overall Precision,Overall Recall,Overall F1,Overall Accuracy
1,No log,0.796343,0.0,0.0,0.0,0.728834
2,No log,0.540394,0.416667,0.491071,0.45082,0.830675
3,No log,0.591712,0.32,0.214286,0.256684,0.813497
4,No log,0.500523,0.404624,0.625,0.491228,0.847853
5,No log,0.528603,0.421053,0.428571,0.424779,0.866258
6,No log,0.564473,0.418301,0.571429,0.483019,0.866258
7,No log,0.588489,0.454545,0.535714,0.491803,0.87362
8,No log,0.614899,0.409722,0.526786,0.460938,0.865031
9,No log,0.709553,0.428571,0.535714,0.47619,0.86135
10,No log,0.699627,0.492537,0.589286,0.536585,0.872393


***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
  _warn_prf(average, modifier, msg_start, len(result))
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
***** Running Evaluation *****
  Nu

TrainOutput(global_step=700, training_loss=0.02962389734174524, metrics={'train_runtime': 192.6965, 'train_samples_per_second': 51.895, 'train_steps_per_second': 3.633, 'total_flos': 334528046080272.0, 'train_loss': 0.02962389734174524, 'epoch': 100.0})

In [14]:
#fine_tuned_model = './outputs_mbert/'
tokenizer.save_pretrained(fine_tuned_model)
model.save_pretrained(fine_tuned_model)

tokenizer config file saved in ./outputs_mbert/tokenizer_config.json
Special tokens file saved in ./outputs_mbert/special_tokens_map.json
Configuration saved in ./outputs_mbert/config.json
Model weights saved in ./outputs_mbert/pytorch_model.bin


In [15]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 100
  Batch size = 16


{'epoch': 100.0,
 'eval_loss': 1.0754543542861938,
 'eval_overall_accuracy': 0.8588957055214724,
 'eval_overall_f1': 0.4838709677419355,
 'eval_overall_precision': 0.4411764705882353,
 'eval_overall_recall': 0.5357142857142857,
 'eval_runtime': 0.4782,
 'eval_samples_per_second': 209.116,
 'eval_steps_per_second': 14.638}

In [16]:
prediction, label, _ = trainer.predict(tokenized_dataset["test"])
prediction = np.argmax(prediction, axis=-1)

true_predictions = [
        [label_names[p] for (p, l) in zip(pred, lab) if l != -100]
        for pred, lab in zip(prediction, label)
    ]
true_labels = [
                   [label_names[l] for l in lab if l!=-100]
                   for lab in label
    ]
results = metric.compute(predictions=true_predictions, references=true_labels)
results

***** Running Prediction *****
  Num examples = 100
  Batch size = 16


{'LOC': {'f1': 0.6476190476190475,
  'number': 47,
  'precision': 0.5862068965517241,
  'recall': 0.723404255319149},
 'ORG': {'f1': 0.4482758620689655,
  'number': 32,
  'precision': 0.5,
  'recall': 0.40625},
 'PER': {'f1': 0.5176470588235295,
  'number': 33,
  'precision': 0.4230769230769231,
  'recall': 0.6666666666666666},
 'overall_accuracy': 0.85,
 'overall_f1': 0.5564516129032259,
 'overall_precision': 0.5073529411764706,
 'overall_recall': 0.6160714285714286}

### huggingface pipeline

In [17]:
from transformers import pipeline
tokenizer = AutoTokenizer.from_pretrained(fine_tuned_model)
model = AutoModelForTokenClassification.from_pretrained(fine_tuned_model)
nlp = pipeline("ner", model=model, tokenizer=tokenizer)

Didn't find file ./outputs_mbert/added_tokens.json. We won't load it.
loading file ./outputs_mbert/vocab.txt
loading file ./outputs_mbert/tokenizer.json
loading file None
loading file ./outputs_mbert/special_tokens_map.json
loading file ./outputs_mbert/tokenizer_config.json
loading configuration file ./outputs_mbert/config.json
Model config BertConfig {
  "_name_or_path": "./outputs_mbert/",
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-PER",
    "2": "I-PER",
    "3": "B-ORG",
    "4": "I-ORG",
    "5": "B-LOC",
    "6": "I-LOC"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-LOC": 5,
    "B-ORG": 3,
    "B-PER": 1,
    "I-LOC": 6,
    "I-ORG": 4,
    "I-PER": 2,
    "O": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_em

In [18]:
example = "ಭಾರತ ನನ್ನ ದೇಶ"
for entity in nlp(example):
  print(entity)

{'entity': 'B-ORG', 'score': 0.6804464, 'index': 1, 'word': 'ಭಾರತ', 'start': 0, 'end': 4}
{'entity': 'I-LOC', 'score': 0.66719013, 'index': 2, 'word': 'ನ', 'start': 5, 'end': 6}
{'entity': 'I-ORG', 'score': 0.7057469, 'index': 4, 'word': 'ದ', 'start': 10, 'end': 11}


In [19]:
#run1
example = "ಪುಣೆ ಮಹಾರಾಷ್ಟ್ರದ ಎರಡನೆಯ ಹಾಗೂ ಭಾರತದ ಏಳನೆಯ ಅತಿದೊಡ್ಡ ನಗರ."
for entity in nlp(example):
  print(entity)

{'entity': 'B-LOC', 'score': 0.9997372, 'index': 1, 'word': 'ಪ', 'start': 0, 'end': 1}
{'entity': 'B-LOC', 'score': 0.9997199, 'index': 2, 'word': '##ು', 'start': 1, 'end': 2}
{'entity': 'B-LOC', 'score': 0.9929148, 'index': 3, 'word': '##ಣೆ', 'start': 2, 'end': 4}
{'entity': 'B-LOC', 'score': 0.9996828, 'index': 4, 'word': 'ಮ', 'start': 5, 'end': 6}
{'entity': 'B-LOC', 'score': 0.9970203, 'index': 5, 'word': '##ಹಾರ', 'start': 6, 'end': 9}
{'entity': 'I-LOC', 'score': 0.6901989, 'index': 6, 'word': '##ಾ', 'start': 9, 'end': 10}
{'entity': 'I-LOC', 'score': 0.8931275, 'index': 7, 'word': '##ಷ್ಟ', 'start': 10, 'end': 13}


In [20]:
example = " ಎಲ್ಲರಿಗು ನಮಸ್ತೆ ಇದು ನಮ್ಮ ಹಳ್ಳಿ ರೇಡಿಯೋ ನಿಮ್ಮ ಜೊತೆ ನಾನು ಅನು ಕೊರೋನಾ ವಿರುದ್ಧ ಹೋರಾಡುತ್ತಿರುವ ಸೇನೆಯಲ್ಲಿ ಸೇವೆ ಸಲ್ಲಿಸುತ್ತಿರುವ  ಕೊರೋನಾ ಸೈನಿಕಲು\
 ಇತ್ತೀಚಿಗೆ ಎಲ್ಲಿ ನೋಡಿದ್ರು ಕೋರೋಣ ಬಗ್ಗೆಯೇ ಗುಣಗಾನ ನಾಮೇಲರಿಗೂ ಅನಿಸಿರಬೊಹುದು ಏನಪ್ಪಾ ಇದು ಎಲ್ಲಿ ನೋಡಿದರೇನು ಕೊರೋನಾ ಕೊರೋನಾ ಅಂತ  ನೇ ಮಾತಾಡ್ತಾರೆ   "
for entity in nlp(example):
  print(entity)

{'entity': 'B-ORG', 'score': 0.5926037, 'index': 33, 'word': 'ಕ', 'start': 59, 'end': 60}
{'entity': 'B-ORG', 'score': 0.44211027, 'index': 34, 'word': '##ೊ', 'start': 60, 'end': 61}
{'entity': 'B-PER', 'score': 0.3346141, 'index': 35, 'word': '##ರ', 'start': 61, 'end': 62}
{'entity': 'B-ORG', 'score': 0.31158957, 'index': 36, 'word': '##ೋ', 'start': 62, 'end': 63}
{'entity': 'B-ORG', 'score': 0.50876087, 'index': 58, 'word': 'ಕ', 'start': 119, 'end': 120}
{'entity': 'B-ORG', 'score': 0.7394536, 'index': 59, 'word': '##ೊ', 'start': 120, 'end': 121}
{'entity': 'B-ORG', 'score': 0.5534794, 'index': 60, 'word': '##ರ', 'start': 121, 'end': 122}
{'entity': 'B-ORG', 'score': 0.4803355, 'index': 61, 'word': '##ೋ', 'start': 122, 'end': 123}
{'entity': 'B-ORG', 'score': 0.9616829, 'index': 79, 'word': 'ಕ', 'start': 159, 'end': 160}
{'entity': 'B-ORG', 'score': 0.9206267, 'index': 80, 'word': '##ೋ', 'start': 160, 'end': 161}
{'entity': 'B-PER', 'score': 0.86929655, 'index': 121, 'word': 'ಕ', 'st

In [21]:
example = "ಎಲ್ಲರಿಗೂ ನಮಸ್ಕಾರ ನಮ್ಮ ಹಳ್ಳಿ ರೇಡಿಯೋ ವಾಹಿನಿ ಮುಖಾಂತರ ನಾನುಡಿ ನಾನು ಡಾಕ್ಟರ್ ಜೆ ಅಚ್ಚುತರಾ ಜನಾಭಿವೃದ್ಧಿ ಮಾಲಿಕೆಯಲ್ಲಿ ಕರೋನವೈರಸ್ ಕುರಿತು\
 ಮೂಲ ಮಾಹಿತಿಗಳನ್ನು ಪ್ರಸ್ತುತಪಡಿಸಲು ನಾವು ಪ್ರಯತ್ನ ಮಾಡ್ತಾಯಿದೀವಿ ನಂಜೊತೆಗೆ ಡಾಕ್ಟರ್ ಶೀಲ ಕರೆಯದಿದ್ದರೆ ಹಾಗೆನೇ "
for entity in nlp(example):
  print(entity)

{'entity': 'B-PER', 'score': 0.94306684, 'index': 35, 'word': 'ಡಾ', 'start': 62, 'end': 64}
{'entity': 'B-PER', 'score': 0.7313524, 'index': 36, 'word': '##ಕ್', 'start': 64, 'end': 66}


In [22]:
example = "ನನ್ನ ಹೆಸರು ದೇವರಾಜ್ ಅಂತ ತುರುವೇಕೆರೆ ತಾಲೂಕು ಸಂಘದ ಅಧ್ಯಕ್ಷರು ನಾನು ಈ ಕೊರೋನಾ ಬಗ್ಗೆ ಜನಗಳಿಗೆ ಮಾಹಿತಿ ನೆಡುವುದು ಏನು ಅಂತ ಅಂದರೆ \
 ಈಗ ಮಾಸ್ಕ ಎಲ್ಲಾ ಹಳ್ಳಿ ಕಡೆ ಹೋದರೆ ರಾಜಕಾರಣಿಗಳನ್ನ ಕೇಳುತ್ತಾರೆ ನಮಗೇನು ಸೌಲತ್ತು ಬರಲಿಲ್ಲ ನಮಗಿಂದ ಅಧಿಕಾರಿಗಳು ಎಲ್ಲಾ ಅಧಿಕಾರಿಗಳು ಏನು ಕೊಡುತ್ತ ಇಲ್ಲ \
 ಆಮೇಲೆ ರಾಜಕಾರಣಿಗಳು ಏನು ಕೊಡುತ್ತಿಲ್ಲ ಅಂತ"
for entity in nlp(example):
  print(entity)

{'entity': 'B-PER', 'score': 0.99985313, 'index': 4, 'word': 'ದ', 'start': 11, 'end': 12}
{'entity': 'B-PER', 'score': 0.9998111, 'index': 5, 'word': '##ೇ', 'start': 12, 'end': 13}
{'entity': 'B-PER', 'score': 0.9204486, 'index': 6, 'word': '##ವರ', 'start': 13, 'end': 15}
{'entity': 'I-PER', 'score': 0.9273081, 'index': 7, 'word': '##ಾಜ', 'start': 15, 'end': 17}
{'entity': 'I-PER', 'score': 0.9765568, 'index': 8, 'word': '##್', 'start': 17, 'end': 18}
{'entity': 'B-ORG', 'score': 0.821067, 'index': 18, 'word': '##ಾಲ', 'start': 35, 'end': 37}
{'entity': 'I-ORG', 'score': 0.6917166, 'index': 19, 'word': '##ೂ', 'start': 37, 'end': 38}
{'entity': 'I-ORG', 'score': 0.9869551, 'index': 20, 'word': '##ಕು', 'start': 38, 'end': 40}
{'entity': 'I-ORG', 'score': 0.99964345, 'index': 21, 'word': 'ಸ', 'start': 41, 'end': 42}
{'entity': 'I-ORG', 'score': 0.9995876, 'index': 22, 'word': '##ಂ', 'start': 42, 'end': 43}
{'entity': 'I-ORG', 'score': 0.9992306, 'index': 23, 'word': '##ಘ', 'start': 43, 'en

In [23]:
example = "ಚಿಕ್ಕನಾಯಕನಹಳ್ಳಿ ತಾಲೂಕಿನ ಹಾಗೂ ತುಮಕೂರು ಜಿಲ್ಲೆಯೆಲ್ಲಾ ಸಾರ್ವಜನಿಕ ಬಂಧುಗಳೇ ಇತ್ತೀಚಿನ ದಿನಗಳಲ್ಲಿ ಕೋವಿಡ್ 19 ಕರೋನ ವೈರಸ್ \
ಹರಡಿಕ್ಕೆ ಆರಂಭವಾಗಿದ್ದು ಈ ಬಗ್ಗೆ ಬಹಳ ರೀತಿಯ ತೊಂದರೆಗಳು ಮಾತುಕತೆಗಳು ತರದಿಕ್ಕೆ ಅನಿಸುತ್ತ ಇದ್ದೀರಿ ವೈರಸ್ಸು ಹೊರಡೋದು "
for entity in nlp(example):
  print(entity)

{'entity': 'B-LOC', 'score': 0.9609116, 'index': 1, 'word': 'ಚ', 'start': 0, 'end': 1}
{'entity': 'B-LOC', 'score': 0.9863252, 'index': 2, 'word': '##ಿಕ್', 'start': 1, 'end': 4}
{'entity': 'B-LOC', 'score': 0.59903246, 'index': 3, 'word': '##ಕ', 'start': 4, 'end': 5}
{'entity': 'B-LOC', 'score': 0.9961486, 'index': 17, 'word': 'ತ', 'start': 29, 'end': 30}
{'entity': 'B-LOC', 'score': 0.99659985, 'index': 18, 'word': '##ು', 'start': 30, 'end': 31}
{'entity': 'B-LOC', 'score': 0.9396283, 'index': 19, 'word': '##ಮಕ', 'start': 31, 'end': 33}
{'entity': 'B-LOC', 'score': 0.6719969, 'index': 20, 'word': '##ೂರು', 'start': 33, 'end': 36}
{'entity': 'B-PER', 'score': 0.99750024, 'index': 42, 'word': 'ಕ', 'start': 87, 'end': 88}
{'entity': 'B-PER', 'score': 0.9985207, 'index': 43, 'word': '##ೋ', 'start': 88, 'end': 89}
{'entity': 'B-PER', 'score': 0.9891195, 'index': 44, 'word': '##ವಿ', 'start': 89, 'end': 91}
{'entity': 'I-PER', 'score': 0.7292953, 'index': 47, 'word': 'ಕ', 'start': 97, 'end': 

In [24]:
example = "ಎಲ್ಲರಿಗೂ ನಮಸ್ಕಾರ ನನ್ ಹೆಸರು ಸಾಗರ್ ನಲ್ಲಿ ಪ್ರಭು ಅಂತ ಹೇಳಿ ಭಾರತೀಯ ರೆಡ್ ಕ್ರಾಸ್ ಸಂಸ್ಥೆ ತುಮಕೂರು ಶಾಖೆ ಸ್ಕಿಲ್ ದೆವಲಪ್ಮೆಂಟ್ ಚೇರ್ಮನ್ \
ಆಗಿ ಕೆಲಸ ಮಾಡ್ತಾ ಇದ್ದೀನಿ ಈಗ ಎಲ್ಲರೂ ಕರೋನ ಬಗ್ಗೆ ಮಾತಾಡ್ತಾರೆ ಇಡೀ ವಿಶ್ವ ಬಿಡಿ ಪ್ರಪಂಚ ಕರೋನ ಬಗ್ಗೆ ಮಾತಾಡುತ್ತಿದೆ "
for entity in nlp(example):
  print(entity)

{'entity': 'B-LOC', 'score': 0.99709976, 'index': 11, 'word': 'ಸ', 'start': 27, 'end': 28}
{'entity': 'B-LOC', 'score': 0.95289934, 'index': 12, 'word': '##ಾಗ', 'start': 28, 'end': 30}
{'entity': 'B-LOC', 'score': 0.59674996, 'index': 13, 'word': '##ರ್', 'start': 30, 'end': 32}
{'entity': 'B-ORG', 'score': 0.999388, 'index': 24, 'word': 'ಭಾರತೀಯ', 'start': 54, 'end': 60}
{'entity': 'I-ORG', 'score': 0.9956701, 'index': 25, 'word': 'ರ', 'start': 61, 'end': 62}
{'entity': 'I-ORG', 'score': 0.99850416, 'index': 26, 'word': '##ೆ', 'start': 62, 'end': 63}
{'entity': 'I-ORG', 'score': 0.99930704, 'index': 27, 'word': '##ಡ್', 'start': 63, 'end': 65}
{'entity': 'I-ORG', 'score': 0.99943084, 'index': 28, 'word': 'ಕ', 'start': 66, 'end': 67}
{'entity': 'I-ORG', 'score': 0.999329, 'index': 29, 'word': '##್ರ', 'start': 67, 'end': 69}
{'entity': 'I-ORG', 'score': 0.99932885, 'index': 30, 'word': '##ಾಸ್', 'start': 69, 'end': 72}
{'entity': 'I-ORG', 'score': 0.9994911, 'index': 31, 'word': 'ಸ', 'start

In [25]:
#test_ds = tokenized_dataset['test']

In [26]:
#trainer.predict(test_ds)