<a href="https://colab.research.google.com/github/C22-PS008/machine-learning/blob/main/train/named-entity-recognition/named-entity-recognition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Dependency Installation

In [None]:
!pip install transformers
!pip install tokenizers
!pip install datasets
!pip install sentencepiece

In [None]:
!pip install accelerate

# Log-in and load dataset

In [None]:
!pip install huggingface_hub
from huggingface_hub import notebook_login
notebook_login()

In [None]:
from datasets import load_dataset
wikiann_dataset = load_dataset("wikiann","id")

In [None]:
from datasets import load_dataset
indonlu_dataset = load_dataset("indonlu","nergrit")

In [None]:
from datasets import load_dataset
conll2033_dataset = load_dataset("conll2033")

In [None]:
wikiann_label_names=wikiann_dataset["train"].features[f"ner_tags"].feature.names

In [None]:
wikiann_label_names

In [None]:
indonlu_label_names=indonlu_dataset["train"].features[f"ner_tags"].feature.names

In [None]:
indonlu_label_names

In [None]:
indonlu_feature=indonlu_dataset["train"].features
indonlu_feature

# Defining Model and Tokenizer

In [None]:
from transformers import AutoTokenizer , TFAutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("chanifrusydi/bert-finetuned-ner",from_tf=True)
model

In [None]:
model = TFAutoModelForTokenClassification("chanifrusydi/bert-finetuned-ner")
# model.bert.load_state_dict(model.bert.state_dict())

In [None]:
indobenchmark/indobert-base-p1

# Tokenize Dataset 





In [None]:
tokenizer(
    ["Hello", ",", "this", "is", "one", "sentence", "split", "into", "words", "."],
    is_split_into_words=True,
)

In [None]:
def aligning_labels_with_token(labels, word_id):
  new_labels=[]
  current_word=None
  label=0
  for word in word_id:
    if word != current_word:
      current_word=word
      if word is None:
        label=-100
      else:
        label=labels[word]
      new_labels.append(label)
    elif word is None:
      new_labels.append(-100)
    else:
      label=labels[word]
      if label %2==1:
        label+=1
      new_labels.append(label)
  return new_labels

In [None]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [None]:
inputs=tokenizer(indonlu_dataset["train"][0]["tokens"], is_split_into_words=True)
labels=indonlu_dataset["train"][0]["ner_tags"]
word_ids=inputs["input_ids"]
word_id_test=inputs.word_ids(0)


In [None]:
def tokenizing_and_labeling(examples):
  tokenized_input=tokenizer(examples["tokens"],truncation=True, is_split_into_words=True)
  all_label=examples["ner_tags"]
  new_labels=[]
  for i,labels in enumerate(all_label):
    new_labels.append(aligning_labels_with_token(labels,tokenized_input.word_ids(i)))
  tokenized_input["labels"]=new_labels
  return tokenized_input

In [None]:
tokenized_indonlu_dataset=indonlu_dataset.map(tokenizing_and_labeling,batched=True,remove_columns=indonlu_dataset["train"].column_names,)

In [None]:
tokenized_dataset=tokenized_indonlu_dataset

In [None]:
for i in range(len(tokenized_dataset["train"])):
  for j in range( len(tokenized_dataset["train"][i]['labels'] )):
    if isinstance(["train"][i]['labels'][j], str)==True:
      print(f"found on %d %d",i,j)

In [None]:
for i in range(len(tokenized_dataset["validation"])):
  for j in range( len(tokenized_dataset"validation"][i]['labels'] )):
    if isinstance(tokenized_dataset["validation"][i]['labels'][j], str)==True:
      print(f"found on %d %d",i,j)

In [None]:
for i in range(len(tokenized_dataset["test"])):
  for j in range( len(tokenized_dataset["test"][i]['labels'] )):
    if isinstance(tokenized_dataset["test"][i]['labels'][j], str)==True:
      print(f"found on %d %d",i,j)

# Pytorch Dataset Preprocessing and loading into the model

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
from torch.utils.data import DataLoader

torch_train_dataset = DataLoader(tokenize_indonlu_dataset["train"],shuffle=True,collate_fn=pytorch_data_collator,
    batch_size=16,
)
torch_validation_dataset = DataLoader(
    tokenize_indonlu_dataset["validation"], collate_fn=pytorch_data_collator, batch_size=16
)

In [None]:
from transformers.keras_callbacks import PushToHubCallback

callback = PushToHubCallback(output_dir="bert-finetuned-squad", tokenizer=tokenizer)

# We're going to do validation afterwards, so no validation mid-training
model.fit(tf_train_dataset, callbacks=[callback], epochs=num_train_epochs)

# Pytorch with Accelerator intialize

In [None]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5)

In [None]:
from accelerate import Accelerator

accelerator = Accelerator()
model, optimizer, torch_train_dataset, torch_validation_dataset = accelerator.prepare(
    model, optimizer, torch_train_dataset, torch_validation_dataset
)

In [None]:
from transformers import get_scheduler

num_train_epochs = 3
num_update_steps_per_epoch = len(torch_train_dataset)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [None]:
from huggingface_hub import Repository, get_full_repo_name

model_name = "bert-finetuned-ner"
repo_name = get_full_repo_name(model_name)
repo_name

In [None]:
output_dir = "bert-finetuned-ner-accelerate"
repo = Repository(output_dir, clone_from=repo_name)

In [None]:
def postprocess(predictions, labels):
    predictions = predictions.detach().cpu().clone().numpy()
    labels = labels.detach().cpu().clone().numpy()

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return true_labels, true_predictions

In [None]:
from tqdm.auto import tqdm
import torch

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    # Training
    model.train()
    for batch in torch_train_dataset:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
    model.eval()
    for batch in torch_validation_dataset:
        with torch.no_grad():
            outputs = model(**batch)

        predictions = outputs.logits.argmax(dim=-1)
        labels = batch["labels"]

        # Necessary to pad predictions and labels for being gathered
        predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
        labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

        predictions_gathered = accelerator.gather(predictions)
        labels_gathered = accelerator.gather(labels)

        true_predictions, true_labels = postprocess(predictions_gathered, labels_gathered)
        metric.add_batch(predictions=true_predictions, references=true_labels)

    results = metric.compute()
    print(
        f"epoch {epoch}:",
        {
            key: results[f"overall_{key}"]
            for key in ["precision", "recall", "f1", "accuracy"]
        },
    )

    # Save and upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)
        repo.push_to_hub(
            commit_message=f"Training in progress epoch {epoch}", blocking=False
        )

# Prepare dataset for Tensorflow

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator=DataCollatorForTokenClassification(tokenizer=tokenizer,return_tensors="tf")

In [None]:
tf_train_dataset = tokenized_dataset["train"].to_tf_dataset(
  columns=['attention_mask','input_ids', 'labels','tokens'],
  shuffle=True,
  batch_size=16,
  collate_fn=data_collator,
)
tf_validation_dataset=tokenized_dataset["validation"].to_tf_dataset(
  columns=['attention_mask','input_ids', 'labels','tokens'],
  shuffle=False,
  batch_size=16,
  collate_fn=data_collator,
)

# Training Using Tensorflow

In [None]:
id2label={str(i): label for i, label in enumerate(label_names)}
label2id={v: k for k,v in id2label.items()}

In [None]:
from transformers import create_optimizer
import tensorflow as tf

tf.keras.mixed_precision.set_global_policy("mixed_float16")
num_train_epochs=5
num_train_steps = len(tf_train_dataset) * num_train_epochs
optimizer, schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=0,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.bert.load_state_dict(model.bert.state_dict())
model.compile(optimizer=optimizer)

In [None]:
from transformers import create_optimizer
from transformers.keras_callbacks import PushToHubCallback
import tensorflow as tf

# The number of training steps is the number of samples in the dataset, divided by the batch size then multiplied
# by the total number of epochs. Note that the tf_train_dataset here is a batched tf.data.Dataset,
# not the original Hugging Face Dataset, so its len() is already num_samples // batch_size.
num_train_epochs = 3
num_train_steps = len(tf_train_dataset) * num_train_epochs
optimizer, schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=0,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.compile(optimizer=optimizer)

# Train in mixed-precision float16

#tf.keras.mixed_precision.set_global_policy("mixed_float16")

In [None]:
from transformers.keras_callbacks import PushToHubCallback
callback=PushToHubCallback(output_dir="bert-indonesia-finetuned-ner", tokenizer=tokenizer)
model.fit(tf_train_dataset,validation_dataset=tf_validation_dataset, callbacks=[callback], epochs=num_train_epochs)