# Install required libraries

In [1]:
!pip install transformers datasets huggingface_hub



In [6]:
from huggingface_hub import login

login(token='hf_xxx')

# Explore the dataset

## Load dataset

For this task, we'll be using the `jnlpba` [dataset](https://huggingface.co/datasets/jnlpba).

In [7]:
from datasets import load_dataset

dataset = load_dataset("nanyang-technological-university-singapore/hkcancor", trust_remote_code=True)

In [8]:
dataset

DatasetDict({
    train: Dataset({
        features: ['conversation_id', 'speaker', 'turn_number', 'tokens', 'transcriptions', 'pos_tags_prf', 'pos_tags_ud'],
        num_rows: 10801
    })
})

In [9]:
import random

# Set seed for reproducibility
random.seed(42)

# Separate 100 random rows for a new validation split
train_dataset = dataset["train"]
validation_indices = random.sample(range(len(train_dataset)), 100)
validation_dataset = train_dataset.select(validation_indices)
train_dataset = train_dataset.select([i for i in range(len(train_dataset)) if i not in validation_indices])

# Update the dataset dictionary
dataset["train"] = train_dataset
dataset["validation"] = validation_dataset

# Display the new dataset splits
dataset


DatasetDict({
    train: Dataset({
        features: ['conversation_id', 'speaker', 'turn_number', 'tokens', 'transcriptions', 'pos_tags_prf', 'pos_tags_ud'],
        num_rows: 10701
    })
    validation: Dataset({
        features: ['conversation_id', 'speaker', 'turn_number', 'tokens', 'transcriptions', 'pos_tags_prf', 'pos_tags_ud'],
        num_rows: 100
    })
})

In [10]:
label_names = dataset["train"].features["pos_tags_ud"].feature.names

label_names

['V',
 'NOUN',
 'ADP',
 'AUX',
 'VERB',
 'X',
 'PRON',
 'PROPN',
 'PART',
 'PUNCT',
 'CCONJ',
 'ADV',
 'NUM',
 'INTJ',
 'DET',
 'ADJ']

In [11]:
example = dataset["train"][0]

example

{'conversation_id': 'TN001-DR300497-WAI3C',
 'speaker': 'A',
 'turn_number': 0,
 'tokens': ['喂',
  '遲',
  '啲',
  '去',
  '唔',
  '去',
  '旅行',
  '啊',
  '？',
  '你',
  '老公',
  '有冇',
  '平',
  '機票',
  '啊',
  '？'],
 'transcriptions': ['wai3',
  'ci4',
  'di1',
  'heoi3',
  'm4',
  'heoi3',
  'leoi5hang4',
  'aa3',
  'VQ6',
  'nei5',
  'lou5gung1',
  'jau5mou5',
  'peng4',
  'gei1piu3',
  'aa3',
  'VQ6'],
 'pos_tags_prf': [24,
  9,
  72,
  75,
  21,
  75,
  80,
  116,
  83,
  64,
  50,
  76,
  9,
  50,
  116,
  83],
 'pos_tags_ud': [13, 15, 8, 4, 11, 4, 1, 8, 9, 6, 1, 4, 15, 1, 8, 9]}

The `tokens` are the words in the sentence, and the `ner_tags` are the corresponding labels.

NER stands for Named Entity Recognition.

In [12]:
print("Token => Label Name\n")

pre_length = 15
for token, tag in zip(example["tokens"], example["pos_tags_ud"]):
  tag_label = label_names[tag]
  string = token
  while len(string) != pre_length:
    string += " "
  print(f"{string} => {tag_label}")

## Tokenization

The Tokenizer is used to convert sentences into [sub-words](https://towardsdatascience.com/a-comprehensive-guide-to-subword-tokenisers-4bbd3bad9a7c).

In [13]:
from transformers import AutoTokenizer

checkpoint = "indiejoseph/bert-base-cantonese"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Let's tokenize our example.

In [14]:
example

{'conversation_id': 'TN001-DR300497-WAI3C',
 'speaker': 'A',
 'turn_number': 0,
 'tokens': ['喂',
  '遲',
  '啲',
  '去',
  '唔',
  '去',
  '旅行',
  '啊',
  '？',
  '你',
  '老公',
  '有冇',
  '平',
  '機票',
  '啊',
  '？'],
 'transcriptions': ['wai3',
  'ci4',
  'di1',
  'heoi3',
  'm4',
  'heoi3',
  'leoi5hang4',
  'aa3',
  'VQ6',
  'nei5',
  'lou5gung1',
  'jau5mou5',
  'peng4',
  'gei1piu3',
  'aa3',
  'VQ6'],
 'pos_tags_prf': [24,
  9,
  72,
  75,
  21,
  75,
  80,
  116,
  83,
  64,
  50,
  76,
  9,
  50,
  116,
  83],
 'pos_tags_ud': [13, 15, 8, 4, 11, 4, 1, 8, 9, 6, 1, 4, 15, 1, 8, 9]}

In [15]:
inputs = tokenizer(
    example["tokens"],
    is_split_into_words=True,
)

inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [16]:
print(inputs["input_ids"])

Since the tokenizer converts the words into sub-words, the number of tokens will be greater than the number of labels, because each word has been split into one or more sub-words.

In [17]:
len(inputs["input_ids"])

22

In [18]:
len(example["pos_tags_ud"])

16

Each token needs a corresponding label, so we need to "align" the labels with the tokens.

In [19]:
def align_labels_with_input_ids(word_ids, old_labels):
  """
  Returns new labels which are of the same length as the word ids.

  Example inputs:

  # word_ids = [None, 0, 0, 0, 1, 2, 3, 4, 4, 4, 4, 4, 5, 6, 7, 8, 8, 9, 10, 11, 12, 13, 14, 14, 14, 14, 14, 14, 14, 15, None]
  # labels = [1, 2, 0, 0, 9, 10, 0, 0, 9, 0, 0, 0, 0, 0, 9, 0]

  Output:

  """

  new_labels = []
  prev_word_id = None

  for word_id in word_ids:
    if word_id is None:
      # if the word_id is None, i.e. the token is [CLS] or [SEP]
      new_labels.append(-100)
    else:
      label = old_labels[word_id]
      if prev_word_id == word_id and label % 2 == 1:
        # label is intermediate i.e. I-XXX
          label += 1
      new_labels.append(label)
    prev_word_id = word_id

  return new_labels

In [20]:
new_labels = align_labels_with_input_ids(inputs.word_ids(), example["pos_tags_ud"])

In [21]:
len(inputs["input_ids"])

22

In [22]:
len(new_labels)

22

Nice, our labels are the same length as our input_ids!

In [23]:
print(new_labels)

Now, let's create a function which takes in a group of examples, tokenize each example, and aligns their labels.

In [24]:
def tokenize_and_align(examples):
  # tokenize examples
  model_inputs = tokenizer(
      examples["tokens"],
      truncation=True,
      is_split_into_words=True
  )

  # align labels
  model_inputs["labels"] = []
  # iterate over each example
  for i in range(len(model_inputs["input_ids"])):
    # get word_ids
    word_ids = model_inputs.word_ids(i)
    # get labels
    pos_tags = examples["pos_tags_ud"][i]
    # compute new labels
    new_labels = align_labels_with_input_ids(word_ids, pos_tags)
    # store new labels
    model_inputs["labels"].append(new_labels)

  return model_inputs

Let's try this function on a group of examples.

In [25]:
examples = dataset["train"][:5]

In [26]:
inputs = tokenize_and_align(examples)

In [27]:
for ex_input_ids, ex_labels in zip(inputs["input_ids"], inputs["labels"]):
  print(f">>> Length of input_ids: {len(ex_input_ids)}")
  print(f">>> Length of labels: {len(ex_labels)}")
  print()

Nice! We have tokenized the sentences, and made sure that the labels for each sentence are the same length.

Notice how each separate example, however, is of different length. That's because we haven't applied **padding** yet, however we will take care of this later.

In [28]:
tokenized_datasets = dataset.map(
    tokenize_and_align,
    batched=True,
    remove_columns=dataset["train"].column_names,
)

In [29]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 10701
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
})

## Data Collation

Data Collation means taking our dataset and organanizing it in mini-batches.

You may have noticed that we haven't padded our dataset yet, as models require each tensor to be of equal length. Padding the entire dataset at once would be inefficient, as we would be padding each tensor to the length of the longest tensor in the dataset.

Instead, we can do this for each mini-batch, so each tensor is only padded up to the largest tensor in its mini-batch. This saves unnecessary RAM and computation.

The data collator takes care of this for us.

In [30]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

Let's test out our data collator on a small batch.

In [31]:
batch_pre_collation = [ tokenized_datasets["train"][i] for i in range(5) ]

In [32]:
for example in batch_pre_collation:
  print(f">>> Length: {len(example['input_ids'])}")

Right now, each example has a different length.

After we apply padding through the data collator, each example will have the length of the longest example in the mini-batch.

In [33]:
batch_collated = data_collator(batch_pre_collation)

In [34]:
for example in batch_collated["input_ids"]:
  print(f">>> Length: {len(example)}")

In [35]:
batch_collated["input_ids"].shape

torch.Size([5, 22])

In [36]:
batch_collated["labels"].shape

torch.Size([5, 22])

## Prepare dataset for fine-tuning

Now that we've set up our data collator, let's apply it to our entire dataset.

# Fine-tuning!

## Load `bert-base-cantonese` model

Before loading the model, let's create some maps which will be used to go back and forth between ids and labels.

In [37]:
id2label = { i:k for i, k in enumerate(label_names) }
label2id = { v:k for k, v in id2label.items() }

In [38]:
id2label

{0: 'V',
 1: 'NOUN',
 2: 'ADP',
 3: 'AUX',
 4: 'VERB',
 5: 'X',
 6: 'PRON',
 7: 'PROPN',
 8: 'PART',
 9: 'PUNCT',
 10: 'CCONJ',
 11: 'ADV',
 12: 'NUM',
 13: 'INTJ',
 14: 'DET',
 15: 'ADJ'}

In [39]:
label2id

{'V': 0,
 'NOUN': 1,
 'ADP': 2,
 'AUX': 3,
 'VERB': 4,
 'X': 5,
 'PRON': 6,
 'PROPN': 7,
 'PART': 8,
 'PUNCT': 9,
 'CCONJ': 10,
 'ADV': 11,
 'NUM': 12,
 'INTJ': 13,
 'DET': 14,
 'ADJ': 15}

Now let's load our model.

The reason we created those maps above is so that the Inference API on the Hugging Face website can show us the label names such as "DNA" and "protein" instead of the ids.

In [40]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    checkpoint,
    id2label=id2label,
    label2id=label2id,
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at indiejoseph/bert-base-cantonese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


The model knows how many classes there are because of the maps we provided earlier.

In [41]:
model.config.num_labels

16

In [42]:
assert model.config.num_labels == len(label_names)

## Prepare model

## PushToHubCallback

This callbacks tells HuggingFace to push the model to your HuggingFace profile while the model is training.

To make this work, ensure that you have connected your HuggingFace account to the notebook.

## Train!

In [43]:
import wandb
from transformers import Trainer, TrainingArguments

wandb.init(project="CantoBERT")

training_args = TrainingArguments(
    output_dir='./bert_base_cantonese_pos_hkcancor',
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=0,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    push_to_hub=False,
    report_to="wandb",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()
wandb.finish()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingfac



  0%|          | 0/1338 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

'(MaxRetryError("HTTPSConnectionPool(host='hf-hub-lfs-us-east-1.s3-accelerate.amazonaws.com', port=443): Max retries exceeded with url: /repos/a1/01/a10116c6137135aeaa6b2579dd8b4a85503da093c57d9f7cbaef07a3403afc15/821041ec3fe63ffb94a8c28636af3355779af64d16da078b564429b7081838fa?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=AKIA2JU7TKAQLC2QXPN7%2F20240807%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240807T080957Z&X-Amz-Expires=86400&X-Amz-Signature=147ebac3d8f9693c0e6510711ec29ea3e13e8ab7eec4195cfce822a43837da90&X-Amz-SignedHeaders=host&partNumber=5&uploadId=PcAgsY4ydni2t5YC6g1AFRWL92N4dAG29EulfQW1JoXjQTwNBLhEdmOvE8pFNiF8FWJ.4d5PvaZl34foAdG2Pig7a61GRoL506CtMz7QLVdxpoUv.UWBRnFZA7wNdDmu&x-id=UploadPart (Caused by SSLError(SSLEOFError(8, 'EOF occurred in violation of protocol (_ssl.c:2406)')))"), '(Request ID: 8f3371f4-2b22-4e73-a4c4-38ae4b6d19f2)')' thrown while requesting PUT https://hf-hub-lfs-us-east-1.s3-accelerate.amazonaws.com/repos/a1/01/a

  0%|          | 0/7 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [44]:
trainer.push_to_hub()

model.safetensors:   0%|          | 0.00/408M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/AlienKevin/bert_base_cantonese_pos_hkcancor/commit/12e070acdbc2f3eefd49dbce73e75147a74c3f92', commit_message='End of training', commit_description='', oid='12e070acdbc2f3eefd49dbce73e75147a74c3f92', pr_url=None, pr_revision=None, pr_num=None)

# Inference

Now that our model is ready, we can try it out using the nifty `pipeline` API.

In [45]:
from transformers import pipeline

classifier = pipeline(
    "token-classification",
    "AlienKevin/bert_base_cantonese_pos_hkcancor",
    grouped_entities=True,
)

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/408M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/85.9k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/529k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/8.00k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [48]:
input = "你喺度做緊？"

In [49]:
classifier(input)

[{'entity_group': 'PRON',
  'score': 0.9993793,
  'word': '你',
  'start': 0,
  'end': 1},
 {'entity_group': 'ADV',
  'score': 0.41889605,
  'word': '喺',
  'start': 1,
  'end': 2},
 {'entity_group': 'NUM',
  'score': 0.3735045,
  'word': '度',
  'start': 2,
  'end': 3},
 {'entity_group': 'VERB',
  'score': 0.9975925,
  'word': '做',
  'start': 3,
  'end': 4},
 {'entity_group': 'PART',
  'score': 0.9583228,
  'word': '緊',
  'start': 4,
  'end': 5},
 {'entity_group': 'PUNCT',
  'score': 0.999387,
  'word': '？',
  'start': 5,
  'end': 6}]