# Install required libraries

In [1]:
!pip install transformers[torch] datasets huggingface_hub wandb accelerate

[0m

In [2]:
from huggingface_hub import login

login(token='hf_xxx')

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


# Explore the dataset

In [3]:
# dataset_names = ["nanyang-technological-university-singapore/hkcancor"]
dataset_names = ["AlienKevin/wiki-yue-long-tagged", "AlienKevin/lihkg-tagged", "AlienKevin/cc100-yue-tagged"]

# model_name = "AlienKevin/bert_base_cantonese_pos_hkcancor"
# checkpoint = "indiejoseph/bert-base-cantonese"

model_name = "AlienKevin/electra_hongkongese_small_pos_wiki_lihkg_cc100"
checkpoint = "toastynews/electra-hongkongese-small-discriminator"

max_length = 512
num_train_epochs = 3

## Load dataset

For this task, we'll be using the `jnlpba` [dataset](https://huggingface.co/datasets/jnlpba).

In [4]:
from datasets import load_dataset, concatenate_datasets

dataset = concatenate_datasets([load_dataset(dataset_name, trust_remote_code=True)['train'] for dataset_name in dataset_names])

In [5]:
dataset

Dataset({
    features: ['sentence_preserved', 'tokens', 'pos_tags_ud', 'sentence'],
    num_rows: 326640
})

In [6]:
label_names = dataset.features["pos_tags_ud"].feature.names

label_names

['ADJ',
 'ADP',
 'ADV',
 'AUX',
 'CCONJ',
 'DET',
 'INTJ',
 'NOUN',
 'NUM',
 'PART',
 'PRON',
 'PROPN',
 'PUNCT',
 'SCONJ',
 'SYM',
 'VERB',
 'X']

In [7]:
example = dataset[0]

example

{'sentence_preserved': True,
 'tokens': ['登坂廣臣',
  '細個',
  '本身',
  '係',
  '想',
  '做',
  '飛髮師傅',
  '，',
  '所以',
  '佢',
  '讀',
  '上窪田理容美容専業學校',
  '，',
  '畢業',
  '之後',
  '出嚟',
  '做',
  '飛髮師傅',
  '，',
  '但係',
  '做',
  '咗',
  '半年',
  '就',
  '辭職',
  '，',
  '之後',
  '走',
  '去',
  '做',
  '衫',
  '。',
  '響',
  '2010年',
  '佢',
  '參加',
  '選藝節目',
  'VOCALBATTLEAUDITION2',
  '，',
  '最後',
  '合格',
  '，',
  '畀',
  '編入',
  'JSoulBrothers',
  '。'],
 'pos_tags_ud': [11,
  7,
  2,
  15,
  3,
  15,
  7,
  12,
  4,
  10,
  15,
  11,
  12,
  15,
  1,
  15,
  15,
  7,
  12,
  4,
  15,
  9,
  7,
  2,
  15,
  12,
  1,
  15,
  15,
  15,
  7,
  12,
  1,
  7,
  10,
  15,
  7,
  11,
  12,
  7,
  15,
  12,
  1,
  15,
  11,
  12],
 'sentence': '登坂廣臣細個本身係想做飛髮師傅，所以佢讀上窪田理容美容専業學校，畢業之後出嚟做飛髮師傅，但係做咗半年就辭職，之後走去做衫。響2010年佢參加選藝節目VOCAL BATTLE AUDITION 2，最後合格，畀編入J Soul Brothers。'}

The `tokens` are the words in the sentence, and the `pos_tags_ud` are the corresponding labels.

In [8]:
# print("Token => Label Name\n")

# pre_length = 15
# for token, tag in zip(example["tokens"], example["pos_tags_ud"]):
#   tag_label = label_names[tag]
#   string = token
#   while len(string) != pre_length:
#     string += " "
#   print(f"{string} => {tag_label}")

In [9]:
import random

# Set seed for reproducibility
random.seed(42)

# Separate 100 random rows for a new validation split
train_dataset = dataset

# Shuffle the dataset
shuffled_indices = list(range(len(train_dataset)))
random.shuffle(shuffled_indices)

# Select the first 100 shuffled indices for validation
validation_indices = shuffled_indices[:100]
train_indices = shuffled_indices[100:]

# Create validation dataset from the first 100 shuffled indices
validation_dataset = train_dataset.select(validation_indices)

# Create train dataset from the remaining indices
train_dataset = train_dataset.select(train_indices)

# Create a new DatasetDict with the updated splits
from datasets import DatasetDict

dataset = DatasetDict({
    "train": train_dataset,
    "validation": validation_dataset
})

# Display the new dataset splits
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['sentence_preserved', 'tokens', 'pos_tags_ud', 'sentence'],
        num_rows: 326540
    })
    validation: Dataset({
        features: ['sentence_preserved', 'tokens', 'pos_tags_ud', 'sentence'],
        num_rows: 100
    })
})


## Tokenization

The Tokenizer is used to convert sentences into [sub-words](https://towardsdatascience.com/a-comprehensive-guide-to-subword-tokenisers-4bbd3bad9a7c).

In [10]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(checkpoint)



Let's tokenize our example.

In [11]:
example

{'sentence_preserved': True,
 'tokens': ['登坂廣臣',
  '細個',
  '本身',
  '係',
  '想',
  '做',
  '飛髮師傅',
  '，',
  '所以',
  '佢',
  '讀',
  '上窪田理容美容専業學校',
  '，',
  '畢業',
  '之後',
  '出嚟',
  '做',
  '飛髮師傅',
  '，',
  '但係',
  '做',
  '咗',
  '半年',
  '就',
  '辭職',
  '，',
  '之後',
  '走',
  '去',
  '做',
  '衫',
  '。',
  '響',
  '2010年',
  '佢',
  '參加',
  '選藝節目',
  'VOCALBATTLEAUDITION2',
  '，',
  '最後',
  '合格',
  '，',
  '畀',
  '編入',
  'JSoulBrothers',
  '。'],
 'pos_tags_ud': [11,
  7,
  2,
  15,
  3,
  15,
  7,
  12,
  4,
  10,
  15,
  11,
  12,
  15,
  1,
  15,
  15,
  7,
  12,
  4,
  15,
  9,
  7,
  2,
  15,
  12,
  1,
  15,
  15,
  15,
  7,
  12,
  1,
  7,
  10,
  15,
  7,
  11,
  12,
  7,
  15,
  12,
  1,
  15,
  11,
  12],
 'sentence': '登坂廣臣細個本身係想做飛髮師傅，所以佢讀上窪田理容美容専業學校，畢業之後出嚟做飛髮師傅，但係做咗半年就辭職，之後走去做衫。響2010年佢參加選藝節目VOCAL BATTLE AUDITION 2，最後合格，畀編入J Soul Brothers。'}

In [12]:
inputs = tokenizer(
    example["tokens"],
    is_split_into_words=True,
)

inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [13]:
print(inputs["input_ids"])

[2, 15953, 9753, 11212, 18244, 17263, 8198, 12966, 20801, 8137, 11644, 8263, 22860, 23281, 11057, 8300, 24225, 11909, 7951, 8059, 20112, 7764, 16772, 15687, 15476, 10591, 17765, 10591, 10655, 13414, 10535, 13173, 24225, 15727, 13414, 7826, 11348, 8602, 9615, 8263, 22860, 23281, 11057, 8300, 24225, 8032, 8137, 8263, 9162, 8870, 11115, 10696, 21009, 17936, 24225, 7826, 11348, 20539, 8964, 8263, 19510, 2209, 22672, 29622, 26826, 11115, 8059, 8971, 8741, 21181, 19021, 16986, 16035, 64, 29689, 29609, 26821, 29603, 26811, 29624, 26808, 29789, 29608, 29621, 26816, 24225, 12927, 11348, 9030, 13194, 24225, 15702, 17410, 8475, 52, 29938, 29698, 26821, 26822, 29628, 29891, 26805, 2209, 3]


Since the tokenizer converts the words into sub-words, the number of tokens will be greater than the number of labels, because each word has been split into one or more sub-words.

In [14]:
len(inputs["input_ids"])

104

In [15]:
len(example["pos_tags_ud"])

46

Now, let's create a function which takes in a group of examples, tokenize each example, and aligns their labels.

In [16]:
# https://huggingface.co/docs/transformers/en/tasks/token_classification#preprocess
def tokenize_and_align(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True, max_length=max_length)

    labels = []
    for i, label in enumerate(examples[f"pos_tags_ud"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

Let's try this function on a group of examples.

In [17]:
examples = dataset["train"][:5]

In [18]:
inputs = tokenize_and_align(examples)

In [19]:
for ex_input_ids, ex_labels in zip(inputs["input_ids"], inputs["labels"]):
  print(f">>> Length of input_ids: {len(ex_input_ids)}")
  print(f">>> Length of labels: {len(ex_labels)}")
  print()

>>> Length of input_ids: 108
>>> Length of labels: 108

>>> Length of input_ids: 30
>>> Length of labels: 30

>>> Length of input_ids: 51
>>> Length of labels: 51

>>> Length of input_ids: 25
>>> Length of labels: 25

>>> Length of input_ids: 16
>>> Length of labels: 16



Nice! We have tokenized the sentences, and made sure that the labels for each sentence are the same length.

Notice how each separate example, however, is of different length. That's because we haven't applied **padding** yet, however we will take care of this later.

In [20]:
tokenized_datasets = dataset.map(
    tokenize_and_align,
    batched=True,
    remove_columns=dataset["train"].column_names,
)

In [21]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 326540
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
})

In [22]:
print(tokenized_datasets["train"][0]['input_ids'])
print(tokenized_datasets["train"][0]['labels'])

[2, 22546, 10670, 11115, 15211, 17735, 15961, 11866, 9674, 16100, 10104, 24225, 8032, 7811, 19712, 9151, 10588, 11167, 17289, 13487, 7767, 18352, 2208, 10666, 10535, 13173, 15961, 8110, 20423, 11476, 7767, 20586, 12936, 22249, 2209, 12936, 15211, 17735, 8675, 16607, 15961, 9710, 7797, 15673, 10112, 10104, 7844, 12936, 7797, 20890, 9151, 21083, 10588, 17341, 23155, 24225, 7938, 8203, 9734, 8634, 12954, 12930, 8602, 15469, 7755, 7883, 20880, 11162, 15961, 8256, 11006, 19484, 14858, 24225, 8094, 10186, 12019, 14961, 2208, 9394, 21362, 9151, 22840, 20821, 24225, 14858, 7865, 22688, 22323, 24225, 20094, 12578, 17742, 7844, 8741, 11278, 8650, 9710, 7797, 13173, 9716, 8054, 14079, 14055, 12598, 17984, 2209, 3]
[-100, 7, -100, -100, 15, -100, 9, 7, -100, 0, -100, 12, 4, 0, -100, 1, 7, -100, 7, -100, 0, -100, 12, 1, 7, -100, 9, 7, -100, -100, 15, -100, 15, -100, 12, 15, 15, -100, 7, -100, 9, 7, -100, -100, 2, -100, 2, 15, 15, -100, 4, 15, -100, 7, -100, 12, 10, -100, 1, 7, -100, 3, 15, -100, 5,

## Data Collation

Data Collation means taking our dataset and organanizing it in mini-batches.

You may have noticed that we haven't padded our dataset yet, as models require each tensor to be of equal length. Padding the entire dataset at once would be inefficient, as we would be padding each tensor to the length of the longest tensor in the dataset.

Instead, we can do this for each mini-batch, so each tensor is only padded up to the largest tensor in its mini-batch. This saves unnecessary RAM and computation.

The data collator takes care of this for us.

In [23]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

Let's test out our data collator on a small batch.

In [24]:
batch_pre_collation = [ tokenized_datasets["train"][i] for i in range(5) ]

In [25]:
for example in batch_pre_collation:
  print(f">>> Length: {len(example['input_ids'])}")

>>> Length: 108
>>> Length: 30
>>> Length: 51
>>> Length: 25
>>> Length: 16


Right now, each example has a different length.

After we apply padding through the data collator, each example will have the length of the longest example in the mini-batch.

In [26]:
batch_collated = data_collator(batch_pre_collation)

In [27]:
for example in batch_collated["input_ids"]:
  print(f">>> Length: {len(example)}")

>>> Length: 108
>>> Length: 108
>>> Length: 108
>>> Length: 108
>>> Length: 108


In [28]:
batch_collated["input_ids"].shape

torch.Size([5, 108])

In [29]:
batch_collated["labels"].shape

torch.Size([5, 108])

## Prepare dataset for fine-tuning

Now that we've set up our data collator, let's apply it to our entire dataset.

# Fine-tuning!

## Load `bert-base-cantonese` model

Before loading the model, let's create some maps which will be used to go back and forth between ids and labels.

In [30]:
id2label = { i:k for i, k in enumerate(label_names) }
label2id = { v:k for k, v in id2label.items() }

In [31]:
id2label

{0: 'ADJ',
 1: 'ADP',
 2: 'ADV',
 3: 'AUX',
 4: 'CCONJ',
 5: 'DET',
 6: 'INTJ',
 7: 'NOUN',
 8: 'NUM',
 9: 'PART',
 10: 'PRON',
 11: 'PROPN',
 12: 'PUNCT',
 13: 'SCONJ',
 14: 'SYM',
 15: 'VERB',
 16: 'X'}

In [32]:
label2id

{'ADJ': 0,
 'ADP': 1,
 'ADV': 2,
 'AUX': 3,
 'CCONJ': 4,
 'DET': 5,
 'INTJ': 6,
 'NOUN': 7,
 'NUM': 8,
 'PART': 9,
 'PRON': 10,
 'PROPN': 11,
 'PUNCT': 12,
 'SCONJ': 13,
 'SYM': 14,
 'VERB': 15,
 'X': 16}

Now let's load our model.

The reason we created those maps above is so that the Inference API on the Hugging Face website can show us the label names such as "DNA" and "protein" instead of the ids.

In [33]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    checkpoint,
    id2label=id2label,
    label2id=label2id,
)

if "electra" in model_name:
    # Make the model's parameters contiguous
    for param in model.parameters():
        if not param.is_contiguous():
            param.data = param.data.contiguous()

Some weights of ElectraForTokenClassification were not initialized from the model checkpoint at toastynews/electra-hongkongese-small-discriminator and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


The model knows how many classes there are because of the maps we provided earlier.

In [34]:
model.config.num_labels

17

In [35]:
assert model.config.num_labels == len(label_names)

## Prepare model

## PushToHubCallback

This callbacks tells HuggingFace to push the model to your HuggingFace profile while the model is training.

To make this work, ensure that you have connected your HuggingFace account to the notebook.

## Train!

In [36]:
import wandb
from transformers import Trainer, TrainingArguments

wandb.init(project="CantoBERT")

training_args = TrainingArguments(
    output_dir=f'./{model_name.split("/")[-1]}',
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=0,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy="steps",
    eval_steps=0.1,
    save_strategy="epoch",
    push_to_hub=False,
    report_to="wandb",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()
wandb.finish()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mkevinxli[0m. Use [1m`wandb login --relogin`[0m to force relogin


Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss,Validation Loss
6123,0.232,0.219173
12246,0.2465,0.205177
18369,0.2396,0.190614
24492,0.2128,0.189691
30615,0.2007,0.188319
36738,0.2019,0.183173
42861,0.1864,0.181279
48984,0.2051,0.179936
55107,0.1837,0.179582


VBox(children=(Label(value='0.031 MB of 0.031 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/loss,█▆▃▃▃▂▁▁▁
eval/runtime,█▃▂▅▃▂▁▂▃
eval/samples_per_second,▁▅▇▄▅▇█▆▆
eval/steps_per_second,▁▅▇▄▅▇█▆▆
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/grad_norm,▄▅▆▄▃▇█▄▄▃▆▄▃▃▃▅▂▅▇▃▄▂▁▃▂▃▄▂▃▃▄▂▃▄▂▃▄▄▅▄
train/learning_rate,███▇▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁
train/loss,█▄▃▃▂▂▂▂▂▂▂▂▂▁▁▂▁▂▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
eval/loss,0.17958
eval/runtime,0.0791
eval/samples_per_second,1264.167
eval/steps_per_second,88.492
total_flos,6755379957414264.0
train/epoch,3.0
train/global_step,61227.0
train/grad_norm,1.25613
train/learning_rate,0.0
train/loss,0.1959


In [37]:
trainer.push_to_hub()

model.safetensors:   0%|          | 0.00/53.7M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/AlienKevin/electra_hongkongese_small_pos_wiki_lihkg_cc100/commit/5b05c57b462116d0de363991005c3d2ed822392f', commit_message='End of training', commit_description='', oid='5b05c57b462116d0de363991005c3d2ed822392f', pr_url=None, pr_revision=None, pr_num=None)

# Inference

Now that our model is ready, we can try it out using the nifty `pipeline` API.

In [38]:
from transformers import pipeline

classifier = pipeline(
    "token-classification",
    model_name,
    grouped_entities=True,
)

config.json:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/53.7M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.26k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/128k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/600k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [39]:
input = "醫院喺邊度？"

In [40]:
classifier(input)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[{'entity_group': 'NOUN',
  'score': 0.9984979,
  'word': '醫 院',
  'start': 0,
  'end': 2},
 {'entity_group': 'ADP',
  'score': 0.94674855,
  'word': '喺',
  'start': 2,
  'end': 3},
 {'entity_group': 'PRON',
  'score': 0.9991716,
  'word': '邊',
  'start': 3,
  'end': 4},
 {'entity_group': 'NOUN',
  'score': 0.9751618,
  'word': '度',
  'start': 4,
  'end': 5},
 {'entity_group': 'PUNCT',
  'score': 0.9999639,
  'word': '？',
  'start': 5,
  'end': 6}]