# Install required libraries

In [1]:
!pip install transformers datasets huggingface_hub



In [2]:
from huggingface_hub import login

login(token='hf_xxx')

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /Users/kevin/.cache/huggingface/token
Login successful


# Explore the dataset

In [1]:
dataset_name = "nanyang-technological-university-singapore/hkcancor"
# dataset_name = "AlienKevin/cc100-yue-tagged"

# model_name = "AlienKevin/bert_base_cantonese_pos_hkcancor"
# checkpoint = "indiejoseph/bert-base-cantonese"

model_name = "AlienKevin/electra_hongkongese_small_pos_hkcancor"
checkpoint = "toastynews/electra-hongkongese-small-discriminator"

max_length = 512
num_train_epochs = 3

## Load dataset

For this task, we'll be using the `jnlpba` [dataset](https://huggingface.co/datasets/jnlpba).

In [2]:
from datasets import load_dataset

dataset = load_dataset(dataset_name, trust_remote_code=True)

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['conversation_id', 'speaker', 'turn_number', 'tokens', 'transcriptions', 'pos_tags_prf', 'pos_tags_ud'],
        num_rows: 10801
    })
})

In [4]:
label_names = dataset["train"].features["pos_tags_ud"].feature.names

label_names

['V',
 'NOUN',
 'ADP',
 'AUX',
 'VERB',
 'X',
 'PRON',
 'PROPN',
 'PART',
 'PUNCT',
 'CCONJ',
 'ADV',
 'NUM',
 'INTJ',
 'DET',
 'ADJ']

In [5]:
example = dataset["train"][0]

example

{'conversation_id': 'TN001-DR300497-WAI3C',
 'speaker': 'A',
 'turn_number': 0,
 'tokens': ['喂',
  '遲',
  '啲',
  '去',
  '唔',
  '去',
  '旅行',
  '啊',
  '？',
  '你',
  '老公',
  '有冇',
  '平',
  '機票',
  '啊',
  '？'],
 'transcriptions': ['wai3',
  'ci4',
  'di1',
  'heoi3',
  'm4',
  'heoi3',
  'leoi5hang4',
  'aa3',
  'VQ6',
  'nei5',
  'lou5gung1',
  'jau5mou5',
  'peng4',
  'gei1piu3',
  'aa3',
  'VQ6'],
 'pos_tags_prf': [24,
  9,
  72,
  75,
  21,
  75,
  80,
  116,
  83,
  64,
  50,
  76,
  9,
  50,
  116,
  83],
 'pos_tags_ud': [13, 15, 8, 4, 11, 4, 1, 8, 9, 6, 1, 4, 15, 1, 8, 9]}

The `tokens` are the words in the sentence, and the `pos_tags_ud` are the corresponding labels.

In [6]:
print("Token => Label Name\n")

pre_length = 15
for token, tag in zip(example["tokens"], example["pos_tags_ud"]):
  tag_label = label_names[tag]
  string = token
  while len(string) != pre_length:
    string += " "
  print(f"{string} => {tag_label}")

Token => Label Name

喂               => INTJ
遲               => ADJ
啲               => PART
去               => VERB
唔               => ADV
去               => VERB
旅行              => NOUN
啊               => PART
？               => PUNCT
你               => PRON
老公              => NOUN
有冇              => VERB
平               => ADJ
機票              => NOUN
啊               => PART
？               => PUNCT


In [7]:
import random

# Set seed for reproducibility
random.seed(42)

# Separate 100 random rows for a new validation split
train_dataset = dataset["train"]
validation_indices = random.sample(range(len(train_dataset)), 100)
validation_dataset = train_dataset.select(validation_indices)
train_dataset = train_dataset.select([i for i in range(len(train_dataset)) if i not in validation_indices])

# Update the dataset dictionary
dataset["train"] = train_dataset
dataset["validation"] = validation_dataset

# Display the new dataset splits
dataset

DatasetDict({
    train: Dataset({
        features: ['conversation_id', 'speaker', 'turn_number', 'tokens', 'transcriptions', 'pos_tags_prf', 'pos_tags_ud'],
        num_rows: 10701
    })
    validation: Dataset({
        features: ['conversation_id', 'speaker', 'turn_number', 'tokens', 'transcriptions', 'pos_tags_prf', 'pos_tags_ud'],
        num_rows: 100
    })
})

## Tokenization

The Tokenizer is used to convert sentences into [sub-words](https://towardsdatascience.com/a-comprehensive-guide-to-subword-tokenisers-4bbd3bad9a7c).

In [8]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Let's tokenize our example.

In [9]:
example

{'conversation_id': 'TN001-DR300497-WAI3C',
 'speaker': 'A',
 'turn_number': 0,
 'tokens': ['喂',
  '遲',
  '啲',
  '去',
  '唔',
  '去',
  '旅行',
  '啊',
  '？',
  '你',
  '老公',
  '有冇',
  '平',
  '機票',
  '啊',
  '？'],
 'transcriptions': ['wai3',
  'ci4',
  'di1',
  'heoi3',
  'm4',
  'heoi3',
  'leoi5hang4',
  'aa3',
  'VQ6',
  'nei5',
  'lou5gung1',
  'jau5mou5',
  'peng4',
  'gei1piu3',
  'aa3',
  'VQ6'],
 'pos_tags_prf': [24,
  9,
  72,
  75,
  21,
  75,
  80,
  116,
  83,
  64,
  50,
  76,
  9,
  50,
  116,
  83],
 'pos_tags_ud': [13, 15, 8, 4, 11, 4, 1, 8, 9, 6, 1, 4, 15, 1, 8, 9]}

In [10]:
inputs = tokenizer(
    example["tokens"],
    is_split_into_words=True,
)

inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [11]:
print(inputs["input_ids"])

[2, 9370, 21175, 9354, 8964, 9273, 8964, 12679, 19484, 9321, 24244, 8058, 17855, 8482, 12936, 8505, 11114, 13596, 16527, 9321, 24244, 3]


Since the tokenizer converts the words into sub-words, the number of tokens will be greater than the number of labels, because each word has been split into one or more sub-words.

In [12]:
len(inputs["input_ids"])

22

In [13]:
len(example["pos_tags_ud"])

16

Now, let's create a function which takes in a group of examples, tokenize each example, and aligns their labels.

In [14]:
# https://huggingface.co/docs/transformers/en/tasks/token_classification#preprocess
def tokenize_and_align(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True, max_length=max_length)

    labels = []
    for i, label in enumerate(examples[f"pos_tags_ud"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

Let's try this function on a group of examples.

In [15]:
examples = dataset["train"][:5]

In [16]:
inputs = tokenize_and_align(examples)

In [17]:
for ex_input_ids, ex_labels in zip(inputs["input_ids"], inputs["labels"]):
  print(f">>> Length of input_ids: {len(ex_input_ids)}")
  print(f">>> Length of labels: {len(ex_labels)}")
  print()

>>> Length of input_ids: 22
>>> Length of labels: 22

>>> Length of input_ids: 20
>>> Length of labels: 20

>>> Length of input_ids: 7
>>> Length of labels: 7

>>> Length of input_ids: 4
>>> Length of labels: 4

>>> Length of input_ids: 19
>>> Length of labels: 19



Nice! We have tokenized the sentences, and made sure that the labels for each sentence are the same length.

Notice how each separate example, however, is of different length. That's because we haven't applied **padding** yet, however we will take care of this later.

In [18]:
tokenized_datasets = dataset.map(
    tokenize_and_align,
    batched=True,
    remove_columns=dataset["train"].column_names,
)

In [19]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 10701
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
})

In [20]:
print(tokenized_datasets["train"][0]['input_ids'])
print(tokenized_datasets["train"][0]['labels'])

[2, 9370, 21175, 9354, 8964, 9273, 8964, 12679, 19484, 9321, 24244, 8058, 17855, 8482, 12936, 8505, 11114, 13596, 16527, 9321, 24244, 3]
[-100, 13, 15, 8, 4, 11, 4, 1, -100, 8, 9, 6, 1, -100, 4, -100, 15, 1, -100, 8, 9, -100]


## Data Collation

Data Collation means taking our dataset and organanizing it in mini-batches.

You may have noticed that we haven't padded our dataset yet, as models require each tensor to be of equal length. Padding the entire dataset at once would be inefficient, as we would be padding each tensor to the length of the longest tensor in the dataset.

Instead, we can do this for each mini-batch, so each tensor is only padded up to the largest tensor in its mini-batch. This saves unnecessary RAM and computation.

The data collator takes care of this for us.

In [21]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

Let's test out our data collator on a small batch.

In [22]:
batch_pre_collation = [ tokenized_datasets["train"][i] for i in range(5) ]

In [23]:
for example in batch_pre_collation:
  print(f">>> Length: {len(example['input_ids'])}")

>>> Length: 22
>>> Length: 20
>>> Length: 7
>>> Length: 4
>>> Length: 19


Right now, each example has a different length.

After we apply padding through the data collator, each example will have the length of the longest example in the mini-batch.

In [24]:
batch_collated = data_collator(batch_pre_collation)

In [25]:
for example in batch_collated["input_ids"]:
  print(f">>> Length: {len(example)}")

>>> Length: 22
>>> Length: 22
>>> Length: 22
>>> Length: 22
>>> Length: 22


In [26]:
batch_collated["input_ids"].shape

torch.Size([5, 22])

In [27]:
batch_collated["labels"].shape

torch.Size([5, 22])

## Prepare dataset for fine-tuning

Now that we've set up our data collator, let's apply it to our entire dataset.

# Fine-tuning!

## Load `bert-base-cantonese` model

Before loading the model, let's create some maps which will be used to go back and forth between ids and labels.

In [28]:
id2label = { i:k for i, k in enumerate(label_names) }
label2id = { v:k for k, v in id2label.items() }

In [29]:
id2label

{0: 'V',
 1: 'NOUN',
 2: 'ADP',
 3: 'AUX',
 4: 'VERB',
 5: 'X',
 6: 'PRON',
 7: 'PROPN',
 8: 'PART',
 9: 'PUNCT',
 10: 'CCONJ',
 11: 'ADV',
 12: 'NUM',
 13: 'INTJ',
 14: 'DET',
 15: 'ADJ'}

In [30]:
label2id

{'V': 0,
 'NOUN': 1,
 'ADP': 2,
 'AUX': 3,
 'VERB': 4,
 'X': 5,
 'PRON': 6,
 'PROPN': 7,
 'PART': 8,
 'PUNCT': 9,
 'CCONJ': 10,
 'ADV': 11,
 'NUM': 12,
 'INTJ': 13,
 'DET': 14,
 'ADJ': 15}

Now let's load our model.

The reason we created those maps above is so that the Inference API on the Hugging Face website can show us the label names such as "DNA" and "protein" instead of the ids.

In [31]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    checkpoint,
    id2label=id2label,
    label2id=label2id,
)

if "electra" in model_name:
    # Make the model's parameters contiguous
    for param in model.parameters():
        if not param.is_contiguous():
            param.data = param.data.contiguous()

Some weights of ElectraForTokenClassification were not initialized from the model checkpoint at toastynews/electra-hongkongese-small-discriminator and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


The model knows how many classes there are because of the maps we provided earlier.

In [32]:
model.config.num_labels

16

In [33]:
assert model.config.num_labels == len(label_names)

## Prepare model

## PushToHubCallback

This callbacks tells HuggingFace to push the model to your HuggingFace profile while the model is training.

To make this work, ensure that you have connected your HuggingFace account to the notebook.

## Train!

In [34]:
import wandb
from transformers import Trainer, TrainingArguments

wandb.init(project="CantoBERT")

training_args = TrainingArguments(
    output_dir=f'./{model_name.split("/")[-1]}',
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=0,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy="steps",
    eval_steps=0.1,
    save_strategy="epoch",
    push_to_hub=False,
    report_to="wandb",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()
wandb.finish()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingfac



  0%|          | 0/2007 [00:00<?, ?it/s]

{'loss': 2.7471, 'grad_norm': 1.6966887712478638, 'learning_rate': 1.9900348779272545e-05, 'epoch': 0.01}
{'loss': 2.6772, 'grad_norm': 1.7285258769989014, 'learning_rate': 1.9800697558545096e-05, 'epoch': 0.03}
{'loss': 2.5853, 'grad_norm': 1.8951977491378784, 'learning_rate': 1.970104633781764e-05, 'epoch': 0.04}
{'loss': 2.5092, 'grad_norm': 1.7884441614151, 'learning_rate': 1.9601395117090184e-05, 'epoch': 0.06}
{'loss': 2.4149, 'grad_norm': 1.8379368782043457, 'learning_rate': 1.9501743896362734e-05, 'epoch': 0.07}
{'loss': 2.3312, 'grad_norm': 1.7341512441635132, 'learning_rate': 1.9402092675635278e-05, 'epoch': 0.09}
{'loss': 2.2581, 'grad_norm': 1.8665177822113037, 'learning_rate': 1.9302441454907822e-05, 'epoch': 0.1}
{'loss': 2.1931, 'grad_norm': 1.6170521974563599, 'learning_rate': 1.920279023418037e-05, 'epoch': 0.12}
{'loss': 2.1127, 'grad_norm': 1.9911563396453857, 'learning_rate': 1.9103139013452916e-05, 'epoch': 0.13}
{'loss': 2.0437, 'grad_norm': 1.6713836193084717, 'l

  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 1.3203635215759277, 'eval_runtime': 1.7128, 'eval_samples_per_second': 58.383, 'eval_steps_per_second': 4.087, 'epoch': 0.3}
{'loss': 1.3994, 'grad_norm': 1.8292165994644165, 'learning_rate': 1.790732436472347e-05, 'epoch': 0.31}
{'loss': 1.3713, 'grad_norm': 1.63935124874115, 'learning_rate': 1.7807673143996016e-05, 'epoch': 0.33}
{'loss': 1.3052, 'grad_norm': 1.8651208877563477, 'learning_rate': 1.770802192326856e-05, 'epoch': 0.34}
{'loss': 1.2988, 'grad_norm': 1.6734015941619873, 'learning_rate': 1.7608370702541107e-05, 'epoch': 0.36}
{'loss': 1.2157, 'grad_norm': 1.8230482339859009, 'learning_rate': 1.7508719481813654e-05, 'epoch': 0.37}
{'loss': 1.1921, 'grad_norm': 1.8251394033432007, 'learning_rate': 1.7409068261086198e-05, 'epoch': 0.39}
{'loss': 1.1236, 'grad_norm': 1.669816493988037, 'learning_rate': 1.7309417040358745e-05, 'epoch': 0.4}
{'loss': 1.1378, 'grad_norm': 1.742148518562317, 'learning_rate': 1.7209765819631293e-05, 'epoch': 0.42}
{'loss': 1.0775, 'gr

  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.7443869709968567, 'eval_runtime': 0.6796, 'eval_samples_per_second': 147.143, 'eval_steps_per_second': 10.3, 'epoch': 0.6}
{'loss': 0.8142, 'grad_norm': 1.8193641901016235, 'learning_rate': 1.591429995017439e-05, 'epoch': 0.61}
{'loss': 0.7977, 'grad_norm': 1.6800892353057861, 'learning_rate': 1.5814648729446936e-05, 'epoch': 0.63}
{'loss': 0.7796, 'grad_norm': 1.4025938510894775, 'learning_rate': 1.5714997508719484e-05, 'epoch': 0.64}
{'loss': 0.7695, 'grad_norm': 1.5352709293365479, 'learning_rate': 1.5615346287992027e-05, 'epoch': 0.66}
{'loss': 0.7294, 'grad_norm': 1.4970735311508179, 'learning_rate': 1.5515695067264575e-05, 'epoch': 0.67}
{'loss': 0.7273, 'grad_norm': 1.5480180978775024, 'learning_rate': 1.541604384653712e-05, 'epoch': 0.69}
{'loss': 0.6948, 'grad_norm': 1.9629114866256714, 'learning_rate': 1.531639262580967e-05, 'epoch': 0.7}
{'loss': 0.7332, 'grad_norm': 1.3671932220458984, 'learning_rate': 1.5216741405082214e-05, 'epoch': 0.72}
{'loss': 0.7071, 

  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.5415219664573669, 'eval_runtime': 0.6316, 'eval_samples_per_second': 158.322, 'eval_steps_per_second': 11.083, 'epoch': 0.9}
{'loss': 0.5894, 'grad_norm': 1.6453114748001099, 'learning_rate': 1.3921275535625311e-05, 'epoch': 0.91}
{'loss': 0.601, 'grad_norm': 1.1581220626831055, 'learning_rate': 1.382162431489786e-05, 'epoch': 0.93}
{'loss': 0.5887, 'grad_norm': 1.5204392671585083, 'learning_rate': 1.3721973094170404e-05, 'epoch': 0.94}
{'loss': 0.5595, 'grad_norm': 1.3672395944595337, 'learning_rate': 1.3622321873442953e-05, 'epoch': 0.96}
{'loss': 0.5751, 'grad_norm': 1.778099775314331, 'learning_rate': 1.3522670652715496e-05, 'epoch': 0.97}
{'loss': 0.5438, 'grad_norm': 2.2297160625457764, 'learning_rate': 1.3423019431988042e-05, 'epoch': 0.99}
{'loss': 0.5579, 'grad_norm': 1.2353484630584717, 'learning_rate': 1.332336821126059e-05, 'epoch': 1.0}
{'loss': 0.5256, 'grad_norm': 1.6056938171386719, 'learning_rate': 1.3223716990533135e-05, 'epoch': 1.02}
{'loss': 0.546, 

  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.4398439824581146, 'eval_runtime': 0.5717, 'eval_samples_per_second': 174.918, 'eval_steps_per_second': 12.244, 'epoch': 1.2}
{'loss': 0.4881, 'grad_norm': 1.3361326456069946, 'learning_rate': 1.1928251121076233e-05, 'epoch': 1.21}
{'loss': 0.4704, 'grad_norm': 2.115931510925293, 'learning_rate': 1.182859990034878e-05, 'epoch': 1.23}
{'loss': 0.5047, 'grad_norm': 1.4886726140975952, 'learning_rate': 1.1728948679621325e-05, 'epoch': 1.24}
{'loss': 0.4561, 'grad_norm': 1.8569788932800293, 'learning_rate': 1.1629297458893873e-05, 'epoch': 1.26}
{'loss': 0.4433, 'grad_norm': 0.9695405960083008, 'learning_rate': 1.1529646238166418e-05, 'epoch': 1.27}
{'loss': 0.4515, 'grad_norm': 1.5292043685913086, 'learning_rate': 1.1429995017438964e-05, 'epoch': 1.29}
{'loss': 0.4763, 'grad_norm': 1.6215846538543701, 'learning_rate': 1.1330343796711511e-05, 'epoch': 1.3}
{'loss': 0.4526, 'grad_norm': 1.4697340726852417, 'learning_rate': 1.1230692575984056e-05, 'epoch': 1.32}
{'loss': 0.446

  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.37440216541290283, 'eval_runtime': 0.6501, 'eval_samples_per_second': 153.828, 'eval_steps_per_second': 10.768, 'epoch': 1.5}
{'loss': 0.4106, 'grad_norm': 2.411201238632202, 'learning_rate': 9.935226706527156e-06, 'epoch': 1.51}
{'loss': 0.415, 'grad_norm': 1.6365492343902588, 'learning_rate': 9.835575485799702e-06, 'epoch': 1.52}
{'loss': 0.3969, 'grad_norm': 1.3893998861312866, 'learning_rate': 9.735924265072247e-06, 'epoch': 1.54}
{'loss': 0.4248, 'grad_norm': 1.9409549236297607, 'learning_rate': 9.636273044344795e-06, 'epoch': 1.55}
{'loss': 0.404, 'grad_norm': 1.6062811613082886, 'learning_rate': 9.53662182361734e-06, 'epoch': 1.57}
{'loss': 0.4226, 'grad_norm': 1.87498939037323, 'learning_rate': 9.436970602889887e-06, 'epoch': 1.58}
{'loss': 0.3953, 'grad_norm': 1.8845279216766357, 'learning_rate': 9.337319382162433e-06, 'epoch': 1.6}
{'loss': 0.4174, 'grad_norm': 1.2180052995681763, 'learning_rate': 9.237668161434978e-06, 'epoch': 1.61}
{'loss': 0.3978, 'grad_no

  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.3419061303138733, 'eval_runtime': 0.5863, 'eval_samples_per_second': 170.548, 'eval_steps_per_second': 11.938, 'epoch': 1.8}
{'loss': 0.3791, 'grad_norm': 2.1985604763031006, 'learning_rate': 7.942202291978076e-06, 'epoch': 1.81}
{'loss': 0.3814, 'grad_norm': 1.1020593643188477, 'learning_rate': 7.842551071250624e-06, 'epoch': 1.82}
{'loss': 0.3996, 'grad_norm': 1.115397334098816, 'learning_rate': 7.742899850523171e-06, 'epoch': 1.84}
{'loss': 0.344, 'grad_norm': 1.2038968801498413, 'learning_rate': 7.643248629795715e-06, 'epoch': 1.85}
{'loss': 0.3726, 'grad_norm': 1.2313005924224854, 'learning_rate': 7.543597409068262e-06, 'epoch': 1.87}
{'loss': 0.3769, 'grad_norm': 1.5787543058395386, 'learning_rate': 7.443946188340808e-06, 'epoch': 1.88}
{'loss': 0.3631, 'grad_norm': 1.1126751899719238, 'learning_rate': 7.344294967613354e-06, 'epoch': 1.9}
{'loss': 0.3934, 'grad_norm': 1.3378781080245972, 'learning_rate': 7.2446437468859e-06, 'epoch': 1.91}
{'loss': 0.3652, 'grad_n

  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.3124505877494812, 'eval_runtime': 0.6024, 'eval_samples_per_second': 166.002, 'eval_steps_per_second': 11.62, 'epoch': 2.1}
{'loss': 0.3254, 'grad_norm': 1.2780097723007202, 'learning_rate': 5.949177877428999e-06, 'epoch': 2.11}
{'loss': 0.3591, 'grad_norm': 1.10097336769104, 'learning_rate': 5.8495266567015455e-06, 'epoch': 2.12}
{'loss': 0.3655, 'grad_norm': 1.2193394899368286, 'learning_rate': 5.749875435974092e-06, 'epoch': 2.14}
{'loss': 0.3236, 'grad_norm': 1.230116844177246, 'learning_rate': 5.6502242152466365e-06, 'epoch': 2.15}
{'loss': 0.3393, 'grad_norm': 0.9994789958000183, 'learning_rate': 5.550572994519184e-06, 'epoch': 2.17}
{'loss': 0.3571, 'grad_norm': 1.1061352491378784, 'learning_rate': 5.45092177379173e-06, 'epoch': 2.18}
{'loss': 0.3535, 'grad_norm': 1.0031763315200806, 'learning_rate': 5.351270553064275e-06, 'epoch': 2.2}
{'loss': 0.3342, 'grad_norm': 1.4633980989456177, 'learning_rate': 5.251619332336821e-06, 'epoch': 2.21}
{'loss': 0.3335, 'grad_

  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.29497379064559937, 'eval_runtime': 0.5101, 'eval_samples_per_second': 196.033, 'eval_steps_per_second': 13.722, 'epoch': 2.4}
{'loss': 0.3568, 'grad_norm': 1.3276641368865967, 'learning_rate': 3.956153462879921e-06, 'epoch': 2.41}
{'loss': 0.3215, 'grad_norm': 1.35435950756073, 'learning_rate': 3.8565022421524665e-06, 'epoch': 2.42}
{'loss': 0.3241, 'grad_norm': 1.1545326709747314, 'learning_rate': 3.7568510214250124e-06, 'epoch': 2.44}
{'loss': 0.332, 'grad_norm': 1.1092991828918457, 'learning_rate': 3.657199800697559e-06, 'epoch': 2.45}
{'loss': 0.3316, 'grad_norm': 1.5443695783615112, 'learning_rate': 3.5575485799701047e-06, 'epoch': 2.47}
{'loss': 0.2928, 'grad_norm': 0.925728976726532, 'learning_rate': 3.4578973592426514e-06, 'epoch': 2.48}
{'loss': 0.338, 'grad_norm': 1.109576940536499, 'learning_rate': 3.358246138515197e-06, 'epoch': 2.5}
{'loss': 0.3192, 'grad_norm': 1.0123553276062012, 'learning_rate': 3.258594917787743e-06, 'epoch': 2.51}
{'loss': 0.3136, 'gra

  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.28457602858543396, 'eval_runtime': 0.5153, 'eval_samples_per_second': 194.066, 'eval_steps_per_second': 13.585, 'epoch': 2.7}
{'loss': 0.3014, 'grad_norm': 0.8153993487358093, 'learning_rate': 1.963129048330842e-06, 'epoch': 2.71}
{'loss': 0.3021, 'grad_norm': 1.1058731079101562, 'learning_rate': 1.8634778276033883e-06, 'epoch': 2.72}
{'loss': 0.3103, 'grad_norm': 2.494328022003174, 'learning_rate': 1.7638266068759345e-06, 'epoch': 2.74}
{'loss': 0.3247, 'grad_norm': 1.0259900093078613, 'learning_rate': 1.6641753861484806e-06, 'epoch': 2.75}
{'loss': 0.2986, 'grad_norm': 0.9228556156158447, 'learning_rate': 1.5645241654210267e-06, 'epoch': 2.77}
{'loss': 0.3221, 'grad_norm': 1.3900444507598877, 'learning_rate': 1.4648729446935724e-06, 'epoch': 2.78}
{'loss': 0.2941, 'grad_norm': 1.0223788022994995, 'learning_rate': 1.3652217239661186e-06, 'epoch': 2.8}
{'loss': 0.3094, 'grad_norm': 1.8018395900726318, 'learning_rate': 1.2655705032386647e-06, 'epoch': 2.81}
{'loss': 0.33

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/loss,█▄▃▂▂▁▁▁▁
eval/runtime,█▂▂▁▂▁▂▁▁
eval/samples_per_second,▁▆▆▇▆▇▆██
eval/steps_per_second,▁▆▆▇▆▇▆██
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/grad_norm,▅▅▅▅▄▅▃▄▄▄▃▃▂▄▂▆▆▃▅▃▄▂▂▂▁▃▆▁▂▃▂▁▁▁█▃▁▅▆▃
train/learning_rate,████▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁
train/loss,█▇▆▅▄▄▃▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
eval/loss,0.28458
eval/runtime,0.5153
eval/samples_per_second,194.066
eval/steps_per_second,13.585
total_flos,159616366705152.0
train/epoch,3.0
train/global_step,2007.0
train/grad_norm,1.33616
train/learning_rate,0.0
train/loss,0.3333


In [35]:
trainer.push_to_hub()

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/53.7M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/AlienKevin/electra_hongkongese_small_pos_hkcancor/commit/7d96dd4113a154dff2ad1cd91d59aca9e8cfb526', commit_message='End of training', commit_description='', oid='7d96dd4113a154dff2ad1cd91d59aca9e8cfb526', pr_url=None, pr_revision=None, pr_num=None)

# Inference

Now that our model is ready, we can try it out using the nifty `pipeline` API.

In [36]:
from transformers import pipeline

classifier = pipeline(
    "token-classification",
    model_name,
    grouped_entities=True,
)

config.json:   0%|          | 0.00/1.37k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/53.7M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.26k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/128k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/600k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [37]:
input = "醫院喺邊度？"

In [38]:
classifier(input)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[{'entity_group': 'NOUN',
  'score': 0.83947504,
  'word': '醫 院',
  'start': 0,
  'end': 2},
 {'entity_group': 'ADP',
  'score': 0.40372804,
  'word': '喺',
  'start': 2,
  'end': 3},
 {'entity_group': 'PRON',
  'score': 0.9002198,
  'word': '邊',
  'start': 3,
  'end': 4},
 {'entity_group': 'ADV',
  'score': 0.7424585,
  'word': '度',
  'start': 4,
  'end': 5},
 {'entity_group': 'PUNCT',
  'score': 0.91792315,
  'word': '？',
  'start': 5,
  'end': 6}]

: 