In [1]:
!pip install -qqq -U transformers datasets huggingface_hub accelerate bitsandbytes tqdm --progress-bar off
!FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE pip install -qqq -U flash-attn --no-build-isolation pip install flash-attn --progress-bar off

In [2]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
import transformers
import torch
from peft import PeftModel

# Set torch dtype and attention implementation
if torch.cuda.get_device_capability()[0] >= 8:
    !pip install -qqq flash-attn
    torch_dtype = torch.bfloat16
    attn_implementation = "flash_attention_2"
else:
    torch_dtype = torch.float16
    attn_implementation = "eager"

base_model = "meta-llama/Meta-Llama-3-8B"
base2_model = "AlienKevin/Meta-Llama-3-8B-tagllm-lang-1-fixed-embed"
new_model = "AlienKevin/Meta-Llama-3-8B-tagllm-pos-1-fixed-embed"

bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)

num_token_per_tag = 1
lang_tokens = ['<|TOK0|>', '<|TOK1|>', '<|TOK2|>']
pos_tokens = ['<|TOK200|>']

tokenizer = transformers.AutoTokenizer.from_pretrained(base_model, additional_special_tokens=lang_tokens + pos_tokens)

model = transformers.AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation,
)
model.resize_token_embeddings(len(tokenizer) - 1)
model = PeftModel.from_pretrained(model, base2_model)
model = model.merge_and_unload()

model.resize_token_embeddings(len(tokenizer))
model = PeftModel.from_pretrained(model, new_model)
model = model.merge_and_unload()

pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]



In [4]:
from datasets import load_dataset, features

def patch_v(tag):
    if tag == 'V':
        return 'VERB'
    else:
        return tag

def get_dataset(num_existing_tokens=0, num_token_per_tag=1):
    dataset = load_dataset("hkcancor", "default")

    single_lang = ["eng", "yue", "cmn"]

    tag_name_dict = {}
    for lang in single_lang:
        tag_name_dict[lang] = "".join([f'<|TOK{i}|>' for i in range(num_existing_tokens, num_existing_tokens + num_token_per_tag)])
        num_existing_tokens += num_token_per_tag

    source_upos = dataset['train'].features["pos_tags_ud"].feature
    print("Source upos:", source_upos)

    def preprocess_function(example):
        example["input"] = tag_name_dict["yue"] + ' '.join(example["tokens"]) + "\n" + \
            ''.join(pos_tokens) + ' '.join(patch_v(source_upos.int2str(tag)).lower()
            for tag in example["pos_tags_ud"])
        return example
    
    dataset['train'] = dataset['train'].map(preprocess_function, remove_columns=
        ['tokens', 'conversation_id', 'pos_tags_prf', 'pos_tags_ud', 'speaker', 'transcriptions', 'turn_number', 'tokens'])

    num_examples = 10
    
    dataset['train'] = dataset['train'].shuffle(42)
    dataset['train'] = dataset['train'].select(range(num_examples))
    prompt = '\n'.join(example['input'] for example in dataset['train'].take(num_examples)) + '\n'
    print(prompt)
    
    dataset = load_dataset("universal_dependencies", "yue_hk")
    test_dataset = dataset["test"]

    def preprocess_function(examples):
        examples["inputs"] = [prompt + tag_name_dict["yue"] + ' '.join(examples["tokens"][i]) + "\n" + ''.join(pos_tokens) for i in range(len(examples["tokens"]))]
        return examples
    
    test_dataset = test_dataset.map(preprocess_function, remove_columns=
        ['idx', 'text', 'tokens', 'lemmas', 'upos', 'xpos', 'feats', 'head', 'deprel', 'deps', 'misc'], batched=True)

    return prompt, test_dataset, tag_name_dict

In [5]:
prompt, eval_dataset, tag_name_dict = get_dataset(num_token_per_tag=num_token_per_tag)

Source upos: ClassLabel(names=['NUM', 'ADP', 'INTJ', 'PROPN', 'ADJ', 'V', 'DET', 'ADV', 'CCONJ', 'PRON', 'X', 'PART', 'AUX', 'VERB', 'NOUN', 'PUNCT'], id=None)
<|TOK1|>啲 - 啲 sales 嘅 質素 下降 ， 會 唔 會 好似 同 醫管局 嗰啲 …
<|TOK200|>noun punct noun noun part noun verb punct aux adv aux verb adp propn pron punct
<|TOK1|>唉 ， 有 乜嘢 辦法 啊 ， 興 吖 嗎 ， 咁 興 啊 。
<|TOK200|>intj punct verb pron noun part punct verb part part punct adv verb part punct
<|TOK1|>夠 嚹 。
<|TOK200|>verb part punct
<|TOK1|>噉 另外 就 同 佢 太太 之間 呢 亦都 有 個 中年 婚姻 危機 𡃉 。 即係 覺得 太太 ， 冇 理由 我 太太 會 重 - 重 愛 我 吖 。 即係 我 事業 又 唔 得 ， 要 樣 冇 樣 ， 要 錢 冇 錢 噉樣 。 噉 另外 有 個 黑人 嘅 。 就 係 鬍鬚 阿伯 ， 就 成 五十 歲 嘅 。 點解 會 揾 到 佢 ？ 因為 佢 要 揾 個 人 教 跳舞 吖 嗎 。 而 哩個 阿伯 係 識 跳 所有 爵士舞 ， 只不過 係 因為 骨頭 硬 。 即係 跳 起 身 ， 跌落 地下 之後 就 起 唔 到 身 𡃉 嘞 。 但係 之前 嗰 段 係 得 嘅 。
<|TOK200|>cconj cconj adv adp pron noun adv part adv verb noun noun noun noun part punct cconj verb noun punct verb noun pron noun aux adv punct adv verb pron part punct cconj pron noun adv adv verb punct aux noun verb noun punct aux noun

In [6]:
eval_dataset

Dataset({
    features: ['inputs'],
    num_rows: 1004
})

In [7]:
eval_dataset[:2]

{'inputs': ['<|TOK1|>啲 - 啲 sales 嘅 質素 下降 ， 會 唔 會 好似 同 醫管局 嗰啲 …\n<|TOK200|>noun punct noun noun part noun verb punct aux adv aux verb adp propn pron punct\n<|TOK1|>唉 ， 有 乜嘢 辦法 啊 ， 興 吖 嗎 ， 咁 興 啊 。\n<|TOK200|>intj punct verb pron noun part punct verb part part punct adv verb part punct\n<|TOK1|>夠 嚹 。\n<|TOK200|>verb part punct\n<|TOK1|>噉 另外 就 同 佢 太太 之間 呢 亦都 有 個 中年 婚姻 危機 𡃉 。 即係 覺得 太太 ， 冇 理由 我 太太 會 重 - 重 愛 我 吖 。 即係 我 事業 又 唔 得 ， 要 樣 冇 樣 ， 要 錢 冇 錢 噉樣 。 噉 另外 有 個 黑人 嘅 。 就 係 鬍鬚 阿伯 ， 就 成 五十 歲 嘅 。 點解 會 揾 到 佢 ？ 因為 佢 要 揾 個 人 教 跳舞 吖 嗎 。 而 哩個 阿伯 係 識 跳 所有 爵士舞 ， 只不過 係 因為 骨頭 硬 。 即係 跳 起 身 ， 跌落 地下 之後 就 起 唔 到 身 𡃉 嘞 。 但係 之前 嗰 段 係 得 嘅 。\n<|TOK200|>cconj cconj adv adp pron noun adv part adv verb noun noun noun noun part punct cconj verb noun punct verb noun pron noun aux adv punct adv verb pron part punct cconj pron noun adv adv verb punct aux noun verb noun punct aux noun verb noun cconj punct cconj cconj verb noun noun part punct adv verb noun noun punct adv num num noun part punct pron aux verb part pron pu

In [8]:
# https://huggingface.co/PygmalionAI/pygmalion-6b/discussions/25#64387bf26c8841ba74e7d9c0
from transformers import StoppingCriteria

class TranslationStoppingCriteria(StoppingCriteria):
    def __init__(self, prompt):
        self.prompt = prompt
        
    def __call__(self, input_ids, scores, **kwargs):
        # Get the generated text as a string
        generated_text = tokenizer.decode(input_ids[0])
        generated_text = generated_text.removeprefix(prompt)
        if generated_text.endswith('\n'):
            return True  # Stop generation
        return False  # Continue generation
    
    def __len__(self):
        return 1
    
    def __iter__(self):
        yield self

In [None]:
from transformers.pipelines.pt_utils import KeyDataset
from tqdm import tqdm
import json

outputs = pipeline(
    KeyDataset(eval_dataset, "inputs"),
    max_new_tokens=128,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
    stopping_criteria=TranslationStoppingCriteria(prompt),
    pad_token_id=tokenizer.eos_token_id,
)

def parse_translation(text):
    lines = text.strip().split('\n')
    result = { 'langs': [], 'sents': [] }

    assert(len(pos_tokens) == 1)
    # tag_dict = { 'yue': tag_name_dict['yue'], 'pos': pos_tokens[0] }
    tag_dict = { 'yue': 'yue:', 'pos': 'pos:' }
    
    for line in lines:
        if len(line.strip()) > 0:
            for tag in tag_dict.values():
                if line.startswith(tag):
                    lang = next(lang for lang, t in tag_dict.items() if t == tag)
                    content = line.removeprefix(tag)
                    result['langs'].append(lang)
                    result['sents'].append(content.strip())
                    break
    
    return result

with open(f'pos_{new_model.split("/")[1]}.jsonl', 'w+') as f:
    for output in tqdm(outputs, total=len(eval_dataset)):
        generated_text = output[0]['generated_text']
        generated_text = generated_text.removeprefix(prompt)
        print(generated_text)
        f.write(json.dumps(parse_translation(generated_text)) + '\n')
        f.flush()

  0%|          | 1/1004 [00:07<2:03:03,  7.36s/it]

<|TOK1|>你 喺度 搵 乜嘢 呀 ？
<|TOK200|>cconj pron adv noun part punct cconj pron noun part num noun part punct coun noun part cconj pron noun part coun noun part coun noun part coun noun part coun noun part coun noun part coun noun part coun noun part coun noun part coun noun part coun noun part coun noun part coun noun part coun noun part coun noun part coun noun part coun noun part coun noun part coun noun part coun noun part coun noun part coun noun part coun noun part coun noun part coun noun part coun noun part coun noun


  0%|          | 2/1004 [00:14<1:59:37,  7.16s/it]

<|TOK1|>咪 執 返 啲 嘢 去 阿哥 個 新 屋 度 囖 。
<|TOK200|>cconj pron verb part num noun part punct cconj pron verb part num noun verb part num noun part punct counj part verb num noun part punct counj part verb part num noun part num noun part punct counj part num noun part num noun part punct counj part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num num noun part num noun part num noun part num num noun part num noun part num num noun part num num noun part num num noun part num num noun part num num noun part num noun part num num noun part num


  0%|          | 3/1004 [00:21<1:58:34,  7.11s/it]

<|TOK1|>該 拎 嗰啲 都 拎 走 咗 啦 ！
<|TOK200|>counj punct cconj pron noun part punct councer noun part num noun part punct councer noun part noun punct councer noun part noun part councer noun part noun part councer noun part councer noun part councer noun part councer noun part councer noun part councer noun part councer noun part councer noun part councer noun part councer noun part councer noun part councer noun part councer noun part councer noun part councer noun part councer noun part councer noun part councer noun part councer noun part councer noun part councer noun part councer noun part councer noun part


  0%|          | 4/1004 [00:28<1:57:51,  7.07s/it]

<|TOK1|>剩 落 呢啲 都 係 冇用 㗎 喇 ！
<|TOK200|>cconj pron verb part punct cconj pron verb part num noun part verb punct cconj pron verb part num noun part punct cconj pron verb part num noun part punctuation counj part num noun part verb part num noun part num verb part num noun part verb part num noun part num verb part num noun part num noun part num noun part num noun part num verb part num noun part num noun part num noun part num num noun part num noun part num noun part num noun part num noun part num num noun part num noun part num num noun part num num noun part num noun part num num noun part num num noun part num noun


  0%|          | 5/1004 [00:35<1:57:23,  7.05s/it]

<|TOK1|>噉 都 要 執  ！
<|TOK200|>icounj punct cconj pron verb part num noun part punct cjon pro verb part num noun part num noun verb part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num num noun part num num noun part num noun part num noun part num noun part num num noun part num num noun part num num num num noun part num num noun part num num num num num noun part num num num noun part num num noun part num num num num num num num noun part num


  1%|          | 6/1004 [00:42<1:57:06,  7.04s/it]

<|TOK1|>係 喇 ， 豪仔 今晚 返 唔 返嚟 食飯 呀 ？
<|TOK200|>cconj pron verb part num noun verb part punct cconj pron verb part num noun verb part punct cjon ver noun part num noun verb part num noun part punct cjon ver noun part num noun part cjon ver noun part num noun part num noun part num noun part cjon ver noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num num noun part num num noun part num noun part


  1%|          | 7/1004 [00:49<1:57:02,  7.04s/it]

<|TOK1|>佢 日日 都 同 女朋友 去 玩 煮飯仔 ， 鬼 得閒 理 我哋 咩 ？
<|TOK200|>cconj pron verb part num noun part punct cconj pron verb part num noun part punct cconj pron verb part num noun part punct cconj pron verb part num noun part punct cunjoun part num noun part num noun part cunjoun counc pron verb part num noun part num noun part num noun part cunjoun counc pron verb part num noun part num noun part num noun part num noun part cunjoun counc pron verb part num noun part num noun part cunjoun cence pron verb part num noun part num noun part num noun part num noun part num noun part num noun part num noun part


  1%|          | 8/1004 [00:56<1:56:44,  7.03s/it]

<|TOK1|>呢啲 都 唔 要 㗎 啦 ， 可 ？
<|TOK200|>cconj pron verb part num noun punct cjon pron verb part num noun punct conjon verb part num noun part num noun part conjon verb part num noun part conjon verb part num noun part conjon verb part conjon verb part num noun part conjon verb part num noun part conjon verb part conjon verb part num noun part conjon verb part num noun part conjon verb part num noun part conjon verb part num noun part conjon verb part num noun part conjon verb part num noun part conjon verb part num noun part conjon


  1%|          | 9/1004 [01:03<1:56:30,  7.03s/it]

<|TOK1|>部 機 都 冇 咗 嘞 。
<|TOK200|>intj punct cconj pron verb part num noun punct cconj pron verb part num noun part cconj pron verb part num noun part cconj pron verb part num noun part cconj pron verb part num noun part cun cjon noun part cun cjon noun part cun cjon noun part cun cjon noun part cun cjon noun part cun cjon noun part cun cjon noun part cun cjon noun part cun cjon noun part cun cjon noun part cun cjon noun part cun cjon noun part cun cjon noun part cun cjon noun part c


  1%|          | 10/1004 [01:04<1:25:54,  5.19s/it]

<|TOK1|>爺爺 ， 我 可 唔 可以 睇 新 一 集 嘅 龍珠 呀 ？
<|TOK200|>counjunct cconj pron verb part num noun part punct cunjoun



  1%|          | 11/1004 [01:11<1:35:02,  5.74s/it]

<|TOK1|>等 你 媽咪 收工 返嚟 ， 同 你 對 晒 啲 功課 先 睇 啦 ！
<|TOK200|>counj punct cconj pron noun part punct cfnosf partn verb noun part punct cfnosf partn verb noun partnum noun part punct cfnosf partn verb noun partnum noun partnum verb noun partnum n verb partnum n verb partnum n verb partnum n verb partnum n verb partnum n verb partnum n verb partnum n verb partnum n verb partnum n verb partnum n verb partnum n verb partnum n verb partnum n verb partnum n verb partum n verb n partum n verb partum n verb n partum n verb n part


  1%|          | 12/1004 [01:18<1:41:30,  6.14s/it]

<|TOK1|>爺爺 ， 我 做 晒 功課 喇 ！
<|TOK200|>counj punct cconj pron verb part num noun noun part punct cconj pron noun part num noun part punct cconj pron noun part num noun part punct councj pron noun part num noun part punct councj pron noun part num noun part punct councj pron noun part num noun part punct councj pron noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num num noun part num noun part num noun part num noun part num noun part num num noun part num noun part num noun part num noun part num num noun part


  1%|▏         | 13/1004 [01:25<1:45:43,  6.40s/it]

<|TOK1|>默書 呢 ？
<|TOK200|> cunj punct cjjunct cunj cjjunct cjjunct cunj cjjunct cjjunct cunj cjjunct cunj cjjunct cunj cjjunct cunj cunj cjjunct cunj cjjunct cunj cjjunct cunj cjjunct cunj cjjunct cunj cjjunct cunj cjjunct cunj cjjunct cunj cjjunct cunj cjjunct cunj cjjunct cunj cjjunct cunj cjjunct cunj c


  1%|▏         | 14/1004 [01:32<1:48:47,  6.59s/it]

<|TOK1|>冇 呀 ， 爺爺 。
<|TOK200|>intj punct cjonous cajounous cajounous cajounous cajounous cajounous cajounous cajounous cajounous cajounous cajounous cajounous cajounous cajounous cajounous cajounous cajounous cajounous cajounous cajounous cajounous cajounous cajounous cajounous cajounous cajounous cajounous cajounous cajounous cajounous cajounous caj


  1%|▏         | 15/1004 [01:39<1:50:49,  6.72s/it]

<|TOK1|>冇 ？
<|TOK200|>conj pron adv punct counc pron verb part num noun noun part punct counc pro noun verb part num noun part punct counc pro noun verb part num noun noun part punct counc pro noun verb part num noun verb part num noun verb part num noun noun part num noun verb part num noun verb part num noun noun part num noun part num noun part num noun verb part num noun part num noun verb part num num noun noun part num num noun part num noun verb part num noun part num num noun verb part num noun part num num noun verb part num num noun part num noun num part num num num num noun part num noun part


  2%|▏         | 16/1004 [01:46<1:52:06,  6.81s/it]

<|TOK1|>爺爺 ， 好 好睇 㗎 ！
<|TOK200|>cconj pron noun part punct cconj pron verb part num noun noun part punct councjon part noun noun part verb punct counjon part num noun part noun part num noun part punct councjon part num noun part num noun part num noun part num noun part councjon part num noun part num noun part num noun part num noun part num noun part num noun part num num noun part num noun part num noun part num num noun part num num noun part num num noun part num noun part num num noun part num num noun part num num num num noun part num num noun part num num num noun part num num noun part


  2%|▏         | 17/1004 [01:53<1:53:01,  6.87s/it]

<|TOK1|>你 ⋯⋯ 同 我 睇 啦 ！
<|TOK200|>coun punct cconj pron verb part num noun punct counc pro verb part num noun noun part punct cconj pro verb part num noun pro verb part num noun verb part num noun pro verb part num noun part counc pro verb part num noun part num noun pro verb part num noun pro verb part num noun pro verb part num pro verb part num noun pro verb part num noun pro verb part num noun pro verb part num noun pro verb part num noun pro verb part num noun pro verb part num noun pro verb part num noun pro verb part num pro verb num noun pro verb part num noun pro verb part num noun pro verb part


  2%|▏         | 18/1004 [02:00<1:53:34,  6.91s/it]

<|TOK1|>唔 睇 。
<|TOK200|>coun noun part punct conj punct cconj pron noun part punct conj punct conj pr noun part conj pr noun part conj pr noun part conj pr noun part conj pr noun part conj pr noun part conj pr noun part conj pr noun part conj pr noun part conj pr noun part conj pr noun part conj pr noun part conj pr noun part conj pr noun part conj pr noun part conj pr noun part conj pr noun part conj pr noun part c


  2%|▏         | 19/1004 [02:07<1:53:54,  6.94s/it]

<|TOK1|>我 都 唔 睇 呢啲 公仔 嘅 。
<|TOK200|>cconj pron dver verb part num noun verb part num noun part punct cconj pron verb part num noun part punct cconj pron verb part num noun part verb punct conjoun part num noun part num noun part verb part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num num noun part num num noun part num noun part num noun part num num noun part num noun part num num num noun part num num noun part num num noun part num num noun part num num num num noun part num


  2%|▏         | 20/1004 [02:14<1:54:04,  6.96s/it]

<|TOK1|>爺爺 ， 你 冇 tei1si2 嘅 ！
<|TOK200|>cunj punct cconj pron noun part num noun punct cconj pron verb noun part num noun part punct cconj pron verb noun part num noun punct cconj pron noun part num noun part num noun part num noun part cunj punct councer noun part num noun part counjunct councer noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num num noun part num noun part num noun part num


  2%|▏         | 21/1004 [02:22<1:55:50,  7.07s/it]

<|TOK1|>好 好睇 㗎 ！
<|TOK200|>cconj pron adpunc verb part num noun part punct conj punctosimfunkosimfunkosimfunkosimfunkosimfunkosimfunkosimfunkosimfunkosimfunkosimfunkosimfunkosimfunkosimfunkosimfunkosimfunkosimfunkosimfunkosimfunkosimfunkosimfunkosimfunkosimfunkosimfunkosimfunkosimfunkosimfunkosimfunkosimfunk


  2%|▏         | 22/1004 [02:29<1:55:47,  7.07s/it]

<|TOK1|>舊陣時 呢 ， 龍珠 係 好 興 㗎 。
<|TOK200|>intj punct cconj pron noun part num noun punct counj part num noun punct councj part num noun part num noun part punct counj part num noun part num noun part punct counj part num noun part num noun part num noun part num noun part councj part num noun part num noun part num noun part num noun part num noun part councj part num noun part num noun part num noun part councj part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part


  2%|▏         | 23/1004 [02:36<1:55:38,  7.07s/it]

<|TOK1|>成日 都 講 埋 啲 咩 龜波氣功 吖 ， 打交 啦 。
<|TOK200|>cjonjunct cconj pron verb part num noun part punct cconj pron noun part num noun part num noun part punct conjunct cconj pron noun part num noun part conjunct conjunct conjunct conjunct conjunct conjunct conjunct conjunct conjunct conjunct conjunct conjunct conjunct conjunct conjunct conjunct conjunct conjunct conjunct conjunct conjunct conjunct conj


  2%|▏         | 24/1004 [02:43<1:55:31,  7.07s/it]

<|TOK1|>三 歲 到 卅 歲 都 沉迷 。
<|TOK200|> cconj pron noun part punct cconj pron noun part num noun part punct counj pron noun part num noun part num noun part councj pron noun part num noun part num noun part num noun part councj pron noun part num noun part councj pron noun part num noun part councj pron noun part councj pron noun part councj pron noun part councj pron noun part councj pron noun part councj pron noun part councj pron noun part councj pron noun part councj pron noun part councj pron noun part councj pron noun part counc


  2%|▏         | 25/1004 [02:50<1:55:24,  7.07s/it]

<|TOK1|>噓 ！ 好 誇張 呀 。
<|TOK200|>cconj pron verb partnum noun part punct coun pron verb part num noun part punct coun pron verb part num noun partnum noun part coun pron verb partnum noun part num noun partnum noun partnum noun partnum noun partnum noun partnum noun partnum noun partnum noun partnum noun partnum noun partnum noun partnum noun partnum noun partnum noun partnum noun partnum noun partnum noun partnum noun partnum noun partnum noun partnum noun partnum noun partnum noun partnum noun partnum noun partnum noun partnum noun partnum noun partnum noun partnum partnum noun


  3%|▎         | 26/1004 [02:57<1:55:20,  7.08s/it]

<|TOK1|>軒軒 ， 走 啦 ！
<|TOK200|>intj punct cconj pron verb part num noun part cconj pron verb part num noun part counc pron verb part num noun part counc pron verb part num noun part counc pron verb part num noun part counc pron verb part counc pron verb part num noun part counc pron verb part counc pron verb part num noun part counc pron verb part num noun part counc pron verb part num noun part counc pron verb part counc noun part counc pron verb part num noun part counc noun part counc noun part counc noun part counc noun part counc noun part counc noun part counc


  3%|▎         | 27/1004 [03:04<1:55:10,  7.07s/it]

<|TOK1|>走 啦 ， 走 啦 ⋯⋯ 走 啦 ， 走 啦 。
<|TOK200|>intj punct cconj pron noun part punct cconj pron verb part num noun part punct counc pro noun part num noun part punct counc pro noun part num noun part punct counc pro noun part num noun part counc pro noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num num noun part num noun part num noun part num noun part num noun part num noun part num noun part num num noun part num noun part num noun part num noun part num noun


  3%|▎         | 28/1004 [03:11<1:55:00,  7.07s/it]

<|TOK1|>吖 ， 掰掰 啦 ， 同 小朋友 掰掰 。
<|TOK200|>counjunct cjjunc p verb part punct cjjunc p verb part punct cjjunc p verb part punct cjjunc p verb part punct cjjunc p verb part punct cjjunc p verb part punct cjjunc p verb part punct cjjunc p verb part punct cjjunc p verb part punct cjjunc p verb part punct cjjunc p verb part punct cjjunc p verb part punct cjjunc p verb part punct cjjunc p verb part punct cjjunc p verb part punct cjjunc p verb part punct cjjunc p verb part punct cjjunc p verb


  3%|▎         | 29/1004 [03:18<1:54:52,  7.07s/it]

<|TOK1|>掰掰 ！
<|TOK200|>itj punct cconj pron noun part punct coun noun part punct coun noun part num noun part punct coun noun part noun part counc noun part punct coun noun part counc noun part counc noun part counc noun part counc noun part counc noun part counc noun part counc noun part counc noun part counc noun part counc noun part counc noun part counc noun part counc noun part counc noun part counc noun part counc noun part counc noun part counc noun part counc noun part counc noun part counc noun part counc noun part counc noun


  3%|▎         | 30/1004 [03:25<1:54:41,  7.06s/it]

<|TOK1|>爺爺 ， 我 行 嘞 ！
<|TOK200|>intj punct cconj pron verb part num noun part punct coun noun part num noun part coun noun part coun noun part coun noun part coun noun part coun noun part coun noun part coun noun part coun noun part coun noun part coun noun part coun noun part coun noun part coun noun part coun noun part coun noun part coun noun part coun noun part coun noun part coun noun part coun noun part coun noun part coun noun part coun noun part coun noun part coun noun part coun noun part coun noun part c


  3%|▎         | 31/1004 [03:32<1:54:12,  7.04s/it]

<|TOK1|>爺爺 呀 ， 快 啲 啦 ！
<|TOK200|>counjunct cconj pron verb part num noun noun part punct cconj pron verb part num noun part num noun part punct councjunct cconj pron verb part num noun part num noun part num noun part punct councjunct cconj pron verb part num noun part num noun part num noun part num noun part num noun part num num noun part num noun part num num noun part num noun part num noun part num noun part num num noun part num num noun part num num noun part num num noun part num num noun part num num num noun part num num num num num noun part num num num num num num num


  3%|▎         | 32/1004 [03:39<1:53:54,  7.03s/it]

<|TOK1|>行 得 未 呀 ？
<|TOK200|>icoun noun part punct cjon noun part num noun punct cjon noun part cjon noun part cjon noun part cion noun part cion noun part cion noun part cion noun part cion noun part cion noun part cion noun part cion noun part cion noun part cion noun part cion noun part cion noun part cion noun part cion noun part cion noun part cion noun part cion noun part cion noun part cion noun part cion noun part cion noun part cion noun part cion noun part cion noun part cion noun part cion noun part


  3%|▎         | 33/1004 [03:46<1:53:40,  7.02s/it]

<|TOK1|>得 㗎 啦 ， 得 㗎 啦 ， 係 ， 呢度 ！ 好 嘞 ！
<|TOK200|>cconj pron noun part punct cconj pron noun part num noun part punct councjoun noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num num noun part num noun part num noun part num num noun part num noun part num noun part num num noun part num num noun part num num noun part num num noun


  3%|▎         | 34/1004 [03:53<1:53:28,  7.02s/it]

<|TOK1|>ＯＫ ！
<|TOK200|>coun punct cconj pron noun part num noun punct cconj pron verb noun part num noun punct councj pron verb noun part num noun punct councj pron verb noun part num noun punct councj pron verb noun part num noun part councj pron verb noun part num noun part councj pron verb noun part num noun part councj pron verb noun part num noun part councj pron verb noun part councj pron verb noun part num noun part num noun part councj pron verb noun part num noun part councj pron noun part num noun part councj pron verb noun part num noun


  3%|▎         | 35/1004 [04:00<1:53:27,  7.03s/it]

<|TOK1|>得 㗎 喇 ！
<|TOK200|>counj punct cconj pron verb part num noun part punct cconj pron noun part punct cunj pron noun part punct cunj pron noun part cunj cunj noun part cunj pron noun part cunj pron noun part cunj pron noun part cunj pron noun part cunj pron noun part cunj pron noun part cunj pron noun part cunj pron noun part cunj pron noun part cunj pron noun part cunj pron noun part cunj pron noun part cunj pron noun part cunj pron noun part cunj pron noun part cunj pron noun part cunj pron noun part cunj pron noun part cunj


  4%|▎         | 36/1004 [04:07<1:53:21,  7.03s/it]

<|TOK1|>超級 撒亞 人 ！
<|TOK200|>cconj pron adpunct cconj pron noun part num noun part punct coun noun part punct cconj pron verb part num noun part punct coun noun part coun noun part punct coun noun part coun noun part coun noun part coun noun part coun noun part coun noun part coun noun part coun noun part coun noun part coun noun part coun noun part coun noun part coun noun part coun noun part coun noun part coun noun part coun noun part coun noun part coun noun part coun noun part coun noun part coun noun part coun


  4%|▎         | 37/1004 [04:14<1:53:09,  7.02s/it]

<|TOK1|>戰鬥力 超 勁 ！
<|TOK200|>icconj pron noun part punctuation cconj pron noun part punct cjon noun part punct cjon noun part num noun part punctuation cjon noun part cjon noun part cjon noun part cjon noun part cjon noun part cjon noun part cjon noun part cjon noun part cjon noun part cjon noun part cjon noun part cjon noun part cjon noun part cjon noun part cjon noun part cjon noun part cjon noun part cjon noun part cjon noun part cjon noun part cjon noun part cjon noun part cjon noun part cjon noun part cjon noun part cjon noun


  4%|▍         | 38/1004 [04:15<1:23:28,  5.19s/it]

<|TOK1|>爺爺 ， 我 行 得 喇 ！
<|TOK200|>counjunct cconj pron verb part num noun puncts



  4%|▍         | 39/1004 [04:22<1:32:05,  5.73s/it]

<|TOK1|>多多 ！
<|TOK200|>coun verb part num noun part punct cconj pron verb part num noun part punctuation coun verb part num noun part punctuation coun verb part num noun part num noun part num noun part num noun part num noun part punctuation coun verb part num noun part num noun part num noun part num num noun part num num noun part num noun part num noun part num num noun part num num noun part num num noun part num noun part num num noun part num num noun part num num noun part num num num noun part num num noun part num num num num num num num num noun part num num num num num num num num num num


  4%|▍         | 40/1004 [04:29<1:38:09,  6.11s/it]

<|TOK1|>你 成日 就 知道 多多 ！
<|TOK200|>intjunc cconjunc conjunc cconjunc conjunc conjunc conjunc conjunc conjunc conjunc conjunc conjunc conjunc conjunc conjunc conjunc conjunc conjunc conjunc conjunc conjunc conjunc conjunc conjunc conjunc conjunc conjunc conjunc conjunc conjunc conjunc conjunc c


  4%|▍         | 41/1004 [04:36<1:42:25,  6.38s/it]

<|TOK1|>唔好 整 呀 ！
<|TOK200|>conjpron adj noun part num noun part cconj pron verb noun part num noun part punct cconj pron verb part num noun part punct coun noun part num noun part counc pron verb part num noun part counc pron verb part num noun part num noun part counc pron verb part num noun part counc pron verb part num noun part councoun part num noun part councoun part councoun part councoun part num noun part councoun part councoun part num noun part councoun part num noun part councoun part num noun part councoun part num noun part councoun part num


  4%|▍         | 42/1004 [04:43<1:45:17,  6.57s/it]

<|TOK1|>整 出嚟 呀 ！
<|TOK200|>intj punct cconj pron noun part verb punct cjon noun part num noun noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num num noun part num noun part num num noun part num noun part num noun part num num noun part num num num noun part num noun part num num num noun part num num noun part num noun part num num noun part num num noun part num noun part num num num noun part num num noun part num num noun part num num num num num num num num num


  4%|▍         | 43/1004 [04:50<1:47:15,  6.70s/it]

<|TOK1|>斤 半 ！
<|TOK200|>icoun punct cconj pron verb part num noun noun part num noun part num noun part punct cconj pron verb part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num num noun part num noun part num num noun part num noun part num noun part num num noun part num num noun part num num noun part num noun part num num noun part num noun part num noun part num num num num noun part num noun part num num


  4%|▍         | 44/1004 [04:57<1:48:35,  6.79s/it]

<|TOK1|>好 ， 收 到 。
<|TOK200|>counj punct cconj pron verb part num noun noun part punct cjonoun noun part noun part punct councj pron noun part noun part num noun noun part punct councj pron noun part punct councj pron noun part noun part noun part punct councj pron noun part councj pron noun part councj pron noun part councj pron noun part councj pron noun part councj pron noun part councj pron noun part councj pron noun part councj pron noun part councj pron noun part councj pron noun part councj pron noun part councj


  4%|▍         | 45/1004 [05:04<1:49:43,  6.86s/it]

<|TOK1|>走 啦 ， 快 啲 ！
<|TOK200|>coun noun part punct cconjunct pron verb part num noun part punct cconj pron verb part num noun part punct counjunct conjunct conjunct councjunct councjunct conjunct councjunct conjunct councjunct councjunct councjunct councjunct councjunct councjunct councjunct councjunct councjunct councjunct councjunct councjunct councjunct councjunct councjunct councjunct councjunct councj


  5%|▍         | 46/1004 [05:11<1:50:13,  6.90s/it]

<|TOK1|>六十六 個 四 。
<|TOK200|>coun pro noun part punct cconj pron verb part num noun part punct counc pro verb part num noun part verb punct counc pro noun part num noun part verb noun part num noun part noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num num noun part num noun part num noun part num noun part num noun part num num noun part num noun part num noun part num noun part num noun part


  5%|▍         | 47/1004 [05:18<1:50:37,  6.94s/it]

<|TOK1|>幾多 呀 ？
<|TOK200|>coun pro noun part punct cconj pron verb part num noun part punct cconj pron noun part num noun part punct conj part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num noun part num num noun part num noun part num noun part num noun part num noun part num noun part num noun part num num noun part num noun part num num noun part num num noun part num num noun part num noun part num num num noun part num num num noun part num num num noun


  5%|▍         | 48/1004 [05:25<1:50:48,  6.95s/it]

<|TOK1|>六十六 個 四 。
<|TOK200|>cconj pron verb part punct coun noun part punct counc pron verb part punct counc pron noun part punct counc pron verb part punct counc pron verb part noun part punct counc pron noun part counc pron noun part counc part noun part counc pron noun part counc pron verb part counc pron noun part counc pron noun part counc pron verb part counc part counc pron noun part counc pron verb part counc part counc pron noun part counc part counc part counc pron noun part counc part counc part counc part counc pron noun part counc part counc


  5%|▍         | 49/1004 [05:32<1:50:58,  6.97s/it]

<|TOK1|>一百 。
<|TOK200|>intj punct cjon pron noun part cunion noun part cunion noun part cunion noun part cunion noun part cunion noun part cunion noun part cunion noun part cunion noun part cunion noun part cunion noun part cunion noun part cunion noun part cunion noun part cunion noun part cunion noun part cunion noun part cunion noun part cunion noun part cunion noun part cunion noun part cunion noun part cunion noun part cunion noun part cunion cunion noun part cunion noun part cunion cunion noun part cunion noun part cunion cunion noun part cunion


  5%|▍         | 50/1004 [05:39<1:50:58,  6.98s/it]

<|TOK1|>找 返 你 三十 ⋯⋯
<|TOK200|>cconj pron verb part num noun part punct cjonous part num noun part punct cjonous part num noun part punct cjonous part num noun part punct cjonous part num noun part cjonous part num noun part cjonous part num noun part cjonous part num noun part cjonous part num noun part cjonous part num noun part cjonous part num noun part cjonous part num noun part cjonous part num noun part cjonous part num noun part cjonous part num noun part cjonous part num noun part cjonous part num noun part cjonous part


  5%|▌         | 51/1004 [05:46<1:50:55,  6.98s/it]

<|TOK1|>姐姐 呀 ， 啲 錢 畀 我 得 㗎 嘞 ！
<|TOK200|>counj punct cconj pron verb part num noun part punct cconj pron noun part num noun part num noun part cjon pron noun part cunj punct cjj part cunj cjj part cunj cjj part cunj cjj part cunj cjj part cunj cjj part cunj cjj part cunj cjj part cunj cjj part cunj cjj part cunj cjj part cunj cjj part cunj cjj part cunj cjj part cunj cjj part cunj cjj part cun


  5%|▌         | 52/1004 [05:53<1:50:52,  6.99s/it]

<|TOK1|>爺爺 呀 ， 呢啲 畀 你 嘅 ！
<|TOK200|>cconj pron adv part num noun part punct cconj pron verb part num noun part cjon pron verb part num noun part num noun part cjon pron verb part num noun part cjon pron verb part num noun part cjon pro verb part num noun part cjon pro verb part num noun part cjon pro verb part num noun part cjon pro verb part num noun part cjon pro verb part num noun part cjon pro verb part num noun part cjon pro verb part num noun part cjon pro verb part num noun part cjon pro verb part num noun part cjon pro verb part num noun part cjon pro verb


  5%|▌         | 53/1004 [06:00<1:50:54,  7.00s/it]

<|TOK1|>嗰啲 呢 ？
<|TOK200|>coun noun part verb punct cconj pron noun part verb part punct cconj pron verb noun part punct cunjousnoun part num noun part cunjousnoun partnum noun part cunjousnoun part num noun part cunjousnoun part cunjous noun part cunjous noun part cunjous noun part cunjous part cunjous noun part cunjous part cunjous part cunjous part cunjous part cunjous part cunjous part cunjous part cunjous part cunjous part cunjous part cunjous part cunjous part cunjous part cunj
