In [1]:
num_token_per_tag = 10

pos_tokens = [f'<|TOK {i}|>' for i in range(200, 200 + num_token_per_tag)]

In [12]:
from datasets import load_dataset, features

def patch_v(tag):
    if tag == 'V':
        return 'VERB'
    else:
        return tag

def get_dataset(num_existing_tokens=0):
    #default only option
    dataset = load_dataset("hkcancor", "default")

    single_lang = ["eng", "yue", "cmn"]

    tag_name_dict = {}
    for lang in single_lang:
        tag_name_dict[lang] = "".join([f'<|TOK {i}|>' for i in range(num_existing_tokens, num_existing_tokens + num_token_per_tag)])
        num_existing_tokens += num_token_per_tag

    source_upos = dataset['train'].features["pos_tags_ud"].feature
    print("Source upos:", source_upos)
    target_upos = features.ClassLabel(
        names=[
            "NOUN",
            "PUNCT",
            "ADP",
            "NUM",
            "SYM",
            "SCONJ",
            "ADJ",
            "PART",
            "DET",
            "CCONJ",
            "PROPN",
            "PRON",
            "X",
            "_",
            "ADV",
            "INTJ",
            "VERB",
            "AUX",
        ]
    )
    print("Target upos:", target_upos)

    def preprocess_function(examples):
        examples["inputs"] = [tag_name_dict["yue"] + ' '.join(examples["tokens"][i]) + "\n" + 
            ''.join(pos_tokens) + ' '.join(patch_v(source_upos.int2str(tag)).lower()
            for tag in examples["pos_tags_ud"][i]) for i in range(len(examples["tokens"]))]
        return examples
    
    dataset['train'] = dataset['train'].map(preprocess_function, remove_columns=
        ['tokens', 'conversation_id', 'pos_tags_prf', 'pos_tags_ud', 'speaker', 'transcriptions', 'turn_number', 'tokens'], batched=True)
    
    dataset = dataset['train'].train_test_split(test_size=0.1, seed=42)
    train_dataset, eval_dataset = dataset['train'], dataset['test']
    return train_dataset, eval_dataset, tag_name_dict

In [13]:
train_dataset, eval_dataset, tag_name_dict = get_dataset()

Source upos: ClassLabel(names=['NUM', 'ADP', 'INTJ', 'PROPN', 'ADJ', 'V', 'DET', 'ADV', 'CCONJ', 'PRON', 'X', 'PART', 'AUX', 'VERB', 'NOUN', 'PUNCT'], id=None)
Target upos: ClassLabel(names=['NOUN', 'PUNCT', 'ADP', 'NUM', 'SYM', 'SCONJ', 'ADJ', 'PART', 'DET', 'CCONJ', 'PROPN', 'PRON', 'X', '_', 'ADV', 'INTJ', 'VERB', 'AUX'], id=None)


In [14]:
train_dataset

Dataset({
    features: ['inputs'],
    num_rows: 9720
})

In [15]:
train_dataset[:10]

{'inputs': ['<|TOK 10|><|TOK 11|><|TOK 12|><|TOK 13|><|TOK 14|><|TOK 15|><|TOK 16|><|TOK 17|><|TOK 18|><|TOK 19|>噉 考 唔 考 到 啊 ？\n<|TOK 200|><|TOK 201|><|TOK 202|><|TOK 203|><|TOK 204|><|TOK 205|><|TOK 206|><|TOK 207|><|TOK 208|><|TOK 209|>cconj verb adv verb part part punct',
  '<|TOK 10|><|TOK 11|><|TOK 12|><|TOK 13|><|TOK 14|><|TOK 15|><|TOK 16|><|TOK 17|><|TOK 18|><|TOK 19|>誒 ， 一 隻 西施 ， 一 隻 拉薩 。\n<|TOK 200|><|TOK 201|><|TOK 202|><|TOK 203|><|TOK 204|><|TOK 205|><|TOK 206|><|TOK 207|><|TOK 208|><|TOK 209|>intj punct num noun propn punct num noun propn punct',
  '<|TOK 10|><|TOK 11|><|TOK 12|><|TOK 13|><|TOK 14|><|TOK 15|><|TOK 16|><|TOK 17|><|TOK 18|><|TOK 19|>我 記得 - 記得 嗰陣時 返 羅師 都 係 - 都 係 唔使 一 個 鐘頭 咋 喎 。 即係 開頭 - 係 開頭 返 返 - 即係 最 開頭 嘅 時候 返學 就 要 最 - 最 多 時間 囖 。 成 一 個 鐘頭 零 三 個 字 ， 但係 返 熟 咗 呢 知道 啲 路 點 行 嚹 嗰啲 呢 …\n<|TOK 200|><|TOK 201|><|TOK 202|><|TOK 203|><|TOK 204|><|TOK 205|><|TOK 206|><|TOK 207|><|TOK 208|><|TOK 209|>pron verb punct verb pron verb propn adv verb punct adv verb aux num n

In [16]:
eval_dataset[:10]

{'inputs': ['<|TOK 10|><|TOK 11|><|TOK 12|><|TOK 13|><|TOK 14|><|TOK 15|><|TOK 16|><|TOK 17|><|TOK 18|><|TOK 19|>啲 - 啲 sales 嘅 質素 下降 ， 會 唔 會 好似 同 醫管局 嗰啲 …\n<|TOK 200|><|TOK 201|><|TOK 202|><|TOK 203|><|TOK 204|><|TOK 205|><|TOK 206|><|TOK 207|><|TOK 208|><|TOK 209|>noun punct noun noun part noun verb punct aux adv aux verb adp propn pron punct',
  '<|TOK 10|><|TOK 11|><|TOK 12|><|TOK 13|><|TOK 14|><|TOK 15|><|TOK 16|><|TOK 17|><|TOK 18|><|TOK 19|>唉 ， 有 乜嘢 辦法 啊 ， 興 吖 嗎 ， 咁 興 啊 。\n<|TOK 200|><|TOK 201|><|TOK 202|><|TOK 203|><|TOK 204|><|TOK 205|><|TOK 206|><|TOK 207|><|TOK 208|><|TOK 209|>intj punct verb pron noun part punct verb part part punct adv verb part punct',
  '<|TOK 10|><|TOK 11|><|TOK 12|><|TOK 13|><|TOK 14|><|TOK 15|><|TOK 16|><|TOK 17|><|TOK 18|><|TOK 19|>夠 嚹 。\n<|TOK 200|><|TOK 201|><|TOK 202|><|TOK 203|><|TOK 204|><|TOK 205|><|TOK 206|><|TOK 207|><|TOK 208|><|TOK 209|>verb part punct',
  '<|TOK 10|><|TOK 11|><|TOK 12|><|TOK 13|><|TOK 14|><|TOK 15|><|TOK 16|><|TOK 17|><|TOK