In [1]:
!pip install -qqq -U transformers datasets huggingface_hub accelerate bitsandbytes tqdm --progress-bar off
!FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE pip install -qqq -U flash-attn --no-build-isolation pip install flash-attn --progress-bar off

In [2]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
import transformers
import torch
from peft import PeftModel

# Set torch dtype and attention implementation
if torch.cuda.get_device_capability()[0] >= 8:
    !pip install -qqq flash-attn
    torch_dtype = torch.bfloat16
    attn_implementation = "flash_attention_2"
else:
    torch_dtype = torch.float16
    attn_implementation = "eager"

base_model = "meta-llama/Meta-Llama-3-8B"
new_model = "Meta-Llama-3-8B-qlora-pos-no-tag"

bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)

tokenizer = transformers.AutoTokenizer.from_pretrained(base_model, padding_side='left')

model = transformers.AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation,
)
model = PeftModel.from_pretrained(model, new_model)
model = model.merge_and_unload()

pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]



In [4]:
from datasets import load_dataset, features

def patch_v(tag):
    if tag == 'V':
        return 'VERB'
    else:
        return tag

def get_dataset(num_existing_tokens=0):
    dataset = load_dataset("hkcancor", "default")

    single_lang = ["eng", "yue", "cmn"]

    tag_name_dict = {}
    for lang in single_lang:
        tag_name_dict[lang] = f'{lang}:'

    source_upos = dataset['train'].features["pos_tags_ud"].feature
    print("Source upos:", source_upos)

    def preprocess_function(example):
        example["input"] = 'input:' + ' '.join(example["tokens"]) + "\n" + \
            "output:" + ' '.join(patch_v(source_upos.int2str(tag)).lower()
            for tag in example["pos_tags_ud"])
        return example
    
    dataset['train'] = dataset['train'].map(preprocess_function, remove_columns=
        ['tokens', 'conversation_id', 'pos_tags_prf', 'pos_tags_ud', 'speaker', 'transcriptions', 'turn_number', 'tokens'])

    dataset['train'] = dataset['train'].shuffle(42)
    dataset['train'] = dataset['train'].select(range(10))
    prompt = '\n'.join(example['input'] for example in dataset['train'].take(10)) + '\n'
    print(prompt)
    
    dataset = load_dataset("universal_dependencies", "yue_hk")
    test_dataset = dataset["test"]

    def preprocess_function(examples):
        examples["inputs"] = [prompt + "input:" + ' '.join(examples["tokens"][i]) + "\n" + "output:" for i in range(len(examples["tokens"]))]
        return examples
    
    test_dataset = test_dataset.map(preprocess_function, remove_columns=
        ['idx', 'text', 'tokens', 'lemmas', 'upos', 'xpos', 'feats', 'head', 'deprel', 'deps', 'misc'], batched=True)

    return prompt, test_dataset, tag_name_dict

In [5]:
prompt, eval_dataset, tag_name_dict = get_dataset()

Source upos: ClassLabel(names=['NUM', 'ADP', 'INTJ', 'PROPN', 'ADJ', 'V', 'DET', 'ADV', 'CCONJ', 'PRON', 'X', 'PART', 'AUX', 'VERB', 'NOUN', 'PUNCT'], id=None)


Map:   0%|          | 0/10801 [00:00<?, ? examples/s]

input:啲 - 啲 sales 嘅 質素 下降 ， 會 唔 會 好似 同 醫管局 嗰啲 …
output:noun punct noun noun part noun verb punct aux adv aux verb adp propn pron punct
input:唉 ， 有 乜嘢 辦法 啊 ， 興 吖 嗎 ， 咁 興 啊 。
output:intj punct verb pron noun part punct verb part part punct adv verb part punct
input:夠 嚹 。
output:verb part punct
input:噉 另外 就 同 佢 太太 之間 呢 亦都 有 個 中年 婚姻 危機 𡃉 。 即係 覺得 太太 ， 冇 理由 我 太太 會 重 - 重 愛 我 吖 。 即係 我 事業 又 唔 得 ， 要 樣 冇 樣 ， 要 錢 冇 錢 噉樣 。 噉 另外 有 個 黑人 嘅 。 就 係 鬍鬚 阿伯 ， 就 成 五十 歲 嘅 。 點解 會 揾 到 佢 ？ 因為 佢 要 揾 個 人 教 跳舞 吖 嗎 。 而 哩個 阿伯 係 識 跳 所有 爵士舞 ， 只不過 係 因為 骨頭 硬 。 即係 跳 起 身 ， 跌落 地下 之後 就 起 唔 到 身 𡃉 嘞 。 但係 之前 嗰 段 係 得 嘅 。
output:cconj cconj adv adp pron noun adv part adv verb noun noun noun noun part punct cconj verb noun punct verb noun pron noun aux adv punct adv verb pron part punct cconj pron noun adv adv verb punct aux noun verb noun punct aux noun verb noun cconj punct cconj cconj verb noun noun part punct adv verb noun noun punct adv num num noun part punct pron aux verb part pron punct cconj pron aux verb noun noun verb v

Downloading data:   0%|          | 0.00/143k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/1004 [00:00<?, ? examples/s]

Map:   0%|          | 0/1004 [00:00<?, ? examples/s]

In [6]:
eval_dataset

Dataset({
    features: ['inputs'],
    num_rows: 1004
})

In [7]:
eval_dataset[:2]

{'inputs': ['input:啲 - 啲 sales 嘅 質素 下降 ， 會 唔 會 好似 同 醫管局 嗰啲 …\noutput:noun punct noun noun part noun verb punct aux adv aux verb adp propn pron punct\ninput:唉 ， 有 乜嘢 辦法 啊 ， 興 吖 嗎 ， 咁 興 啊 。\noutput:intj punct verb pron noun part punct verb part part punct adv verb part punct\ninput:夠 嚹 。\noutput:verb part punct\ninput:噉 另外 就 同 佢 太太 之間 呢 亦都 有 個 中年 婚姻 危機 𡃉 。 即係 覺得 太太 ， 冇 理由 我 太太 會 重 - 重 愛 我 吖 。 即係 我 事業 又 唔 得 ， 要 樣 冇 樣 ， 要 錢 冇 錢 噉樣 。 噉 另外 有 個 黑人 嘅 。 就 係 鬍鬚 阿伯 ， 就 成 五十 歲 嘅 。 點解 會 揾 到 佢 ？ 因為 佢 要 揾 個 人 教 跳舞 吖 嗎 。 而 哩個 阿伯 係 識 跳 所有 爵士舞 ， 只不過 係 因為 骨頭 硬 。 即係 跳 起 身 ， 跌落 地下 之後 就 起 唔 到 身 𡃉 嘞 。 但係 之前 嗰 段 係 得 嘅 。\noutput:cconj cconj adv adp pron noun adv part adv verb noun noun noun noun part punct cconj verb noun punct verb noun pron noun aux adv punct adv verb pron part punct cconj pron noun adv adv verb punct aux noun verb noun punct aux noun verb noun cconj punct cconj cconj verb noun noun part punct adv verb noun noun punct adv num num noun part punct pron aux verb part pron punct cconj pron aux v

In [8]:
# https://huggingface.co/PygmalionAI/pygmalion-6b/discussions/25#64387bf26c8841ba74e7d9c0
from transformers import StoppingCriteria

class TranslationStoppingCriteria(StoppingCriteria):
    def __init__(self, prompt):
        self.prompt = prompt
        
    def __call__(self, input_ids, scores, **kwargs):
        # Get the generated text as a string
        generated_text = tokenizer.decode(input_ids[0])
        generated_text = generated_text.removeprefix(prompt)
        if generated_text.endswith('\n'):
            return True  # Stop generation
        return False  # Continue generation
    
    def __len__(self):
        return 1
    
    def __iter__(self):
        yield self

In [9]:
from transformers.pipelines.pt_utils import KeyDataset
from tqdm import tqdm
import json

outputs = pipeline(
    KeyDataset(eval_dataset, "inputs"),
    max_new_tokens=128,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
    stopping_criteria=TranslationStoppingCriteria(prompt),
    pad_token_id=tokenizer.eos_token_id,
)

def parse_translation(text):
    lines = text.strip().split('\n')
    result = { 'langs': [], 'sents': [] }
    
    for line in lines:
        if ':' in line:
            lang, content = line.split(':', 1)
            if lang in ['input', 'output']:
                result['langs'].append('yue' if lang == 'input' else 'pos')
                result['sents'].append(content.strip())
    
    return result

with open(f'experiment_results/pos_{new_model}.jsonl', 'w+') as f:
    for output in tqdm(outputs, total=len(eval_dataset)):
        generated_text = output[0]['generated_text']
        generated_text = generated_text.removeprefix(prompt)
        f.write(json.dumps(parse_translation(generated_text)) + '\n')
        f.flush()

100%|██████████| 1004/1004 [25:26<00:00,  1.52s/it]
