In [1]:
!pip install -qqq -U transformers datasets huggingface_hub accelerate bitsandbytes --progress-bar off
!FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE pip install -qqq -U flash-attn --no-build-isolation pip install flash-attn --progress-bar off

In [2]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
import transformers
import torch
from peft import PeftModel

# Set torch dtype and attention implementation
if torch.cuda.get_device_capability()[0] >= 8:
    !pip install -qqq flash-attn
    torch_dtype = torch.bfloat16
    attn_implementation = "flash_attention_2"
else:
    torch_dtype = torch.float16
    attn_implementation = "eager"

bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)

num_token_per_tag = 1

base_model = "meta-llama/Meta-Llama-3-8B"
new_model = "Meta-Llama-3-8B-tagllm-lang-1-reserved"

tokenizer = transformers.AutoTokenizer.from_pretrained(base_model, padding_side='left')

model = transformers.AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation,
)

model = PeftModel.from_pretrained(model, new_model)
model.to("cuda")
model.eval()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): lora.Embedding(
          (base_layer): Embedding(128256, 4096)
          (lora_dropout): ModuleDict(
            (default): Dropout(p=0.05, inplace=False)
          )
          (lora_A): ModuleDict()
          (lora_B): ModuleDict()
          (lora_embedding_A): ParameterDict(  (default): Parameter containing: [torch.cuda.HalfTensor of size 16x128256 (cuda:0)])
          (lora_embedding_B): ParameterDict(  (default): Parameter containing: [torch.cuda.HalfTensor of size 4096x16 (cuda:0)])
        )
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaFlashAttention2(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
           

In [4]:
from datasets import load_dataset, interleave_datasets

def get_dataset(num_existing_tokens=0):
    lm_datasets_test = []

    single_lang = ["eng", "yue", "cmn"]
    lang_datasets = ["eng-yue", "cmn-yue"]
    lang_pairs = ["eng-yue", "yue-cmn"]

    tag_name_dict = {}
    for lang in single_lang:
        tag_name_dict[lang] = f'{lang}:'

    prompt_examples = {}

    for i, lang_dataset in enumerate(lang_datasets):

        lm_dataset = load_dataset("AlienKevin/yue-cmn-eng", lang_dataset)
        lm_dataset_train = lm_dataset["train"]
        lm_dataset_train = lm_dataset_train.shuffle(seed=42)

        source_lang, target_lang = lang_pairs[i].split("-")

        def preprocess_train(example):
            example = example['translation']
            return {"input": tag_name_dict[source_lang] + example[source_lang] + '\n' + tag_name_dict[target_lang] + 'translate:' + example[target_lang]}

        lm_dataset_train = lm_dataset_train.select(range(10))
        prompt_examples[lang_pairs[i]] = [example['input'] for example in lm_dataset_train.map(preprocess_train, remove_columns=['translation']).take(10)]

    prompts = {pair: '\n'.join(examples) + '\n' for pair, examples in prompt_examples.items()}
    print(prompts)
    
    for i, lang_dataset in enumerate(lang_datasets):

        lm_dataset = load_dataset("AlienKevin/yue-cmn-eng", lang_dataset)
        lm_dataset_test = lm_dataset["test"]

        source_lang, target_lang = lang_pairs[i].split("-")

        def preprocess_eval(examples):
            examples["inputs"] = [prompts[lang_pairs[i]] + tag_name_dict[source_lang] + example[source_lang] + '\n' + tag_name_dict[target_lang] + 'translate:' for example in examples["translation"]]
            del examples['translation']
            return examples
        
        lm_dataset_test = lm_dataset_test.map(preprocess_eval, batched=True)
        lm_datasets_test.append(lm_dataset_test)
    
    eval_dataset = interleave_datasets(lm_datasets_test)
    return prompts, eval_dataset, tag_name_dict

In [5]:
prompts, eval_dataset, tag_name_dict = get_dataset()

{'eng-yue': "eng:Please don't put toilet paper into the urinal, so as to avoid clogging it, thanks for your cooperation.\nyue:translate:請勿將廁紙放在尿兜内，以免淤塞，多謝合作。\neng:This guy is very greedy for money; he was caught stealing money from his company before.\nyue:translate:呢條友好貪錢㗎，之前俾人發現佢偷公司錢。\neng:nostril\nyue:translate:鼻哥窿\neng:As your informer, I'll certainly pass on any information to you.\nyue:translate:我做得你條針，實會過料畀你。\neng:This website was designed by me.\nyue:translate:呢個係我自己設計嘅網站。\neng:to see the world\nyue:translate:見世面\neng:Mum! Are you fine?\nyue:translate:媽！你有冇事啊？\neng:I am becoming clumsier as I get older.\nyue:translate:我老咗做嘢係論盡啲。\neng:This shirt doesn't have even one pocket.\nyue:translate:呢件裇衫一個衫袋都冇。\neng:a rain shower\nyue:translate:一陣雨\n", 'yue-cmn': 'yue:見衫係紅色嘅\ncmn:translate:衣服是红色的\nyue:嗰個地方好多時有賊劏死牛，冇乜事唔好行去嗰度\ncmn:translate:那个地方经常有贼烂路抢劫，没什么事不要走到那儿去\nyue:唔成功都唔使心淡吖\ncmn:translate:不成功也不用著心灰意冷\nyue:睇呢啲濕星嘢你要唔要呀\ncmn:translate:看你要不要这些琐碎的东西\nyue:叫你整闊啲，你又闊過龍\ncmn:translate:叫你弄宽点儿，你

In [6]:
eval_dataset

Dataset({
    features: ['inputs'],
    num_rows: 3000
})

In [7]:
eval_dataset[:2]

{'inputs': ["eng:Please don't put toilet paper into the urinal, so as to avoid clogging it, thanks for your cooperation.\nyue:translate:請勿將廁紙放在尿兜内，以免淤塞，多謝合作。\neng:This guy is very greedy for money; he was caught stealing money from his company before.\nyue:translate:呢條友好貪錢㗎，之前俾人發現佢偷公司錢。\neng:nostril\nyue:translate:鼻哥窿\neng:As your informer, I'll certainly pass on any information to you.\nyue:translate:我做得你條針，實會過料畀你。\neng:This website was designed by me.\nyue:translate:呢個係我自己設計嘅網站。\neng:to see the world\nyue:translate:見世面\neng:Mum! Are you fine?\nyue:translate:媽！你有冇事啊？\neng:I am becoming clumsier as I get older.\nyue:translate:我老咗做嘢係論盡啲。\neng:This shirt doesn't have even one pocket.\nyue:translate:呢件裇衫一個衫袋都冇。\neng:a rain shower\nyue:translate:一陣雨\neng:This is really amusing, a radio controlled car that can climb on walls.\nyue:translate:",
  'yue:見衫係紅色嘅\ncmn:translate:衣服是红色的\nyue:嗰個地方好多時有賊劏死牛，冇乜事唔好行去嗰度\ncmn:translate:那个地方经常有贼烂路抢劫，没什么事不要走到那儿去\nyue:唔成功都唔使心淡吖\ncmn:translate:不成功也不用著心灰意冷\nyu

In [8]:
# https://huggingface.co/PygmalionAI/pygmalion-6b/discussions/25#64387bf26c8841ba74e7d9c0
from transformers import StoppingCriteria

class TranslationStoppingCriteria(StoppingCriteria):
    def __init__(self, prompts):
        self.prompts = prompts
        
    def __call__(self, input_ids, scores, **kwargs):
        # Get the generated text as a string
        generated_text = tokenizer.decode(input_ids[0])
        for prompt in prompts.values():
            generated_text = generated_text.removeprefix(prompt)
        if generated_text.endswith('\n'):
            return True  # Stop generation
        return False  # Continue generation
    
    def __len__(self):
        return 1
    
    def __iter__(self):
        yield self

In [9]:
from transformers.pipelines.pt_utils import KeyDataset
import json
from tqdm import tqdm

def parse_translation(text):
    lines = text.strip().split('\n')
    result = { 'langs': [], 'sents': [] }
    
    for line in lines:
        if len(line.strip()) > 0:
            for lang_tags in tag_name_dict.values():
                if lang_tags in line:
                    lang = next(lang for lang, tags in tag_name_dict.items() if tags == lang_tags)
                    content = line.removeprefix(lang_tags)
                    result['langs'].append(lang)
                    result['sents'].append(content.strip().removeprefix('translate:'))
                    break
    
    return result

with open(f'translations_{new_model}.jsonl', 'w+') as f:
    for inputs in tqdm(eval_dataset.select(range(10))["inputs"]):
        input_ids = tokenizer(inputs, return_tensors="pt").input_ids.to('cuda')
        output = model.generate(
            input_ids,
            max_new_tokens=128,
            do_sample=True,
            temperature=0.6,
            top_p=0.9,
            stopping_criteria=TranslationStoppingCriteria(prompts),
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id,
        )[0]
        generated_text = tokenizer.decode(output, skip_special_tokens=True)
        for prompt in prompts.values():
            generated_text = generated_text.removeprefix(prompt)
        f.write(json.dumps(parse_translation(generated_text)) + '\n')
        f.flush()

  0%|          | 0/10 [00:00<?, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 10%|█         | 1/10 [00:03<00:27,  3.01s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 20%|██        | 2/10 [00:16<01:12,  9.08s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 30%|███       | 3/10 [00:18<00:41,  5.86s/it]The attention mask and the pad token id were not set. As a consequen

KeyboardInterrupt: 