# SCB-10-x-Trainer-LLM

https://github.com/scb-10x/sft-trainer-example

In [1]:
!git clone https://github.com/scb-10x/sft-trainer-example

Cloning into 'sft-trainer-example'...
remote: Enumerating objects: 50, done.[K
remote: Counting objects: 100% (50/50), done.[K
remote: Compressing objects: 100% (34/34), done.[K
remote: Total 50 (delta 22), reused 41 (delta 13), pack-reused 0[K
Unpacking objects: 100% (50/50), 9.07 KiB | 774.00 KiB/s, done.


In [4]:
!pip install -r "/kaggle/working/sft-trainer-example/requirements.txt"

Collecting bitsandbytes==0.43.1 (from -r /kaggle/working/sft-trainer-example/requirements.txt (line 1))
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl.metadata (2.2 kB)
Collecting trl==0.8.6 (from -r /kaggle/working/sft-trainer-example/requirements.txt (line 2))
  Downloading trl-0.8.6-py3-none-any.whl.metadata (11 kB)
Collecting transformers==4.40.1 (from -r /kaggle/working/sft-trainer-example/requirements.txt (line 3))
  Downloading transformers-4.40.1-py3-none-any.whl.metadata (137 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m138.0/138.0 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting datasets==2.19.0 (from -r /kaggle/working/sft-trainer-example/requirements.txt (line 5))
  Downloading datasets-2.19.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate==0.4.1 (from -r /kaggle/working/sft-trainer-example/requirements.txt (line 6))
  Downloading evaluate-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting peft==0

In [6]:
!pip install transformers==4.30



## Generate_dataset

In [None]:
import json
import os
from datasets import load_dataset
from openai import OpenAI
from tqdm import tqdm
from dotenv import load_dotenv
load_dotenv()


def call_openai(
    user_prompt: str,
    model="typhoon-instruct",
    max_tokens=1000,
    top_p=0.1,
    temperature=1.0,
):
    client = OpenAI(base_url=os.environ['OPENAI_BASE_URL'], api_key=os.environ['OPENAI_API_KEY'])
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": user_prompt},
    ]
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        top_p=top_p,
        temperature=temperature,
        max_tokens=max_tokens,
    )
    content = response.choices[0].message.content
    finish_reason = response.choices[0].finish_reason
    return content


def translate_task(text: str, outpath: str):
    assert isinstance(text, str)
    user_prompt = f"""
Translate message below to Thai:
---
{text}
---
Output only translation result
"""
    translate_resp = call_openai(user_prompt)
    row = {"en": text, "th": translate_resp}
    with open(outpath, "a") as w:
        w.write(f"{json.dumps(row, ensure_ascii=False)}\n")


def process_row(example, outpath):
    for conv in example["conversations"]:
        translate_task(conv['value'], outpath=outpath)


def main():
    ds = load_dataset("openaccess-ai-collective/oasst1-guanaco-extended-sharegpt", split="train")
    ds = ds.select(range(100))
    print(ds)
    for row in tqdm(iter(ds)):
        process_row(row, outpath="output.jsonl")


if __name__ == "__main__":
    main()


## Train

In [12]:
from dataclasses import dataclass, field
from typing import Optional
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from datasets import load_dataset
from peft import LoraConfig
import torch
import os

@dataclass
class ScriptArguments:
    """
    Define the arguments used in this script.
    """

    model_name: Optional[str] = field(default="scb10x/typhoon-7b", metadata={"help": "the model name"})
    dataset_name: Optional[str] = field(default='output.jsonl', metadata={"help": "the dataset name"})
    use_4_bit: Optional[bool] = field(default=True, metadata={"help": "use 4 bit precision"})
    batch_size: Optional[int] = field(default=4, metadata={"help": "input batch size"})
    lr: Optional[float] = field(default=4e-4, metadata={"help": "learning rate"})
    gradient_accumulation_steps: Optional[int] = field(default=1, metadata={"help": "input grad accum step"})
    max_seq_length: Optional[int] = field(default=2048, metadata={"help": "max sequence length"})
    output_dir: Optional[str] = field(default="ckpt", metadata={"help": "ckpt output"})

def main():
    parser = HfArgumentParser(ScriptArguments)
    args = parser.parse_args_into_dataclasses()[0]
    torch_dtype = torch.bfloat16
    # Load model and tokenizer
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        llm_int8_threshold=6.0,
        llm_int8_has_fp16_weight=False,
        bnb_4bit_compute_dtype=torch_dtype,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_use_double_quant=True,
    )
    model = AutoModelForCausalLM.from_pretrained(args.model_name, quantization_config=bnb_config, torch_dtype=torch_dtype, attn_implementation="flash_attention_2")
    tokenizer = AutoTokenizer.from_pretrained(args.model_name)
    peft_config = LoraConfig(
        r=32,
        lora_alpha=8,
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
    )
    tokenizer.pad_token_id = tokenizer.unk_token_id
    # Update the model config to use the new eos & bos token
    if getattr(model, "config", None) is not None:
        model.config.pad_token_id = tokenizer.pad_token_id
        model.config.bos_token_id = tokenizer.bos_token_id
        model.config.eos_token_id = tokenizer.eos_token_id
    if getattr(model, "generation_config", None) is not None:
        model.generation_config.bos_token_id = tokenizer.bos_token_id
        model.generation_config.eos_token_id = tokenizer.eos_token_id
        model.generation_config.pad_token_id = tokenizer.pad_token_id

    if os.path.exists(args.dataset_name):
        dataset = load_dataset('json', data_files=args.dataset_name)['train']
    else:
        dataset = load_dataset(args.dataset_name, split="train")

    def formatting_prompts_func(examples):
        INPUT_COLUMN = "en"
        OUTPUT_COLUMN = "th"
        output_texts = []
        for i in range(len(examples[INPUT_COLUMN])):
            input = examples[INPUT_COLUMN][i]
            output = examples[OUTPUT_COLUMN][i]
            text = f'''Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n
### Instruction:
Translate message to Thai
### Input:
{input}
### Response:
{output}{tokenizer.eos_token}''' # <-- make sure there are eos_token in the format_prompt; sfttrainer doesn't add eos token internally.
            output_texts.append(text)
        return output_texts

    # we need to make sure it
    response_template = "\n### Response:"
    response_template_ids = tokenizer.encode(response_template, add_special_tokens=False)[2:]
    print(args)
    training_arguments = TrainingArguments(
        per_device_train_batch_size=args.batch_size,
        gradient_accumulation_steps=args.gradient_accumulation_steps,
        per_device_eval_batch_size=args.batch_size,
        report_to=['tensorboard'],
        optim='adamw_torch',
        learning_rate=args.lr,
        logging_steps=1,
        bf16=True,
        fp16=False,
        save_steps=1,
        save_strategy='epoch',
        gradient_checkpointing=True,
        output_dir=args.output_dir
    )

    collator = DataCollatorForCompletionOnlyLM(response_template_ids, tokenizer=tokenizer)
    trainer = SFTTrainer(
        model,
        args=training_arguments,
        data_collator=collator,
        tokenizer=tokenizer,
        train_dataset=dataset,
        peft_config=peft_config,
        max_seq_length=args.max_seq_length,
        formatting_func=formatting_prompts_func
    )

    trainer.train()
    trainer.save_model(args.output_dir)

if __name__ == '__main__':
    main()

ValueError: Some specified arguments are not used by the HfArgumentParser: ['-f', '/root/.local/share/jupyter/runtime/kernel-43ac8e06-ff8c-45a4-9b2e-bafe09c0ca78.json']

## Evaluate

In [11]:
import json
import os
from datasets import load_dataset
from sacrebleu.metrics import BLEU
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import torch
from tqdm import tqdm
import argparse
bleu = BLEU(tokenize="flores200")


def get_prompt(input: str):
    prompt = f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n
### Instruction:
Translate message to Thai
### Input:
{input}
### Response:"""
    return prompt


def main(base_model: str, lora_path: str, eval_dataset):
    tokenizer = AutoTokenizer.from_pretrained(base_model)
    device = torch.device("cuda")
    model = AutoModelForCausalLM.from_pretrained(
        base_model,
        load_in_8bit=False,
        low_cpu_mem_usage=True,
        trust_remote_code=True,
        torch_dtype=torch.bfloat16,
        attn_implementation="flash_attention_2",
    )
    print(f"loaded: {base_model}")
    model = PeftModel.from_pretrained(model, lora_path)
    model.eval()
    model.to(device)
    if os.path.exists(eval_dataset):
        ds = load_dataset("json", data_files={"validation": eval_dataset}, split="validation")
    else:
        ds = load_dataset(eval_dataset, split="validation")
    results = []
    references = []
    inputs = []

    for row in tqdm(iter(ds), total=len(ds)):
        prompt = get_prompt(row["en"])
        references.append(row["th"])
        inputs.append(row['en'])
        input = tokenizer([prompt], return_tensors="pt").to(device)
        output = model.generate(**input, max_new_tokens=256)
        output = tokenizer.decode(output[0][input['input_ids'].shape[-1]:], skip_special_tokens=True).strip()
        results.append(output)

    print(
        {
            "bleu": str(bleu.corpus_score(results, [references])),
        }
    )
    with open("eval_results.json", "w") as w:
        json.dump(
            [{"pred": pred, "ref": ref, "input": ip} for pred, ref, ip in zip(results, references, inputs)], w, ensure_ascii=False
        )


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--base-model', type=str, default='scb10x/typhoon-7b')
    parser.add_argument('--lora-path', type=str)
    parser.add_argument ('--eval-dataset', type=str, default='scb_mt_enth_2020_wiki_1k_test.jsonl')
    args = parser.parse_args()
    main(
        args.base_model,
        lora_path=args.lora_path,
        eval_dataset=args.eval_dataset,
    )


usage: colab_kernel_launcher.py [-h] [--base-model BASE_MODEL] [--lora-path LORA_PATH]

                                [--eval-dataset EVAL_DATASET]

colab_kernel_launcher.py: error: unrecognized arguments: -f /root/.local/share/jupyter/runtime/kernel-43ac8e06-ff8c-45a4-9b2e-bafe09c0ca78.json


SystemExit: 2

In [8]:
!python /kaggle/working/sft-trainer-example/train.py --dataset_name scb10x/scb_mt_enth_2020_aqdf_1k --gradient_accumulation_steps 4

2024-04-29 08:51:57.335033: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-29 08:51:57.335145: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-29 08:51:57.438407: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
config.json: 100%|█████████████████████████████| 595/595 [00:00<00:00, 3.24MB/s]
Traceback (most recent call last):
  File "/kaggle/working/sft-trainer-example/train.py", line 116, in <module>
    main()
  File "/kaggle/working/sft-trainer-example/train.py", line 38, in main
    model = AutoModelForCausalLM.from_pretrained(args.model_name, quantization_config=bnb