# For casual understand Vietnamese

In [None]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    pipeline,
    BitsAndBytesConfig,
    Trainer
    , TrainingArguments
)
from peft import LoraConfig, PeftModel, get_peft_model
from trl import SFTTrainer
import os 
from datasets import load_dataset
import pandas as pd

os.environ['HUGGING_FACE_HUB_TOKEN'] = os.getenv("HUGGING_FACE_HUB_TOKEN")

In [2]:
# Select model base
base_model = "meta-llama/Llama-3.2-3B-Instruct"
dataset_path = "VTSNLP/instruct_general_dataset"


In [3]:
model = AutoModelForCausalLM.from_pretrained(base_model)
tokenizer = AutoTokenizer.from_pretrained(base_model)

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.15it/s]


In [4]:
config = LoraConfig(
    r=32,
    lora_alpha=32,   
    lora_dropout=0.01,
    bias="none",
    task_type="CAUSAL_LM",
)

In [5]:
model = get_peft_model(model, config)

  return torch._C._cuda_getDeviceCount() > 0
The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.


In [6]:
prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token
def formatting_prompts_func(examples):
    instructions = examples["instruct"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        text = prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

In [7]:
from datasets import load_dataset
dataset = load_dataset("VTSNLP/instruct_general_dataset", split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)

In [8]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    packing = False, 
    args = TrainingArguments(
        per_device_train_batch_size = 8,
        gradient_accumulation_steps = 2,
        warmup_steps = 4,
        num_train_epochs = 3,
        max_steps = 100,
        learning_rate = 2e-4,
        fp16 = True,
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        save_steps=1000,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "llama_v1",
        report_to = "none", 
    ))



In [None]:
trainer.train()