In [None]:
!pip install --no-deps bitsandbytes accelerate xformers==0.0.29 peft trl triton
!pip install --no-deps cut_cross_entropy unsloth_zoo
!pip install sentencepiece protobuf datasets huggingface_hub hf_transfer comet_ml==3.48.1
!pip install --no-deps unsloth

In [None]:
import os
from getpass import getpass
hf_token = "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"
enable_hf = bool(hf_token)
print(f"Is Hugging Face enabled? '{enable_hf}'")

comet_api_key = "LHVLuczJDQUM8l4jzZVtRrpue"
enable_comet = bool(comet_api_key)
comet_project_name = "second-brain-course"
print(f"Is Comet enabled? '{enable_comet}'")

if enable_hf:
    os.environ["HF_TOKEN"] = hf_token
if enable_comet:
    os.environ["COMET_API_KEY"] = comet_api_key
    os.environ["COMET_PROJECT_NAME"] = comet_project_name

In [None]:
import torch


def get_gpu_info() -> str | None:
    """Gets GPU device name if available.

    Returns:
        str | None: Name of the GPU device if available, None if no GPU is found.
    """
    if not torch.cuda.is_available():
        return None

    gpu_name = torch.cuda.get_device_properties(0).name

    return gpu_name


active_gpu_name = get_gpu_info()

print("GPU type:")
print(active_gpu_name)

In [None]:
dataset_id = "PhanDai/luat-viet-nam-qa_small"

In [None]:
max_seq_length = 4096  # Choose any! We auto support RoPE Scaling internally!
dtype = (
    None  # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
)
if active_gpu_name and "T4" in active_gpu_name:
    load_in_4bit = True  # Use 4bit quantization to reduce memory usage.
    max_steps = 25  # Reduce training steps to avoiding waiting too long.
elif active_gpu_name and ("A100" in active_gpu_name or "L4" in active_gpu_name):
    load_in_4bit = False  # Disable 4bit quantization for faster training.
    max_steps = 250  # As we train without 4bit quantization, we can train for more steps without waiting too long.
elif active_gpu_name:
    load_in_4bit = False  # Disable 4bit quantization for faster training.
    max_steps = 150  # As we train without 4bit quantization, we can train for more steps without waiting too long.
else:
    raise ValueError("No Nvidia GPU found.")

print("--- Parameters ---")
print(f"{max_steps=}")
print(f"{load_in_4bit=}")
print(f"{dtype=}")

In [None]:
from unsloth import FastLanguageModel

base_model = "1TuanPham/T-VisStar-7B-v0.1"  # or unsloth/Qwen2.5-7B-Instruct
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=base_model,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r=32,  # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    lora_alpha=16,
    lora_dropout=0,  # Supports any, but = 0 is optimized
    bias="none",  # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing="unsloth",  # True or "unsloth" for very long context
    random_state=3407,
    use_rslora=False,  # We support rank stabilized LoRA
    loftq_config=None,  # And LoftQ
)

In [None]:
from datasets import load_dataset

alpaca_prompt = """Dưới đây là hướng dẫn mô tả một nhiệm vụ, kết hợp với thông tin đầu vào cung cấp thêm ngữ cảnh. Hãy viết phản hồi hoàn thành yêu cầu một cách phù hợp.

### Instruction:
Bạn là một trợ lý thông minh, hãy trả lời câu hỏi hiện tại của user dựa trên lịch sử chat và các tài liệu liên quan. Câu trả lời phải ngắn gọn, chính xác nhưng vẫn đảm bảo đầy đủ các ý chính.
### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token  # Must add EOS_TOKEN


def formatting_prompts_func(examples):
    inputs = examples["question"]
    outputs = examples["context"]
    texts = []
    for input_text, output in zip(inputs, outputs):
        # Xử lý context nếu là list
        if isinstance(output, list):
            output_text = " ".join(output)
        else:
            output_text = output

        # Format prompt và thêm EOS token
        formatted_text = alpaca_prompt.format(input_text.strip(), output_text.strip()) + EOS_TOKEN
        texts.append(formatted_text)

    return {
        "text": texts,
    }


In [None]:
dataset = load_dataset(dataset_id)
dataset = dataset.map(
    formatting_prompts_func,
    batched=True,
)

In [None]:
from transformers import TrainingArguments
from trl import SFTTrainer
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset["train"],
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=True,  # Can make training 5x faster for short sequences.
    args=TrainingArguments(
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        # num_train_epochs=1,  # Set this for 1 full training run, while commenting out 'max_steps'.
        max_steps=max_steps,
        # save_strategy="epoch",
        learning_rate=2e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        report_to="comet_ml" if enable_comet else "none",
    ),
)

In [None]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
trainer_stats = trainer.train()

In [None]:
from huggingface_hub import HfApi

model_name = f"Chatbot_VietNamese_Law"
print(f"Model name: {model_name}")
model.save_pretrained_merged(
    model_name,
    tokenizer,
    save_method="merged_16bit",
)  # Local saving

if enable_hf:
    api = HfApi()
    user_info = api.whoami(token=hf_token)
    huggingface_user = user_info["name"]
    print(f"Current Hugging Face user: {huggingface_user}")

    model.push_to_hub_merged(
        f"{huggingface_user}/{model_name}",
        tokenizer=tokenizer,
        save_method="merged_16bit",
        token=hf_token,
    )  # Online saving to Hugging Face

In [None]:
# from huggingface_hub import HfApi

# hf_token = "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"  # Thay bằng token của bạn
# repo_id = "AIPROENGINEER/Chatbot_VietNamese_Law"  # VD: "AIPROENGINEER/Chatbot_VietNamese_Law"

# api = HfApi()
# api.delete_repo(
#     repo_id = repo_id,
#     token = hf_token,
#     repo_type = "model",  # Rất quan trọng
# )

# print(f"✅ Đã xoá model: {repo_id}")


In [None]:
# from transformers import TextStreamer

# FastLanguageModel.for_inference(model)  # Enable native 2x faster inference
# text_streamer = TextStreamer(tokenizer)


# def generate_text(
#     instruction, streaming: bool = True, trim_input_message: bool = False
# ):
#     message = alpaca_prompt.format(
#         instruction,
#         "",  # output - leave this blank for generation!
#     )
#     inputs = tokenizer([message], return_tensors="pt").to("cuda")

#     if streaming:
#         return model.generate(
#             **inputs, streamer=text_streamer, max_new_tokens=256, use_cache=True
#         )
#     else:
#         output_tokens = model.generate(**inputs, max_new_tokens=256, use_cache=True)
#         output = tokenizer.batch_decode(output_tokens, skip_special_tokens=True)[0]

#         if trim_input_message:
#             return output[len(message) :]
#         else:
#             return output

In [None]:
# generate_text(dataset["validation"][0]["instruction"], streaming=True)

In [None]:
# from huggingface_hub import HfApi

# model_name = f"{base_model}-Chatbot_VietNamese_Law"
# print(f"Model name: {model_name}")
# model.save_pretrained_merged(
#     model_name,
#     tokenizer,
#     save_method="merged_16bit",
# )  # Local saving

# if enable_hf:
#     api = HfApi()
#     user_info = api.whoami(token=hf_token)
#     huggingface_user = user_info["name"]
#     print(f"Current Hugging Face user: {huggingface_user}")

#     model.push_to_hub_merged(
#         f"{huggingface_user}/{model_name}",
#         tokenizer=tokenizer,
#         save_method="merged_16bit",
#         token=hf_token,
#     )  # Online saving to Hugging Face