In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install -qq --upgrade pip
!pip install -qq --upgrade peft transformers accelerate bitsandbytes datasets trl huggingface_hub evaluate

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━[0m [32m0.9/1.8 MB[0m [31m26.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m32.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from huggingface_hub import login

login("huggingface API key here")

In [None]:
import torch
import numpy as np

from peft import PeftModel, PeftConfig, LoraConfig, TaskType, get_peft_model, get_peft_config
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import load_dataset
from trl import SFTTrainer
# config cho quantization. Vai trò: QLoRA setup.
from transformers import BitsAndBytesConfig
from peft import prepare_model_for_kbit_training
import evaluate
import warnings

warnings.filterwarnings("ignore")

In [None]:
base_model_id = "meta-llama/Llama-3.2-1B-Instruct"
cache_dir = "./cache"

In [None]:
# Config 4-bit quantization (QLoRA)
# Mô hình gốc của Llama được load ở định dạng fp32 (32-bit floating point), 4 bytes/parameter
# Load trọng số ở định dạng 4-bit => giảm xuống còn 0.5 bytes/parameter
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type = "nf4",
    # Kích hoạt "double quantization" => nén thêm 0.4 bits/parameter mà vẫn bảo toàn accuracy
    # Từ 4-bit quantization => 3.6 bit quantization
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16
)

# **Load Model**

In [None]:
MAX_TRAIN_STEPS = 5_000
NUM_EVAL_STEPS = 500
MAX_TRAIN_SAMPLES = 20_000
MAX_EVAL_SAMPLES = 2_000

In [None]:
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True, cache_dir=cache_dir)
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    trust_remote_code=True,
    cache_dir=cache_dir,
    quantization_config=quantization_config,
    device_map="cuda:0" if torch.cuda.is_available() else "cpu",
)
base_model = prepare_model_for_kbit_training(base_model)

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/877 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

In [None]:
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True, cache_dir=cache_dir)

In [None]:
if tokenizer.pad_token is None or tokenizer.pad_token_id is None:
    print("Pad token is not set. Setting it to EOS token.")
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id
else:
    print(f'Pad token: {tokenizer.pad_token}')
    print(f'Pad token id: {tokenizer.pad_token_id}')

print(f'EOS token: {tokenizer.eos_token}')
print(f'EOS token id: {tokenizer.eos_token_id}')

Pad token is not set. Setting it to EOS token.
EOS token: <|eot_id|>
EOS token id: 128009


In [None]:
# Set up chat template cho tokenizer
if tokenizer.chat_template is None:
    tokenizer.chat_template = """{{- bos_token }}
{%- if not date_string is defined %}
    {%- if strftime_now is defined %}{%- set date_string = strftime_now("%d %b %Y") %}{%- else %}{%- set date_string = "26 Jul 2024" %}{%- endif %}
{%- endif %}

{#- This block extracts the system message, so we can slot it into the right place. #}
{%- if messages[0]['role'] == 'system' %}
    {%- set system_message = messages[0]['content']|trim %}
    {%- set messages = messages[1:] %}
{%- else %}
    {%- set system_message = "" %}
{%- endif %}

{#- System message #}
{{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
{{- "Cutting Knowledge Date: December 2023\n" }}
{{- "Today Date: " + date_string + "\n\n" }}
{{- system_message }}
{{- "<|eot_id|>" }}

{%- for message in messages %}
    {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}
{%- endfor %}
{%- if add_generation_prompt %}
    {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
{%- endif %}
"""

In [None]:
# Config LoRA
peft_config = LoraConfig(
    # chỉ định loại task mà LoRA nhắm đến
    task_type = TaskType.CAUSAL_LM, # Causal Language Modeling (dự đoán token tiếp theo)
    inference_mode=False, # Muốn mô hình tập trung vào training thay vì inference
    r = 8,
    lora_alpha=32,
    lora_dropout=0.1
)

In [None]:
peft_model = get_peft_model(base_model, peft_config)

# **Load Data**

In [None]:
dataset = load_dataset("csv", data_files=
    {
        "train": "/content/drive/MyDrive/ML_DL_datasets/data_qlora/train.csv",
        "validation": "/content/drive/MyDrive/ML_DL_datasets/data_qlora/dev.csv",
        "test": "/content/drive/MyDrive/ML_DL_datasets/data_qlora/test.csv"
    }
)

for split in dataset:
    if split == "train":
        MAX_TRAIN_SAMPLES = min(MAX_TRAIN_SAMPLES, len(dataset[split]))
        dataset[split] = dataset[split].select(range(MAX_TRAIN_SAMPLES))
    else:
        MAX_EVAL_SAMPLES = min(MAX_EVAL_SAMPLES, len(dataset[split]))
        dataset[split] = dataset[split].select(range(MAX_EVAL_SAMPLES))
    print(f"{split}: {len(dataset[split])}")

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

train: 11426
validation: 1583
test: 1583


In [None]:
label_set = set([item["sentiment"] for split in dataset for item in dataset[split]])
label_set

{0, 1, 2}

In [None]:
all_labels = ['negative', 'neutral', 'positive']
print(f'There are {len(all_labels)} labels in the dataset, including {all_labels}')

label2id = {label: i for i, label in enumerate(all_labels)}
id2label = {i: label for i, label in enumerate(all_labels)}

print(f'label2id: {label2id}')
print(f'id2label: {id2label}')

There are 3 labels in the dataset, including ['negative', 'neutral', 'positive']
label2id: {'negative': 0, 'neutral': 1, 'positive': 2}
id2label: {0: 'negative', 1: 'neutral', 2: 'positive'}


In [None]:
USER_PROMPT_TEMPLATE = """
Predict the sentiment of the following input sentence.
The response must begin with "Sentiment: ", followed by one of these keywords: "positive", "negative", or "neutral", to reflect the sentiment of the input sentence.

Sentence: {input}
"""

In [None]:
import os

def tokenize_function(examples):
    results = {
        "input_ids": [],
        "labels": [],
        "attention_mask": [],
    }

    for i in range(len(examples['sentence'])):
        # Extract sentence và label id từ batch. Cần thiết để tạo prompt và output.
        cur_input = examples['sentence'][i]
        cur_output_id = examples['sentiment'][i]

        cur_prompt = USER_PROMPT_TEMPLATE.format(input=cur_input)

        # Chuyển id (0/1/2) sang label text ("negative"/"neutral"/"positive"). Cần thiết vì output là generation text.
        cur_output = id2label[cur_output_id]

        # Tạo conversation chỉ input (prompt) cho tokenized prompt
        input_messages = [
            {"role": "system", "content": "You are a helpful assistant. You must fulfill the user request."},
            {"role": "user", "content": cur_prompt},
        ]

        # Tạo full conversation với output để tokenize toàn bộ.
        input_output_messages = input_messages + [{"role": "assistant", "content": f"Sentiment: {cur_output}"}]

        # Tokenize prompt input
        input_prompt_tokenized = tokenizer.apply_chat_template(conversation=input_messages, return_tensors="pt", add_generation_prompt=True)[0]
        # Tokenize full (input + output) cho input_ids.
        input_output_prompt_tokenized = tokenizer.apply_chat_template(conversation=input_output_messages, return_tensors="pt")[0]

        # Tạo bộ input_ids và label_ids, chuẩn hoá data - chuẩn bị cho quá trình training
        input_ids = input_output_prompt_tokenized
        label_ids = torch.cat([
            torch.full_like(input_prompt_tokenized, fill_value=-100),
            input_output_prompt_tokenized[len(input_prompt_tokenized):]
        ])


        results["input_ids"].append(input_ids)
        results["labels"].append(label_ids)
        results['attention_mask'].append(torch.ones_like(input_ids))

    return results

col_names = dataset['train'].column_names
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=col_names,
    num_proc=os.cpu_count(),
)
tokenized_dataset

Map (num_proc=2):   0%|          | 0/11426 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/1583 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/1583 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 11426
    })
    validation: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 1583
    })
    test: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 1583
    })
})

In [None]:
from transformers import DataCollatorWithPadding
from typing import Any, Dict, List

class RightPaddingDataCollator(DataCollatorWithPadding):
    """The default data collator pads only inputs, not including the labels."""

    def __init__(self, tokenizer, max_length: int = 1024):
        super().__init__(tokenizer, max_length=max_length)

    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
        # Khởi tạo lists để append tensors đã được thêm padding.
        input_ids, labels, attention_mask = [], [], []
        # Tìm độ dài dài nhất trong batch để pad tất cả đến độ dài đó. Đảm bảo batch uniform (cùng size)
        max_batch_len = max(len(f["input_ids"]) for f in features)

        for sample in features:
            # Convert to torch tensors
            cur_input_ids = torch.tensor(sample["input_ids"], dtype=torch.long)
            cur_labels = torch.tensor(sample["labels"], dtype=torch.long)
            cur_attention_mask = torch.ones_like(cur_input_ids)

            # Next, we pad the inputs and labels to the maximum length within the batch
            # Chuyển list thành torch.tensor (long cho ids)
            # 1. Lấy pad_token_id từ tokenizer
            pad_token_id = self.tokenizer.pad_token_id
            # 2. tính padding_length
            padding_length = max_batch_len - len(cur_input_ids)
            # 3. cat với full_like (pad_token_id cho input -100 cho labels để ignore loss, 0 cho mask để model bỏ qua padding)
            '''Hàm này tạo một tensor mới với kích thước (shape) được chỉ định,
            và tất cả các phần tử bên trong tensor đều được điền bằng một giá trị cố định (gọi là fill_value)'''
            cur_input_ids = torch.cat([cur_input_ids, torch.full((padding_length,), fill_value=pad_token_id, dtype=torch.long)])
            cur_labels = torch.cat([cur_labels, torch.full((padding_length,), fill_value=-100, dtype=torch.long)])
            cur_attention_mask = torch.cat([cur_attention_mask, torch.zeros((padding_length,), dtype=torch.long)])

            # Truncate the inputs and labels to the maximum length
            # Giới hạn đến max_batch_len (nếu dài hơn, nhưng thường không vì pad đến max).
            cur_input_ids = cur_input_ids[:max_batch_len]
            cur_labels = cur_labels[:max_batch_len]
            cur_attention_mask = cur_attention_mask[:max_batch_len]

            # Append to the return lists
            # Gom tất cả sample pad vào lists.
            input_ids.append(cur_input_ids)
            labels.append(cur_labels)
            attention_mask.append(cur_attention_mask)

        # Return formatted batch.
        return {
            "input_ids": torch.stack(input_ids),
            "labels": torch.stack(labels),
            "attention_mask": torch.stack(attention_mask)
        }

data_collator = RightPaddingDataCollator(tokenizer)


In [None]:
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")


def preprocess_logits_for_metrics(logits, labels):
    if isinstance(logits, tuple):
        logits = logits[0]
    return logits.argmax(dim=-1)


def compute_metrics(eval_preds):
    preds, labels = eval_preds

    if isinstance(preds, tuple):
        preds = preds[0]

    idx = 0
    """
    Trong batch, tất cả các example thường có độ dài prompt giống nhau (do cùng sử dụng template prompt từ USER_PROMPT_TEMPLATE
    Vì vậy, vị trí idx (nơi -100 kết thúc và output bắt đầu) là giống nhau cho tất cả sequences trong batch.
    Chỉ cần kiểm tra sequence đầu tiên (labels[0]) là đủ
    """
    for i in range(len(labels[0])):
        if labels[0][i] == -100:
            idx = i
        else:
            break
    # Slice the labels and preds to remove the prompt tokens
    # Giữ lại output
    """Vì đoạn code
    label_ids = torch.cat([
            torch.full_like(input_prompt_tokenized, fill_value=-100),
            input_output_prompt_tokenized[len(input_prompt_tokenized):]
        ])
    Thêm hàng loạt -100 vào đầu trước khi đến với output
    """
    preds = preds[:, idx:]

    # Replace -100 in the preds as we can't decode them
    # Thay -100 trong preds bằng pad_token_id để decode an toàn (tokenizer không decode -100).
    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)


    # Loop qua preds, tìm end_pred_idx (eos_token_id), append pred[:end_pred_idx]
    # Để cắt preds đến eos (end of text), bỏ padding
    processed_preds = []
    for pred in preds:
        # np.where tìm vị trí eos
        # Đảm bảo decoded_preds chỉ là phần token có ý nghĩa
        end_pred_idx = np.where(pred == tokenizer.eos_token_id)[0]
        if len(end_pred_idx) > 0:
            end_pred_idx = end_pred_idx[0]
            processed_preds.append(pred[:end_pred_idx])
        else:
            processed_preds.append(pred)

    # Decode generated summaries into text
    # Decode token ids thành text
    decoded_preds = tokenizer.batch_decode(processed_preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    # Decode reference summaries into text
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Convert the decoded predictions and labels to label ids
    # Extract keyword (positive/negative/neutral) từ text, chuyển sang id (0/1/2) để tính metrics.
    int_preds, int_labels = [], []
    for p, l in zip(decoded_preds, decoded_labels):
        # lấy keyword ("positive") từ label
        l = l.split(":")[-1].strip()
        cur_label_id = label2id[l]
        int_labels.append(cur_label_id)
        # Tương tự cho pred
        # Dùng try except vì pred có thể ra sai format
        try:
            p = p.split(":")[-1].strip()
            cur_pred_id = label2id[p]
        except Exception as e:
            cur_pred_id = (cur_label_id + 1) % len(label2id)
        int_preds.append(cur_pred_id)

    accuracy_results = accuracy_metric.compute(predictions=int_preds, references=int_labels)
    f1_results = f1_metric.compute(predictions=int_preds, references=int_labels, average="macro")
    precision_results = precision_metric.compute(predictions=int_preds, references=int_labels, average="macro")
    recall_results = recall_metric.compute(predictions=int_preds, references=int_labels, average="macro")

    return {
        **accuracy_results,
        **f1_results,
        **precision_results,
        **recall_results
    }

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

# **Training**

In [None]:
training_args = TrainingArguments(
    output_dir="./output",
    # num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=NUM_EVAL_STEPS,
    max_steps=MAX_TRAIN_STEPS,
    eval_steps=NUM_EVAL_STEPS,
    eval_strategy="steps",
    overwrite_output_dir=True,
    save_total_limit=2,
    report_to="none",
    push_to_hub=False,
    remove_unused_columns=False,
)

In [None]:
import bitsandbytes as bnb
from transformers import get_linear_schedule_with_warmup

trainable_params = filter(lambda p: p.requires_grad, peft_model.parameters())

paged_optimizer = bnb.optim.PagedAdamW(
    trainable_params,
    lr=3e-4,
    weight_decay=0.0
)

scheduler = get_linear_schedule_with_warmup(
    paged_optimizer,
    num_warmup_steps=int(MAX_TRAIN_STEPS*0.1),
    num_training_steps=MAX_TRAIN_STEPS
)

In [None]:
trainer = SFTTrainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    preprocess_logits_for_metrics=preprocess_logits_for_metrics,
    compute_metrics=compute_metrics,
    processing_class=tokenizer,
    data_collator=data_collator,
    optimizers=(paged_optimizer, scheduler),
)

Truncating train dataset:   0%|          | 0/11426 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/1583 [00:00<?, ? examples/s]

In [None]:
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 128009, 'pad_token_id': 128009}.


Step,Training Loss,Validation Loss


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipython-input-4032920361.py", line 1, in <cell line: 0>
    trainer.train()
  File "/usr/local/lib/python3.12/dist-packages/transformers/trainer.py", line 2325, in train
    return inner_training_loop(
           ^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/trainer.py", line 2755, in _inner_training_loop
    self.control = self.callback_handler.on_step_end(args, self.state, self.control)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/trainer_callback.py", line 534, in on_step_end
    return self.call_event("on_step_end", args, state, control)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-pack

TypeError: object of type 'NoneType' has no len()