## 1. T·∫£i D·ªØ Li·ªáu t·ª´ CSV

In [2]:
!pip install datasets



In [3]:
from transformers import AutoTokenizer, TrainingArguments, Trainer, AutoModel
import numpy as np
import torch
from datasets import load_dataset
import torch.nn as nn
import os
from typing import List
from tqdm import tqdm


os.environ["CUDA_VISIBLE_DEVICES"] = "1" ## Setup CUDA GPU 1



In [4]:

class BERTIntentClassification(nn.Module):


    def __init__(self, model_name="bert-base-uncased", num_classes=10, dropout_rate=0.1, cache_dir = "huggingface"):
        super(BERTIntentClassification, self).__init__()
        self.bert = AutoModel.from_pretrained(model_name, cache_dir = cache_dir)
        # Get BERT hidden size
        hidden_size = self.bert.config.hidden_size
        self.ffnn = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            nn.LayerNorm(hidden_size),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_size, num_classes)
        )


    def freeze_bert(self):
        for param in self.bert.parameters():
            param.requires_grad = False


    def get_pooling(self, hidden_state, attention_mask):
        """
        Get mean pooled representation from BERT hidden states
        Args:
            hidden_state: BERT output containing hidden states
        Returns:
            pooled_output: Mean pooled representation of the sequence
        """
        # Get last hidden state
        last_hidden_state = hidden_state.last_hidden_state  # Shape: [batch_size, seq_len, hidden_size]

        if attention_mask is not None:
            # Expand attention mask to match hidden state dimensions
            attention_mask = attention_mask.unsqueeze(-1)  # [batch_size, seq_len, 1]

            # Mask out padding tokens
            masked_hidden = last_hidden_state * attention_mask

            # Calculate mean (sum / number of actual tokens)
            sum_hidden = torch.sum(masked_hidden, dim=1)  # [batch_size, hidden_size]
            count_tokens = torch.sum(attention_mask, dim=1)  # [batch_size, 1]
            pooled_output = sum_hidden / count_tokens
        else:
            # If no attention mask, simply take mean of all tokens
            pooled_output = torch.mean(last_hidden_state, dim=1)

        return pooled_output


    def forward(self, input_ids, attention_mask, **kwargs):
        """
        Forward pass of the model
        Args:
            input_ids: Input token IDs
            attention_mask: Attention mask for padding
        Returns:
            logits: Raw logits for each class
        """
        # Get BERT hidden states
        hidden_state = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
        )

        # Get pooled representation
        hidden_state_pooling = self.get_pooling(hidden_state=hidden_state, attention_mask=attention_mask)

        # Pass through FFNN classifier
        logits = self.ffnn(hidden_state_pooling)

        return logits


In [5]:
class TrainerCustom(Trainer):

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        """
        How the loss is computed by Trainer. By default, all models return the loss in the first element.

        Subclass and override for custom behavior.
        """
        if "labels" in inputs:
            labels = inputs.pop("labels")
        else:
            labels = None

        # S·ª≠ d·ª•ng nn.CrossEntropyLoss() thay v√¨ nn.CrossEntropy
        cross_entropy_loss = nn.CrossEntropyLoss()

        # Ch·∫°y m√¥ h√¨nh v√† nh·∫≠n ƒë·∫ßu ra (logits)
        outputs = model(**inputs)

        # ƒê·∫£m b·∫£o l·∫•y logits t·ª´ outputs (m√¥ h√¨nh tr·∫£ v·ªÅ tuple, l·∫•y ph·∫ßn t·ª≠ ƒë·∫ßu ti√™n l√† logits)
        logits = outputs

        # T√≠nh to√°n loss
        loss = cross_entropy_loss(logits, labels)

        # Tr·∫£ v·ªÅ loss v√† outputs n·∫øu c·∫ßn
        return (loss, outputs) if return_outputs else loss


# 1. Load Dataset and with Dataloader

In [6]:






# # B∆∞·ªõc 1: T·∫£i d·ªØ li·ªáu
# # S·ª≠ d·ª•ng dataset s·∫µn c√≥ t·ª´ Hugging Face ho·∫∑c t·∫£i t·ª´ file c·ª•c b·ªô
# dataset = load_dataset("imdb", cache_dir = "huggingface")  # V√≠ d·ª•: D·ªØ li·ªáu IMDB ƒë·ªÉ ph√¢n lo·∫°i sentiment
# # Thay th·∫ø tr∆∞·ªùng 'text' th√†nh 'input_ids' trong train_dataset v√† test_dataset
# def preprocess_dataset(dataset):
#     return dataset.map(lambda example: {
#             "input_ids": example['text'],
#             "label": example['label']
#         },
#         remove_columns=["text"],
#         num_proc=4  # S·ª≠ d·ª•ng 4 ti·∫øn tr√¨nh song song ƒë·ªÉ x·ª≠ l√Ω nhanh h∆°n
#     )

# train_dataset = preprocess_dataset(dataset["train"])
# test_dataset = preprocess_dataset(dataset["test"])



In [7]:
# print(train_dataset)
# # Truy c·∫≠p m·∫´u c·ª• th·ªÉ
# train_sample = train_dataset[:10]
# test_sample = test_dataset[:2]
# print(train_sample)


# from datasets import Dataset

# train_sample = train_dataset[:10]

# # Chuy·ªÉn t·ª´ dict v·ªÅ Dataset
# train_sample_dataset = Dataset.from_dict(train_sample)
# test_sample_dataset = Dataset.from_dict(test_sample)
# print(train_sample_dataset)
# print(type(train_sample_dataset))
# # Output: <class 'datasets.arrow_dataset.Dataset'>


# # In th·ª≠ 1 h√†ng trong test_sample_dataset
# print("First row in test_sample_dataset:")
# print(test_sample_dataset[0])




In [8]:
from datasets import Dataset

def load_csv_dataset(csv_path, text_column, label_column):
    """
    T·∫£i dataset t·ª´ file CSV v√† ƒë·ªïi t√™n c·ªôt.

    Args:
        csv_path (str): ƒê∆∞·ªùng d·∫´n ƒë·∫øn file .csv.
        text_column (str): T√™n c·ªôt ch·ª©a vƒÉn b·∫£n.
        label_column (str): T√™n c·ªôt ch·ª©a nh√£n.

    Returns:
        Dataset: T·∫≠p d·ªØ li·ªáu ƒë√£ t·∫£i t·ª´ file .csv.
    """
    # T·∫£i d·ªØ li·ªáu t·ª´ file .csv
    dataset = Dataset.from_csv(csv_path)
    # ƒê·ªïi t√™n c·ªôt
    dataset = dataset.rename_columns({text_column: "input_ids", label_column: "label"})
    return dataset

# S·ª≠ d·ª•ng h√†m
csv_path = "/content/chatbot_intent_data_v1_En.csv"             # ƒê∆∞·ªùng d·∫´n file CSV
text_column = "input_ids"       # C·ªôt ch·ª©a vƒÉn b·∫£n
label_column = "label"        # C·ªôt ch·ª©a nh√£n

# T·∫£i dataset
dataset = load_csv_dataset(csv_path, text_column, label_column)

# Ki·ªÉm tra d·ªØ li·ªáu
print(dataset)

# Truy c·∫≠p m·∫´u c·ª• th·ªÉ
sample_dataset = dataset.select(range(10))  # L·∫•y 10 m·∫´u ƒë·∫ßu ti√™n
print(sample_dataset)


# In th·ª≠ 1 h√†ng trong test_sample_dataset
print("First row in test_sample_dataset:")
print(sample_dataset[0])


Dataset({
    features: ['label', 'input_ids'],
    num_rows: 27
})
Dataset({
    features: ['label', 'input_ids'],
    num_rows: 10
})
First row in test_sample_dataset:
{'label': 'Agree', 'input_ids': 'Yes, I want to show you the picture.'}


In [9]:
def check_invalid_samples(dataset):
    invalid_samples = []
    for idx, sample in enumerate(dataset):
        if not isinstance(sample["input_ids"], str) or sample["input_ids"].strip() == "":
            invalid_samples.append((idx, sample))
    return invalid_samples

# Ki·ªÉm tra d·ªØ li·ªáu kh√¥ng h·ª£p l·ªá
invalid_samples = check_invalid_samples(dataset)
print("\n===== Invalid Samples =====")
print(invalid_samples)



===== Invalid Samples =====
[]


In [10]:
# T·ª± ƒë·ªông ph√°t hi·ªán nh√£n v√† t·∫°o √°nh x·∫° nh√£n
def create_label_mapping(dataset_list):
    """
    T·ª± ƒë·ªông ph√°t hi·ªán t·∫•t c·∫£ c√°c nh√£n t·ª´ danh s√°ch dataset v√† √°nh x·∫° ch√∫ng th√†nh s·ªë nguy√™n.
    """
    all_labels = set()
    for dataset in dataset_list:
        all_labels.update(dataset["label"])  # T·∫≠p h·ª£p t·∫•t c·∫£ c√°c nh√£n t·ª´ dataset

    label_to_int = {label: idx for idx, label in enumerate(sorted(all_labels))}
    print(f"√Ånh x·∫° nh√£n: {label_to_int}")
    return label_to_int

# H√†m chuy·ªÉn ƒë·ªïi nh√£n
def preprocess_labels(example, label_to_int):
    example["label"] = label_to_int.get(example["label"], -1)  # G√°n -1 cho nh√£n kh√¥ng h·ª£p l·ªá
    return example

# T·∫°o √°nh x·∫° nh√£n
label_mapping = create_label_mapping([dataset])

# √Åp d·ª•ng chuy·ªÉn ƒë·ªïi nh√£n
dataset = dataset.map(lambda example: preprocess_labels(example, label_mapping))

# Ki·ªÉm tra k·∫øt qu·∫£
print(dataset)

# Truy c·∫≠p m·∫´u c·ª• th·ªÉ
sample_dataset = dataset.select(range(10))  # L·∫•y 10 m·∫´u ƒë·∫ßu ti√™n
print(sample_dataset)

# In th·ª≠ 1 h√†ng trong sample_dataset
print("First row in sample_dataset:")
print(sample_dataset[0])

√Ånh x·∫° nh√£n: {'Agree': 0, 'Decline': 1, 'Fallback': 2, 'Silence': 3, 'Uncertain': 4}
Dataset({
    features: ['label', 'input_ids'],
    num_rows: 27
})
Dataset({
    features: ['label', 'input_ids'],
    num_rows: 10
})
First row in sample_dataset:
{'label': 0, 'input_ids': 'Yes, I want to show you the picture.'}


In [11]:
def split_dataset(dataset, test_size=0.2, seed=42):
    """
    Chia dataset th√†nh t·∫≠p train v√† test.

    Args:
        dataset (Dataset): T·∫≠p d·ªØ li·ªáu ƒë·∫ßy ƒë·ªß.
        test_size (float): T·ª∑ l·ªá d·ªØ li·ªáu test (0.0 - 1.0).
        seed (int): Seed ƒë·ªÉ chia d·ªØ li·ªáu ng·∫´u nhi√™n.

    Returns:
        tuple: (train_dataset, test_dataset) - T·∫≠p train v√† test.
    """
    if not (0.0 < test_size < 1.0):
        raise ValueError("test_size ph·∫£i n·∫±m trong kho·∫£ng (0.0, 1.0)")
    if len(dataset) < 2:
        raise ValueError("Dataset ph·∫£i c√≥ √≠t nh·∫•t 2 m·∫´u ƒë·ªÉ chia.")

    train_test_split = dataset.train_test_split(test_size=test_size, seed=seed)
    print(f"Chia dataset: {len(train_test_split['train'])} m·∫´u train, {len(train_test_split['test'])} m·∫´u test")
    return train_test_split["train"], train_test_split["test"]

# Chia dataset
train_dataset, test_dataset = split_dataset(dataset, test_size=0.3)

# Ki·ªÉm tra d·ªØ li·ªáu
print("Train dataset:", train_dataset)
print("Test dataset:", test_dataset)

# Truy c·∫≠p m·∫´u c·ª• th·ªÉ
sample_train_dataset = train_dataset.select(range(8))  # L·∫•y 10 m·∫´u ƒë·∫ßu ti√™n t·ª´ train
sample_test_dataset = test_dataset.select(range(9))    # L·∫•y 10 m·∫´u ƒë·∫ßu ti√™n t·ª´ test

print("Sample train dataset:", sample_train_dataset)
print("Sample test dataset:", sample_test_dataset)

Chia dataset: 18 m·∫´u train, 9 m·∫´u test
Train dataset: Dataset({
    features: ['label', 'input_ids'],
    num_rows: 18
})
Test dataset: Dataset({
    features: ['label', 'input_ids'],
    num_rows: 9
})
Sample train dataset: Dataset({
    features: ['label', 'input_ids'],
    num_rows: 8
})
Sample test dataset: Dataset({
    features: ['label', 'input_ids'],
    num_rows: 9
})


# 2. Tokenizer

In [12]:





# B∆∞·ªõc 2: Chu·∫©n b·ªã tokenizer v√† token h√≥a d·ªØ li·ªáu
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir = "huggingface")
model = BERTIntentClassification(
    model_name=model_name,
    num_classes=5
)
model.freeze_bert() # Froze Layer BERT
max_seq_length = 512


def collate_fn(features):
    inputs = []
    labels = []
    for element in features:
        inputs.append(element.get("input_ids"))
        labels.append(element.get("label"))

    labels = torch.tensor(labels, dtype=torch.long)

    token_inputs = tokenizer(
        inputs,
        add_special_tokens=True,
        truncation=True,
        padding=True,
        max_length=max_seq_length,
        return_overflowing_tokens=False,
        return_length=False,
        return_tensors="pt",
    )
    token_inputs.update({
        "labels": labels,
    })
    return token_inputs


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


# 3. Train Model

## 3.1 Log Wandb

In [None]:
!pip install --upgrade wandb



In [None]:
!pip install python-dotenv

In [None]:
from dotenv import load_dotenv
import os

# Load bi·∫øn m√¥i tr∆∞·ªùng t·ª´ file .env
load_dotenv()

# L·∫•y key t·ª´ bi·∫øn m√¥i tr∆∞·ªùng
wandb_api_key = os.getenv("WANDB_API_KEY")
print(wandb_api_key[:5])

In [None]:
import wandb
import os

# L·∫•y API key t·ª´ bi·∫øn m√¥i tr∆∞·ªùng v√† ƒëƒÉng nh·∫≠p
wandb.login(key=os.getenv("WANDB_API_KEY"))


C√°ch thi·∫øt l·∫≠p th√¥ng qua TrainingArguments
Khi s·ª≠ d·ª•ng Trainer, b·∫°n c√≥ th·ªÉ ƒë·∫∑t t√™n d·ª± √°n tr·ª±c ti·∫øp trong TrainingArguments b·∫±ng c√°ch s·ª≠ d·ª•ng tham s·ªë report_to v√† run_name. Tuy nhi√™n, ƒë·ªÉ ƒë·∫∑t project, b·∫°n c·∫ßn kh·ªüi t·∫°o m·ªôt phi√™n wandb tr∆∞·ªõc ho·∫∑c truy·ªÅn c·∫•u h√¨nh n√†y th√¥ng qua wandb.init().

ƒêi·ªÅu ch·ªânh TrainingArguments:
```python
training_args = TrainingArguments(
    output_dir="./results_",          # Th∆∞ m·ª•c l∆∞u k·∫øt qu·∫£
    eval_strategy="epoch",           # ƒê√°nh gi√° sau m·ªói epoch
    learning_rate=2e-4,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",            # Th∆∞ m·ª•c l∆∞u log
    logging_strategy="steps",        # Log theo steps
    logging_steps=10,                # Log sau m·ªói 10 b∆∞·ªõc
    save_strategy="epoch",           # L∆∞u checkpoint sau m·ªói epoch
    save_total_limit=3,              # L∆∞u t·ªëi ƒëa 3 checkpoint
    report_to="wandb",               # B√°o c√°o log t·ªõi wandb
    run_name="bert_run_1"            # T√™n phi√™n ch·∫°y tr√™n wandb
)
```

## 3.2 Train

### Ver 1.2.3

D∆∞·ªõi ƒë√¢y l√† b·∫£ng t√≥m t·∫Øt chi ti·∫øt c√°ch l∆∞u m√¥ h√¨nh d·ª±a tr√™n chi·∫øn l∆∞·ª£c ƒë∆∞·ª£c ƒë·ªÅ xu·∫•t:

| **Lo·∫°i Model**    | **ƒêi·ªÅu Ki·ªán L∆∞u**                                                                 | **Th∆∞ M·ª•c L∆∞u Tr√™n Local**       | **S·ªë L∆∞·ª£ng L∆∞u Tr√™n Local**        | **Th√¥ng Tin Th√™m**                              | **ƒê·ªìng B·ªô L√™n WandB**                  |
|--------------------|-----------------------------------------------------------------------------------|-----------------------------------|------------------------------------|-----------------------------------------------|-----------------------------------------|
| **Best Model**     | Khi `eval_loss` gi·∫£m                                                             | `output_dir/best_model`           | Ch·ªâ l∆∞u m·ªôt b·∫£n duy nh·∫•t           | L∆∞u th√¥ng tin `epoch` v√† `eval_loss`.          | C√≥: Artifact `best_model`. Th√™m `epoch` v√† `loss` v√†o `metadata`. |
| **Final Checkpoint** | Sau m·ªói epoch (checkpoint cu·ªëi c·ªßa epoch)                                        | `output_dir/checkpoint-epoch-<n>` | T·ªëi ƒëa 3 checkpoint g·∫ßn nh·∫•t       | Kh√¥ng c√≥ th√¥ng tin ƒë·∫∑c bi·ªát.                   | Kh√¥ng ƒë·ªìng b·ªô (tr√°nh tr√πng l·∫∑p d·ªØ li·ªáu l·ªõn). |
| **Custom Checkpoint** (t√πy ch·ªçn) | Sau m·ªôt s·ªë b∆∞·ªõc c·ªë ƒë·ªãnh ho·∫∑c m·ªëc quan tr·ªçng (n·∫øu c·∫ßn thi·∫øt, v√≠ d·ª•: m·ªói 5 epoch) | T√πy ch·ªânh, v√≠ d·ª•: `output_dir/checkpoint-step-<n>` | Theo √Ω mu·ªën, ho·∫∑c kh√¥ng gi·ªõi h·∫°n | Th√™m c√°c m·ªëc quan tr·ªçng ƒë·ªÉ ph√¢n t√≠ch sau n√†y. | T√πy ch·ªçn (kh√¥ng b·∫Øt bu·ªôc).              |

---

### **Chi ti·∫øt v·ªÅ b·∫£ng**
1. **Best Model**:
   - ƒêi·ªÅu ki·ªán: `eval_loss` gi·∫£m.
   - Ch·ªâ l∆∞u m·ªôt phi√™n b·∫£n t·ªët nh·∫•t.
   - L∆∞u th√¥ng tin epoch v√† loss ƒë·ªÉ d·ªÖ d√†ng tham kh·∫£o ho·∫∑c t·∫£i xu·ªëng sau n√†y.

2. **Final Checkpoint**:
   - ƒê∆∞·ª£c l∆∞u sau m·ªói epoch.
   - Gi·ªõi h·∫°n s·ªë l∆∞·ª£ng checkpoint l∆∞u tr√™n local ƒë·ªÉ ti·∫øt ki·ªám b·ªô nh·ªõ (v√≠ d·ª•: t·ªëi ƒëa 3 checkpoint).
   - Kh√¥ng l∆∞u th√¥ng tin th√™m v√†o checkpoint.

3. **Custom Checkpoint** (t√πy ch·ªçn):
   - C√≥ th·ªÉ s·ª≠ d·ª•ng n·∫øu b·∫°n mu·ªën l∆∞u checkpoint t·∫°i c√°c m·ªëc th·ªùi gian c·ª• th·ªÉ, ch·∫≥ng h·∫°n nh∆∞ m·ªói 5 epoch ho·∫∑c sau m·ªôt s·ªë b∆∞·ªõc hu·∫•n luy·ªán (steps).
   - Th√≠ch h·ª£p khi b·∫°n c·∫ßn ki·ªÉm tra ti·∫øn ƒë·ªô hu·∫•n luy·ªán chi ti·∫øt h∆°n ho·∫∑c mu·ªën l∆∞u backup.

---

### **T√≥m t·∫Øt logic**
- **Best Model**:
  - L∆∞u v√†o th∆∞ m·ª•c c·ªë ƒë·ªãnh (`best_model`).
  - Ghi ƒë√® khi c√≥ `eval_loss` m·ªõi t·ªët h∆°n.
  - ƒê·ªìng b·ªô l√™n WandB.

- **Final Checkpoint**:
  - L∆∞u sau m·ªói epoch.
  - X√≥a checkpoint c≈© nh·∫•t n·∫øu v∆∞·ª£t gi·ªõi h·∫°n `save_total_limit`.
  - Kh√¥ng ƒë·ªìng b·ªô l√™n WandB (tr√°nh l√£ng ph√≠ kh√¥ng gian l∆∞u tr·ªØ).

- **Custom Checkpoint**:
  - T√πy ch·ªçn n·∫øu b·∫°n c·∫ßn l∆∞u th√™m ƒë·ªÉ ph·ª•c v·ª• c√°c m·ª•c ƒë√≠ch c·ª• th·ªÉ.

N·∫øu b·∫°n c·∫ßn th√™m b·∫•t k·ª≥ chi ti·∫øt n√†o kh√°c, h√£y cho m√¨nh bi·∫øt nh√©! üòä

### **B·∫£ng T√≥m T·∫Øt: L∆∞u Best Model v√† Last Model**

| **Lo·∫°i Model**    | **Khi N√†o C·∫ßn L∆∞u**                                                                                         | **∆Øu ƒêi·ªÉm**                                                                                       | **H·∫°n Ch·∫ø**                                                                                      |
|--------------------|------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------|
| **Best Model**     | - Khi mu·ªën tri·ªÉn khai m√¥ h√¨nh t·ªët nh·∫•t v·ªõi `eval_loss` th·∫•p nh·∫•t ho·∫∑c `accuracy` cao nh·∫•t.                   | - ƒê·∫£m b·∫£o l∆∞u l·∫°i m√¥ h√¨nh c√≥ hi·ªáu su·∫•t t·ªët nh·∫•t tr√™n t·∫≠p validation.<br>- Ph√π h·ª£p ƒë·ªÉ tri·ªÉn khai.   | - Kh√¥ng l∆∞u tr·∫°ng th√°i ƒë·∫ßy ƒë·ªß (optimizer, scheduler).<br>- Kh√¥ng ti·∫øp t·ª•c hu·∫•n luy·ªán t·ª´ tr·∫°ng th√°i n√†y. |
| **Last Model**     | - Khi c·∫ßn ti·∫øp t·ª•c hu·∫•n luy·ªán (fine-tuning) ho·∫∑c kh√¥i ph·ª•c tr·∫°ng th√°i sau khi hu·∫•n luy·ªán k·∫øt th√∫c.         | - L∆∞u ƒë·∫ßy ƒë·ªß tr·∫°ng th√°i (weights, optimizer, scheduler).<br>- Ph√π h·ª£p ƒë·ªÉ ti·∫øp t·ª•c hu·∫•n luy·ªán.    | - C√≥ th·ªÉ kh√¥ng ph·∫£i l√† m√¥ h√¨nh t·ªët nh·∫•t (do overfitting ho·∫∑c underfitting).                     |
| **Ch·ªâ L∆∞u Best**   | - Khi ch·ªâ quan t√¢m ƒë·∫øn tri·ªÉn khai m√¥ h√¨nh t·ªët nh·∫•t, kh√¥ng c·∫ßn ti·∫øp t·ª•c hu·∫•n luy·ªán sau n√†y.                  | - Ti·∫øt ki·ªám t√†i nguy√™n l∆∞u tr·ªØ.<br>- T·∫≠p trung v√†o m√¥ h√¨nh t·ªëi ∆∞u cho tri·ªÉn khai.                | - Kh√¥ng th·ªÉ ti·∫øp t·ª•c hu·∫•n luy·ªán n·∫øu c·∫ßn.                                                         |
| **Ch·ªâ L∆∞u Last**   | - Khi mu·ªën ƒë·∫£m b·∫£o kh·∫£ nƒÉng kh√¥i ph·ª•c tr·∫°ng th√°i ƒë·ªÉ ti·∫øp t·ª•c hu·∫•n luy·ªán.                                    | - Kh√¥i ph·ª•c ho√†n to√†n qu√° tr√¨nh hu·∫•n luy·ªán.<br>- Ph√π h·ª£p cho fine-tuning ho·∫∑c th·ª≠ nghi·ªám sau n√†y. | - Kh√¥ng ƒë·∫£m b·∫£o ƒë√¢y l√† m√¥ h√¨nh t·ªët nh·∫•t ƒë·ªÉ tri·ªÉn khai.                                           |
| **L∆∞u C·∫£ Hai**     | - Khi c·∫ßn c·∫£ tri·ªÉn khai m√¥ h√¨nh t·ªët nh·∫•t v√† ti·∫øp t·ª•c hu·∫•n luy·ªán sau n√†y.                                    | - K·∫øt h·ª£p ∆∞u ƒëi·ªÉm c·ªßa c·∫£ Best Model v√† Last Model.<br>- Linh ho·∫°t trong s·ª≠ d·ª•ng.                 | - T·ªën th√™m t√†i nguy√™n l∆∞u tr·ªØ v√† th·ªùi gian.                                                     |

---

### **Chi·∫øn L∆∞·ª£c T·ªëi ∆Øu**
| **Lo·∫°i L∆∞u** | **T·∫ßn Su·∫•t**                          | **Chi·∫øn L∆∞·ª£c**                                                                                             |
|--------------|---------------------------------------|-----------------------------------------------------------------------------------------------------------|
| **Best Model** | Khi `eval_loss` gi·∫£m                 | L∆∞u m·ªói l·∫ßn `eval_loss` gi·∫£m ƒë·ªÉ ƒë·∫£m b·∫£o m√¥ h√¨nh t·ªët nh·∫•t lu√¥n ƒë∆∞·ª£c l∆∞u.                                    |
| **Last Model** | Sau khi hu·∫•n luy·ªán k·∫øt th√∫c          | L∆∞u tr·∫°ng th√°i cu·ªëi c√πng c·ªßa qu√° tr√¨nh hu·∫•n luy·ªán (weights + optimizer + scheduler).                      |
| **K·∫øt h·ª£p**   | Best Model: M·ªói khi `eval_loss` gi·∫£m<br>Last Model: Sau khi k·∫øt th√∫c | L∆∞u c·∫£ Best Model ƒë·ªÉ tri·ªÉn khai v√† Last Model ƒë·ªÉ ti·∫øp t·ª•c hu·∫•n luy·ªán khi c·∫ßn thi·∫øt.                      |

---

### **L·ª±a Ch·ªçn Ph√π H·ª£p**
- **D·ª± √°n tri·ªÉn khai m√¥ h√¨nh nhanh**: L∆∞u **Best Model**.
- **D·ª± √°n nghi√™n c·ª©u ho·∫∑c fine-tuning ti·∫øp**: L∆∞u **Last Model**.
- **D·ª± √°n quy m√¥ l·ªõn, c·∫ßn c·∫£ tri·ªÉn khai v√† m·ªü r·ªông**: L∆∞u **c·∫£ hai**.

H√£y ch·ªçn chi·∫øn l∆∞·ª£c l∆∞u ph√π h·ª£p v·ªõi m·ª•c ti√™u d·ª± √°n c·ªßa b·∫°n! üöÄ

Thui, ko l∆∞u local n·ªØa, l∆∞u t·∫•t tr√™n wandb ƒëi.
- V·ªõi best model: l∆∞u l√™n wandb khi loss gi·∫£m v√† ƒë√£ sau 10 epochs  
- V·ªõi last model: l∆∞u l√™n wandb sau m·ªói 10 epochs
+, Trong qu√° tr√¨nh l∆∞u th√¨ vi·ªác training v·∫´n di·ªÖn ra Parallel

ƒë·ªÅu l∆∞u ƒë·∫ßy ƒë·ªß to√†n b·ªô tham s·ªë ƒë·ªÉ c√≥ th·ªÉ train th√™m t·ª´ c·∫£ ·ªü best model v√† last model

In [None]:
import os
os.environ["WANDB_LOG_MODEL"] = "checkpoint"

In [None]:
# class TrainerCustom(Trainer):

#     def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
#         """
#         How the loss is computed by Trainer. By default, all models return the loss in the first element.

#         Subclass and override for custom behavior.
#         """
#         if "labels" in inputs:
#             labels = inputs.pop("labels")
#         else:
#             labels = None

#         # S·ª≠ d·ª•ng nn.CrossEntropyLoss() thay v√¨ nn.CrossEntropy
#         cross_entropy_loss = nn.CrossEntropyLoss()

#         # Ch·∫°y m√¥ h√¨nh v√† nh·∫≠n ƒë·∫ßu ra (logits)
#         outputs = model(**inputs)

#         # ƒê·∫£m b·∫£o l·∫•y logits t·ª´ outputs (m√¥ h√¨nh tr·∫£ v·ªÅ tuple, l·∫•y ph·∫ßn t·ª≠ ƒë·∫ßu ti√™n l√† logits)
#         logits = outputs

#         if labels is None:
#             print("Labels are None during compute_loss.")
#         if logits is None:
#             print("Logits are None during compute_loss.")

#         # T√≠nh to√°n loss
#         loss = cross_entropy_loss(logits, labels)

#         # Tr·∫£ v·ªÅ loss v√† outputs n·∫øu c·∫ßn
#         return (loss, outputs) if return_outputs else loss


In [None]:

# import wandb

# # Kh·ªüi t·∫°o wandb
# wandb.init(
#     project="bert-intent-classification",  # T√™n d·ª± √°n
#     name="bert_run_3"                     # T√™n phi√™n ch·∫°y
# )


# # B∆∞·ªõc 6: C√†i ƒë·∫∑t tham s·ªë hu·∫•n luy·ªán
# training_args = TrainingArguments(
#     output_dir="./result__s",          # Th∆∞ m·ª•c l∆∞u k·∫øt qu·∫£
#     eval_strategy="epoch",    # ƒê√°nh gi√° sau m·ªói epoch
#     learning_rate=2e-4,
#     per_device_train_batch_size=128,
#     per_device_eval_batch_size=128,
#     num_train_epochs=50,
#     weight_decay=0.01,
#     logging_dir="./logs",
#     logging_strategy="steps",
#     logging_steps=1,  # Ghi logs m·ªói 500 b∆∞·ªõc hu·∫•n luy·ªán
#     save_strategy="no",          # L∆∞u tr·ªçng s·ªë sau m·ªói epoch
#     save_total_limit=3,
#     label_names = ["labels"],
#     report_to="wandb",
#     run_name="bert_run_3"
# )


# batch = collate_fn([sample_test_dataset[0]]) # T·∫°o m·ªôt batch t·ª´ m·ªôt m·∫´u ƒë∆°n l·∫ª (sample_test_dataset[0]) ƒë·ªÉ ki·ªÉm tra xem h√†m collate_fn c√≥ ho·∫°t ƒë·ªông ƒë√∫ng kh√¥ng.
# print(batch)

# # metrics = trainer.evaluate()
# # M·ª•c ƒë√≠ch: Ch·∫°y giai ƒëo·∫°n evaluation (ƒë√°nh gi√°) tr√™n eval_dataset (sample_test_dataset) v√† t√≠nh to√°n c√°c metrics nh∆∞:
# trainer = TrainerCustom(
#     model=model,
#     args=training_args,
#     train_dataset=sample_train_dataset,
#     eval_dataset=sample_test_dataset,
#     tokenizer=tokenizer,
#     data_collator=collate_fn,
# )

# metrics = trainer.evaluate()
# print(metrics)  # Ki·ªÉm tra xem c√≥ "eval_loss" hay kh√¥ng


In [None]:

# # B∆∞·ªõc 7: T·∫°o Trainer
# trainer = TrainerCustom(
#     model=model,
#     args=training_args,
#     train_dataset=sample_train_dataset,
#     eval_dataset=sample_test_dataset,
#     tokenizer=tokenizer,
#     data_collator = collate_fn,
# )

# # B∆∞·ªõc 8: Hu·∫•n luy·ªán
# trainer.train()

# # K·∫øt th√∫c phi√™n wandb
# wandb.finish()




In [None]:
from concurrent.futures import ThreadPoolExecutor
import wandb
import os
import shutil

class TrainerCustom(Trainer):
    def __init__(self, *args, save_every_n_epochs=10, **kwargs):
        super().__init__(*args, **kwargs)
        self.best_eval_loss = float("inf")  # Gi√° tr·ªã loss t·ªët nh·∫•t ban ƒë·∫ßu
        self.save_every_n_epochs = save_every_n_epochs  # T·∫ßn su·∫•t l∆∞u l√™n WandB
        self.best_model_info = {"epoch": None, "loss": None}
        self.last_saved_epoch = 0  # Epoch cu·ªëi c√πng ƒë√£ l∆∞u Best Model v√† Last Model
        self.executor = ThreadPoolExecutor(max_workers=2)  # Cho ph√©p t·ªëi ƒëa 2 lu·ªìng song song

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        """
        How the loss is computed by Trainer. By default, all models return the loss in the first element.

        Subclass and override for custom behavior.
        """
        if "labels" in inputs:
            labels = inputs.pop("labels")
        else:
            labels = None

        # S·ª≠ d·ª•ng nn.CrossEntropyLoss() thay v√¨ nn.CrossEntropy
        cross_entropy_loss = nn.CrossEntropyLoss()

        # Ch·∫°y m√¥ h√¨nh v√† nh·∫≠n ƒë·∫ßu ra (logits)
        outputs = model(**inputs)

        # ƒê·∫£m b·∫£o l·∫•y logits t·ª´ outputs (m√¥ h√¨nh tr·∫£ v·ªÅ tuple, l·∫•y ph·∫ßn t·ª≠ ƒë·∫ßu ti√™n l√† logits)
        logits = outputs

        if labels is None:
            print("Labels are None during compute_loss.")
        if logits is None:
            print("Logits are None during compute_loss.")

        # T√≠nh to√°n loss
        loss = cross_entropy_loss(logits, labels)

        # Tr·∫£ v·ªÅ loss v√† outputs n·∫øu c·∫ßn
        return (loss, outputs) if return_outputs else loss

    def async_save_model(self, model_dir, artifact_name, metadata=None):
        def save():
            start_time = time.time()
            self.save_model(model_dir)
            artifact = wandb.Artifact(artifact_name, type="model")
            artifact.add_dir(model_dir)
            if metadata:
                artifact.metadata = metadata
            wandb.log_artifact(artifact)
            shutil.rmtree(model_dir, ignore_errors=True)
            elapsed_time = time.time() - start_time
            print(f"Model saved and uploaded to WandB: {artifact_name} in {elapsed_time:.2f} seconds")

        self.executor.submit(save)

    def evaluate(self, eval_dataset=None, ignore_keys=None, metric_key_prefix: str = "eval"):
        metrics = super().evaluate(eval_dataset, ignore_keys, metric_key_prefix)
        eval_loss = metrics.get("eval_loss")

        # C·∫≠p nh·∫≠t Best Model n·∫øu eval_loss gi·∫£m
        if eval_loss is not None and eval_loss < self.best_eval_loss:
            print(f"New best eval_loss: {eval_loss}")
            self.best_eval_loss = eval_loss
            self.best_model_info = {"epoch": self.state.epoch, "loss": eval_loss}

            # L∆∞u Best Model l√™n WandB sau m·ªói 10 epochs
            if int(self.state.epoch) % self.save_every_n_epochs == 0:
                best_model_dir = f"./tmp_best_model_epoch_{int(self.state.epoch)}"
                artifact_name = f"best_model_epoch_{int(self.state.epoch)}"
                self.async_save_model(best_model_dir, artifact_name, self.best_model_info)


        return metrics

    def save_last_model(self):
        """
        L∆∞u Last Model l√™n WandB sau m·ªói N epochs.
        """
        if int(self.state.epoch) % self.save_every_n_epochs == 0 and int(self.state.epoch) != self.last_saved_epoch:
            print(f"Saving Last Model at epoch {self.state.epoch} to WandB...")
            last_model_dir = f"./tmp_last_model_epoch_{int(self.state.epoch)}"
            artifact_name = f"last_model_epoch_{int(self.state.epoch)}"
            self.async_save_model(last_model_dir, artifact_name)

            # C·∫≠p nh·∫≠t epoch cu·ªëi c√πng ƒë√£ l∆∞u
            self.last_saved_epoch = int(self.state.epoch)

    def train(self, *args, **kwargs):
        result = super().train(*args, **kwargs)

        # Sau m·ªói epoch, l∆∞u Last Model l√™n WandB
        self.save_last_model()

        return result


# B∆∞·ªõc 6: C√†i ƒë·∫∑t tham s·ªë hu·∫•n luy·ªán
training_args = TrainingArguments(
    output_dir="./result__s",          # Th∆∞ m·ª•c l∆∞u k·∫øt qu·∫£
    eval_strategy="epoch",    # ƒê√°nh gi√° sau m·ªói epoch
    learning_rate=2e-4,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    num_train_epochs=50,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_strategy="steps",
    logging_steps=1,  # Ghi logs m·ªói 500 b∆∞·ªõc hu·∫•n luy·ªán
    save_strategy="no",          # L∆∞u tr·ªçng s·ªë sau m·ªói epoch
    save_total_limit=3,
    label_names = ["labels"],
    report_to="wandb",
    run_name="bert_run_3"
)


import wandb

# Kh·ªüi t·∫°o wandb
wandb.init(
    project="bert-intent-classification",  # T√™n d·ª± √°n
    name="bert_run_3"                     # T√™n phi√™n ch·∫°y
)


trainer = TrainerCustom(
    model=model,
    args=training_args,
    train_dataset=sample_train_dataset,
    eval_dataset=sample_test_dataset,
    data_collator=collate_fn,
    save_every_n_epochs=10  # L∆∞u Best Model v√† Last Model m·ªói 10 epochs
)

trainer.train()
wandb.finish()


In [None]:
# B∆∞·ªõc 9: ƒê√°nh gi√° tr√™n t·∫≠p ki·ªÉm tra
trainer.evaluate()

# Inference

In [None]:
sentence = "What is the weather like today?"


inputs = tokenizer(
    sentence,
    return_tensors="pt",
    truncation=True,
    padding=True,
    max_length=512
)



In [None]:
model.eval()  # ƒê·∫∑t m√¥ h√¨nh ·ªü ch·∫ø ƒë·ªô ƒë√°nh gi√° (kh√¥ng t√≠nh gradient)
with torch.no_grad():  # Kh√¥ng c·∫ßn t√≠nh gradient
    outputs = model(**inputs)
    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()  # L·∫•y nh√£n d·ª± ƒëo√°n
    print(f"Predicted class: {predicted_class}")
