In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import json
import re
import random
from pathlib import Path
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
from tqdm import tqdm

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("DEVICE =", DEVICE)

MAX_LEN = 256
BATCH_SIZE = 8
EPOCHS = 10
LR = 1e-5
WEIGHT_DECAY = 0.01


DEVICE = cuda


In [3]:
class QAProcessorPhoBERT:
    def __init__(self, tokenizer, max_length=256):
        self.tok = tokenizer
        self.max_length = max_length

    def encode_example(self, context, question, answer):
        # 1. exact char span
        start_char = context.find(answer)
        if start_char == -1:
            return None
        end_char = start_char + len(answer)

        # 2. tokenize context but KEEP char alignment using regex split
        #    (PhoBERT's tokenize() breaks spacing, so we split manually into words)
        import re
        words = re.findall(r"\S+|\s+", context)

        tokens = []
        offsets = []
        char_idx = 0

        for w in words:
            if w.isspace():
                char_idx += len(w)
                continue

            sub_toks = self.tok.tokenize(w)
            for st in sub_toks:
                clean = st.lstrip("‚ñÅ")
                s = context.find(clean, char_idx)
                if s == -1:
                    # fallback to sequential char index
                    s = char_idx
                e = s + len(clean)
                tokens.append(st)
                offsets.append((s, e))
            char_idx = context.find(w, char_idx) + len(w)

        # map char span ‚Üí token span
        start_tok = end_tok = None
        for i, (s, e) in enumerate(offsets):
            if s <= start_char < e:
                start_tok = i
            if s < end_char <= e:
                end_tok = i

        if start_tok is None or end_tok is None:
            return None

        # encode pair
        enc = self.tok(
            question,
            context,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors=None,
            return_overflowing_tokens=False
        )

        enc.pop("token_type_ids", None)

        # question token count
        q_tokens = self.tok.tokenize(question)

        offset = 1 + len(q_tokens) + 1  # <s> Q </s>

        start_pos = start_tok + offset
        end_pos = end_tok + offset

        if end_pos >= self.max_length:
            return None

        enc["start_positions"] = start_pos
        enc["end_positions"] = end_pos
        return enc


In [4]:
class QADataset(Dataset):
    def __init__(self, data, tokenizer, max_length=256):
        self.processor = QAProcessorPhoBERT(tokenizer, max_length)
        self.features = []

        for ex in data:
            item = self.processor.encode_example(
                ex["context"], ex["question"], ex["answer"]
            )
            if item is not None:
                self.features.append(item)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        feat = self.features[idx]
        return {k: torch.tensor(v) for k, v in feat.items()}


In [5]:
import re

def normalize_text(s: str) -> str:
    """Chu·∫©n h√≥a: unicode, kho·∫£ng tr·∫Øng, d·∫•u."""
    if s is None:
        return ""
    s = s.replace("‚Äì", "-").replace("‚Äî", "-")
    s = s.replace("‚Äú", "\"").replace("‚Äù", "\"")
    s = s.replace("‚Äô", "'")
    s = s.replace("‚Ä¶", "...")
    s = re.sub(r"\s+", " ", s.strip())
    return s


from difflib import SequenceMatcher

def extract_best_span(context: str, answer: str):
    """
    Tr·∫£ v·ªÅ m·ªôt ƒëo·∫°n trong context gi·ªëng answer nh·∫•t.
    N·∫øu answer ƒë√£ l√† substring c·ªßa context -> d√πng lu√¥n.
    N·∫øu kh√¥ng -> d√πng fuzzy matching.
    """
    ctx = normalize_text(context)
    ans = normalize_text(answer)

    if not ctx or not ans:
        return None

    # 1) exact match
    idx = ctx.find(ans)
    if idx != -1:
        return ctx[idx: idx + len(ans)]

    # 2) fuzzy match
    matcher = SequenceMatcher(None, ctx, ans)
    match = matcher.find_longest_match(0, len(ctx), 0, len(ans))

    # l·∫•y span trong context
    span = ctx[match.a: match.a + match.size]

    # c·∫ßn t·ªëi thi·ªÉu 2 t·ª´ ƒë·ªÉ coi nh∆∞ h·ª£p l·ªá
    if len(span.split()) >= 2:
        return span

    return None

In [6]:
def load_jsonl(path: str):
    rows = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            rows.append(json.loads(line))
    return rows


def clean_split(raw_list):
    """
    - Chu·∫©n h√≥a context/question/answer
    - Bi·∫øn answer th√†nh span n·∫±m trong context (n·∫øu ƒë∆∞·ª£c)
    - Lo·∫°i c√°c m·∫´u kh√¥ng t·∫°o ƒë∆∞·ª£c span
    """
    cleaned = []
    dropped = 0

    for ex in raw_list:
        ctx = normalize_text(ex.get("context", ""))
        ques = normalize_text(ex.get("question", ""))
        ans = normalize_text(ex.get("answer", ""))

        if not ctx or not ques or not ans:
            dropped += 1
            continue

        span = extract_best_span(ctx, ans)
        if span is None:
            dropped += 1
            continue

        cleaned.append({
            "context": ctx,
            "question": ques,
            "answer": span
        })

    print(f"Original={len(raw_list)}, Kept={len(cleaned)}, Dropped={dropped}")
    return cleaned


In [7]:
# ƒê∆∞·ªùng d·∫´n th∆∞ m·ª•c ch·ª©a train.jsonl / val.jsonl / test.jsonl
base_dir = "/content/drive/MyDrive/NLP/qa_splits_fixed"

train_path = f"{base_dir}/train_full.jsonl"
val_path   = f"{base_dir}/val.jsonl"
test_path  = f"{base_dir}/test.jsonl"

# 1. Load raw
train_raw = load_jsonl(train_path)
val_raw   = load_jsonl(val_path)
test_raw  = load_jsonl(test_path)

print("Raw sizes:", len(train_raw), len(val_raw), len(test_raw))

# 2. Clean -> bi·∫øn answer th√†nh span trong context
train_clean = clean_split(train_raw)
val_clean   = clean_split(val_raw)
test_clean  = clean_split(test_raw)

# 3. Tokenizer PhoBERT
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")

# 4. Build Dataset
train_ds = QADataset(train_clean, tokenizer, MAX_LEN)
val_ds   = QADataset(val_clean, tokenizer, MAX_LEN)
test_ds  = QADataset(test_clean, tokenizer, MAX_LEN)

# 5. DataLoader
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_loader   = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False)



Raw sizes: 38167 2256 2507
Original=38167, Kept=37438, Dropped=729
Original=2256, Kept=2215, Dropped=41
Original=2507, Kept=2468, Dropped=39


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/557 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

bpe.codes: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

In [8]:
class PhoBERTForQA(nn.Module):
    def __init__(self, name="vinai/phobert-base"):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(name)
        H = self.encoder.config.hidden_size
        self.qa_head = nn.Linear(H, 2)

    def forward(self, input_ids, attention_mask, start_positions=None, end_positions=None):
        out = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden = out.last_hidden_state

        logits = self.qa_head(last_hidden)  # [B, L, 2]
        start_logits, end_logits = logits.split(1, dim=-1)

        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)

        loss = None
        if start_positions is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(start_logits, start_positions) + loss_fn(end_logits, end_positions)

        return {"loss": loss, "start_logits": start_logits, "end_logits": end_logits}


In [9]:
class QATrainer:
    def __init__(self, model, train_loader, val_loader, lr=1e-5, epochs=5, weight_decay=0.01):
        self.model = model.to(DEVICE)
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.epochs = epochs

        # Optimizer
        self.optimizer = torch.optim.AdamW(
            model.parameters(),
            lr=lr,
            weight_decay=weight_decay
        )

        # Scheduler (warmup 10%)
        warmup_steps = int(0.1 * len(train_loader) * epochs)
        total_steps = len(train_loader) * epochs

        self.scheduler = get_linear_schedule_with_warmup(
            self.optimizer,
            num_warmup_steps=warmup_steps,
            num_training_steps=total_steps
        )

    # ============================
    #        TRAIN EPOCH
    # ============================
    def train_epoch(self, epoch_idx):
        self.model.train()
        total_loss = 0.0

        progress = tqdm(self.train_loader, desc=f"Train Epoch {epoch_idx+1}/{self.epochs}")

        for batch in progress:
            batch = {k: v.to(DEVICE) for k, v in batch.items()}

            self.optimizer.zero_grad()
            out = self.model(**batch)
            loss = out["loss"]

            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)

            self.optimizer.step()
            self.scheduler.step()

            total_loss += loss.item()
            avg_loss = total_loss / (progress.n + 1)

        return total_loss / len(self.train_loader)

    # ============================
    #          VALIDATION
    # ============================
    @torch.no_grad()
    def val_epoch(self, epoch_idx):
        self.model.eval()
        total_loss = 0.0

        progress = tqdm(self.val_loader, desc=f"Val Epoch {epoch_idx+1}/{self.epochs}")

        for batch in progress:
            batch = {k: v.to(DEVICE) for k, v in batch.items()}
            out = self.model(**batch)
            total_loss += out["loss"].item()

        return total_loss / len(self.val_loader)

    # ============================
    #              FIT
    # ============================
    def fit(self):
        best_val = float("inf")
        best_state = None

        print("üöÄ B·∫Øt ƒë·∫ßu training PhoBERT QA...")

        for ep in range(self.epochs):
            train_loss = self.train_epoch(ep)
            val_loss = self.val_epoch(ep)

            print(f"\nEpoch {ep+1}/{self.epochs} | Train={train_loss:.4f} | Val={val_loss:.4f}")

            if val_loss < best_val:
                best_val = val_loss
                best_state = {k: v.cpu().clone() for k, v in self.model.state_dict().items()}

        # Load best checkpoint
        if best_state is not None:
            self.model.load_state_dict(best_state)
            print(f"\nüî• Loaded best checkpoint (val_loss={best_val:.4f})")

        return self.model


In [11]:

model = PhoBERTForQA()
trainer = QATrainer(model, train_loader, val_loader, lr=LR, epochs=EPOCHS, weight_decay=WEIGHT_DECAY)
model = trainer.fit()

üöÄ B·∫Øt ƒë·∫ßu training PhoBERT QA...


Train Epoch 1/10:   0%|          | 0/2708 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/543M [00:00<?, ?B/s]

Train Epoch 1/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2708/2708 [17:45<00:00,  2.54it/s]
Val Epoch 1/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 175/175 [00:19<00:00,  8.78it/s]



Epoch 1/10 | Train=5.9703 | Val=2.9989


Train Epoch 2/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2708/2708 [18:03<00:00,  2.50it/s]
Val Epoch 2/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 175/175 [00:19<00:00,  8.79it/s]



Epoch 2/10 | Train=2.9231 | Val=2.3983


Train Epoch 3/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2708/2708 [18:03<00:00,  2.50it/s]
Val Epoch 3/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 175/175 [00:19<00:00,  8.75it/s]



Epoch 3/10 | Train=2.2375 | Val=2.3205


Train Epoch 4/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2708/2708 [18:03<00:00,  2.50it/s]
Val Epoch 4/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 175/175 [00:20<00:00,  8.74it/s]



Epoch 4/10 | Train=1.8362 | Val=2.3325


Train Epoch 5/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2708/2708 [18:02<00:00,  2.50it/s]
Val Epoch 5/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 175/175 [00:19<00:00,  8.77it/s]



Epoch 5/10 | Train=1.5195 | Val=2.4106


Train Epoch 6/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2708/2708 [18:03<00:00,  2.50it/s]
Val Epoch 6/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 175/175 [00:19<00:00,  8.77it/s]



Epoch 6/10 | Train=1.2895 | Val=2.7029


Train Epoch 7/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2708/2708 [18:04<00:00,  2.50it/s]
Val Epoch 7/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 175/175 [00:20<00:00,  8.74it/s]



Epoch 7/10 | Train=1.1065 | Val=2.8249


Train Epoch 8/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2708/2708 [18:03<00:00,  2.50it/s]
Val Epoch 8/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 175/175 [00:19<00:00,  8.75it/s]



Epoch 8/10 | Train=0.9603 | Val=2.8991


Train Epoch 9/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2708/2708 [18:03<00:00,  2.50it/s]
Val Epoch 9/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 175/175 [00:19<00:00,  8.76it/s]



Epoch 9/10 | Train=0.8544 | Val=3.0625


Train Epoch 10/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2708/2708 [18:03<00:00,  2.50it/s]
Val Epoch 10/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 175/175 [00:20<00:00,  8.74it/s]



Epoch 10/10 | Train=0.7767 | Val=3.1512

üî• Loaded best checkpoint (val_loss=2.3205)


In [12]:
class ExtractiveQAModel:
    def __init__(self, model, tokenizer, max_length=256, top_k=8, max_answer_len=32):
        self.model = model
        self.tok = tokenizer
        self.max_length = max_length
        self.top_k = top_k
        self.max_answer_len = max_answer_len

    @torch.no_grad()
    def predict_span(self, ctx, ques):
        self.model.eval()

        enc = self.tok(
            ques, ctx,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt",
        )
        enc.pop("token_type_ids", None)
        enc = {k: v.to(DEVICE) for k, v in enc.items()}

        out = self.model(**enc)
        s_log = out["start_logits"][0]
        e_log = out["end_logits"][0]

        s_top = torch.topk(s_log, self.top_k)
        e_top = torch.topk(e_log, self.top_k)

        best = (-1e10, 0, 0)

        for i, s_idx in enumerate(s_top.indices):
            for j, e_idx in enumerate(e_top.indices):
                s = s_idx.item()
                e = e_idx.item()

                if e < s: continue
                if (e - s + 1) > self.max_answer_len: continue

                score = s_top.values[i] + e_top.values[j]
                if score > best[0]:
                    best = (score, s, e)

        _, s, e = best
        ids = enc["input_ids"][0][s:e+1]
        return self.tok.decode(ids, skip_special_tokens=True)


In [13]:
def normalize(text):
    text = text.lower()
    text = re.sub(r"[^\w\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

def em(pred, gold):
    return int(normalize(pred) == normalize(gold))

def f1(pred, gold):
    pt = normalize(pred).split()
    gt = normalize(gold).split()
    if len(pt) == 0 or len(gt) == 0: return 0
    c = Counter(pt) & Counter(gt)
    num_same = sum(c.values())
    if num_same == 0: return 0
    precision = num_same / len(pt)
    recall = num_same / len(gt)
    return 2 * precision * recall / (precision + recall)

def evaluate_extractive(model, data):
    EM, F1 = [], []
    for ex in tqdm(data, desc="Eval"):
        pred = model.predict_span(ex["context"], ex["question"])
        EM.append(em(pred, ex["answer"]))
        F1.append(f1(pred, ex["answer"]))
    return np.mean(EM), np.mean(F1)


In [14]:
from collections import Counter

extractive = ExtractiveQAModel(model, tokenizer)
EM, F1 = evaluate_extractive(extractive, test_clean)



Eval:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 1699/2468 [00:32<00:15, 49.01it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Eval:  81%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 2007/2468 [00:37<00:08, 54.07it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Eval:  93%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 2289/2468 [00:43<00:03, 53.72it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Eval: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2468/2468 [00:46<00:00, 52.63it/s]


In [15]:
print("PHOBERT FINAL ‚Äî EM:", EM, "F1:", F1)

PHOBERT FINAL ‚Äî EM: 0.023095623987034037 F1: 0.5465792930594179


In [53]:
import random

def test_random_samples(extractive_model, dataset, num_samples=5):
    print(f"\nüîç Testing {num_samples} random samples...\n")
    samples = random.sample(dataset, num_samples)

    for i, ex in enumerate(samples):
        ctx   = ex["context"]
        ques  = ex["question"]
        gold  = ex["answer"]

        pred = extractive_model.predict_span(ctx, ques)

        print(f"===== SAMPLE {i+1} =====")
        print("Context:", ctx)
        print("Q:", ques)
        print("Pred:", pred)
        print("Gold:", gold)
        print()
test_random_samples(extractive, test_clean, num_samples=5)



üîç Testing 5 random samples...

===== SAMPLE 1 =====
Context: T·ªëng VƒÉn BƒÉng l√† ƒë·∫£ng vi√™n ƒê·∫£ng C·ªông s·∫£n Vi·ªát Nam, h·ªçc v·ªã C·ª≠ nh√¢n ti·∫øng Anh, Th·∫°c sƒ© Lu·∫≠t Qu·ªëc t·∫ø, Cao c·∫•p l√Ω lu·∫≠n ch√≠nh tr·ªã. √îng c√≥ kh·ªüi ƒë·∫ßu t·ª´ ngh·ªÅ gi√°o tr∆∞·ªùng ƒë·∫°i h·ªçc r·ªìi chuy·ªÉn sang ng√†nh c√¥ng ƒëo√†n Vi·ªát Nam.
Q: Kh·ªüi ƒë·∫ßu s·ª± nghi·ªáp c·ªßa T·ªëng VƒÉn BƒÉng l√† g√¨?
Pred: . √îng c√≥ kh·ªüi ƒë·∫ßu t·ª´ ngh·ªÅ gi√°o tr∆∞·ªùng ƒë·∫°i
Gold: √îng c√≥ kh·ªüi ƒë·∫ßu t·ª´ ngh·ªÅ gi√°o tr∆∞·ªùng ƒë·∫°i h·ªçc

===== SAMPLE 2 =====
Context: Th√°ng 1 nƒÉm 2007, Qu·ªëc h·ªôi Nh·∫≠t B·∫£n th√¥ng qua quy·∫øt ƒë·ªãnh th√†nh l·∫≠p B·ªô qu·ªëc ph√≤ng tr√™n c∆° s·ªü C·ª•c Ph√≤ng v·ªá tr∆∞·ªõc ƒë√¢y, l√† b∆∞·ªõc ti·∫øn m·ªõi theo h∆∞·ªõng gi·∫£i th√≠ch l·∫°i hi·∫øn ph√°p v√† cho ph√©p Nh·∫≠t ph√≤ng th·ªß t·∫≠p th·ªÉ, ƒë∆∞a qu√¢n ra n∆∞·ªõc ngo√†i trong c√°c chi·∫øn d·ªãch gi·ªØ g√¨n ho√† b√¨nh, gi·∫£i quy·∫øt c√°c xung ƒë·ªôt khu v·ª±c.
Q: Nh·∫≠t B·∫£n th√¥ng 

In [54]:
import os
save_path = "/content/drive/MyDrive/NLP/phobert-qa-extend"
os.makedirs(save_path, exist_ok=True)

# 1Ô∏è‚É£ L∆∞u tr·ªçng s·ªë
torch.save(model.state_dict(), f"{save_path}/pytorch_model.bin")

# 2Ô∏è‚É£ L∆∞u c·∫•u h√¨nh encoder + th√¥ng s·ªë QA head
config_dict = {
    "encoder_config": model.encoder.config.to_dict(),
    "hidden_size": model.encoder.config.hidden_size,
    "max_length": MAX_LEN,
}
with open(f"{save_path}/qa_config.json", "w") as f:
    json.dump(config_dict, f, indent=2)

# 3Ô∏è‚É£ L∆∞u tokenizer
tokenizer.save_pretrained(save_path)

print("Saved model to", save_path)


Saved model to /content/drive/MyDrive/NLP/phobert-qa-extend
