In [41]:
!pip install torch transformers evaluate seqeval datasets



In [42]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments 
from torch.utils.data import Dataset
import evaluate
from collections import Counter
from random import sample, choice
from transformers import RobertaForTokenClassification, RobertaConfig
import torch.nn as nn
from collections import Counter
from transformers import EarlyStoppingCallback
import os

In [None]:
def read_bio_file(file_path):
    sentences = []
    current_sentence = {'tokens': [], 'ner_labels': []}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line:
                parts = line.split()
                if len(parts) >= 4:
                    # Tách các từ ghép bằng dấu _
                    token = parts[0].replace('_', ' ')
                    pos, chunk, ner = parts[1], parts[2], parts[3]
                    current_sentence['tokens'].append(token)
                    current_sentence['ner_labels'].append(ner)
            else:
                if current_sentence['tokens']:
                    sentences.append(current_sentence)
                    current_sentence = {'tokens': [], 'ner_labels': []}
    if current_sentence['tokens']:
        sentences.append(current_sentence)
    return sentences


In [48]:
# Đọc dữ liệu
train_data = read_bio_file('/kaggle/input/dataset-dev-test-train/train_data.txt')
dev_data = read_bio_file('/kaggle/input/dataset-dev-test-train/dev_data.txt')
test_data = read_bio_file('/kaggle/input/dataset-dev-test-train/test_data.txt')
# Sau khi gọi read_bio_file
# print("Train data sample (first 2 sentences, after normalization):", train_data[:2])
# print("Train data sample (first 2 sentences):", train_data[:2])

# print("Unique labels in train data:", unique_labels)
# print("Label2id mapping:", label2id)
# # In 2 mẫu cuối để kiểm tra
# print_last_two_samples(train_data, "train_data")
# print_last_two_samples(dev_data, "dev_data")
# print_last_two_samples(test_data, "test_data")

In [64]:
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base-v2")
print("Is fast tokenizer:", tokenizer.is_fast)
unique_labels = set(label for sentence in train_data for label in sentence['ner_labels'])
label2id = {label: idx for idx, label in enumerate(unique_labels)}
id2label = {idx: label for label, idx in label2id.items()}

print("Mapping NER labels to IDs:", label2id)

Is fast tokenizer: False
Mapping NER labels to IDs: {'I-ĐT': 0, 'B-NG': 1, 'B-VBPL': 2, 'I-VBPL': 3, 'O': 4, 'I-NG': 5, 'B-SL': 6, 'B-CQ': 7, 'I-SL': 8, 'I-CQ': 9, 'B-ĐT': 10}


In [66]:
unique_labels = ['O', 'B-SL', 'I-SL', 'B-CQ', 'I-CQ', 'B-ĐT', 'I-ĐT', 'B-VBPL', 'I-VBPL', 'B-NG', 'I-NG']
label2id = {label: idx for idx, label in enumerate(unique_labels)}
id2label = {idx: label for label, idx in label2id.items()}
print("label2id:", label2id)
print("id2label:", id2label)

label2id: {'O': 0, 'B-SL': 1, 'I-SL': 2, 'B-CQ': 3, 'I-CQ': 4, 'B-ĐT': 5, 'I-ĐT': 6, 'B-VBPL': 7, 'I-VBPL': 8, 'B-NG': 9, 'I-NG': 10}
id2label: {0: 'O', 1: 'B-SL', 2: 'I-SL', 3: 'B-CQ', 4: 'I-CQ', 5: 'B-ĐT', 6: 'I-ĐT', 7: 'B-VBPL', 8: 'I-VBPL', 9: 'B-NG', 10: 'I-NG'}


In [74]:
def convert_to_features(sentences, max_len=128):
    input_ids, attention_masks, label_ids = [], [], []
    for sentence in sentences:
        tokens = sentence['tokens']  # Dữ liệu đã được chuẩn hóa
        labels = sentence['ner_labels']
        if not tokens or not labels:
            print(f"Warning: Empty sentence found - Tokens: {tokens}, Labels: {labels}")
            continue
        
        # Chuẩn hóa tokens: thêm khoảng trắng giữa các từ
        tokenized_input = " ".join(tokens)
        
        # Mã hóa tokens bằng PhoBERT
        encoding = tokenizer(tokenized_input, return_tensors='pt', max_length=max_len, padding='max_length', truncation=True)
        attention_mask = encoding['attention_mask'][0]
        input_ids_batch = encoding['input_ids'][0]
        
        # Tạo label_encoding với -100
        label_encoding = [-100] * max_len
        
        # Lấy input_tokens
        input_tokens = tokenizer.convert_ids_to_tokens(input_ids_batch)
        
        # Tính word_ids bằng cách so sánh input_tokens với tokens
        word_ids = []
        token_idx = 0
        for i, token in enumerate(input_tokens):
            if token in ['<s>', '</s>', '[PAD]']:
                word_ids.append(None)
                continue
            if token_idx >= len(tokens):  # Dừng lại nếu đã ánh xạ hết tokens
                word_ids.append(None)
                continue
            # Loại bỏ ký tự đặc biệt (nếu có) và so sánh với token gốc
            token_clean = token.replace('@@', '')
            # Tìm token gốc tương ứng
            found = False
            for j in range(token_idx, len(tokens)):
                if token_clean.lower() in tokens[j].lower():
                    word_ids.append(j)
                    token_idx = j + 1
                    found = True
                    break
            if not found:
                word_ids.append(None)
        
        # Ánh xạ nhãn
        for j, word_id in enumerate(word_ids):
            if word_id is not None and word_id < len(labels):
                if labels[word_id] not in label2id:
                    print(f"Warning: Unknown label '{labels[word_id]}' at sentence: {tokens}")
                    label_encoding[j] = -100
                else:
                    label_encoding[j] = label2id[labels[word_id]]
        
        input_ids.append(input_ids_batch)
        attention_masks.append(attention_mask)
        label_ids.append(torch.tensor(label_encoding))
    
    if not input_ids:
        raise ValueError("No valid data to convert!")
    
    # In thông tin để kiểm tra
    print("Sample tokens (first sentence):", sentences[0]['tokens'])
    print("Sample labels (first sentence):", sentences[0]['ner_labels'])
    print("Sample input_tokens (first sentence):", input_tokens)
    print("Sample word_ids (first sentence):", word_ids)
    print("Sample label_ids (first sentence):", label_ids[0].tolist())
    return torch.stack(input_ids), torch.stack(attention_masks), torch.stack(label_ids)

In [75]:
train_inputs, train_masks, train_labels = convert_to_features(train_data, max_len=128)
dev_inputs, dev_masks, dev_labels = convert_to_features(dev_data, max_len=128)
test_inputs, test_masks, test_labels = convert_to_features(test_data, max_len=128)

# print("Train inputs shape:", train_inputs.shape)
# print("Train labels shape:", train_labels.shape)
# print("Sample train_labels (first 3 sentences):")
# for i in range(min(3, len(train_labels))):
# print(f"Sentence {i}: {train_labels[i].tolist()}")
# print("Dev inputs shape:", dev_inputs.shape)
# print("Dev labels shape:", dev_labels.shape)

Sample tokens (first sentence): ['0', 'đồng', 'c', 'Phạm tội', 'thuộc', 'trường hợp', 'quy định', 'tại', 'khoản', '3', 'Điều', 'này', ',', 'thì', 'bị', 'phạt', 'tiền', 'từ', '1', '.']
Sample labels (first sentence): ['B-SL', 'I-SL', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Sample input_tokens (first sentence): ['<s>', 'Cấp', 'điều', 'chỉnh', ',', 'bổ', 'sung', 'nội', 'dung', 'chứng', 'chỉ', 'năng', 'lực', 'hoạt', 'động', 'xây', 'dựng', 'hạng', 'II', ',', 'hạng', 'III', 'do', 'cấp', 'Cấp', 'Tỉnh', 'thực', 'hiện', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<

In [18]:
# Dataset class
class NERDataset(Dataset):
    def __init__(self, input_ids, attention_masks, label_ids):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.label_ids = label_ids
    def __len__(self):
        return len(self.input_ids)
    def __getitem__(self, idx):
        return {'input_ids': self.input_ids[idx], 'attention_mask': self.attention_masks[idx], 'labels': self.label_ids[idx]}

# Tạo dataset
train_dataset = NERDataset(train_inputs, train_masks, train_labels)
dev_dataset = NERDataset(dev_inputs, dev_masks, dev_labels)
test_dataset = NERDataset(test_inputs, test_masks, test_labels)

In [69]:
unique_labels = set(label for sentence in train_data for label in sentence['ner_labels'])
label2id = {label: idx for idx, label in enumerate(unique_labels)}
id2label = {idx: label for label, idx in label2id.items()}
print("Mapping NER labels to IDs:", label2id)
print("id2label:", id2label)

Mapping NER labels to IDs: {'I-ĐT': 0, 'B-NG': 1, 'B-VBPL': 2, 'I-VBPL': 3, 'O': 4, 'I-NG': 5, 'B-SL': 6, 'B-CQ': 7, 'I-SL': 8, 'I-CQ': 9, 'B-ĐT': 10}
id2label: {0: 'I-ĐT', 1: 'B-NG', 2: 'B-VBPL', 3: 'I-VBPL', 4: 'O', 5: 'I-NG', 6: 'B-SL', 7: 'B-CQ', 8: 'I-SL', 9: 'I-CQ', 10: 'B-ĐT'}


In [19]:

all_labels = [label for sentence in train_data for label in sentence['ner_labels']]
label_counts = Counter(all_labels)
total_labels = sum(label_counts.values())
print("Phân bố nhãn trong dữ liệu huấn luyện:")
for label, count in label_counts.items():
    print(f"{label}: {count} ({count/total_labels*100:.2f}%)")

Phân bố nhãn trong dữ liệu huấn luyện:
B-SL: 11060 (0.34%)
I-SL: 12798 (0.40%)
O: 2890348 (89.70%)
B-CQ: 48418 (1.50%)
B-ĐT: 52500 (1.63%)
B-VBPL: 23827 (0.74%)
I-VBPL: 78787 (2.45%)
B-NG: 27642 (0.86%)
I-NG: 27639 (0.86%)
I-CQ: 12915 (0.40%)
I-ĐT: 36200 (1.12%)


In [20]:
# Thiết lập để giảm fragmentation
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Giải phóng bộ nhớ GPU
torch.cuda.empty_cache()
print("Memory allocated before training:", torch.cuda.memory_allocated() / 1024**3, "GiB")
print("Memory reserved before training:", torch.cuda.memory_reserved() / 1024**3, "GiB")

Memory allocated before training: 0.0 GiB
Memory reserved before training: 0.0 GiB


In [21]:
# Oversampling và undersampling
only_o_sentences = [s for s in train_data if all(label == 'O' for label in s['ner_labels'])]
entity_sentences = [s for s in train_data if any(label != 'O' for label in s['ner_labels'])]
dt_sentences = [s for s in entity_sentences if any(label in ['I-ĐT', 'B-ĐT'] for label in s['ner_labels'])]
reduced_o_sentences = sample(only_o_sentences, int(0.1 * len(only_o_sentences)))
oversampled_entity_sentences = [choice(entity_sentences) for _ in range(2 * len(entity_sentences))]
oversampled_dt_sentences = [choice(dt_sentences) for _ in range(2 * len(dt_sentences))]
balanced_train_data = reduced_o_sentences + oversampled_entity_sentences + oversampled_dt_sentences

# Cập nhật dataset
train_inputs, train_masks, train_labels = convert_to_features(balanced_train_data)
train_dataset = NERDataset(train_inputs, train_masks, train_labels)

Sample label_ids (first sentence): [4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]


In [22]:

class CustomPhoBERTForTokenClassification(RobertaForTokenClassification):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.dropout = nn.Dropout(0.2)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        label_counts = Counter([label for sentence in balanced_train_data for label in sentence['ner_labels']])
        total = sum(label_counts.values())
        class_weights = torch.tensor([total / (len(label2id) * label_counts.get(label, 1)) * (20 if label in ['I-ĐT', 'B-ĐT'] else 10 if label != 'O' else 1) for label in label2id], dtype=torch.float)
        self.register_buffer('class_weights', class_weights)
        print("Class weights:", self.class_weights)

In [23]:
def forward(self, input_ids=None, attention_mask=None, labels=None, **kwargs):
        outputs = self.roberta(input_ids, attention_mask=attention_mask)
        sequence_output = outputs[0]
        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss(weight=self.class_weights)
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        return (loss, logits) if loss is not None else logits

In [24]:
config = RobertaConfig.from_pretrained("vinai/phobert-base-v2")
config.num_labels = len(label2id)
config.hidden_dropout_prob = 0.2  # Tùy chọn: điều chỉnh dropout mặc định
config.attention_probs_dropout_prob = 0.2  # Tùy chọn: điều chỉnh dropout mặc định

model = CustomPhoBERTForTokenClassification.from_pretrained(
    "vinai/phobert-base-v2",
    config=config
)
model.train()

pytorch_model.bin:   0%|          | 0.00/540M [00:00<?, ?B/s]

Class weights: tensor([7.4158e+01, 7.4889e+01, 8.5020e+01, 2.6256e+01, 1.0908e-01, 7.4904e+01,
        1.9303e+02, 4.3297e+01, 1.6164e+02, 1.6773e+02, 5.0882e+01])


Some weights of CustomPhoBERTForTokenClassification were not initialized from the model checkpoint at vinai/phobert-base-v2 and are newly initialized: ['class_weights', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


CustomPhoBERTForTokenClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(64001, 768, padding_idx=1)
      (position_embeddings): Embedding(258, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.2, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.2, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
      

In [25]:
# Metric đánh giá
metric = evaluate.load("seqeval")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
    pred_labels = [[id2label[p] for p, l in zip(pred, label) if l != -100] for pred, label in zip(predictions, labels)]
    results = metric.compute(predictions=pred_labels, references=true_labels, zero_division=0)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

# Thiết lập huấn luyện
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",
    # eval_steps=5000,
    learning_rate=2e-5,
    lr_scheduler_type="linear",
    warmup_steps=1000,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    num_train_epochs=3,
    weight_decay=0.1,
    logging_dir='./logs',
    save_strategy="epoch",
    # save_setps=5000,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    report_to="none",
    logging_steps=500,  # In loss mỗi 500 bước
    max_grad_norm=1.0,
    fp16=True,
    dataloader_num_workers=2,
    dataloader_pin_memory=False,
)

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [26]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

# Huấn luyện và đánh giá
trainer.train()
trainer.save_model("/kaggle/working/results/final_model")
tokenizer.save_pretrained("/kaggle/working/results/final_model")
torch.save(training_args, "/kaggle/working/results/final_model/training_args.bin")
test_results = trainer.evaluate(test_dataset)
print("Test results:", test_results)

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0344,0.070374,0.833004,0.878643,0.855215,0.986096
2,0.0054,0.073409,0.820818,0.868078,0.843787,0.986641
3,0.0023,0.068509,0.831307,0.878093,0.85406,0.988354


Test results: {'eval_loss': 0.07566389441490173, 'eval_precision': 0.7816128020081582, 'eval_recall': 0.8527901403628895, 'eval_f1': 0.8156516044531763, 'eval_accuracy': 0.9845376654018292, 'eval_runtime': 55.9405, 'eval_samples_per_second': 183.731, 'eval_steps_per_second': 1.448, 'epoch': 3.0}


In [27]:
from seqeval.metrics import classification_report
predictions, labels, _ = trainer.predict(test_dataset)
predictions, labels, _ = trainer.predict(dev_dataset)
pred_labels = np.argmax(predictions, axis=-1)
true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
pred_labels = [[id2label[p] for p, l in zip(pred, label) if l != -100] for pred, label in zip(pred_labels, labels)]
print(classification_report(true_labels, pred_labels))

from collections import Counter
true_counts = Counter([l for sublist in true_labels for l in sublist])
pred_counts = Counter([l for sublist in pred_labels for l in sublist])
print("True label counts:", true_counts)
print("Predicted label counts:", pred_counts)

              precision    recall  f1-score   support

          CQ       0.89      0.92      0.91     19516
          NG       0.97      1.00      0.99       481
          SL       0.97      1.00      0.98       496
        VBPL       1.00      1.00      1.00        44
          ĐT       0.60      0.69      0.64      4925

   micro avg       0.83      0.88      0.86     25462
   macro avg       0.89      0.92      0.90     25462
weighted avg       0.84      0.88      0.86     25462

True label counts: Counter({'O': 1148544, 'I-SL': 63488, 'I-NG': 61568, 'B-CQ': 19456, 'I-ĐT': 7808, 'I-CQ': 7680, 'I-VBPL': 5632, 'B-ĐT': 4864})
Predicted label counts: Counter({'O': 1141100, 'I-SL': 63578, 'I-NG': 63232, 'B-CQ': 20095, 'I-ĐT': 11527, 'I-CQ': 8372, 'I-VBPL': 5632, 'B-ĐT': 5504})


In [30]:
# Đọc dữ liệu test
test_data = read_bio_file('/kaggle/input/dataset-dev-test-train/test_data.txt')

# Tiền xử lý dữ liệu test
test_inputs, test_masks, test_labels = convert_to_features(test_data)
test_dataset = NERDataset(test_inputs, test_masks, test_labels)

Sample label_ids (first sentence): [10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10]


In [31]:
from transformers import RobertaForTokenClassification, AutoTokenizer, Trainer, TrainingArguments

# Định nghĩa lại lớp mô hình
class CustomPhoBERTForTokenClassification(RobertaForTokenClassification):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.dropout = nn.Dropout(0.2)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        label_counts = Counter([label for sentence in balanced_train_data for label in sentence['ner_labels']])
        total = sum(label_counts.values())
        class_weights = torch.tensor([total / (len(label2id) * label_counts.get(label, 1)) * (20 if label in ['I-ĐT', 'B-ĐT'] else 10 if label != 'O' else 1) for label in label2id], dtype=torch.float)
        self.register_buffer('class_weights', class_weights)
    
    def forward(self, input_ids=None, attention_mask=None, labels=None, **kwargs):
        outputs = self.roberta(input_ids, attention_mask=attention_mask)
        sequence_output = outputs[0]
        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss(weight=self.class_weights)
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        return (loss, logits) if loss is not None else logits

# Tải mô hình và tokenizer
model_path = "/kaggle/working/results/final_model"
tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)
model = CustomPhoBERTForTokenClassification.from_pretrained(model_path, local_files_only=True)
model.eval()

CustomPhoBERTForTokenClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(64001, 768, padding_idx=1)
      (position_embeddings): Embedding(258, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.2, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.2, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
      

In [32]:
import evaluate
import numpy as np

metric = evaluate.load("seqeval")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
    pred_labels = [[id2label[p] for p, l in zip(pred, label) if l != -100] for pred, label in zip(predictions, labels)]
    results = metric.compute(predictions=pred_labels, references=true_labels, zero_division=0)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [33]:
# Định nghĩa lại TrainingArguments (chỉ cần để đánh giá, không huấn luyện)
training_args = TrainingArguments(
    output_dir='./results',
    per_device_eval_batch_size=128,
    fp16=True,
    dataloader_num_workers=4,
    report_to="none",
)

# Tạo Trainer để đánh giá
trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=test_dataset,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)

# Đánh giá trên tập test
test_results = trainer.evaluate(test_dataset)
print("Test results:", test_results)

Test results: {'eval_loss': 7.501443386077881, 'eval_model_preparation_time': 0.0033, 'eval_precision': 9.227475731738825e-06, 'eval_recall': 3.803872342044201e-05, 'eval_f1': 1.485211011354438e-05, 'eval_accuracy': 0.0023320441720179022, 'eval_runtime': 58.8444, 'eval_samples_per_second': 174.664, 'eval_steps_per_second': 1.377}


In [None]:
from collections import Counter

# Dự đoán trên tập test
predictions, labels, _ = trainer.predict(test_dataset)
predictions = np.argmax(predictions, axis=-1)
true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
pred_labels = [[id2label[p] for p, l in zip(pred, label) if l != -100] for pred, label in zip(predictions, labels)]
results = metric.compute(predictions=pred_labels, references=true_labels, zero_division=0)
print("\nDetailed Test Results:")
for key, value in results.items():
    print(f"{key}: {value}")

true_counts = Counter([label for sentence in true_labels for label in sentence])
pred_counts = Counter([label for sentence in pred_labels for label in sentence])
print("\nTrue label counts:", true_counts)
print("Predicted label counts:", pred_counts)

In [35]:
dev_results = trainer.evaluate(dev_dataset)
print("Dev results:", dev_results)

Dev results: {'eval_loss': 0.2277332842350006, 'eval_model_preparation_time': 0.0033, 'eval_precision': 0.9350668800838692, 'eval_recall': 0.9619906974952572, 'eval_f1': 0.9483377318726384, 'eval_accuracy': 0.986095948568656, 'eval_runtime': 62.6519, 'eval_samples_per_second': 164.48, 'eval_steps_per_second': 1.293}


In [36]:
print(model.classifier.weight.data)

tensor([[-0.0203,  0.0399, -0.0246,  ..., -0.0367, -0.0092,  0.0497],
        [ 0.0265, -0.0450,  0.0081,  ..., -0.0038, -0.0024, -0.0053],
        [ 0.0243, -0.0044, -0.0061,  ...,  0.0212, -0.0468, -0.0015],
        ...,
        [ 0.0554, -0.0478, -0.0294,  ...,  0.0069, -0.0335,  0.0336],
        [-0.0003,  0.0242, -0.0268,  ...,  0.0210, -0.0036,  0.0217],
        [ 0.0441,  0.0248, -0.0114,  ..., -0.0440, -0.0191,  0.0306]],
       device='cuda:0')


In [76]:
# Lấy mẫu từ tập test
sample_sentence = test_data[0]

# Đảm bảo input_ids và attention_mask có batch dimension
input_ids, attention_mask, _ = convert_to_features([sample_sentence], max_len=128)
print("input_ids shape:", input_ids.shape)  # Phải là [1, 128]
print("attention_mask shape:", attention_mask.shape)  # Phải là [1, 128]

# Đưa dữ liệu lên GPU
input_ids = input_ids.to('cuda')
attention_mask = attention_mask.to('cuda')

# Dự đoán
with torch.no_grad():
    logits = model(input_ids, attention_mask=attention_mask)[0]
    print("logits shape:", logits.shape)  # Phải là [1, 128, 11]
    if len(logits.shape) == 2:  # Nếu logits có shape [128, 11]
        logits = logits.unsqueeze(0)  # Thêm batch dimension: [1, 128, 11]
        print("Adjusted logits shape:", logits.shape)
    predictions = torch.argmax(logits, dim=-1)
    print("predictions shape:", predictions.shape)  # Phải là [1, 128]

# Chuyển đổi dự đoán thành nhãn
pred_labels = [id2label[p.item()] for p in predictions[0] if p != -100]
print("True labels:", sample_sentence['ner_labels'])
print("Predicted labels:", pred_labels)

Sample tokens (first sentence): ['Cấp', 'lại', 'Giấy', 'chứng nhận', 'huấn luyện', 'kỹ thuật', 'an toàn', 'vật liệu', 'nổ', 'công nghiệp', 'thuộc', 'thẩm quyền', 'giải quyết', 'của', 'sở công thương', 'thực hiện', 'Trực tiếp', '3', 'Ngày', 'Dịch vụ', 'bưu chính', '3', 'Ngày']
Sample labels (first sentence): ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-CQ', 'O', 'O', 'B-NG', 'I-NG', 'O', 'O', 'B-NG', 'I-NG']
Sample input_tokens (first sentence): ['<s>', 'Cấp', 'lại', 'Giấy', 'chứng', 'nhận', 'huấn', 'luyện', 'kỹ', 'thuật', 'an', 'toàn', 'vật', 'liệu', 'nổ', 'công', 'nghiệp', 'thuộc', 'thẩm', 'quyền', 'giải', 'quyết', 'của', 'sở', 'công', 'thương', 'thực', 'hiện', 'Trực', 'tiếp', '3', 'Ngày', 'Dịch', 'vụ', 'b@@', 'ưu', 'chính', '3', 'Ngày', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>'