In [1]:
!pip install transformers datasets seqeval accelerate evaluate

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=f3feb1f86ac23a57bf9d113f59be43652245df56b22d448553abf6960f25944f
  Stored in directory: /root/.cache/pip/wheels/5f/b8/73/0b2c1a76b701a677653dd79ece07cfabd7457989dbfbdcd8d7
Successfully built seqeval
Installing collected packages: seqeval, evaluate
Successfully installed evaluate-0.4.6 seqeval-1.2.2


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
from datasets import Dataset, load_dataset
from evaluate import load
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
import numpy as np
import evaluate

In [11]:
!ls /content/drive/MyDrive/CNLTHD/

test.json  train.json  Untitled  validation.json


In [14]:
data_files = {
    "train": "/content/drive/MyDrive/CNLTHD/train.json",
    "validation": "/content/drive/MyDrive/CNLTHD/validation.json",
    "test": "/content/drive/MyDrive/CNLTHD/test.json",
}

In [15]:
datasets = load_dataset("json", data_files=data_files)

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [16]:
labels = sorted(list({label for row in datasets["train"] for label in row["ner_tags"]}))
label2id = {label: i for i, label in enumerate(labels)}
id2label = {i: label for label, i in label2id.items()}

In [17]:
print("Labels:", labels)

Labels: ['B-CONTRACT_VALUE', 'B-EFFECTIVE_DATE', 'B-PARTY', 'I-CONTRACT_VALUE', 'I-EFFECTIVE_DATE', 'I-PARTY', 'O']


In [18]:
model_name = "xlm-roberta-base"  # hoặc "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [19]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    all_labels = []
    for i, labels_per_example in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            else:
                label_ids.append(label2id[labels_per_example[word_idx]])
        all_labels.append(label_ids)
    tokenized_inputs["labels"] = all_labels
    return tokenized_inputs

tokenized_datasets = datasets.map(tokenize_and_align_labels, batched=True, remove_columns=datasets["train"].column_names)

Map:   0%|          | 0/960 [00:00<?, ? examples/s]

Map:   0%|          | 0/120 [00:00<?, ? examples/s]

Map:   0%|          | 0/120 [00:00<?, ? examples/s]

In [20]:
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id,
)

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [22]:
metric = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

Downloading builder script: 0.00B [00:00, ?B/s]

In [23]:
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/CNLTHD/ner_contract_model",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir="/content/drive/MyDrive/CNLTHD/logs",
    logging_strategy="epoch",
    fp16=True,  # Dùng GPU hỗ trợ
    report_to = "none"
)

In [27]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [28]:
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0029,0.000211,1.0,1.0,1.0,1.0


TrainOutput(global_step=120, training_loss=0.0029487999776999156, metrics={'train_runtime': 495.9155, 'train_samples_per_second': 1.936, 'train_steps_per_second': 0.242, 'total_flos': 8206722231840.0, 'train_loss': 0.0029487999776999156, 'epoch': 1.0})

In [29]:
results = trainer.evaluate(tokenized_datasets["test"])
print(results)



{'eval_loss': 0.00021442430443130434, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_f1': 1.0, 'eval_accuracy': 1.0, 'eval_runtime': 13.4203, 'eval_samples_per_second': 8.942, 'eval_steps_per_second': 1.118, 'epoch': 1.0}


In [30]:
from transformers import pipeline
ner = pipeline("ner", model=model, tokenizer="xlm-roberta-base", aggregation_strategy="simple")
text = "Hợp đồng có hiệu lực từ ngày 01/01/2024 giữa Công ty A và Công ty B với giá trị 50 triệu đồng."
print(ner(text))

Device set to use cpu


[{'entity_group': 'EFFECTIVE_DATE', 'score': np.float32(0.9993724), 'word': '01', 'start': 29, 'end': 31}, {'entity_group': 'EFFECTIVE_DATE', 'score': np.float32(0.9993449), 'word': '/01/', 'start': 31, 'end': 35}, {'entity_group': 'EFFECTIVE_DATE', 'score': np.float32(0.99906594), 'word': '20', 'start': 35, 'end': 37}, {'entity_group': 'EFFECTIVE_DATE', 'score': np.float32(0.9992632), 'word': '24', 'start': 37, 'end': 39}, {'entity_group': 'PARTY', 'score': np.float32(0.99940515), 'word': 'Công ty A', 'start': 45, 'end': 54}, {'entity_group': 'PARTY', 'score': np.float32(0.9992799), 'word': 'Công ty B', 'start': 58, 'end': 67}, {'entity_group': 'CONTRACT_VALUE', 'score': np.float32(0.9984477), 'word': '50 triệu đồng', 'start': 80, 'end': 93}]


In [31]:
ner_results = ner(text)
grouped_entities = []
current_group = None

for item in ner_results:
    entity_group = item['entity_group']
    word = item['word'].strip() # Loại bỏ khoảng trắng thừa nếu có

    # Nếu đây là thực thể đầu tiên hoặc thực thể này khác loại
    # hoặc bị cách biệt (start position không tiếp nối end position của thực thể trước)
    if (current_group is None or
        entity_group != current_group['entity_group'] or
        item['start'] > current_group['end']):

        # Lưu lại thực thể đã hoàn thành (nếu có)
        if current_group is not None:
            grouped_entities.append(current_group)

        # Bắt đầu một nhóm mới
        current_group = {
            'entity_group': entity_group,
            'word': word,
            'start': item['start'],
            'end': item['end']
        }
    else:
        # Nếu cùng nhóm và tiếp nối, thì gộp từ và cập nhật vị trí kết thúc
        current_group['word'] += word
        current_group['end'] = item['end']

# Thêm nhóm cuối cùng sau khi lặp xong
if current_group is not None:
    grouped_entities.append(current_group)

# -----------------------------------------------
# ĐỊNH DẠNG KẾT QUẢ CUỐI CÙNG
# -----------------------------------------------

final_results = []
for entity in grouped_entities:
    final_results.append({
        'entity_group': entity['entity_group'],
        'word': entity['word'].replace('##', '') # Loại bỏ ký tự sub-word nếu có
    })

# In ra kết quả đã gộp
print(final_results)

[{'entity_group': 'EFFECTIVE_DATE', 'word': '01/01/2024'}, {'entity_group': 'PARTY', 'word': 'Công ty A'}, {'entity_group': 'PARTY', 'word': 'Công ty B'}, {'entity_group': 'CONTRACT_VALUE', 'word': '50 triệu đồng'}]


In [32]:
trainer.save_model("/content/drive/MyDrive/CNLTHD/ner_contract_model")