In [1]:
from transformers import AutoTokenizer
from datasets import load_dataset, DatasetDict

dataset = load_dataset('json', data_files='data/train_data.json')
model_checkpoint = 'VietAI/vit5-base'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, max_length=256)

train_size = int(0.9 * len(dataset['train']))
dataset['train'].shuffle()
train_dataset = dataset['train'].select(range(train_size))
val_dataset = dataset['train'].select(range(train_size, len(dataset['train'])))

raw_datasets = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset
})
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['Tokens', 'Intent', 'ner_labels'],
        num_rows: 2245
    })
    validation: Dataset({
        features: ['Tokens', 'Intent', 'ner_labels'],
        num_rows: 250
    })
})

In [2]:
all_labels = ['O', 'B-balcony_direction','I-balcony_direction','B-city','I-city','B-district','I-district','B-house_direction','I-house_direction', 'B-legal','I-legal', 'B-max_acreage','I-max_acreage', 'B-max_price','I-max_price','B-min_acreage','I-min_acreage','B-min_price','I-min_price', 'B-type_of_land','I-type_of_land']

def create_ner_tags(examples):
    
    ner_tags = [[all_labels.index(label) for label in labels] for labels in examples["ner_labels"]]

    return {"ner_tags": ner_tags}
    

    

def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

raw_datasets = raw_datasets.map(create_ner_tags, batched=True)

labels = raw_datasets["train"][45]["ner_tags"]
inputs = tokenizer(raw_datasets["train"][45]["Tokens"], is_split_into_words=True)
word_ids = inputs.word_ids()
print(labels)
print(align_labels_with_tokens(labels, word_ids))

[0, 19, 20, 20, 20, 0, 5, 6, 0, 0, 3, 4, 0, 0, 0, 0, 1, 0, 0]
[0, 19, 20, 20, 20, 0, 5, 6, 0, 0, 3, 4, 0, 0, 0, 0, 1, 0, 0, -100]


In [3]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["Tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)
tokenized_datasets

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2245
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 250
    })
})

In [4]:
from transformers import DataCollatorForTokenClassification
import evaluate

metric = evaluate.load("seqeval")
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
batch["labels"]

tensor([[   0,    0,    0,    0,    0,    3,    4,    0,    7,    0,    0,   17,
           18,    0,   13,   14, -100],
        [  19,   20,    0,    0,    0,    3,    4,    0,    7,    0,    0,    9,
           10,   10,    0, -100, -100]])

In [5]:
import numpy as np
from transformers import AutoModelForTokenClassification

id2label = {i: label for i, label in enumerate(all_labels)}
label2id = {v: k for k, v in id2label.items()}

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    true_labels = [[all_labels[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [all_labels[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }


model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

Some weights of T5ForTokenClassification were not initialized from the model checkpoint at VietAI/vit5-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
from transformers import TrainingArguments
from transformers import Trainer

args = TrainingArguments(
    "ViT5-real-estate-ner",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)


trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

  0%|          | 0/843 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.4586713910102844, 'eval_precision': 0.9241573033707865, 'eval_recall': 0.996969696969697, 'eval_f1': 0.9591836734693877, 'eval_accuracy': 0.9877712031558186, 'eval_runtime': 3.6307, 'eval_samples_per_second': 68.858, 'eval_steps_per_second': 8.814, 'epoch': 1.0}
{'loss': 1.568, 'grad_norm': 24.983171463012695, 'learning_rate': 8.137603795966786e-06, 'epoch': 1.78}


  0%|          | 0/32 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.5133143663406372, 'eval_precision': 0.9189944134078212, 'eval_recall': 0.996969696969697, 'eval_f1': 0.9563953488372093, 'eval_accuracy': 0.9881656804733728, 'eval_runtime': 3.4152, 'eval_samples_per_second': 73.202, 'eval_steps_per_second': 9.37, 'epoch': 2.0}


  0%|          | 0/32 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.4648110568523407, 'eval_precision': 0.9189944134078212, 'eval_recall': 0.996969696969697, 'eval_f1': 0.9563953488372093, 'eval_accuracy': 0.9877712031558186, 'eval_runtime': 3.6696, 'eval_samples_per_second': 68.127, 'eval_steps_per_second': 8.72, 'epoch': 3.0}


There were missing keys in the checkpoint model loaded: ['transformer.encoder.embed_tokens.weight'].


{'train_runtime': 1051.1203, 'train_samples_per_second': 6.407, 'train_steps_per_second': 0.802, 'train_loss': 0.933494241495008, 'epoch': 3.0}


TrainOutput(global_step=843, training_loss=0.933494241495008, metrics={'train_runtime': 1051.1203, 'train_samples_per_second': 6.407, 'train_steps_per_second': 0.802, 'total_flos': 90513448126200.0, 'train_loss': 0.933494241495008, 'epoch': 3.0})

In [28]:
from transformers import pipeline

model_checkpoint = "ViT5-real-estate-ner"

ner = pipeline("ner", model=model_checkpoint, aggregation_strategy="simple", device=0)
ner("Tôi tên là Hùng")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[{'entity_group': 'city',
  'score': 1.0,
  'word': 'Hùng',
  'start': 10,
  'end': 15}]

In [15]:
from huggingface_hub import InferenceClient
import os
import copy
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

data = pd.read_excel('data/router.xlsx')

client = InferenceClient(api_key=os.environ["HF_TOKEN"])
template = [
	{ "role": "system", "content": "Bạn là một trợ lý ảo chuyên phân loại ý định dựa trên câu chat của người dùng (intent classification). Nếu người dùng muốn tìm nhà, hãy trả lời \"[SEARCH_HOUSE]\". Còn lại thì trả lời \"[NORMAL_CHAT]\"" },
	{ "role": "user", "content": "Bạn là ai" },
	{ "role": "assistant", "content": "[NORMAL_CHAT]" },
	{ "role": "user", "content": "Tìm nhà ở Hà Nội" },
	{ "role": "assistant", "content": "[SEARCH_HOUSE]" },
	{ "role": "user", "content": "Nhà cấp 4 có dành cho người thu nhập thấp?" },
	{ "role": "assistant", "content": "[NORMAL_CHAT]" },
	{ "role": "user", "content": "Cho tôi thông tin về chung cư mini ở Hồ Chí Minh" },
	{ "role": "assistant", "content": "[SEARCH_HOUSE]" },
	{ "role": "user", "content": "Bạn nghĩ làm thế nào để tiết kiệm chi phí mua nhà?" },
	{ "role": "assistant", "content": "[NORMAL_CHAT]" },
	{ "role": "user", "content": "Theo bạn, nên chọn chung cư tầng cao hay tầng thấp thì tốt hơn?" },
	{ "role": "assistant", "content": "[NORMAL_CHAT]" },
]

def predict_intent(input):
	messages = copy.deepcopy(template)
	messages.append({ "role": "user", "content": input })
	result = client.chat.completions.create(
		model="Qwen/Qwen2.5-72B-Instruct", 
		messages=messages, 
		temperature=0.1,
		max_tokens=1024,
		top_p=0.7,
		stream=False
	)
	return result.choices[0].message.content

def metric(sample):
	if sample['label'] in sample['predicted_intent']:
		return True
	else:
		return False


data['predicted_intent'] = data['text'].progress_apply(predict_intent)
data['status'] = data.apply(metric, axis=1)
print("accuracy:",  (data['status'].sum() / len(data)) * 100)
data.to_excel('data/router_generated.xlsx', index=False)

100%|██████████| 100/100 [00:23<00:00,  4.19it/s]

accuracy: 97.0



