In [1]:
from transformers import AutoTokenizer
from datasets import load_dataset, DatasetDict

dataset = load_dataset('json', data_files='data/train_data.json')
model_checkpoint = 'VietAI/vit5-base'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, max_length=256)

train_size = int(0.9 * len(dataset['train']))
dataset['train'].shuffle()
train_dataset = dataset['train'].select(range(train_size))
val_dataset = dataset['train'].select(range(train_size, len(dataset['train'])))

raw_datasets = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset
})
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['Tokens', 'Intent', 'ner_labels'],
        num_rows: 2245
    })
    validation: Dataset({
        features: ['Tokens', 'Intent', 'ner_labels'],
        num_rows: 250
    })
})

In [2]:
all_labels = ['O', 'B-balcony_direction','I-balcony_direction','B-city','I-city','B-district','I-district','B-house_direction','I-house_direction', 'B-legal','I-legal', 'B-max_acreage','I-max_acreage', 'B-max_price','I-max_price','B-min_acreage','I-min_acreage','B-min_price','I-min_price', 'B-type_of_land','I-type_of_land']

def create_ner_tags(examples):
    
    ner_tags = [[all_labels.index(label) for label in labels] for labels in examples["ner_labels"]]

    return {"ner_tags": ner_tags}
    

    

def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

raw_datasets = raw_datasets.map(create_ner_tags, batched=True)

labels = raw_datasets["train"][45]["ner_tags"]
inputs = tokenizer(raw_datasets["train"][45]["Tokens"], is_split_into_words=True)
word_ids = inputs.word_ids()
print(labels)
print(align_labels_with_tokens(labels, word_ids))

[0, 19, 20, 20, 20, 0, 5, 6, 0, 0, 3, 4, 0, 0, 0, 0, 1, 0, 0]
[0, 19, 20, 20, 20, 0, 5, 6, 0, 0, 3, 4, 0, 0, 0, 0, 1, 0, 0, -100]


In [3]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["Tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)
tokenized_datasets

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2245
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 250
    })
})

In [4]:
from transformers import DataCollatorForTokenClassification
import evaluate

metric = evaluate.load("seqeval")
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
batch["labels"]

tensor([[   0,    0,    0,    0,    0,    3,    4,    0,    7,    0,    0,   17,
           18,    0,   13,   14, -100],
        [  19,   20,    0,    0,    0,    3,    4,    0,    7,    0,    0,    9,
           10,   10,    0, -100, -100]])

In [5]:
import numpy as np
from transformers import AutoModelForTokenClassification

id2label = {i: label for i, label in enumerate(all_labels)}
label2id = {v: k for k, v in id2label.items()}

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    true_labels = [[all_labels[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [all_labels[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }


model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

Some weights of T5ForTokenClassification were not initialized from the model checkpoint at VietAI/vit5-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
from transformers import TrainingArguments
from transformers import Trainer

args = TrainingArguments(
    "ViT5-real-estate-ner",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)


trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

  0%|          | 0/843 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.4586713910102844, 'eval_precision': 0.9241573033707865, 'eval_recall': 0.996969696969697, 'eval_f1': 0.9591836734693877, 'eval_accuracy': 0.9877712031558186, 'eval_runtime': 3.6307, 'eval_samples_per_second': 68.858, 'eval_steps_per_second': 8.814, 'epoch': 1.0}
{'loss': 1.568, 'grad_norm': 24.983171463012695, 'learning_rate': 8.137603795966786e-06, 'epoch': 1.78}


  0%|          | 0/32 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.5133143663406372, 'eval_precision': 0.9189944134078212, 'eval_recall': 0.996969696969697, 'eval_f1': 0.9563953488372093, 'eval_accuracy': 0.9881656804733728, 'eval_runtime': 3.4152, 'eval_samples_per_second': 73.202, 'eval_steps_per_second': 9.37, 'epoch': 2.0}


  0%|          | 0/32 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.4648110568523407, 'eval_precision': 0.9189944134078212, 'eval_recall': 0.996969696969697, 'eval_f1': 0.9563953488372093, 'eval_accuracy': 0.9877712031558186, 'eval_runtime': 3.6696, 'eval_samples_per_second': 68.127, 'eval_steps_per_second': 8.72, 'epoch': 3.0}


There were missing keys in the checkpoint model loaded: ['transformer.encoder.embed_tokens.weight'].


{'train_runtime': 1051.1203, 'train_samples_per_second': 6.407, 'train_steps_per_second': 0.802, 'train_loss': 0.933494241495008, 'epoch': 3.0}


TrainOutput(global_step=843, training_loss=0.933494241495008, metrics={'train_runtime': 1051.1203, 'train_samples_per_second': 6.407, 'train_steps_per_second': 0.802, 'total_flos': 90513448126200.0, 'train_loss': 0.933494241495008, 'epoch': 3.0})

In [28]:
from transformers import pipeline

model_checkpoint = "ViT5-real-estate-ner"

ner = pipeline("ner", model=model_checkpoint, aggregation_strategy="simple", device=0)
ner("Tôi tên là Hùng")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[{'entity_group': 'city',
  'score': 1.0,
  'word': 'Hùng',
  'start': 10,
  'end': 15}]

In [43]:
from typing import Any, Dict, List, Optional
from langchain_core.language_models import BaseChatModel
from langchain_core.outputs import ChatGeneration, ChatResult
from typing import Any, List, Optional, Dict
from gradio_client import Client
from langchain_core.language_models.chat_models import BaseChatModel
from langchain_core.messages import BaseMessage, HumanMessage, AIMessage
from langchain_core.callbacks import CallbackManagerForLLMRun


class LLama_3(BaseChatModel):
    """A custom chat model that echoes the first `n` characters of the input.

    When contributing an implementation to LangChain, carefully document
    the model including the initialization parameters, include
    an example of how to initialize the model and include any relevant
    links to the underlying models documentation or API.

    Example:

        .. code-block:: python

            model = CustomChatModel(n=2)
            result = model.invoke([HumanMessage(content="hello")])
            result = model.batch([[HumanMessage(content="hello")],
                                 [HumanMessage(content="world")]])
    """

    client: Client

    def _generate(
        self,
        messages: List[BaseMessage],
        stop: Optional[List[str]] = None,
        run_manager: Optional[CallbackManagerForLLMRun] = None,
        **kwargs: Any,
    ) -> ChatResult:
        """Override the _generate method to implement the chat model logic.

        This can be a call to an API, a call to a local model, or any other
        implementation that generates a response to the input prompt.

        Args:
            messages: the prompt composed of a list of messages.
            stop: a list of strings on which the model should stop generating.
                  If generation stops due to a stop token, the stop token itself
                  SHOULD BE INCLUDED as part of the output. This is not enforced
                  across models right now, but it's a good practice to follow since
                  it makes it much easier to parse the output of the model
                  downstream and understand why generation stopped.
            run_manager: A run manager with callbacks for the LLM.
        """
        latest_message = messages[-1].content
        
        chat_history = []
        for message in messages[:-1]:
            chat_history.append({"role": "user", "metadata": {"title": None}, "content": message.content})

        result = self.client.predict(
            message=latest_message,
            chat_history=chat_history,
            max_new_tokens=1024,
            temperature=0.6,
            top_p=0.9,
            top_k=50,
            repetition_penalty=1.2,
            api_name="/generate"
        )
        message = AIMessage(content=result[1][-1]['content'])
        generation = ChatGeneration(message=message)
        return ChatResult(generations=[generation])
    

    @property
    def _llm_type(self) -> str:
        """Get the type of language model used by this chat model."""
        return "llama"

    @property
    def _identifying_params(self) -> Dict[str, Any]:
        """Return a dictionary of identifying parameters.

        This information is used by the LangChain callback system, which
        is used for tracing purposes make it possible to monitor LLMs.
        """
        return {
            # The model name allows users to specify custom token counting
            # rules in LLM monitoring applications (e.g., in LangSmith users
            # can provide per token pricing for their model and monitor
            # costs for the given LLM.)
            "model_name": "llama-3.2-instruct-3b"
        }
    
client = Client("ysharma/Llama3-2_with_Gradio-5")
custom_llm = LLama_3(client=client)

chat_history = [
    # HumanMessage(content="Hello!!"),
    # AIMessage(content="It's nice to meet you. Is there something I can help you with or would you like to chat?"),
    HumanMessage(content="what is the multiplication of 2 and 3?"),
]
result = custom_llm.invoke(chat_history)
print(result)

Loaded as API: https://ysharma-llama3-2-with-gradio-5.hf.space ✔
content='Để tìm kiếm thông tin về các căn nhà ở quận 1 có giá dưới 1 tỷ đồng, bạn có thể sử dụng câu lệnh sau:\n\n[CALL_TOOL_QUERY_REAL_ESTATE "quận 1" "dưới 1 tỷ"]\n\nHoặc nếu bạn muốn yêu cầu cụ thể hơn là tìm kiếm căn nhà trong khu vực trung tâm hoặc gần các điểm du lịch nổi tiếng, bạn có thể thêm một số thông tin khác vào câu lệnh.\n\nVí dụ:\n[CALL_TOOL_QUERY_REAL_ESTATE "quận 1" "dưới 1 tỷ" "trung tâm"]\nhoặc \n[CALL_TOOL_QUERY_REAL_ESTATE "quận 1" "dưới 1 tỷ" "gần Bến Thành"]' id='run-46ea4077-5a38-4efd-9eea-a559d83663ad-0'


In [None]:
from huggingface_hub import InferenceClient
import os

client = InferenceClient(api_key=os.environ["HF_TOKEN"])

messages = [
	{ "role": "system", "content": "Bạn là một trợ lý ảo. Nếu người dùng hỏi những câu liên quan đến bất động sản, tìm nhà, hãy trả lời \"[CALL_TOOL]\". Còn lại thì trả lời như bình thường" },
	{ "role": "user", "content": "Xin chào" },
	{ "role": "assistant", "content": "Xin chào! Rất vui được gặp bạn. Tôi có thể giúp gì cho bạn hôm nay?" },
	{ "role": "user", "content": "Tìm nhà ở Hà Nội" },
	{ "role": "assistant", "content": "[CALL_TOOL]" },
	{ "role": "user", "content": "Bạn là ai" },
	{ "role": "assistant", "content": "Xin chào! Tôi là trợ lý ảo của bạn, được tạo ra để hỗ trợ bạn trong nhiều vấn đề khác nhau." },
	{ "role": "user", "content": "Cho tôi thông tin về chung cư mini ở Hồ Chí Minh" },
	{ "role": "assistant", "content": "[CALL_TOOL]" }
]

result = client.chat.completions.create(
    model="Qwen/Qwen2.5-72B-Instruct", 
	messages=messages, 
	temperature=0.1,
	max_tokens=1024,
	top_p=0.7,
	stream=False
)

print(result.choices[0].message.content)

Tôi đã ghi nhận yêu cầu của bạn về thông tin chung cư mini ở Hồ Chí Minh. Tuy nhiên, để cung cấp thông tin chính xác và phù hợp nhất, tôi cần một số thông tin thêm:

1. **Vị trí cụ thể**: Bạn quan tâm đến khu vực nào ở Hồ Chí Minh?
2. **Số phòng**: Bạn cần bao nhiêu phòng ngủ?
3. **Giá cả**: Bạn có ngân sách cụ thể không?
4. **Tiện ích**: Bạn có yêu cầu đặc biệt về tiện ích như an ninh, gym, hồ bơi, v.v.?

Vui lòng cung cấp thêm thông tin để tôi có thể hỗ trợ bạn tốt hơn.
