#Install library

In [None]:
!pip install -q datasets==2.16.0
!pip install -q bitsandbytes
!pip install -q tiktoken
!pip install -q peft
!pip install -q trl
!pip install -q transformers
!pip install -q openpyxl
!pip install -q pandas
!pip install -q scikit-learn
#!pip install -q flash-attn
#pip install -q transformers==4.38.1

[0m

#Import library

In [None]:
import json
import torch
import pandas as pd
import numpy as np
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
from accelerate import PartialState
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
from trl import SFTTrainer
from peft import prepare_model_for_kbit_training
from transformers import TrainingArguments

#Hyperparameters

In [None]:
from IPython.display import clear_output

modelpath = "Qwen/Qwen2.5-0.5B-Instruct"
lr=2e-4      # learning rate
bs=8            # batch size
bs_eval=8      # batch size for evals
ga_steps=2     # gradient acc. steps
epochs=4
max_length=2048      # max. sample length with 24GB VRAM
output_dir="out"

In [None]:
huggingface_token = input()
clear_output()

#Create Dataset
- Because I don't need evaluate data when training (always use lastest model). I merge train and dev dataset.
- Dataset is download from [PhoNER_COVID19](https://github.com/VinAIResearch/PhoNER_COVID19)
- First I read dataset from json (actually jsonl)
- I list all tags and review dataset, I convert list of tag (each tag for each word) to list of entity [(entity_name, entity_tag)] as [('Trung tâm Chống độc , Bệnh viện Bạch Mai', 'ORGANIZATION'),
  ('7/3', 'DATE'),
  ('ngộ độc thuốc', 'SYMPTOM_AND_DISEASE')]. I add this output to each sample dictionary with key = 'outputs'.
- I create dataset file (to use dataset package easily):
    - for each sample, create input and output from words and outputs of this sample
    - dump dataset to jsonl
    - use load_dataset to load train and val dataset.

In [None]:
!git clone https://github.com/VinAIResearch/PhoNER_COVID19

Cloning into 'PhoNER_COVID19'...
remote: Enumerating objects: 61, done.[K
remote: Counting objects: 100% (61/61), done.[K
remote: Compressing objects: 100% (42/42), done.[K
remote: Total 61 (delta 24), reused 41 (delta 17), pack-reused 0 (from 0)[K
Receiving objects: 100% (61/61), 3.61 MiB | 15.29 MiB/s, done.
Resolving deltas: 100% (24/24), done.


In [None]:
train_data_json = []

with open('PhoNER_COVID19/data/syllable/train_syllable.json', "r") as f:
    for line in f:
        data = json.loads(line)
        #print(data)
        train_data_json.append(data)

In [None]:
dev_data_json = []

with open('PhoNER_COVID19/data/syllable/dev_syllable.json', "r") as f:
    for line in f:
        data = json.loads(line)
        #print(data)
        dev_data_json.append(data)

In [None]:
test_data_json = []

with open('PhoNER_COVID19/data/syllable/test_syllable.json', "r") as f:
    for line in f:
        data = json.loads(line)
        #print(data)
        test_data_json.append(data)

In [None]:
train_data_json = train_data_json + dev_data_json

In [None]:
print(test_data_json[0])

words = test_data_json[0]['words']
tags = test_data_json[0]['tags']

for w, t in zip(words, tags):
    print(f"word: {w} --- have tag: {t}")

{'words': ['Từ', '24', '-', '7', 'đến', '31', '-', '7', ',', 'bệnh', 'nhân', 'được', 'mẹ', 'là', 'bà', 'H.T.P', '(', '47', 'tuổi', ')', 'đón', 'về', 'nhà', 'ở', 'phường', 'Phước', 'Hoà', '(', 'bằng', 'xe', 'máy', ')', ',', 'không', 'đi', 'đâu', 'chỉ', 'ra', 'Tạp', 'hoá', 'Phượng', ',', 'chợ', 'Vườn', 'Lài', ',', 'phường', 'An', 'Sơn', 'cùng', 'mẹ', 'bán', 'tạp', 'hoá', 'ở', 'đây', '.'], 'tags': ['O', 'B-DATE', 'I-DATE', 'I-DATE', 'O', 'B-DATE', 'I-DATE', 'I-DATE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-NAME', 'O', 'B-AGE', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOCATION', 'I-LOCATION', 'I-LOCATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOCATION', 'I-LOCATION', 'I-LOCATION', 'O', 'B-LOCATION', 'I-LOCATION', 'I-LOCATION', 'O', 'B-LOCATION', 'I-LOCATION', 'I-LOCATION', 'O', 'O', 'B-JOB', 'I-JOB', 'I-JOB', 'O', 'O', 'O']}
word: Từ --- have tag: O
word: 24 --- have tag: B-DATE
word: - --- have tag: I-DATE
word: 7 --- have tag: I-DATE
word: đến --- have tag: O
word: 31 --- have

In [None]:
train_tag_labels = dict()
dev_tag_labels = dict()
test_tag_labels = dict()

for i, sample in enumerate(train_data_json):
    tags = sample['tags']
    words = sample['words']
    outputs = []
    object_name = ""
    object_tag = ""
    for j, (word, tag) in enumerate(zip(words, tags)):
        if tag != 'O':
            if tag[:2] == 'B-':
                if object_name != "":
                    outputs.append((object_name, object_tag))
                object_name = word
                object_tag = tag[2:]
            elif tag[:2] != 'B-':
                object_name += " " + word
                if j == len(tags) - 1:
                    outputs.append((object_name, object_tag))
        else:
            if object_name != "":
                outputs.append((object_name, object_tag))
            object_name = ""

        if tag == 'O' or tag[:2] != 'B-':
            continue
        tag = tag[2:]
        if tag in train_tag_labels:
            train_tag_labels[tag] += 1
        else:
            train_tag_labels[tag] = 1
    train_data_json[i] = {'words': train_data_json[i]['words'], 'tags': train_data_json[i]['tags'], 'outputs': outputs}

for i, sample in enumerate(dev_data_json):
    tags = sample['tags']
    words = sample['words']
    outputs = []
    object_name = ""
    object_tag = ""
    for j, (word, tag) in enumerate(zip(words, tags)):
        if tag != 'O':
            if tag[:2] == 'B-':
                if object_name != "":
                    outputs.append((object_name, object_tag))
                object_name = word
                object_tag = tag[2:]
            elif tag[:2] != 'B-':
                object_name += " " + word
                if j == len(tags) - 1:
                    outputs.append((object_name, object_tag))
        else:
            if object_name != "":
                outputs.append((object_name, object_tag))
            object_name = ""

        if tag == 'O' or tag[:2] != 'B-':
            continue
        tag = tag[2:]
        if tag in dev_tag_labels:
            dev_tag_labels[tag] += 1
        else:
            dev_tag_labels[tag] = 1
    dev_data_json[i] = {'words': dev_data_json[i]['words'], 'tags': dev_data_json[i]['tags'], 'outputs': outputs}

for i, sample in enumerate(test_data_json):
    tags = sample['tags']
    words = sample['words']
    outputs = []
    object_name = ""
    object_tag = ""
    for j, (word, tag) in enumerate(zip(words, tags)):
        if tag != 'O':
            if tag[:2] == 'B-':
                if object_name != "":
                    outputs.append((object_name, object_tag))
                object_name = word
                object_tag = tag[2:]
            elif tag[:2] != 'B-':
                object_name += " " + word
                if j == len(tags) - 1:
                    outputs.append((object_name, object_tag))
        else:
            if object_name != "":
                outputs.append((object_name, object_tag))
            object_name = ""

        if tag == 'O' or tag[:2] != 'B-':
            continue
        tag = tag[2:]
        if tag in test_tag_labels:
            test_tag_labels[tag] += 1
        else:
            test_tag_labels[tag] = 1
    test_data_json[i] = {'words': test_data_json[i]['words'], 'tags': test_data_json[i]['tags'], 'outputs': outputs}


print(train_tag_labels)
print(dev_tag_labels)
print(test_tag_labels)
tag_labels = list(set(list(train_tag_labels.keys()) + list(dev_tag_labels.keys()) + list(test_tag_labels.keys())))
print(tag_labels)

{'ORGANIZATION': 1688, 'SYMPTOM_AND_DISEASE': 2205, 'LOCATION': 8135, 'DATE': 3652, 'PATIENT_ID': 4516, 'AGE': 1043, 'NAME': 537, 'JOB': 337, 'TRANSPORTATION': 313, 'GENDER': 819}
{'ORGANIZATION': 551, 'DATE': 1103, 'SYMPTOM_AND_DISEASE': 766, 'PATIENT_ID': 1276, 'GENDER': 277, 'AGE': 361, 'JOB': 132, 'LOCATION': 2737, 'NAME': 188, 'TRANSPORTATION': 87}
{'DATE': 1654, 'NAME': 318, 'AGE': 582, 'LOCATION': 4441, 'JOB': 173, 'ORGANIZATION': 771, 'PATIENT_ID': 2005, 'SYMPTOM_AND_DISEASE': 1136, 'GENDER': 462, 'TRANSPORTATION': 193}
['TRANSPORTATION', 'LOCATION', 'NAME', 'ORGANIZATION', 'JOB', 'GENDER', 'PATIENT_ID', 'SYMPTOM_AND_DISEASE', 'DATE', 'AGE']


In [None]:
dev_data_json[0]

{'words': ['Bác',
  'sĩ',
  'Nguyễn',
  'Trung',
  'Nguyên',
  ',',
  'Giám',
  'đốc',
  'Trung',
  'tâm',
  'Chống',
  'độc',
  ',',
  'Bệnh',
  'viện',
  'Bạch',
  'Mai',
  ',',
  'cho',
  'biết',
  'bệnh',
  'nhân',
  'được',
  'chuyển',
  'đến',
  'bệnh',
  'viện',
  'ngày',
  '7/3',
  ',',
  'chẩn',
  'đoán',
  'ngộ',
  'độc',
  'thuốc',
  'điều',
  'trị',
  'sốt',
  'rét',
  'chloroquine',
  '.'],
 'tags': ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-ORGANIZATION',
  'I-ORGANIZATION',
  'I-ORGANIZATION',
  'I-ORGANIZATION',
  'I-ORGANIZATION',
  'I-ORGANIZATION',
  'I-ORGANIZATION',
  'I-ORGANIZATION',
  'I-ORGANIZATION',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-DATE',
  'O',
  'O',
  'O',
  'B-SYMPTOM_AND_DISEASE',
  'I-SYMPTOM_AND_DISEASE',
  'I-SYMPTOM_AND_DISEASE',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 'outputs': [('Trung tâm Chống độc , Bệnh viện Bạch Mai', 'ORGANIZATION'),
  ('7/3', 'DATE'),
  ('ngộ độc thuốc', '

In [None]:
df = pd.DataFrame([train_tag_labels, dev_tag_labels, test_tag_labels])
df = df.T
df.columns = ['train', 'dev', 'test']
df = df.fillna(0).astype(int)
print(df)

                     train   dev  test
ORGANIZATION          1688   551   771
SYMPTOM_AND_DISEASE   2205   766  1136
LOCATION              8135  2737  4441
DATE                  3652  1103  1654
PATIENT_ID            4516  1276  2005
AGE                   1043   361   582
NAME                   537   188   318
JOB                    337   132   173
TRANSPORTATION         313    87   193
GENDER                 819   277   462


In [None]:
tag_labels

['TRANSPORTATION',
 'LOCATION',
 'NAME',
 'ORGANIZATION',
 'JOB',
 'GENDER',
 'PATIENT_ID',
 'SYMPTOM_AND_DISEASE',
 'DATE',
 'AGE']

In [None]:
max_length_dataset = 0

with open('train.jsonl', 'w', encoding='utf-8') as outfile:
    for x in train_data_json:
        words = " ".join(x['words'])
        tags = " ".join(x['tags'])
        outputs = str(x['outputs'])
        #label = 'yes' if label == 'relevance' else 'no'
        data = {
            "input": f'''Named Entity Recognition Task

Sentence: {words}

Identify and label the named entities in the above sentence. The possible entity types are:
NAME, GENDER, TRANSPORTATION, SYMPTOM_AND_DISEASE, DATE, ORGANIZATION, LOCATION, PATIENT_ID, JOB, AGE

Format your response as a list of tuples: (entity, entity_type)
''',
            "output": f"{outputs}"
        }
        max_length_dataset = max(max_length_dataset, len(data['input'].split()) + len(data['output'].split()) + 2)
        json.dump(data, outfile, ensure_ascii=False)
        outfile.write('\n')

In [None]:
with open('dev.jsonl', 'w', encoding='utf-8') as outfile:
    for x in dev_data_json:
        words = " ".join(x['words'])
        tags = " ".join(x['tags'])
        outputs = str(x['outputs'])
        #label = 'yes' if label == 'relevance' else 'no'
        data = {
            "input": f'''Named Entity Recognition Task

Sentence: {words}

Identify and label the named entities in the above sentence. The possible entity types are:
NAME, GENDER, TRANSPORTATION, SYMPTOM_AND_DISEASE, DATE, ORGANIZATION, LOCATION, PATIENT_ID, JOB, AGE

Format your response as a list of tuples: (entity, entity_type)
''',
            "output": f"{outputs}"
        }
        max_length_dataset = max(max_length_dataset, len(data['input'].split()) + len(data['output'].split()) + 2)
        json.dump(data, outfile, ensure_ascii=False)
        outfile.write('\n')

In [None]:
with open('test.jsonl', 'w', encoding='utf-8') as outfile:
    for x in test_data_json:
        words = " ".join(x['words'])
        tags = " ".join(x['tags'])
        outputs = str(x['outputs'])
        #label = 'yes' if label == 'relevance' else 'no'
        data = {
            "input": f'''Named Entity Recognition Task

Sentence: {words}

Identify and label the named entities in the above sentence. The possible entity types are:
NAME, GENDER, TRANSPORTATION, SYMPTOM_AND_DISEASE, DATE, ORGANIZATION, LOCATION, PATIENT_ID, JOB, AGE

Format your response as a list of tuples: (entity, entity_type)
''',
            "output": f"{outputs}"
        }
        max_length_dataset = max(max_length_dataset, len(data['input'].split()) + len(data['output'].split()) + 2)
        json.dump(data, outfile, ensure_ascii=False)
        outfile.write('\n')

In [None]:
max_length_dataset

372

In [None]:
data_files = {
    "train": "train.jsonl",
    "validation": "test.jsonl",
}

dataset = load_dataset("json", data_files=data_files)

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['input', 'output'],
        num_rows: 7027
    })
    validation: Dataset({
        features: ['input', 'output'],
        num_rows: 3000
    })
})

#Create prompt format
To train LLM, we need convert input and output to prompt. I use formatting_prompts_func function to convert. This function just concat input and output. Note that I add " <|im_end|>" at the end of prompt to help LLM know when we need stop generate output (<|im_end|> is eos token of Qwen2)

In [None]:
#from google.colab import userdata

tokenizer = AutoTokenizer.from_pretrained(
    modelpath,
    padding_side="right",
    # add_eos_token=True,
    # add_bos_token=True,
    trust_remote_code=True,
    token = huggingface_token#userdata.get('huggingface_token')
)

In [None]:
prompt = "test thử mô hình"
tokens = tokenizer(prompt, return_tensors="pt")
print(tokens)
tokenizer.batch_decode(tokenizer.encode(prompt))

{'input_ids': tensor([[  1944, 131885, 130179, 128338]]), 'attention_mask': tensor([[1, 1, 1, 1]])}


['test', ' thử', ' mô', ' hình']

In [None]:
def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['input'])):
        input_ = example['input'][i]
        output_ = example['output'][i]
        #text = f"### Question: {input__}\n ### Answer: {example['output'][i]}"
        text = f'''{input_}
{output_} <|im_end|>'''

        output_texts.append(text)
    return output_texts

In [None]:
print(dev_data_json[0]['words'], len(dev_data_json[0]['words']))
print(dev_data_json[0]['tags'], len(dev_data_json[0]['tags']))

['Bác', 'sĩ', 'Nguyễn', 'Trung', 'Nguyên', ',', 'Giám', 'đốc', 'Trung', 'tâm', 'Chống', 'độc', ',', 'Bệnh', 'viện', 'Bạch', 'Mai', ',', 'cho', 'biết', 'bệnh', 'nhân', 'được', 'chuyển', 'đến', 'bệnh', 'viện', 'ngày', '7/3', ',', 'chẩn', 'đoán', 'ngộ', 'độc', 'thuốc', 'điều', 'trị', 'sốt', 'rét', 'chloroquine', '.'] 41
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'O', 'O', 'O', 'B-SYMPTOM_AND_DISEASE', 'I-SYMPTOM_AND_DISEASE', 'I-SYMPTOM_AND_DISEASE', 'O', 'O', 'O', 'O', 'O', 'O'] 41


In [None]:
dataset["validation"]['input'][0], dataset["validation"]['output'][0]

('Named Entity Recognition Task\n\nSentence: Từ 24 - 7 đến 31 - 7 , bệnh nhân được mẹ là bà H.T.P ( 47 tuổi ) đón về nhà ở phường Phước Hoà ( bằng xe máy ) , không đi đâu chỉ ra Tạp hoá Phượng , chợ Vườn Lài , phường An Sơn cùng mẹ bán tạp hoá ở đây .\n\nIdentify and label the named entities in the above sentence. The possible entity types are:\nNAME, GENDER, TRANSPORTATION, SYMPTOM_AND_DISEASE, DATE, ORGANIZATION, LOCATION, PATIENT_ID, JOB, AGE\n\nFormat your response as a list of tuples: (entity, entity_type)\n',
 "[('24 - 7', 'DATE'), ('31 - 7', 'DATE'), ('H.T.P', 'NAME'), ('47', 'AGE'), ('phường Phước Hoà', 'LOCATION'), ('Tạp hoá Phượng', 'LOCATION'), ('chợ Vườn Lài', 'LOCATION'), ('phường An Sơn', 'LOCATION'), ('bán tạp hoá', 'JOB')]")

In [None]:
formatting_prompts_func({'input': [dataset["validation"]['input'][0]],
                         'output': [dataset["validation"]['output'][0]]})

["Named Entity Recognition Task\n\nSentence: Từ 24 - 7 đến 31 - 7 , bệnh nhân được mẹ là bà H.T.P ( 47 tuổi ) đón về nhà ở phường Phước Hoà ( bằng xe máy ) , không đi đâu chỉ ra Tạp hoá Phượng , chợ Vườn Lài , phường An Sơn cùng mẹ bán tạp hoá ở đây .\n\nIdentify and label the named entities in the above sentence. The possible entity types are:\nNAME, GENDER, TRANSPORTATION, SYMPTOM_AND_DISEASE, DATE, ORGANIZATION, LOCATION, PATIENT_ID, JOB, AGE\n\nFormat your response as a list of tuples: (entity, entity_type)\n\n[('24 - 7', 'DATE'), ('31 - 7', 'DATE'), ('H.T.P', 'NAME'), ('47', 'AGE'), ('phường Phước Hoà', 'LOCATION'), ('Tạp hoá Phượng', 'LOCATION'), ('chợ Vườn Lài', 'LOCATION'), ('phường An Sơn', 'LOCATION'), ('bán tạp hoá', 'JOB')] <|im_end|>"]

In [None]:
print(formatting_prompts_func({'input': [dataset["validation"]['input'][0]],
                         'output': [dataset["validation"]['output'][0]]})[0])

Named Entity Recognition Task

Sentence: Từ 24 - 7 đến 31 - 7 , bệnh nhân được mẹ là bà H.T.P ( 47 tuổi ) đón về nhà ở phường Phước Hoà ( bằng xe máy ) , không đi đâu chỉ ra Tạp hoá Phượng , chợ Vườn Lài , phường An Sơn cùng mẹ bán tạp hoá ở đây .

Identify and label the named entities in the above sentence. The possible entity types are:
NAME, GENDER, TRANSPORTATION, SYMPTOM_AND_DISEASE, DATE, ORGANIZATION, LOCATION, PATIENT_ID, JOB, AGE

Format your response as a list of tuples: (entity, entity_type)

[('24 - 7', 'DATE'), ('31 - 7', 'DATE'), ('H.T.P', 'NAME'), ('47', 'AGE'), ('phường Phước Hoà', 'LOCATION'), ('Tạp hoá Phượng', 'LOCATION'), ('chợ Vườn Lài', 'LOCATION'), ('phường An Sơn', 'LOCATION'), ('bán tạp hoá', 'JOB')] <|im_end|>


#Create model

In [None]:
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_use_double_quant=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.bfloat16,
# )

model = AutoModelForCausalLM.from_pretrained(
    modelpath,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    #torch_dtype=torch.float32,
    #quantization_config=bnb_config,
    #attn_implementation="flash_attention_2",
    trust_remote_code=True,
    token = huggingface_token#userdata.get('huggingface_token')
)

In [None]:
model

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2SdpaAttention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((

#Create peft

In [None]:
peft_config = LoraConfig(
    lora_alpha=32,
    lora_dropout=0.1,
    r=8,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules = 'all-linear',
#     target_modules=["q_proj",
#         "k_proj",
#         "v_proj",
#         "o_proj",
#         "gate_proj",
#         "up_proj",
#         "down_proj",
#         "lm_head",],
    modules_to_save=["embed_tokens", "rotary_emb"]
                     #"input_layernorm", "post_attention_layernorm", "norm"]
)
model = get_peft_model(model, peft_config)

In [None]:
model.print_trainable_parameters()

trainable params: 140,533,760 || all params: 634,566,528 || trainable%: 22.1464


#Create TrainingArguments

In [None]:
print(len(train_data_json)//bs//ga_steps*epochs//4)

439


In [None]:
save_step = len(train_data_json)//bs//ga_steps*epochs//4
print(save_step)

439


In [None]:
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=epochs,
    per_device_train_batch_size=bs,
    per_device_eval_batch_size=bs_eval,
    evaluation_strategy="steps",
    eval_steps=save_step,
    gradient_accumulation_steps=ga_steps,
    optim="paged_adamw_32bit",
    save_steps=save_step,
    save_strategy="steps",
    logging_steps=save_step,
    learning_rate=lr,
    weight_decay=0.001,
    fp16=False,
    bf16=True,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="none",
    save_total_limit=1,
    #load_best_model_at_end=True
)



#Create trainer

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    peft_config=peft_config,
    max_seq_length= max_length,
    #dataset_text_field=["input", "output"],
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
    formatting_func = formatting_prompts_func,
    #data_collator=collator
)



Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/7027 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [None]:
trainer.train()

Step,Training Loss,Validation Loss
439,0.6207,0.619406
878,0.5092,0.581862
1317,0.4527,0.574239
1756,0.403,0.576836


TrainOutput(global_step=1756, training_loss=0.49642897412554277, metrics={'train_runtime': 909.2407, 'train_samples_per_second': 30.914, 'train_steps_per_second': 1.931, 'total_flos': 9580549900993536.0, 'train_loss': 0.49642897412554277, 'epoch': 3.995449374288965})

# Evaluate

In [None]:
save_step = len(train_data_json)//bs//ga_steps*epochs//4
print(save_step)

439


In [None]:
import gc

del model
gc.collect()
torch.cuda.empty_cache()

In [None]:
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

_id = save_step*epochs

peft_model_id = f"{output_dir}/checkpoint-{_id}"

config = PeftConfig.from_pretrained(peft_model_id)

# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_use_double_quant=False,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.float16,
# )

model = AutoModelForCausalLM.from_pretrained(
    modelpath,
    device_map="auto",
    #torch_dtype=torch.float16,
    torch_dtype=torch.bfloat16,
    #quantization_config=bnb_config,
    #attn_implementation="flash_attention_2",
    trust_remote_code=True,
    token = huggingface_token#userdata.get('huggingface_token')
)

tokenizer = AutoTokenizer.from_pretrained(f"{output_dir}/checkpoint-{_id}",
                                          trust_remote_code=True,
                                          padding_side='left',
                                          token=huggingface_token,
                                          return_tensors="pt")#userdata.get('huggingface_token'))

# Load the Lora model
model = PeftModel.from_pretrained(model, peft_model_id)

## Test with 1 sample

In [None]:
def create_prompt(input_, output_):
    text = f'''{input_}
'''
    return text

In [None]:
words = " ".join(test_data_json[0]['words'])
input_ = f'''Named Entity Recognition Task

Sentence: {words}

Identify and label the named entities in the above sentence. The possible entity types are:
NAME, GENDER, TRANSPORTATION, SYMPTOM_AND_DISEASE, DATE, ORGANIZATION, LOCATION, PATIENT_ID, JOB, AGE

Format your response as a list of tuples: (entity, entity_type)
'''
prompt = create_prompt(input_, "")
print(prompt)

Named Entity Recognition Task

Sentence: Từ 24 - 7 đến 31 - 7 , bệnh nhân được mẹ là bà H.T.P ( 47 tuổi ) đón về nhà ở phường Phước Hoà ( bằng xe máy ) , không đi đâu chỉ ra Tạp hoá Phượng , chợ Vườn Lài , phường An Sơn cùng mẹ bán tạp hoá ở đây .

Identify and label the named entities in the above sentence. The possible entity types are:
NAME, GENDER, TRANSPORTATION, SYMPTOM_AND_DISEASE, DATE, ORGANIZATION, LOCATION, PATIENT_ID, JOB, AGE

Format your response as a list of tuples: (entity, entity_type)




In [None]:
inputs = torch.tensor([tokenizer.encode(prompt)])
inputs

tensor([[ 15810,  10390,  47598,   5430,    271,  84564,     25, 130232,    220,
             17,     19,    481,    220,     22, 128263,    220,     18,     16,
            481,    220,     22,   1154, 128453, 128310,  63478, 128850,  37915,
         129302,    472,    836,   1069,    320,    220,     19,     22, 128790,
            873,  14854,   3165, 128265, 128275, 128269, 140515,   2350, 124603,
          17275,   6362,    320, 128411,  82790, 128392,    873,   1154,  53037,
          73586, 129625,  98127,  15122,    350, 127432, 137692,   2350,  98210,
           1154, 136625,    647,  51990,     77,    444,  52551,   1154, 140515,
           1527, 131279, 128340, 128850, 128627, 138628, 137692, 128269, 128358,
           6762,  28301,   1437,    323,   2383,    279,   6941,  14744,    304,
            279,   3403,  11652,     13,    576,   3204,   5387,   4494,    525,
            510,   7535,     11,    479,  43604,     11,  40347,  54800,   3495,
             11,  16079,    

In [None]:
tokens = model.generate(
    inputs.to(model.device),
    max_new_tokens=1024,
    temperature=0.1,
    do_sample=True
)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


In [None]:
print(tokenizer.decode(tokens[0], skip_special_tokens=False))

Named Entity Recognition Task

Sentence: Từ 24 - 7 đến 31 - 7 , bệnh nhân được mẹ là bà H.T.P ( 47 tuổi ) đón về nhà ở phường Phước Hoà ( bằng xe máy ) , không đi đâu chỉ ra Tạp hoá Phượng , chợ Vườn Lài , phường An Sơn cùng mẹ bán tạp hoá ở đây .

Identify and label the named entities in the above sentence. The possible entity types are:
NAME, GENDER, TRANSPORTATION, SYMPTOM_AND_DISEASE, DATE, ORGANIZATION, LOCATION, PATIENT_ID, JOB, AGE

Format your response as a list of tuples: (entity, entity_type)

[('24 - 7', 'DATE'), ('31 - 7', 'DATE'), ('H.T.P', 'NAME'), ('47', 'AGE'), ('phường Phước Hoà', 'LOCATION'), ('Tạp hoá Phượng', 'LOCATION'), ('chợ Vườn Lài', 'LOCATION'), ('phường An Sơn', 'LOCATION')] <|im_end|>


In [None]:
tokens = model.generate(
    inputs.to(model.device),
    max_new_tokens=256,
    temperature=0.1,
    do_sample=False
)
print(tokenizer.decode(tokens[0], skip_special_tokens=False))



Named Entity Recognition Task

Sentence: Từ 24 - 7 đến 31 - 7 , bệnh nhân được mẹ là bà H.T.P ( 47 tuổi ) đón về nhà ở phường Phước Hoà ( bằng xe máy ) , không đi đâu chỉ ra Tạp hoá Phượng , chợ Vườn Lài , phường An Sơn cùng mẹ bán tạp hoá ở đây .

Identify and label the named entities in the above sentence. The possible entity types are:
NAME, GENDER, TRANSPORTATION, SYMPTOM_AND_DISEASE, DATE, ORGANIZATION, LOCATION, PATIENT_ID, JOB, AGE

Format your response as a list of tuples: (entity, entity_type)

[('24 - 7', 'DATE'), ('31 - 7', 'DATE'), ('H.T.P', 'NAME'), ('47', 'AGE'), ('phường Phước Hoà', 'LOCATION'), ('Tạp hoá Phượng', 'LOCATION'), ('chợ Vườn Lài', 'LOCATION'), ('phường An Sơn', 'LOCATION')] <|im_end|>


In [None]:
print(test_data_json[0]['outputs'])

[('24 - 7', 'DATE'), ('31 - 7', 'DATE'), ('H.T.P', 'NAME'), ('47', 'AGE'), ('phường Phước Hoà', 'LOCATION'), ('Tạp hoá Phượng', 'LOCATION'), ('chợ Vườn Lài', 'LOCATION'), ('phường An Sơn', 'LOCATION'), ('bán tạp hoá', 'JOB')]


In [None]:
tokenizer.decode(tokens[0], skip_special_tokens=False).split("\n")[-1].split(" <|im_end|>")[0]

"[('24 - 7', 'DATE'), ('31 - 7', 'DATE'), ('H.T.P', 'NAME'), ('47', 'AGE'), ('phường Phước Hoà', 'LOCATION'), ('Tạp hoá Phượng', 'LOCATION'), ('chợ Vườn Lài', 'LOCATION'), ('phường An Sơn', 'LOCATION')]"

# test batch generate with 2 samples

In [None]:
words1 = " ".join(test_data_json[0]['words'])
input_1 = f'''Named Entity Recognition Task

Sentence: {words1}

Identify and label the named entities in the above sentence. The possible entity types are:
NAME, GENDER, TRANSPORTATION, SYMPTOM_AND_DISEASE, DATE, ORGANIZATION, LOCATION, PATIENT_ID, JOB, AGE

Format your response as a list of tuples: (entity, entity_type)
'''

words2 = " ".join(test_data_json[1]['words'])
input_2 = f'''Named Entity Recognition Task

Sentence: {words2}

Identify and label the named entities in the above sentence. The possible entity types are:
NAME, GENDER, TRANSPORTATION, SYMPTOM_AND_DISEASE, DATE, ORGANIZATION, LOCATION, PATIENT_ID, JOB, AGE

Format your response as a list of tuples: (entity, entity_type)
'''

prompts = [create_prompt(input_1, ""), create_prompt(input_2, "")]
print(prompts)

['Named Entity Recognition Task\n\nSentence: Từ 24 - 7 đến 31 - 7 , bệnh nhân được mẹ là bà H.T.P ( 47 tuổi ) đón về nhà ở phường Phước Hoà ( bằng xe máy ) , không đi đâu chỉ ra Tạp hoá Phượng , chợ Vườn Lài , phường An Sơn cùng mẹ bán tạp hoá ở đây .\n\nIdentify and label the named entities in the above sentence. The possible entity types are:\nNAME, GENDER, TRANSPORTATION, SYMPTOM_AND_DISEASE, DATE, ORGANIZATION, LOCATION, PATIENT_ID, JOB, AGE\n\nFormat your response as a list of tuples: (entity, entity_type)\n\n', 'Named Entity Recognition Task\n\nSentence: Bác sĩ Trần Thanh Linh , từ Bệnh viện Chợ Rẫy chi viện phụ trách đơn nguyên hồi sức tích cực , cho biết " bệnh nhân 416 " vẫn đang duy trì ECMO , thở máy , hiện xơ phổi rất nhiều .\n\nIdentify and label the named entities in the above sentence. The possible entity types are:\nNAME, GENDER, TRANSPORTATION, SYMPTOM_AND_DISEASE, DATE, ORGANIZATION, LOCATION, PATIENT_ID, JOB, AGE\n\nFormat your response as a list of tuples: (entity, 

In [None]:
prompts[0]

'Named Entity Recognition Task\n\nSentence: Từ 24 - 7 đến 31 - 7 , bệnh nhân được mẹ là bà H.T.P ( 47 tuổi ) đón về nhà ở phường Phước Hoà ( bằng xe máy ) , không đi đâu chỉ ra Tạp hoá Phượng , chợ Vườn Lài , phường An Sơn cùng mẹ bán tạp hoá ở đây .\n\nIdentify and label the named entities in the above sentence. The possible entity types are:\nNAME, GENDER, TRANSPORTATION, SYMPTOM_AND_DISEASE, DATE, ORGANIZATION, LOCATION, PATIENT_ID, JOB, AGE\n\nFormat your response as a list of tuples: (entity, entity_type)\n\n'

In [None]:
torch.tensor(tokenizer(prompts, return_tensors="pt", padding=True, truncation=True).input_ids).shape

  torch.tensor(tokenizer(prompts, return_tensors="pt", padding=True, truncation=True).input_ids).shape


torch.Size([2, 149])

In [None]:
tokenizer(prompts, return_tensors="pt", padding=True, truncation=True)

{'input_ids': tensor([[ 15810,  10390,  47598,   5430,    271,  84564,     25, 130232,    220,
             17,     19,    481,    220,     22, 128263,    220,     18,     16,
            481,    220,     22,   1154, 128453, 128310,  63478, 128850,  37915,
         129302,    472,    836,   1069,    320,    220,     19,     22, 128790,
            873,  14854,   3165, 128265, 128275, 128269, 140515,   2350, 124603,
          17275,   6362,    320, 128411,  82790, 128392,    873,   1154,  53037,
          73586, 129625,  98127,  15122,    350, 127432, 137692,   2350,  98210,
           1154, 136625,    647,  51990,     77,    444,  52551,   1154, 140515,
           1527, 131279, 128340, 128850, 128627, 138628, 137692, 128269, 128358,
           6762,  28301,   1437,    323,   2383,    279,   6941,  14744,    304,
            279,   3403,  11652,     13,    576,   3204,   5387,   4494,    525,
            510,   7535,     11,    479,  43604,     11,  40347,  54800,   3495,
             1

In [None]:
inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True).to(model.device)

In [None]:
with torch.no_grad():
            outputs = model.generate(
                inputs.input_ids,
                attention_mask=inputs.attention_mask,
                max_new_tokens=1024,
                temperature=0.1,
                do_sample=False,
                pad_token_id=tokenizer.eos_token_id
            )

In [None]:
outputs

tensor([[ 15810,  10390,  47598,   5430,    271,  84564,     25, 130232,    220,
             17,     19,    481,    220,     22, 128263,    220,     18,     16,
            481,    220,     22,   1154, 128453, 128310,  63478, 128850,  37915,
         129302,    472,    836,   1069,    320,    220,     19,     22, 128790,
            873,  14854,   3165, 128265, 128275, 128269, 140515,   2350, 124603,
          17275,   6362,    320, 128411,  82790, 128392,    873,   1154,  53037,
          73586, 129625,  98127,  15122,    350, 127432, 137692,   2350,  98210,
           1154, 136625,    647,  51990,     77,    444,  52551,   1154, 140515,
           1527, 131279, 128340, 128850, 128627, 138628, 137692, 128269, 128358,
           6762,  28301,   1437,    323,   2383,    279,   6941,  14744,    304,
            279,   3403,  11652,     13,    576,   3204,   5387,   4494,    525,
            510,   7535,     11,    479,  43604,     11,  40347,  54800,   3495,
             11,  16079,    

In [None]:
decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
print(decoded_outputs[1])

Named Entity Recognition Task

Sentence: Bác sĩ Trần Thanh Linh , từ Bệnh viện Chợ Rẫy chi viện phụ trách đơn nguyên hồi sức tích cực , cho biết " bệnh nhân 416 " vẫn đang duy trì ECMO , thở máy , hiện xơ phổi rất nhiều .

Identify and label the named entities in the above sentence. The possible entity types are:
NAME, GENDER, TRANSPORTATION, SYMPTOM_AND_DISEASE, DATE, ORGANIZATION, LOCATION, PATIENT_ID, JOB, AGE

Format your response as a list of tuples: (entity, entity_type)

[('Bệnh viện Chợ Rẫy', 'LOCATION'), ('416', 'PATIENT_ID')] 


In [None]:
test_data_json[1]['outputs']

[('Bệnh viện Chợ Rẫy', 'ORGANIZATION'),
 ('416', 'PATIENT_ID'),
 ('xơ phổi rất nhiều', 'SYMPTOM_AND_DISEASE')]

In [None]:
print(decoded_outputs[0])

Named Entity Recognition Task

Sentence: Từ 24 - 7 đến 31 - 7 , bệnh nhân được mẹ là bà H.T.P ( 47 tuổi ) đón về nhà ở phường Phước Hoà ( bằng xe máy ) , không đi đâu chỉ ra Tạp hoá Phượng , chợ Vườn Lài , phường An Sơn cùng mẹ bán tạp hoá ở đây .

Identify and label the named entities in the above sentence. The possible entity types are:
NAME, GENDER, TRANSPORTATION, SYMPTOM_AND_DISEASE, DATE, ORGANIZATION, LOCATION, PATIENT_ID, JOB, AGE

Format your response as a list of tuples: (entity, entity_type)

[('24 - 7', 'DATE'), ('31 - 7', 'DATE'), ('H.T.P', 'NAME'), ('47', 'AGE'), ('phường Phước Hoà', 'LOCATION'), ('Tạp hoá Phượng', 'LOCATION'), ('chợ Vườn Lài', 'LOCATION'), ('phường An Sơn', 'LOCATION')] 


In [None]:
print(decoded_outputs[0].split("\n")[-1][:-1])

[('24 - 7', 'DATE'), ('31 - 7', 'DATE'), ('H.T.P', 'NAME'), ('47', 'AGE'), ('phường Phước Hoà', 'LOCATION'), ('Tạp hoá Phượng', 'LOCATION'), ('chợ Vườn Lài', 'LOCATION'), ('phường An Sơn', 'LOCATION')]


In [None]:
test_data_json[1]['outputs']

[('Bệnh viện Chợ Rẫy', 'ORGANIZATION'),
 ('416', 'PATIENT_ID'),
 ('xơ phổi rất nhiều', 'SYMPTOM_AND_DISEASE')]

# predict for test dataset
We can see that The model can predict correctly for 65.1% of the samples when all entities of a given input match the labels exactly.

In [None]:
from datetime import datetime

start = datetime.now()

prediction = []
response = []
accuracy = []
labels = []

batchsize = 16

for i in range(len(test_data_json) // batchsize + int(len(test_data_json) % batchsize != 0)):
    batch_data = test_data_json[i * batchsize: min((i+1) * batchsize, len(test_data_json))]
    batch_words = [x['words'] for x in batch_data]
    batch_words = [" ".join(x) for x in batch_words]
    batch_tags = [x['tags'] for x in batch_data]
    batch_outputs = [x['outputs'] for x in batch_data]
    batch_input_ = [f'''Named Entity Recognition Task

Sentence: {words}

Identify and label the named entities in the above sentence. The possible entity types are:
NAME, GENDER, TRANSPORTATION, SYMPTOM_AND_DISEASE, DATE, ORGANIZATION, LOCATION, PATIENT_ID, JOB, AGE

Format your response as a list of tuples: (entity, entity_type)
''' for words in batch_words]
    batch_prompts = [create_prompt(input_, tags) for input_ in batch_input_]

    inputs =  tokenizer(batch_prompts, return_tensors="pt", padding=True, truncation=True).to(model.device)

    with torch.no_grad():
        batch_tokens = model.generate(
                inputs.input_ids,
                attention_mask=inputs.attention_mask,
                max_new_tokens=1024,
                temperature=0.1,
                do_sample=False,
                pad_token_id=tokenizer.eos_token_id
            )

    decoded_outputs = tokenizer.batch_decode(batch_tokens, skip_special_tokens=True)
    batch_answer = [answer.split("\n")[-1][:-1] for answer in decoded_outputs]
    prediction += batch_answer
    response += decoded_outputs

    accuracy += [answer == str(output) for answer, output in zip(batch_answer, batch_outputs)]
    labels += batch_outputs

    if i % 10 == 0:
        print(i, np.array(accuracy).sum()/len(prediction)*100, datetime.now() - start, (datetime.now() - start)/len(prediction))

0 56.25 0:00:10.097161 0:00:00.631073
10 46.02272727272727 0:01:01.016046 0:00:00.346682
20 45.535714285714285 0:02:06.597968 0:00:00.376780
30 45.564516129032256 0:03:02.472894 0:00:00.367889
40 45.27439024390244 0:03:56.073506 0:00:00.359868
50 46.32352941176471 0:04:57.886063 0:00:00.365056
60 46.41393442622951 0:05:56.116896 0:00:00.364874
70 46.478873239436616 0:06:55.350422 0:00:00.365625
80 48.99691358024691 0:08:00.314906 0:00:00.370613
90 51.579670329670336 0:08:46.687924 0:00:00.361736
100 54.146039603960396 0:09:33.387144 0:00:00.354819
110 56.70045045045045 0:10:23.414660 0:00:00.351022
120 58.16115702479338 0:11:02.255238 0:00:00.342074
130 59.78053435114504 0:11:53.228222 0:00:00.340281
140 60.54964539007093 0:12:35.831597 0:00:00.335032
150 61.75496688741722 0:13:22.193459 0:00:00.332034
160 63.04347826086957 0:14:10.280399 0:00:00.330078
170 63.888888888888886 0:14:55.931582 0:00:00.327460
180 64.74447513812154 0:15:43.989315 0:00:00.325963


In [None]:
np.array(accuracy).sum()/len(prediction)*100

65.10000000000001

In [None]:
with open("test_response.json", "w", encoding="utf-8") as file:
    json.dump(response, file, ensure_ascii=False)

In [None]:
df = pd.DataFrame(data = {'words': [x['words'] for x in test_data_json],
                         'outputs': labels,
                        'outputs_string': [str(x) for x in labels],
                         'response': response,
                         'prediction': prediction})
df['check'] = df['outputs_string'] == df['prediction']

## Evaluate for each tags
Model will predict poorly for JOB entites

In [None]:
import ast
from sklearn.metrics import precision_score, recall_score, f1_score

def evaluate_ner(outputs, true_outputs, entity_labels):
    """
    Đánh giá NER dựa trên đầu ra dự đoán và nhãn thật.

    Args:
    - outputs: List đầu ra dự đoán từ mô hình, với định dạng [[('entity', 'label')]].
    - true_outputs: List nhãn thật (ground truth), với định dạng giống với outputs.
    - entity_labels: List các nhãn thực thể cần đánh giá (ví dụ: ['DATE', 'NAME', 'AGE', 'LOCATION']).

    Returns:
    - Một dict chứa Precision, Recall, và F1-score cho từng nhãn và trung bình.
    """
    # Danh sách lưu lại các giá trị true và predicted cho từng token (flattened)
    y_true = []
    y_pred = []

    # Flatten danh sách nhãn thật và nhãn dự đoán
    for predicted_sentence, true_sentence in zip(outputs, true_outputs):
        # for pred_entity, true_entity in zip(predicted_sentence, true_sentence):
        #     y_pred.append(pred_entity[1])  # Lấy nhãn dự đoán
        #     y_true.append(true_entity[1])  # Lấy nhãn thật
        predicted_sentence = {x[0]:x[1] for x in predicted_sentence}
        for true_entity in true_sentence:
            if true_entity[0] in predicted_sentence:
                y_pred.append(predicted_sentence[true_entity[0]])
                y_true.append(true_entity[1])
            else:
                y_true.append(true_entity[1])
                y_pred.append('O')
        true_sentence = {x[0]:x[1] for x in true_sentence}
        for pred_entity in predicted_sentence.keys():
            if pred_entity not in true_sentence:
                y_true.append('O')
                y_pred.append(predicted_sentence[pred_entity])

    # Tạo dict lưu kết quả Precision, Recall, F1-score cho từng nhãn
    results = {}
    for label in entity_labels:
        precision = precision_score([1 if y == label else 0 for y in y_true],
                                    [1 if y == label else 0 for y in y_pred], zero_division=0)
        recall = recall_score([1 if y == label else 0 for y in y_true],
                              [1 if y == label else 0 for y in y_pred], zero_division=0)
        f1 = f1_score([1 if y == label else 0 for y in y_true],
                      [1 if y == label else 0 for y in y_pred], zero_division=0)
        results[label] = {'Precision': precision, 'Recall': recall, 'F1-score': f1}

    # Tính F1-score, Precision, Recall trung bình cho toàn bộ các nhãn
    avg_precision = precision_score(y_true, y_pred, labels=entity_labels, average='macro', zero_division=0)
    avg_recall = recall_score(y_true, y_pred, labels=entity_labels, average='macro', zero_division=0)
    avg_f1 = f1_score(y_true, y_pred, labels=entity_labels, average='macro', zero_division=0)

    results['Average'] = {'Precision': avg_precision, 'Recall': avg_recall, 'F1-score': avg_f1}

    return results

entity_labels = tag_labels + ['O']

results = evaluate_ner([ast.literal_eval(x) for x in prediction], labels, entity_labels)
print(results)

{'TRANSPORTATION': {'Precision': 0.9838709677419355, 'Recall': 0.9481865284974094, 'F1-score': 0.9656992084432717}, 'LOCATION': {'Precision': 0.8805903398926654, 'Recall': 0.8913535536441829, 'F1-score': 0.8859392575928009}, 'NAME': {'Precision': 0.8884892086330936, 'Recall': 0.7942122186495176, 'F1-score': 0.8387096774193549}, 'ORGANIZATION': {'Precision': 0.8112244897959183, 'Recall': 0.8249027237354085, 'F1-score': 0.8180064308681672}, 'JOB': {'Precision': 0.4880952380952381, 'Recall': 0.47398843930635837, 'F1-score': 0.4809384164222874}, 'GENDER': {'Precision': 0.9239130434782609, 'Recall': 0.9199134199134199, 'F1-score': 0.9219088937093276}, 'PATIENT_ID': {'Precision': 0.9745762711864406, 'Recall': 0.9181636726546906, 'F1-score': 0.9455292908530318}, 'SYMPTOM_AND_DISEASE': {'Precision': 0.842, 'Recall': 0.7411971830985915, 'F1-score': 0.7883895131086143}, 'DATE': {'Precision': 0.975181598062954, 'Recall': 0.9740024183796856, 'F1-score': 0.9745916515426497}, 'AGE': {'Precision': 0.

In [None]:
results

{'TRANSPORTATION': {'Precision': 0.9838709677419355,
  'Recall': 0.9481865284974094,
  'F1-score': 0.9656992084432717},
 'LOCATION': {'Precision': 0.8805903398926654,
  'Recall': 0.8913535536441829,
  'F1-score': 0.8859392575928009},
 'NAME': {'Precision': 0.8884892086330936,
  'Recall': 0.7942122186495176,
  'F1-score': 0.8387096774193549},
 'ORGANIZATION': {'Precision': 0.8112244897959183,
  'Recall': 0.8249027237354085,
  'F1-score': 0.8180064308681672},
 'JOB': {'Precision': 0.4880952380952381,
  'Recall': 0.47398843930635837,
  'F1-score': 0.4809384164222874},
 'GENDER': {'Precision': 0.9239130434782609,
  'Recall': 0.9199134199134199,
  'F1-score': 0.9219088937093276},
 'PATIENT_ID': {'Precision': 0.9745762711864406,
  'Recall': 0.9181636726546906,
  'F1-score': 0.9455292908530318},
 'SYMPTOM_AND_DISEASE': {'Precision': 0.842,
  'Recall': 0.7411971830985915,
  'F1-score': 0.7883895131086143},
 'DATE': {'Precision': 0.975181598062954,
  'Recall': 0.9740024183796856,
  'F1-score': 

## Evaluate for the original tag format.
I need convert LLM output back to original tag format. If 1 sample can't convert because LLM output wrong format, I will set output to list of 'O' (don't predict any entities)

In [None]:
def predictions_to_tags(sentence, predictions):
    """
    Convert model predictions (list of entity tuples) back to tags format.

    :param sentence: List of tokens in the original sentence
    :param predictions: List of (entity, entity_type) tuples
    :return: List of tags in BIO format
    """
    tags = ['O'] * len(sentence)  # Initialize all tags as 'O'

    for entity, entity_type in predictions:
        entity_tokens = entity.split()

        # Find the start index of the entity in the sentence
        for i in range(len(sentence) - len(entity_tokens) + 1):
            if sentence[i:i+len(entity_tokens)] == entity_tokens:
                # Mark the first token of the entity
                tags[i] = f'B-{entity_type}'
                # Mark the rest of the tokens of the entity
                for j in range(1, len(entity_tokens)):
                    tags[i+j] = f'I-{entity_type}'
                break  # Stop after finding the first occurrence

    return tags

# Example usage
sentence = test_data_json[0]['words']
model_predictions = ast.literal_eval(prediction[0])
predicted_tags = predictions_to_tags(sentence, model_predictions)

# Print the results
for token, tag in zip(sentence, predicted_tags):
    print(f"{token}: {tag}")

# Optionally, compare with the original tags
original_tags = test_data_json[0]['tags']

print("\nComparison with original tags:")
for token, pred_tag, orig_tag in zip(sentence, predicted_tags, original_tags):
    if pred_tag != orig_tag:
        print(f"{token}: Predicted: {pred_tag}, Original: {orig_tag}")

Từ: O
24: B-DATE
-: I-DATE
7: I-DATE
đến: O
31: B-DATE
-: I-DATE
7: I-DATE
,: O
bệnh: O
nhân: O
được: O
mẹ: O
là: O
bà: O
H.T.P: B-NAME
(: O
47: B-AGE
tuổi: O
): O
đón: O
về: O
nhà: O
ở: O
phường: B-LOCATION
Phước: I-LOCATION
Hoà: I-LOCATION
(: O
bằng: O
xe: O
máy: O
): O
,: O
không: O
đi: O
đâu: O
chỉ: O
ra: O
Tạp: B-LOCATION
hoá: I-LOCATION
Phượng: I-LOCATION
,: O
chợ: B-LOCATION
Vườn: I-LOCATION
Lài: I-LOCATION
,: O
phường: B-LOCATION
An: I-LOCATION
Sơn: I-LOCATION
cùng: O
mẹ: O
bán: O
tạp: O
hoá: O
ở: O
đây: O
.: O

Comparison with original tags:
bán: Predicted: O, Original: B-JOB
tạp: Predicted: O, Original: I-JOB
hoá: Predicted: O, Original: I-JOB


In [None]:
print(predicted_tags)
print(original_tags)

['O', 'B-DATE', 'I-DATE', 'I-DATE', 'O', 'B-DATE', 'I-DATE', 'I-DATE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-NAME', 'O', 'B-AGE', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOCATION', 'I-LOCATION', 'I-LOCATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOCATION', 'I-LOCATION', 'I-LOCATION', 'O', 'B-LOCATION', 'I-LOCATION', 'I-LOCATION', 'O', 'B-LOCATION', 'I-LOCATION', 'I-LOCATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['O', 'B-DATE', 'I-DATE', 'I-DATE', 'O', 'B-DATE', 'I-DATE', 'I-DATE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-NAME', 'O', 'B-AGE', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOCATION', 'I-LOCATION', 'I-LOCATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOCATION', 'I-LOCATION', 'I-LOCATION', 'O', 'B-LOCATION', 'I-LOCATION', 'I-LOCATION', 'O', 'B-LOCATION', 'I-LOCATION', 'I-LOCATION', 'O', 'O', 'B-JOB', 'I-JOB', 'I-JOB', 'O', 'O', 'O']


In [None]:
sentence = test_data_json[0]['words']
model_predictions = ast.literal_eval(prediction[0])
predicted_tags = predictions_to_tags(sentence, model_predictions)

In [None]:
words, predict

(['Người',
  'phụ',
  'nữ',
  'đi',
  'bộ',
  'trên',
  'phố',
  '"',
  'thời',
  'trang',
  'bình',
  'dân',
  '"',
  'Chùa',
  'Bộc',
  ',',
  'quận',
  'Đống',
  'Đa',
  'ngày',
  '13/4',
  '.'],
 [('phố thôn Chùa Bộc', 'quận Đống Đa', '13/4')])

In [None]:
df['tags'] = [x['tags'] for x in test_data_json]
predict_tags = []
sentences = [x['words'] for x in test_data_json]
bugs = []

for words, predict in zip(sentences, prediction):
    predict = ast.literal_eval(predict)
    try:
        predict_tag = predictions_to_tags(words, predict)
        bugs.append(False)
    except:
        predict_tag = ['O' for _ in words]
        bugs.append(True)
    predict_tags.append(predict_tag)
    #print(predict, words, predict_tags[-1])

df['predict_tags'] = predict_tags
df['bugs'] = bugs

In [None]:
df.head()

Unnamed: 0,words,outputs,outputs_string,response,prediction,check,tags,predict_tags,bugs
0,"[Từ, 24, -, 7, đến, 31, -, 7, ,, bệnh, nhân, đ...","[(24 - 7, DATE), (31 - 7, DATE), (H.T.P, NAME)...","[('24 - 7', 'DATE'), ('31 - 7', 'DATE'), ('H.T...",Named Entity Recognition Task\n\nSentence: Từ ...,"[('24 - 7', 'DATE'), ('31 - 7', 'DATE'), ('H.T...",False,"[O, B-DATE, I-DATE, I-DATE, O, B-DATE, I-DATE,...","[O, B-DATE, I-DATE, I-DATE, O, B-DATE, I-DATE,...",False
1,"[Bác, sĩ, Trần, Thanh, Linh, ,, từ, Bệnh, viện...","[(Bệnh viện Chợ Rẫy, ORGANIZATION), (416, PATI...","[('Bệnh viện Chợ Rẫy', 'ORGANIZATION'), ('416'...",Named Entity Recognition Task\n\nSentence: Bác...,"[('Bệnh viện Chợ Rẫy', 'LOCATION'), ('416', 'P...",False,"[O, O, O, O, O, O, O, B-ORGANIZATION, I-ORGANI...","[O, O, O, O, O, O, O, B-LOCATION, I-LOCATION, ...",False
2,"[Theo, đó, ,, Sở, Y, tế, Bình, Thuận, cho, biế...","[(Sở Y tế Bình Thuận, ORGANIZATION), (34, PATI...","[('Sở Y tế Bình Thuận', 'ORGANIZATION'), ('34'...",Named Entity Recognition Task\n\nSentence: The...,"[('Sở Y tế Bình Thuận', 'ORGANIZATION'), ('34'...",True,"[O, O, O, B-ORGANIZATION, I-ORGANIZATION, I-OR...","[O, O, O, B-ORGANIZATION, I-ORGANIZATION, I-OR...",False
3,"[Bệnh, nhân, 218, :, nữ, ,, 43, tuổi, ,, quốc,...","[(218, PATIENT_ID), (nữ, GENDER), (43, AGE), (...","[('218', 'PATIENT_ID'), ('nữ', 'GENDER'), ('43...",Named Entity Recognition Task\n\nSentence: Bện...,"[('218', 'PATIENT_ID'), ('nữ', 'GENDER'), ('43...",False,"[O, O, B-PATIENT_ID, O, B-GENDER, O, B-AGE, O,...","[O, O, B-PATIENT_ID, O, B-GENDER, O, B-AGE, O,...",False
4,"[Ông, cùng, 4, người, khác, hôm, 4/3, từ, Mala...","[(4/3, DATE), (Malaysia, LOCATION), (sân bay T...","[('4/3', 'DATE'), ('Malaysia', 'LOCATION'), ('...",Named Entity Recognition Task\n\nSentence: Ông...,"[('4/3', 'DATE'), ('Malaysia', 'LOCATION'), ('...",True,"[O, O, O, O, O, O, B-DATE, O, B-LOCATION, O, B...","[O, O, O, O, O, O, B-DATE, O, B-LOCATION, O, B...",False


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, classification_report

def evaluate_ner_multiple_samples(predicted_tags_list, original_tags_list, entity_labels):
    """
    Đánh giá NER dựa trên nhiều câu (multiple samples) với nhãn BIO đã dự đoán và nhãn thật.

    Args:
    - predicted_tags_list: List các nhãn dự đoán từ mô hình (mỗi phần tử là một danh sách nhãn cho một câu).
    - original_tags_list: List các nhãn thật (mỗi phần tử là một danh sách nhãn cho một câu).
    - entity_labels: List các nhãn thực thể cần đánh giá (ví dụ: ['B-DATE', 'I-DATE', 'B-NAME', 'B-AGE', 'B-LOCATION', ...]).

    Returns:
    - Một dict chứa Precision, Recall, F1-score, Accuracy cho từng nhãn và trung bình.
    """
    # Flatten the predicted and original tags from multiple samples into one list each
    set_entity_labels = set(entity_labels)
    flattened_predicted_tags = [tag if tag in set_entity_labels else 'O' for sample in predicted_tags_list for tag in sample]
    flattened_original_tags = [tag for sample in original_tags_list for tag in sample]

    # Precision, Recall, F1 cho từng nhãn trong entity_labels
    results = {}
    for label in entity_labels:
        precision = precision_score([1 if y == label else 0 for y in flattened_original_tags],
                                    [1 if y == label else 0 for y in flattened_predicted_tags], zero_division=0)
        recall = recall_score([1 if y == label else 0 for y in flattened_original_tags],
                              [1 if y == label else 0 for y in flattened_predicted_tags], zero_division=0)
        f1 = f1_score([1 if y == label else 0 for y in flattened_original_tags],
                      [1 if y == label else 0 for y in flattened_predicted_tags], zero_division=0)
        results[label] = {'Precision': precision, 'Recall': recall, 'F1-score': f1}

    # Tính Accuracy chung cho tất cả các nhãn
    overall_accuracy = accuracy_score(flattened_original_tags, flattened_predicted_tags)

    # Tính F1-score, Precision, Recall trung bình cho toàn bộ các nhãn
    avg_precision = precision_score(flattened_original_tags, flattened_predicted_tags, labels=entity_labels, average='macro', zero_division=0)
    avg_recall = recall_score(flattened_original_tags, flattened_predicted_tags, labels=entity_labels, average='macro', zero_division=0)
    avg_f1 = f1_score(flattened_original_tags, flattened_predicted_tags, labels=entity_labels, average='macro', zero_division=0)

    results['Average'] = {'Precision': avg_precision, 'Recall': avg_recall, 'F1-score': avg_f1, 'Accuracy': overall_accuracy}

    print(classification_report(flattened_original_tags, flattened_predicted_tags, digits = 4))

    return results

predicted_tags_list = predict_tags
original_tags_list = [x['tags'] for x in test_data_json]
entity_labels = ['B-' + x for x in tag_labels] + ['I-' + x for x in tag_labels] + ['O']

results = evaluate_ner_multiple_samples(predicted_tags_list, original_tags_list, entity_labels)
print(results)

                       precision    recall  f1-score   support

                B-AGE     0.9533    0.9124    0.9324       582
               B-DATE     0.9829    0.9716    0.9772      1654
             B-GENDER     0.9404    0.8203    0.8763       462
                B-JOB     0.6309    0.5434    0.5839       173
           B-LOCATION     0.9177    0.8633    0.8897      4441
               B-NAME     0.9449    0.7547    0.8392       318
       B-ORGANIZATION     0.8564    0.8353    0.8457       771
         B-PATIENT_ID     0.9772    0.8349    0.9005      2005
B-SYMPTOM_AND_DISEASE     0.9370    0.7861    0.8550      1136
     B-TRANSPORTATION     0.9838    0.9430    0.9630       193
                I-AGE     0.4000    0.3333    0.3636         6
               I-DATE     0.9854    0.9640    0.9746      1752
             I-GENDER     0.0000    0.0000    0.0000         1
                I-JOB     0.7027    0.4496    0.5483       347
           I-LOCATION     0.9514    0.8652    0.9063  

In [None]:
results

{'B-TRANSPORTATION': {'Precision': 0.9837837837837838,
  'Recall': 0.9430051813471503,
  'F1-score': 0.9629629629629629},
 'B-LOCATION': {'Precision': 0.9176639540449976,
  'Recall': 0.8633190722810178,
  'F1-score': 0.8896623738252698},
 'B-NAME': {'Precision': 0.9448818897637795,
  'Recall': 0.7547169811320755,
  'F1-score': 0.8391608391608392},
 'B-ORGANIZATION': {'Precision': 0.8563829787234043,
  'Recall': 0.8352788586251622,
  'F1-score': 0.8456992777413},
 'B-JOB': {'Precision': 0.6308724832214765,
  'Recall': 0.5433526011560693,
  'F1-score': 0.5838509316770186},
 'B-GENDER': {'Precision': 0.9404466501240695,
  'Recall': 0.8203463203463204,
  'F1-score': 0.8763005780346821},
 'B-PATIENT_ID': {'Precision': 0.9772329246935202,
  'Recall': 0.8349127182044888,
  'F1-score': 0.900484131253362},
 'B-SYMPTOM_AND_DISEASE': {'Precision': 0.9370409233997902,
  'Recall': 0.7860915492957746,
  'F1-score': 0.8549545236955481},
 'B-DATE': {'Precision': 0.982874617737003,
  'Recall': 0.971584

In [None]:
df.head()

Unnamed: 0,words,outputs,outputs_string,response,prediction,check,tags,predict_tags,bugs
0,"[Từ, 24, -, 7, đến, 31, -, 7, ,, bệnh, nhân, đ...","[(24 - 7, DATE), (31 - 7, DATE), (H.T.P, NAME)...","[('24 - 7', 'DATE'), ('31 - 7', 'DATE'), ('H.T...",Named Entity Recognition Task\n\nSentence: Từ ...,"[('24 - 7', 'DATE'), ('31 - 7', 'DATE'), ('H.T...",False,"[O, B-DATE, I-DATE, I-DATE, O, B-DATE, I-DATE,...","[O, B-DATE, I-DATE, I-DATE, O, B-DATE, I-DATE,...",False
1,"[Bác, sĩ, Trần, Thanh, Linh, ,, từ, Bệnh, viện...","[(Bệnh viện Chợ Rẫy, ORGANIZATION), (416, PATI...","[('Bệnh viện Chợ Rẫy', 'ORGANIZATION'), ('416'...",Named Entity Recognition Task\n\nSentence: Bác...,"[('Bệnh viện Chợ Rẫy', 'LOCATION'), ('416', 'P...",False,"[O, O, O, O, O, O, O, B-ORGANIZATION, I-ORGANI...","[O, O, O, O, O, O, O, B-LOCATION, I-LOCATION, ...",False
2,"[Theo, đó, ,, Sở, Y, tế, Bình, Thuận, cho, biế...","[(Sở Y tế Bình Thuận, ORGANIZATION), (34, PATI...","[('Sở Y tế Bình Thuận', 'ORGANIZATION'), ('34'...",Named Entity Recognition Task\n\nSentence: The...,"[('Sở Y tế Bình Thuận', 'ORGANIZATION'), ('34'...",True,"[O, O, O, B-ORGANIZATION, I-ORGANIZATION, I-OR...","[O, O, O, B-ORGANIZATION, I-ORGANIZATION, I-OR...",False
3,"[Bệnh, nhân, 218, :, nữ, ,, 43, tuổi, ,, quốc,...","[(218, PATIENT_ID), (nữ, GENDER), (43, AGE), (...","[('218', 'PATIENT_ID'), ('nữ', 'GENDER'), ('43...",Named Entity Recognition Task\n\nSentence: Bện...,"[('218', 'PATIENT_ID'), ('nữ', 'GENDER'), ('43...",False,"[O, O, B-PATIENT_ID, O, B-GENDER, O, B-AGE, O,...","[O, O, B-PATIENT_ID, O, B-GENDER, O, B-AGE, O,...",False
4,"[Ông, cùng, 4, người, khác, hôm, 4/3, từ, Mala...","[(4/3, DATE), (Malaysia, LOCATION), (sân bay T...","[('4/3', 'DATE'), ('Malaysia', 'LOCATION'), ('...",Named Entity Recognition Task\n\nSentence: Ông...,"[('4/3', 'DATE'), ('Malaysia', 'LOCATION'), ('...",True,"[O, O, O, O, O, O, B-DATE, O, B-LOCATION, O, B...","[O, O, O, O, O, O, B-DATE, O, B-LOCATION, O, B...",False


In [None]:
df.to_excel("predict.xlsx", index=False)