## Import các thư viện cần thiết

In [4]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, DataCollatorWithPadding, DataCollatorForTokenClassification, AutoModelForSequenceClassification, AutoModelForTokenClassification, Trainer, TrainingArguments, pipeline
import json
import numpy as np
import re
import evaluate
from collections import defaultdict

In [2]:
val_list_path = 'MultiWOZ2.4-main/data/mwz24/MULTIWOZ2.4/valListFile.json'
test_list_path = 'MultiWOZ2.4-main/data/mwz24/MULTIWOZ2.4/testListFile.json'
data_path = 'MultiWOZ2.4-main/data/mwz24/MULTIWOZ2.4/data.json'
dialogue_acts_path = 'MultiWOZ2.4-main/data/mwz24/MULTIWOZ2.4/dialogue_acts.json'
ontology_path = 'MultiWOZ2.4-main/data/mwz24/MULTIWOZ2.4/ontology.json'

## Đọc danh sách mã hội thoại val và test

In [3]:
with open(val_list_path) as f:
    val_list = [line.strip() for line in f]
    
with open(test_list_path) as f:
    test_list = [line.strip() for line in f]

## Đọc dữ liệu hội thoại

In [4]:
with open(data_path) as f:
    data = json.load(f)
    
with open(dialogue_acts_path) as f:
    dialogue_acts = json.load(f)
    
with open(ontology_path) as f:
    ontology = json.load(f)

In [5]:
dialogue_ids = list(data.keys())

train_list = [dialogue_id for dialogue_id in dialogue_ids if dialogue_id not in val_list and dialogue_id not in test_list]

In [6]:
# Lấy tất cả các dialogue acts
acts = set()

for dialogue in dialogue_acts.values():
    for turn in dialogue.values():
        if turn == 'No Annotation':
            continue
        for act in turn.keys():
            acts.add(act)
            
acts.add('No Annotation')
acts = list(acts)

In [7]:
print('Số lượng acts:', len(acts))
print(acts)

Số lượng acts: 32
['Taxi-Inform', 'Restaurant-NoOffer', 'Train-NoOffer', 'Attraction-Request', 'general-welcome', 'general-bye', 'Attraction-NoOffer', 'Booking-NoBook', 'Attraction-Select', 'general-greet', 'general-reqmore', 'Restaurant-Inform', 'Train-Inform', 'Restaurant-Request', 'Hotel-Request', 'Train-OfferBook', 'Hotel-Select', 'Attraction-Inform', 'Taxi-Request', 'Train-Select', 'Booking-Inform', 'Booking-Request', 'Train-OfferBooked', 'Hotel-NoOffer', 'Attraction-Recommend', 'No Annotation', 'Restaurant-Recommend', 'Hotel-Recommend', 'Restaurant-Select', 'Booking-Book', 'Train-Request', 'Hotel-Inform']


In [8]:
# Tạo một từ điển chuyển đổi từ act sang index và ngược lại
act2idx = {act: idx for idx, act in enumerate(acts)}
idx2act = {idx: act for act, idx in act2idx.items()}

In [9]:
# Danh sách các slot dạng phân loại và không phân loại
# Danh sách này được định nghĩa theo ý hiểu cá nhân
# Với biến phân loại, ta sẽ chuyển thành one-hot vector
# Với biến không phân loại, ta sẽ xác định span của nó trong câu

categorical_slots = {
    'attraction-area',
    'attraction-type',
    'bus-day',
    'hotel-area',
    'hotel-internet',
    'hotel-parking',
    'hotel-pricerange',
    'hotel-stars',
    'hotel-type',
    'restaurant-area',
    'restaurant-pricerange',
    'train-departure',
    'train-destination',
}

non_categorical_slots = {
    'attraction-name',
    'bus-arriveBy',
    'bus-book people',
    'bus-departure',
    'bus-destination',
    'bus-leaveAt',
    'hospital-department',
    'hotel-book day',
    'hotel-book people',
    'hotel-book stay',
    'hotel-name',
    'restaurant-book day',
    'restaurant-book people',
    'restaurant-book time',
    'restaurant-food',
    'restaurant-name',
    'taxi-arriveBy',
    'taxi-departure',
    'taxi-destination',
    'taxi-leaveAt',
    'train-arriveBy',
    'train-book people',
    'train-day',
    'train-leaveAt'
}

In [10]:
# Tạo nhãn BIO cho các slot không phân loại
bio_list = ['O']
bio_list.extend([item for slot in non_categorical_slots for item in [f'B-{slot}', f'I-{slot}']])
print('Số lượng nhãn BIO:', len(bio_list))
print(bio_list)

Số lượng nhãn BIO: 49
['O', 'B-taxi-arriveBy', 'I-taxi-arriveBy', 'B-bus-book people', 'I-bus-book people', 'B-bus-destination', 'I-bus-destination', 'B-restaurant-book day', 'I-restaurant-book day', 'B-taxi-leaveAt', 'I-taxi-leaveAt', 'B-restaurant-book people', 'I-restaurant-book people', 'B-taxi-destination', 'I-taxi-destination', 'B-bus-leaveAt', 'I-bus-leaveAt', 'B-restaurant-food', 'I-restaurant-food', 'B-bus-departure', 'I-bus-departure', 'B-train-book people', 'I-train-book people', 'B-hotel-name', 'I-hotel-name', 'B-restaurant-book time', 'I-restaurant-book time', 'B-hospital-department', 'I-hospital-department', 'B-train-arriveBy', 'I-train-arriveBy', 'B-train-day', 'I-train-day', 'B-train-leaveAt', 'I-train-leaveAt', 'B-hotel-book day', 'I-hotel-book day', 'B-taxi-departure', 'I-taxi-departure', 'B-restaurant-name', 'I-restaurant-name', 'B-hotel-book stay', 'I-hotel-book stay', 'B-bus-arriveBy', 'I-bus-arriveBy', 'B-attraction-name', 'I-attraction-name', 'B-hotel-book people

In [11]:
# Tạo một từ điển chuyển đổi từ nhãn BIO sang index và ngược lại
bio2idx = {bio: idx for idx, bio in enumerate(bio_list)}
idx2bio = {idx: bio for bio, idx in bio2idx.items()}

In [12]:
# Tạo nhãn value cho các slot phân loại
categorical_value_list = []

for slot in categorical_slots:
    for value in ontology[slot]:
        for v in re.split(r'\||>', value):
            categorical_value_list.append(f'{slot} {v}')
            
categorical_value_list = list(set(categorical_value_list))
print('Số lượng nhãn value:', len(categorical_value_list))
print(categorical_value_list)

Số lượng nhãn value: 143
['train-destination london kings cross', 'hotel-area north', 'attraction-type hiking', 'attraction-type churchills college', 'train-destination city centre north', 'attraction-type cinema', 'hotel-parking none', 'attraction-type museums', 'train-departure leicester', 'hotel-type guest house', 'train-destination dontcare', 'train-departure city hall', 'hotel-internet none', 'attraction-type historical', 'train-destination bournemouth', 'hotel-type guesthouse', 'hotel-type dontcare', 'attraction-type sports', 'attraction-type architecture', 'restaurant-area centre', 'attraction-type concert hall', 'hotel-internet no', 'hotel-internet yes', 'attraction-type museum', 'hotel-pricerange expensive', 'train-destination curry prince', 'train-destination norwich', 'attraction-type boat', 'train-destination broxbourne', 'hotel-area east', 'attraction-type entertainment', 'hotel-parking free', 'attraction-type camboats', 'hotel-internet dontcare', 'train-departure east lon

In [13]:
cv2idx = {cv: idx for idx, cv in enumerate(categorical_value_list)}
idx2cv = {idx: cv for cv, idx in cv2idx.items()}

## Dataset

In [14]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [15]:
class MultiWozDataset(Dataset):
    def __init__(self, dialogue_data, acts_data, tokenizer, act2idx, bio2idx, cv2idx, max_turn=-1, dialogue_ids=None):
        self.data = self._process_data(
            dialogue_data, acts_data, max_turn, dialogue_ids)
        self.tokenizer = tokenizer
        self.act2idx = act2idx
        self.bio2idx = bio2idx
        self.cv2idx = cv2idx
        self.problem = 1 # 2, 3

    def _process_data(self, dialogue_data, acts_data, max_turn, dialogue_ids):
        data = []
        for dialogue_id, dialogue in dialogue_data.items():
            if dialogue_ids is not None and dialogue_id not in dialogue_ids:
                continue
            turns = dialogue['log']
            history = []
            for i in range(0, len(turns) - 1, 2):
                user_turn = turns[i]
                history.append(user_turn['text'])

                system_turn = turns[i + 1]
                history.append(system_turn['text'])

                # Lấy act của system
                system_acts = acts_data[dialogue_id[:-5]
                                        ].get(str(i//2 + 1), 'No Annotation')
                if system_acts == 'No Annotation':
                    system_acts = ['No Annotation']
                else:
                    system_acts = list(system_acts.keys())

                # Lấy slot, value của user
                slot_values = []
                for domain, domain_value in system_turn['metadata'].items():
                    for slot, value in domain_value['book'].items():
                        if slot == 'booked':
                            continue
                        if value and value != 'not mentioned':
                            slot_values.append(
                                [f'{domain}-book {slot}', value])
                    for slot, value in domain_value['semi'].items():
                        if value and value != 'not mentioned':
                            slot_values.append([f'{domain}-{slot}', value])

                data.append({
                    'dialogue_id': dialogue_id,
                    'history': history[max(0, i - 2 * max_turn):i] if max_turn > 0 else history[:i],
                    'utterance': user_turn['text'],
                    'system_acts': system_acts,
                    'slot_values': slot_values
                })
        return data
    
    def set_problem(self, problem):
        self.problem = problem

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        data = self.data[idx]

        history_text = '[SEP]'.join(data['history'])
        full_text = f'{history_text}[SEP]{data["utterance"]}' if history_text else data['utterance']

        # Tokenize text
        encoding = self.tokenizer(full_text, return_tensors='pt')

        if self.problem == 1:
            act_labels = torch.zeros(len(self.act2idx))
            for act in data['system_acts']:
                act_labels[self.act2idx[act]] = 1
            return {
                'input_ids': encoding['input_ids'].squeeze(0),
                'attention_mask': encoding['attention_mask'].squeeze(0),
                'labels': act_labels
            }

        if self.problem == 2:
            slot_labels = torch.tensor([self.bio2idx['O']] * len(encoding['input_ids'].squeeze(0)))
            for slot, value in data['slot_values']:
                if slot in non_categorical_slots:
                    start, end = self._get_value_start_end(full_text, value)
                    if start == 0 and end == 0:
                        continue
                    slot_labels[start] = self.bio2idx[f'B-{slot}']
                    slot_labels[start + 1:end + 1] = self.bio2idx[f'I-{slot}']
            # Chuyển label của [SEP] và [CLS] thành -100
            sep_idx = (encoding['input_ids'] == self.tokenizer.sep_token_id).nonzero(as_tuple=True)[1]
            cls_idx = (encoding['input_ids'] == self.tokenizer.cls_token_id).nonzero(as_tuple=True)[1]
            slot_labels[sep_idx] = -100
            slot_labels[cls_idx] = -100
            return {
                'input_ids': encoding['input_ids'].squeeze(0),
                'attention_mask': encoding['attention_mask'].squeeze(0),
                'labels': slot_labels
            }
            
        if self.problem == 3:
            categorical_labels = torch.zeros(len(self.cv2idx))
            for slot, value in data['slot_values']:
                if slot in categorical_slots:
                    for v in re.split(r'\||>', value):
                        categorical_labels[self.cv2idx[f'{slot} {v}']] = 1
            return {
                'input_ids': encoding['input_ids'].squeeze(0),
                'attention_mask': encoding['attention_mask'].squeeze(0),
                'labels': categorical_labels
            }

    def _get_value_start_end(self, text, value):
        tokenized_text = self.tokenizer.tokenize(text, add_special_tokens=True)
        tokenized_value = self.tokenizer.tokenize(value)

        start, end = 0, 0
        for id_v, token_v in enumerate(tokenized_value):
            for id_u, token_u in enumerate(tokenized_text):
                if token_v == token_u:
                    # nếu value được tìm thấy trong text
                    if tokenized_value == tokenized_text[id_u:id_u+len(tokenized_value)]:
                        start, end = id_u, id_u+len(tokenized_value) - 1
                        break
                    # nếu số lượng token còn lại trong text ít hơn số lượng token của value
                    elif len(tokenized_text) - id_u + 1 <= len(tokenized_value):
                        break
        return torch.tensor([start, end])

In [16]:
train_dataset = MultiWozDataset(data, dialogue_acts, tokenizer, act2idx, bio2idx, cv2idx, max_turn=3, dialogue_ids=train_list)
val_dataset = MultiWozDataset(data, dialogue_acts, tokenizer, act2idx, bio2idx, cv2idx, max_turn=3, dialogue_ids=val_list)
test_dataset = MultiWozDataset(data, dialogue_acts, tokenizer, act2idx, bio2idx, cv2idx, max_turn=3, dialogue_ids=test_list)

In [17]:
data_collator = DataCollatorWithPadding(tokenizer)

## Mô hình phát hiện system acts

In [18]:
clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])


def sigmoid(x):
    return 1/(1 + np.exp(-x))


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = sigmoid(predictions)
    predictions = (predictions > 0.5).astype(int).reshape(-1)
    return clf_metrics.compute(predictions=predictions, references=labels.astype(int).reshape(-1))

In [19]:
system_acts_model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels = len(acts) , id2label=idx2act, label2id=act2idx, problem_type='multi_label_classification')

training_args = TrainingArguments(
   output_dir="model/system_acts_model",
   learning_rate=2e-5,
   per_device_train_batch_size=16,
   per_device_eval_batch_size=16,
   num_train_epochs=3,
   weight_decay=0.01,
   evaluation_strategy="epoch",
   save_strategy="epoch",
   load_best_model_at_end=True,
)

trainer = Trainer(
   model=system_acts_model,
   args=training_args,
   train_dataset=train_dataset,
   eval_dataset=val_dataset,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


In [20]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.0828,0.07849,0.968381,0.6109,0.757795,0.511707
2,0.0773,0.073625,0.970106,0.652718,0.747688,0.579154
3,0.0728,0.072773,0.970475,0.660726,0.746397,0.592696


TrainOutput(global_step=10647, training_loss=0.08612494631262355, metrics={'train_runtime': 495.9328, 'train_samples_per_second': 343.462, 'train_steps_per_second': 21.469, 'total_flos': 7296220874484864.0, 'train_loss': 0.08612494631262355, 'epoch': 3.0})

In [21]:
trainer.save_model()

In [22]:
# Thử nghiệm trên tập test
print(trainer.evaluate(test_dataset))

{'eval_loss': 0.07327355444431305, 'eval_accuracy': 0.9703057175800326, 'eval_f1': 0.6580090807010692, 'eval_precision': 0.742344128662701, 'eval_recall': 0.5908811924594476, 'eval_runtime': 7.4223, 'eval_samples_per_second': 993.227, 'eval_steps_per_second': 62.11, 'epoch': 3.0}


In [8]:
system_acts_classifier = pipeline('text-classification', model='model/system_acts_model', device=0, top_k=None)

In [9]:
sample_text = [
    "I need train reservations from norwich to cambridge",
    "I have 133 trains matching your request. Is there a specific day and time you would like to travel?",
    "I'd like to leave on Monday and arrive by 18:00.",
    "There are 12 trains for the day and time you request.  Would you like to book it now?",
    "Before booking, I would also like to know the travel time, price, and departure time please.",
    "There are 12 trains meeting your needs with the first leaving at 05:16 and the last one leaving at 16:16. Do you want to book one of these? ",
    "No hold off on booking for now.  Can you help me find an attraction called cineworld cinema?",
    "Yes it is a cinema located in the south part of town what information would you like on it?",
    "Yes, that was all I needed. Thank you very much!",
    "Thank you for using our system."
]

history = []
max_turn = 3
threshold = 0.5
for turn in range(0, len(sample_text) - 1, 2):
    history_text = '[SEP]'.join(history[max(0, turn - 2 * max_turn):turn])
    full_text = f'{history_text}[SEP]{sample_text[turn]}' if history_text else sample_text[turn]
    print(sample_text[turn])
    print([out['label'] for out in system_acts_classifier(full_text)[0] if out['score'] > threshold])
    print(sample_text[turn + 1])

I need train reservations from norwich to cambridge
['Train-Request']
I have 133 trains matching your request. Is there a specific day and time you would like to travel?
I'd like to leave on Monday and arrive by 18:00.
['Train-Request']
There are 12 trains for the day and time you request.  Would you like to book it now?
Before booking, I would also like to know the travel time, price, and departure time please.
['Train-Inform']
There are 12 trains meeting your needs with the first leaving at 05:16 and the last one leaving at 16:16. Do you want to book one of these? 
No hold off on booking for now.  Can you help me find an attraction called cineworld cinema?
['Attraction-Inform']
Yes it is a cinema located in the south part of town what information would you like on it?
Yes, that was all I needed. Thank you very much!
['general-bye']
Thank you for using our system.


## Mô hình phát hiện slot value

### Với slot phân loại

In [25]:
data_collator_for_cv = DataCollatorWithPadding(tokenizer)

In [26]:
train_dataset.set_problem(3)
val_dataset.set_problem(3)
test_dataset.set_problem(3)

In [27]:
categorical_value_model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels = len(categorical_value_list), id2label=idx2cv, label2id=cv2idx, problem_type='multi_label_classification')

training_args_cv = TrainingArguments(
   output_dir="model/categorical_value_model",
   learning_rate=2e-5,
   per_device_train_batch_size=16,
   per_device_eval_batch_size=16,
   num_train_epochs=5,
   weight_decay=0.01,
   evaluation_strategy="epoch",
   save_strategy="epoch",
   load_best_model_at_end=True,
)

trainer_cv = Trainer(
   model=categorical_value_model,
   args=training_args_cv,
   train_dataset=train_dataset,
   eval_dataset=val_dataset,
   tokenizer=tokenizer,
   data_collator=data_collator_for_cv,
   compute_metrics=compute_metrics,
)

trainer_cv.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer_cv = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.0418,0.04911,0.986075,0.507678,0.805768,0.370582
2,0.0307,0.042797,0.989661,0.6831,0.840968,0.575135
3,0.026,0.041945,0.990417,0.713472,0.847948,0.61581
4,0.0236,0.043003,0.990563,0.719366,0.848626,0.624278
5,0.0222,0.043739,0.99064,0.722316,0.849345,0.628341


TrainOutput(global_step=17745, training_loss=0.03500862968038525, metrics={'train_runtime': 853.5385, 'train_samples_per_second': 332.604, 'train_steps_per_second': 20.79, 'total_flos': 1.2178201492754736e+16, 'train_loss': 0.03500862968038525, 'epoch': 5.0})

In [28]:
trainer_cv.save_model()

In [29]:
trainer_cv.evaluate(test_dataset)

{'eval_loss': 0.03998860344290733,
 'eval_accuracy': 0.9905036634553727,
 'eval_f1': 0.7101537392512811,
 'eval_precision': 0.8407486117776102,
 'eval_recall': 0.6146752205292703,
 'eval_runtime': 12.1907,
 'eval_samples_per_second': 604.722,
 'eval_steps_per_second': 37.816,
 'epoch': 5.0}

### Với slot không phân loại

In [30]:
train_dataset.set_problem(2)
val_dataset.set_problem(2)
test_dataset.set_problem(2)

In [31]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [32]:
seqeval = evaluate.load("seqeval")

def compute_metrics_tf(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [idx2bio[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [idx2bio[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [33]:
non_categorical_value_model = AutoModelForTokenClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=len(bio_list), id2label=idx2bio, label2id=bio2idx
)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:
training_args_ncv = TrainingArguments(
    output_dir="model/non_categorical_value_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer_ncv = Trainer(
    model=non_categorical_value_model,
    args=training_args_ncv,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics_tf,
)

trainer_ncv.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0275,0.022133,0.862717,0.882219,0.872359,0.992526
2,0.0208,0.01968,0.886945,0.901309,0.894069,0.993486
3,0.0169,0.019418,0.894676,0.911509,0.903014,0.993845


TrainOutput(global_step=10647, training_loss=0.03067349930755452, metrics={'train_runtime': 607.2189, 'train_samples_per_second': 280.515, 'train_steps_per_second': 17.534, 'total_flos': 7198554702861252.0, 'train_loss': 0.03067349930755452, 'epoch': 3.0})

In [37]:
trainer_ncv.save_model()

In [39]:
trainer_ncv.evaluate(test_dataset)

{'eval_loss': 0.019307278096675873,
 'eval_precision': 0.8939373389393734,
 'eval_recall': 0.9118074289271633,
 'eval_f1': 0.9027839605519979,
 'eval_accuracy': 0.9940508945322825,
 'eval_runtime': 14.9565,
 'eval_samples_per_second': 492.897,
 'eval_steps_per_second': 30.823,
 'epoch': 3.0}

In [10]:
categorical_value_clf = pipeline('text-classification', model='model/categorical_value_model', device=0, top_k=None)
non_categorical_value_tclf = pipeline('ner', model='model/non_categorical_value_model', device=0)

In [11]:
def slot_vlaue_predict(full_text):
    state = defaultdict(list)
    categorical_value_result = categorical_value_clf(full_text)
    non_categorical_value_result = non_categorical_value_tclf(full_text)
    
    for out in categorical_value_result[0]:
        if out['score'] > 0.5:
            slot, value = out['label'].split(' ', 1)
            state[slot].append(value)
            
    current_entity = None
    current_value = ""
    
    for item in non_categorical_value_result:
        entity_type = item['entity'][2:]  # Remove the B- or I- prefix
        if item['entity'].startswith('B-'):
            if current_entity:  # Save the previous entity-value pair if exists
                if current_value.find(':') != -1:
                    current_value = current_value.replace(' ', '')
                state[current_entity].append(current_value)
            current_entity = entity_type
            current_value = item['word']
        elif item['entity'].startswith('I-') and current_entity == entity_type:
            if item['word'].startswith('##'):
                current_value += item['word'][2:]
            else:
                current_value += ' ' + item['word']  # Concatenate words for the same entity

    # Append the last entity-value pair
    if current_entity:
        if current_value.find(':') != -1:
            current_value = current_value.replace(' ', '')
        state[current_entity].append(current_value)
        
    return state

In [13]:
state = {}
for turn in range(0, len(sample_text) - 1, 2):
    history_text = '[SEP]'.join(history[max(0, turn - 2 * max_turn):turn])
    full_text = f'{history_text}[SEP]{sample_text[turn]}' if history_text else sample_text[turn]
    print(sample_text[turn])
    state = dict(state | slot_vlaue_predict(full_text))
    print(state)
    print(sample_text[turn + 1])

I need train reservations from norwich to cambridge
{'train-destination': ['cambridge'], 'train-departure': ['norwich']}
I have 133 trains matching your request. Is there a specific day and time you would like to travel?
I'd like to leave on Monday and arrive by 18:00.
{'train-destination': ['cambridge'], 'train-departure': ['norwich'], 'train-day': ['monday'], 'train-arriveBy': ['18:00']}
There are 12 trains for the day and time you request.  Would you like to book it now?
Before booking, I would also like to know the travel time, price, and departure time please.
{'train-destination': ['cambridge'], 'train-departure': ['norwich'], 'train-day': ['monday'], 'train-arriveBy': ['18:00']}
There are 12 trains meeting your needs with the first leaving at 05:16 and the last one leaving at 16:16. Do you want to book one of these? 
No hold off on booking for now.  Can you help me find an attraction called cineworld cinema?
{'train-destination': ['cambridge'], 'train-departure': ['norwich'], 't