## Import các thư viện cần thiết

In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, DataCollatorWithPadding, DataCollatorForTokenClassification, AutoModelForSequenceClassification, AutoModelForTokenClassification, Trainer, TrainingArguments
import json
import numpy as np
import re
import evaluate

In [2]:
val_list_path = 'MultiWOZ2.4-main/data/mwz24/MULTIWOZ2.4/valListFile.json'
test_list_path = 'MultiWOZ2.4-main/data/mwz24/MULTIWOZ2.4/testListFile.json'
data_path = 'MultiWOZ2.4-main/data/mwz24/MULTIWOZ2.4/data.json'
dialogue_acts_path = 'MultiWOZ2.4-main/data/mwz24/MULTIWOZ2.4/dialogue_acts.json'
ontology_path = 'MultiWOZ2.4-main/data/mwz24/MULTIWOZ2.4/ontology.json'

## Đọc danh sách mã hội thoại val và test

In [3]:
with open(val_list_path) as f:
    val_list = [line.strip() for line in f]
    
with open(test_list_path) as f:
    test_list = [line.strip() for line in f]

## Đọc dữ liệu hội thoại

In [4]:
with open(data_path) as f:
    data = json.load(f)
    
with open(dialogue_acts_path) as f:
    dialogue_acts = json.load(f)
    
with open(ontology_path) as f:
    ontology = json.load(f)

In [5]:
dialogue_ids = list(data.keys())

train_list = [dialogue_id for dialogue_id in dialogue_ids if dialogue_id not in val_list and dialogue_id not in test_list]

In [6]:
# Lấy tất cả các dialogue acts
acts = set()

for dialogue in dialogue_acts.values():
    for turn in dialogue.values():
        if turn == 'No Annotation':
            continue
        for act in turn.keys():
            acts.add(act)
            
acts.add('No Annotation')
acts = list(acts)

In [7]:
print('Số lượng acts:', len(acts))
print(acts)

Số lượng acts: 32
['Train-NoOffer', 'Attraction-Request', 'Taxi-Inform', 'Attraction-Recommend', 'Booking-Request', 'general-reqmore', 'Restaurant-Request', 'Booking-Inform', 'Attraction-NoOffer', 'Train-Request', 'Train-OfferBooked', 'Hotel-Recommend', 'Restaurant-Recommend', 'Taxi-Request', 'general-bye', 'Booking-NoBook', 'Hotel-Request', 'Train-Select', 'Restaurant-Inform', 'Hotel-Inform', 'Hotel-NoOffer', 'Restaurant-NoOffer', 'Train-OfferBook', 'Booking-Book', 'general-greet', 'Hotel-Select', 'No Annotation', 'Attraction-Select', 'Train-Inform', 'Attraction-Inform', 'general-welcome', 'Restaurant-Select']


In [8]:
# Tạo một từ điển chuyển đổi từ act sang index và ngược lại
act2idx = {act: idx for idx, act in enumerate(acts)}
idx2act = {idx: act for act, idx in act2idx.items()}

In [9]:
# Danh sách các slot dạng phân loại và không phân loại
# Danh sách này được định nghĩa theo ý hiểu cá nhân
# Với biến phân loại, ta sẽ chuyển thành one-hot vector
# Với biến không phân loại, ta sẽ xác định span của nó trong câu

categorical_slots = {
    'attraction-area',
    'attraction-type',
    'bus-day',
    'hotel-area',
    'hotel-internet',
    'hotel-parking',
    'hotel-pricerange',
    'hotel-stars',
    'hotel-type',
    'restaurant-area',
    'restaurant-pricerange',
    'train-departure',
    'train-destination',
}

non_categorical_slots = {
    'attraction-name',
    'bus-arriveBy',
    'bus-book people',
    'bus-departure',
    'bus-destination',
    'bus-leaveAt',
    'hospital-department',
    'hotel-book day',
    'hotel-book people',
    'hotel-book stay',
    'hotel-name',
    'restaurant-book day',
    'restaurant-book people',
    'restaurant-book time',
    'restaurant-food',
    'restaurant-name',
    'taxi-arriveBy',
    'taxi-departure',
    'taxi-destination',
    'taxi-leaveAt',
    'train-arriveBy',
    'train-book people',
    'train-day',
    'train-leaveAt'
}

In [10]:
# Tạo nhãn BIO cho các slot không phân loại
bio_list = ['O']
bio_list.extend([item for slot in non_categorical_slots for item in [f'B-{slot}', f'I-{slot}']])
print('Số lượng nhãn BIO:', len(bio_list))
print(bio_list)

Số lượng nhãn BIO: 49
['O', 'B-hotel-name', 'I-hotel-name', 'B-bus-destination', 'I-bus-destination', 'B-taxi-leaveAt', 'I-taxi-leaveAt', 'B-restaurant-book day', 'I-restaurant-book day', 'B-restaurant-food', 'I-restaurant-food', 'B-bus-leaveAt', 'I-bus-leaveAt', 'B-hotel-book day', 'I-hotel-book day', 'B-restaurant-book people', 'I-restaurant-book people', 'B-train-day', 'I-train-day', 'B-train-arriveBy', 'I-train-arriveBy', 'B-restaurant-name', 'I-restaurant-name', 'B-train-book people', 'I-train-book people', 'B-taxi-arriveBy', 'I-taxi-arriveBy', 'B-taxi-destination', 'I-taxi-destination', 'B-bus-arriveBy', 'I-bus-arriveBy', 'B-restaurant-book time', 'I-restaurant-book time', 'B-attraction-name', 'I-attraction-name', 'B-bus-departure', 'I-bus-departure', 'B-train-leaveAt', 'I-train-leaveAt', 'B-hotel-book people', 'I-hotel-book people', 'B-taxi-departure', 'I-taxi-departure', 'B-hospital-department', 'I-hospital-department', 'B-bus-book people', 'I-bus-book people', 'B-hotel-book st

In [11]:
# Tạo một từ điển chuyển đổi từ nhãn BIO sang index và ngược lại
bio2idx = {bio: idx for idx, bio in enumerate(bio_list)}
idx2bio = {idx: bio for bio, idx in bio2idx.items()}

In [12]:
# Tạo nhãn value cho các slot phân loại
categorical_value_list = []

for slot in categorical_slots:
    for value in ontology[slot]:
        for v in re.split(r'\||>', value):
            categorical_value_list.append(f'{slot} {v}')
            
categorical_value_list = list(set(categorical_value_list))
print('Số lượng nhãn value:', len(categorical_value_list))
print(categorical_value_list)

Số lượng nhãn value: 143
['restaurant-area dontcare', 'attraction-type nightclub', 'attraction-type boating', 'attraction-type hotel', 'hotel-pricerange moderate', 'hotel-area dontcare', 'attraction-type night club', 'attraction-type church', 'hotel-type none', 'attraction-type dontcare', 'train-destination cambridge', 'hotel-pricerange none', 'restaurant-pricerange dontcare', 'hotel-internet yes', 'train-destination city centre north', 'attraction-type park', 'attraction-type college', 'hotel-area east', 'train-departure city hall', 'attraction-type churchills college', 'train-departure none', 'train-destination stevenage', 'attraction-type sports', 'hotel-parking dontcare', 'attraction-type architecture', 'attraction-area west', 'train-departure brookshite', 'hotel-type hotel', 'train-departure stevenage', 'attraction-type museum', 'train-departure stratford', 'attraction-type cinemas', 'attraction-type theatre', 'hotel-stars 5', 'attraction-area south', 'attraction-type concert hall

In [13]:
cv2idx = {cv: idx for idx, cv in enumerate(categorical_value_list)}
idx2cv = {idx: cv for cv, idx in cv2idx.items()}

## Dataset

In [14]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [15]:
class MultiWozDataset(Dataset):
    def __init__(self, dialogue_data, acts_data, tokenizer, act2idx, bio2idx, cv2idx, max_turn=-1, dialogue_ids=None):
        self.data = self._process_data(
            dialogue_data, acts_data, max_turn, dialogue_ids)
        self.tokenizer = tokenizer
        self.act2idx = act2idx
        self.bio2idx = bio2idx
        self.cv2idx = cv2idx
        self.problem = 1 # 2, 3

    def _process_data(self, dialogue_data, acts_data, max_turn, dialogue_ids):
        data = []
        for dialogue_id, dialogue in dialogue_data.items():
            if dialogue_ids is not None and dialogue_id not in dialogue_ids:
                continue
            turns = dialogue['log']
            history = []
            for i in range(0, len(turns) - 1, 2):
                user_turn = turns[i]
                history.append(user_turn['text'])

                system_turn = turns[i + 1]
                history.append(system_turn['text'])

                # Lấy act của system
                system_acts = acts_data[dialogue_id[:-5]
                                        ].get(str(i//2 + 1), 'No Annotation')
                if system_acts == 'No Annotation':
                    system_acts = ['No Annotation']
                else:
                    system_acts = list(system_acts.keys())

                # Lấy slot, value của user
                slot_values = []
                for domain, domain_value in system_turn['metadata'].items():
                    for slot, value in domain_value['book'].items():
                        if slot == 'booked':
                            continue
                        if value and value != 'not mentioned':
                            slot_values.append(
                                [f'{domain}-book {slot}', value])
                    for slot, value in domain_value['semi'].items():
                        if value and value != 'not mentioned':
                            slot_values.append([f'{domain}-{slot}', value])

                data.append({
                    'dialogue_id': dialogue_id,
                    'history': history[max(0, i - 2 * max_turn):i] if max_turn > 0 else history[:i],
                    'utterance': user_turn['text'],
                    'system_acts': system_acts,
                    'slot_values': slot_values
                })
        return data
    
    def set_problem(self, problem):
        self.problem = problem

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        data = self.data[idx]

        history_text = '[SEP]'.join(data['history'])
        full_text = f'{history_text}[SEP]{data["utterance"]}' if history_text else data['utterance']

        # Tokenize text
        encoding = self.tokenizer(full_text, return_tensors='pt')

        if self.problem == 1:
            act_labels = torch.zeros(len(self.act2idx))
            for act in data['system_acts']:
                act_labels[self.act2idx[act]] = 1
            return {
                'input_ids': encoding['input_ids'].squeeze(0),
                'attention_mask': encoding['attention_mask'].squeeze(0),
                'labels': act_labels
            }

        if self.problem == 2:
            slot_labels = torch.tensor([self.bio2idx['O']] * len(encoding['input_ids'].squeeze(0)))
            for slot, value in data['slot_values']:
                if slot in non_categorical_slots:
                    start, end = self._get_value_start_end(full_text, value)
                    if start == 0 and end == 0:
                        continue
                    slot_labels[start] = self.bio2idx[f'B-{slot}']
                    slot_labels[start + 1:end + 1] = self.bio2idx[f'I-{slot}']
            # Chuyển label của [SEP] và [CLS] thành -100
            sep_idx = (encoding['input_ids'] == self.tokenizer.sep_token_id).nonzero(as_tuple=True)[1]
            cls_idx = (encoding['input_ids'] == self.tokenizer.cls_token_id).nonzero(as_tuple=True)[1]
            slot_labels[sep_idx] = -100
            slot_labels[cls_idx] = -100
            return {
                'input_ids': encoding['input_ids'].squeeze(0),
                'attention_mask': encoding['attention_mask'].squeeze(0),
                'labels': slot_labels
            }
            
        if self.problem == 3:
            categorical_labels = torch.zeros(len(self.cv2idx))
            for slot, value in data['slot_values']:
                if slot in categorical_slots:
                    categorical_labels[self.cv2idx[f'{slot} {value}']] = 1
            return {
                'input_ids': encoding['input_ids'].squeeze(0),
                'attention_mask': encoding['attention_mask'].squeeze(0),
                'labels': categorical_labels
            }

    def _get_value_start_end(self, text, value):
        tokenized_text = self.tokenizer.tokenize(text, add_special_tokens=True)
        tokenized_value = self.tokenizer.tokenize(value)

        start, end = 0, 0
        for id_v, token_v in enumerate(tokenized_value):
            for id_u, token_u in enumerate(tokenized_text):
                if token_v == token_u:
                    # nếu value được tìm thấy trong text
                    if tokenized_value == tokenized_text[id_u:id_u+len(tokenized_value)]:
                        start, end = id_u, id_u+len(tokenized_value) - 1
                        break
                    # nếu số lượng token còn lại trong text ít hơn số lượng token của value
                    elif len(tokenized_text) - id_u + 1 <= len(tokenized_value):
                        break
        return torch.tensor([start, end])

In [16]:
train_dataset = MultiWozDataset(data, dialogue_acts, tokenizer, act2idx, bio2idx, cv2idx, dialogue_ids=train_list)
val_dataset = MultiWozDataset(data, dialogue_acts, tokenizer, act2idx, bio2idx, cv2idx, dialogue_ids=val_list)
test_dataset = MultiWozDataset(data, dialogue_acts, tokenizer, act2idx, bio2idx, cv2idx, dialogue_ids=test_list)

In [17]:
data_collator = DataCollatorWithPadding(tokenizer)
data_collator_for_tc = DataCollatorForTokenClassification(tokenizer)

## Mô hình phát hiện system acts

In [18]:
clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])


def sigmoid(x):
    return 1/(1 + np.exp(-x))


def compute_metrics(eval_pred):

    predictions, labels = eval_pred
    predictions = sigmoid(predictions)
    predictions = (predictions > 0.5).astype(int).reshape(-1)
    return clf_metrics.compute(predictions=predictions, references=labels.astype(int).reshape(-1))

In [21]:
system_acts_model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', id2label=idx2act, label2id=act2idx, problem_type='multi_label_classification')

# training_args = TrainingArguments(
#    output_dir="model",
#    learning_rate=2e-5,
#    per_device_train_batch_size=16,
#    per_device_eval_batch_size=16,
#    num_train_epochs=3,
#    weight_decay=0.01,
#    evaluation_strategy="epoch",
#    save_strategy="epoch",
#    load_best_model_at_end=True,
# )

# trainer = Trainer(
#    model=system_acts_model,
#    args=training_args,
#    train_dataset=train_dataset,
#    eval_dataset=val_dataset,
#    tokenizer=tokenizer,
#    data_collator=data_collator,
#    compute_metrics=compute_metrics,
# )

# trainer.train()

total_params = sum(p.numel() for p in system_acts_model.parameters())
print(f'Total number of parameters: {total_params}')

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total number of parameters: 66978080
