In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, AutoTokenizer
import json
import numpy as np
from collections import defaultdict

In [2]:
train_path = "MultiWOZ2.4-main/data/mwz2.4/train_dials.json"
dev_path = "MultiWOZ2.4-main/data/mwz2.4/dev_dials.json"
test_path = "MultiWOZ2.4-main/data/mwz2.4/test_dials.json"

with open(train_path, 'r') as f:
    train_data = json.load(f)
    
with open(dev_path, 'r') as f:
    dev_data = json.load(f)

with open(test_path, 'r') as f:
    test_data = json.load(f)

In [3]:
len(train_data), len(dev_data), len(test_data)

(8420, 1000, 999)

In [7]:
labels = set()

for dialouge in train_data:
    for turn in dialouge['dialogue']:
        for key, _ in turn['turn_label']:
            labels.add(key)
            
for dialouge in dev_data:
    for turn in dialouge['dialogue']:
        for key, _ in turn['turn_label']:
            labels.add(key)
            
for dialouge in test_data:
    for turn in dialouge['dialogue']:
        for key, _ in turn['turn_label']:
            labels.add(key)

In [8]:
len(labels)

31

In [6]:
labels

{'attraction-area',
 'attraction-name',
 'attraction-type',
 'hospital-department',
 'hotel-area',
 'hotel-book day',
 'hotel-book people',
 'hotel-book stay',
 'hotel-internet',
 'hotel-name',
 'hotel-parking',
 'hotel-pricerange',
 'hotel-stars',
 'hotel-type',
 'restaurant-area',
 'restaurant-book day',
 'restaurant-book people',
 'restaurant-book time',
 'restaurant-food',
 'restaurant-name',
 'restaurant-pricerange',
 'taxi-arriveby',
 'taxi-departure',
 'taxi-destination',
 'taxi-leaveat',
 'train-arriveby',
 'train-book people',
 'train-day',
 'train-departure',
 'train-destination',
 'train-leaveat'}

In [4]:
label_list = ['O']
label_list.extend([item for slot in labels for item in [f'B-{slot}', f'I-{slot}']])
label_list

['O',
 'B-hotel-pricerange',
 'I-hotel-pricerange',
 'B-train-destination',
 'I-train-destination',
 'B-restaurant-book time',
 'I-restaurant-book time',
 'B-hotel-parking',
 'I-hotel-parking',
 'B-restaurant-name',
 'I-restaurant-name',
 'B-hotel-area',
 'I-hotel-area',
 'B-restaurant-area',
 'I-restaurant-area',
 'B-train-book people',
 'I-train-book people',
 'B-hotel-book stay',
 'I-hotel-book stay',
 'B-restaurant-pricerange',
 'I-restaurant-pricerange',
 'B-train-departure',
 'I-train-departure',
 'B-attraction-type',
 'I-attraction-type',
 'B-hotel-book people',
 'I-hotel-book people',
 'B-taxi-departure',
 'I-taxi-departure',
 'B-train-leaveat',
 'I-train-leaveat',
 'B-attraction-area',
 'I-attraction-area',
 'B-taxi-destination',
 'I-taxi-destination',
 'B-restaurant-book people',
 'I-restaurant-book people',
 'B-restaurant-book day',
 'I-restaurant-book day',
 'B-restaurant-food',
 'I-restaurant-food',
 'B-hospital-department',
 'I-hospital-department',
 'B-attraction-name',


In [5]:
label2idx = {label: idx for idx, label in enumerate(label_list)}
idx2label = {idx: label for label, idx in label2idx.items()}

In [6]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [7]:
def get_value_from_utterance(utterance, value):
    tokenized_utterance = tokenizer.tokenize(utterance, add_special_tokens=True)
    # print(tokenized_utterance)
    tokenized_value = tokenizer.tokenize(value)
    # print(tokenized_value)

    start, end = 0, 0
    for id_v, token_v in enumerate(tokenized_value):
        for id_u, token_u in enumerate(tokenized_utterance):
            if token_v == token_u:
                # if the value is found in the utterance
                if tokenized_value == tokenized_utterance[id_u:id_u+len(tokenized_value)]:
                    start, end = id_u, id_u+len(tokenized_value) - 1
                    break
                # if the remaining tokens in utterance is less than the value
                elif len(tokenized_utterance) - id_u + 1 <= len(tokenized_value):
                    break
    return torch.tensor([start, end])

In [8]:
class MultiWOZDataset(Dataset):
    def __init__(self, data, tokenizer, label2idx, idx2label):
        self.data = data
        self.tokenizer = tokenizer
        self.label2idx = label2idx
        self.idx2label = idx2label

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        dialouge = self.data[idx]
        turn = dialouge['dialogue'][0]  # only the first turn is considered
        utterance = turn['transcript']
        results = self.tokenizer(
            utterance,  return_tensors="pt", truncation=True)
        labels = torch.tensor([self.label2idx['O']] * len(results['input_ids'].flatten()))
        for slot, value in turn['turn_label']:
            start, end = get_value_from_utterance(utterance, value)
            labels[start] = self.label2idx[f'B-{slot}']
            labels[start+1:end+1] = self.label2idx[f'I-{slot}']
        labels[0] = -100  # CLS token
        labels[-1] = -100 # SEP token

        return {
            'input_ids': results['input_ids'].flatten(),
            'attention_mask': results['attention_mask'].flatten(),
            'labels': labels
        }

In [9]:
train_dataset = MultiWOZDataset(train_data, tokenizer, label2idx, idx2label)
dev_dataset = MultiWOZDataset(dev_data, tokenizer, label2idx, idx2label)
test_dataset = MultiWOZDataset(test_data, tokenizer, label2idx, idx2label)

In [10]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [11]:
import evaluate

seqeval = evaluate.load("seqeval")

In [12]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [idx2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [idx2label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [1]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=len(label_list), id2label=idx2label, label2id=label2idx
)

NameError: name 'label_list' is not defined

In [15]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

  0%|          | 0/1054 [00:00<?, ?it/s]

{'loss': 0.3262, 'grad_norm': 1.0806093215942383, 'learning_rate': 1.0512333965844403e-05, 'epoch': 0.95}


  0%|          | 0/63 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.10356897115707397, 'eval_precision': 0.8090778097982709, 'eval_recall': 0.861857252494244, 'eval_f1': 0.8346339650687478, 'eval_accuracy': 0.9734402405412177, 'eval_runtime': 2.2691, 'eval_samples_per_second': 440.7, 'eval_steps_per_second': 27.764, 'epoch': 1.0}
{'loss': 0.0789, 'grad_norm': 0.30119651556015015, 'learning_rate': 1.0246679316888046e-06, 'epoch': 1.9}


  0%|          | 0/63 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.06719104200601578, 'eval_precision': 0.8944610778443114, 'eval_recall': 0.9171143514965464, 'eval_f1': 0.9056460780598712, 'eval_accuracy': 0.9844650463542972, 'eval_runtime': 3.138, 'eval_samples_per_second': 318.67, 'eval_steps_per_second': 20.076, 'epoch': 2.0}
{'train_runtime': 117.4161, 'train_samples_per_second': 143.422, 'train_steps_per_second': 8.977, 'train_loss': 0.19537621001364835, 'epoch': 2.0}


TrainOutput(global_step=1054, training_loss=0.19537621001364835, metrics={'train_runtime': 117.4161, 'train_samples_per_second': 143.422, 'train_steps_per_second': 8.977, 'total_flos': 126543229774920.0, 'train_loss': 0.19537621001364835, 'epoch': 2.0})

In [16]:
results = trainer.evaluate(test_dataset)
print(results)

  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 0.06670460850000381, 'eval_precision': 0.8960739030023095, 'eval_recall': 0.9107981220657277, 'eval_f1': 0.9033760186263097, 'eval_accuracy': 0.9844116342437108, 'eval_runtime': 2.3019, 'eval_samples_per_second': 433.981, 'eval_steps_per_second': 27.368, 'epoch': 2.0}


  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
from transformers import pipeline

classifier = pipeline("ner", model="./results/checkpoint-1054")
classifier("am looking for a place to to stay that has cheap price range it should be in a type of hotel")

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'entity': 'B-hotel-pricerange',
  'score': np.float32(0.95633507),
  'index': 11,
  'word': 'cheap',
  'start': 43,
  'end': 48},
 {'entity': 'B-hotel-type',
  'score': np.float32(0.4760056),
  'index': 21,
  'word': 'hotel',
  'start': 87,
  'end': 92}]

In [43]:
def predict(utterance):
    slots = []
    results = classifier(utterance)
    
    current_entity = None
    current_value = ""
    
    for item in results:
        entity_type = item['entity'][2:]  # Remove the B- or I- prefix
        if item['entity'].startswith('B-'):
            if current_entity:  # Save the previous entity-value pair if exists
                slots.append(f"{current_entity}={current_value}")
            current_entity = entity_type
            current_value = item['word']
        elif item['entity'].startswith('I-') and current_entity == entity_type:
            current_value += item['word']  # Concatenate words for the same entity

    # Append the last entity-value pair
    if current_entity:
        slots.append(f"{current_entity}={current_value}")
    
            
    return slots

In [44]:
predict("i am looking for information in cambridge. i am staying in the west and i want to find a place near here that serves real british food .")

['restaurant-area=west', 'restaurant-food=british']

In [45]:
predict("i need to catch a train out of cambridge after 15:45 .")

['train-departure=cambridge', 'train-leaveat=15:45']

In [48]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters: {total_params}")

Total number of parameters: 66411327
