In [None]:
! pip install datasets pandas matplotlib scikit-learn transformers rouge evaluate tqdm

In [None]:
# If you use google colab
!wget https://gitlab.dsi.universite-paris-saclay.fr/thomas.gerald/textminingandchatbot/-/raw/469b1e352a5a65a743aeb13b9705157e341fc21e/tp/TP-2/TP-2-data.zip
!unzip TP-2-data.zip

In [1]:
from tqdm.notebook import trange, tqdm # The progress bar

import torch # DeepLearning Framework
from torch import optim
from torch import nn
from torch.utils.data import Dataset, DataLoader

import os
import numpy as np
import pandas as pd
import json
from transformers import AutoTokenizer, AutoModelForCausalLM # Model repository
from datasets import load_dataset # Dataset Repository
%load_ext autoreload
%autoreload 2

In [2]:
# woz_dataset
woz_dataset = load_dataset("multi_woz_v22", trust_remote_code=True)
training_set = woz_dataset['train']
validation_set = woz_dataset['validation']
test_set = woz_dataset['test']

# Build a task oriented chatbot

In this miniproject we propose different tools to extract information. The objective is to develop a chatbot with possible extracted information. 

**A first step** would be to consider generating answer given the dialogue history and a dialogue act such as :
```
[history]
'I need train reservations from norwich to cambridge',
'I have 133 trains matching your request. Is there a specific day and time you would like to travel?',
"I'd like to leave on Monday and arrive by 18:00."
[history]
```
`[dialogue_act]train_choice 12[end_dialogue_act]`

Generate an answer such as :

```There are 12 trains for the day and time you request. Would you like to book it now?```

Thos results should be compared to the approach proposed previous week.

**In a second time** you should take inspiration from the article proposed in the directory to propose an approach that do not use ground truth to generate answer, but contrary to last week, take into account data located in the database. Notice for this step, it is not expected to get very good performances. Depending on your progress you can only consider generating delexicalised sentence i.e. sentences without database entry value. For instance you could predict such a sentence : 

```There are [train_choice] trains meeting your needs with the first leaving at [train_leaveat] and the last one leaving at [train_leaveat]```

Notice to formulate consistent answer information from database are still required 

## 1. Available tools and preprocessing

In [3]:
from woz_tools import MultiWoZDatabase, MultiWoZTools

### Access to dialogue state
To access the dialogue state two function are made available :
* `encode_frame_state(state)` : encoding a state into a characters string 
* `decode_frame_state(str_state)` : decoding the char string to a list (for instance to query the database)

In [4]:
# let consider the following example
x = training_set[50]['turns']['frames'][4]['state']
print(f'Considering the following state : \n\n x = {x}\n\n')
y = MultiWoZTools.encode_frame_state(x)
print(f'MultiWoZState.encode_frame_state(state) : \n\n"{y}"\n\n')
z = MultiWoZTools.decode_frame_state(y)
print(f'MultiWoZState.encode_frame_state(encoded_state) : \n\n {z}')

Considering the following state : 

 x = [{'active_intent': 'find_hotel', 'requested_slots': ['hotel-name'], 'slots_values': {'slots_values_name': ['hotel-area', 'hotel-pricerange'], 'slots_values_list': [['centre'], ['cheap']]}}]


MultiWoZState.encode_frame_state(state) : 

"[state][intent:find_hotel][hotel-area:['centre']][hotel-pricerange:['cheap']][end_intent][end_state]"


MultiWoZState.encode_frame_state(encoded_state) : 

 [{'active_intent': 'find_hotel', 'slots_values': {'slots_values_name': ['hotel-area', 'hotel-pricerange'], 'slots_values_list': [['centre'], ['cheap']]}}]


### Access to dialogue act
For the dialogue act the following function is designed :
* `encode_dialogue_act` : encoding a dialogue act into a characters string 

In [5]:
dialogue_act = test_set[85]['turns']['dialogue_acts'][5]
MultiWoZTools.encode_dialogue_act(dialogue_act)

'[dialogue_act]hotel_choice 3,hotel_name bridge guest house,hotel_name hamilton lodge,hotel_name hobsons house[end_dialogue_act]'

*Using the dialogue act and the correct utterance we can* **delexicalize the utterance** with the function
* `MultiWoZState.delexicalize_answer`

In [6]:
utterance = test_set[85]['turns']['utterance'][5]
MultiWoZTools.delexicalize_answer(utterance, dialogue_act)

'Yes, there are [hotel_choice]: [hotel_name], [hotel_name], and [hotel_name].'

## Tansform the state into query
The database of the dataset is composed of different json file in the `data/db` folder :
* attraction_db.json
* hospital_db.json
* police_db.json
* taxi_db.json
* bus_db.json
* hotel_db.json
* restaurant_db.json
* train_db.json

You can load the dataset using the `MultiWoZDatabase` object and query the database by using the `search` method. A query is in the following format : 

```
[
    ("domain_1",[
        ("field_1","=",[values_1, values_2]),
        ("field_2","=",["value_1"])
    ])
    ("domain_2",[
        ...
    ])
]
```

You can also create a query from a state using the `state_to_db` function


In [7]:
multiwoz_db = MultiWoZDatabase()

#### Exemple building a query from state

In [8]:
# let consider the following example
x = training_set[50]['turns']['frames'][4]['state']
query = MultiWoZTools.state_to_db(x)
print(query)

[('hotel', [('area', '=', ['centre']), ('pricerange', '=', ['cheap'])])]


#### Exemple querying the database

In [9]:
multiwoz_db.search(query)

[('hotel',
  (2,
   [{'address': '56 saint barnabas road',
     'area': 'centre',
     'internet': 'yes',
     'parking': 'yes',
     'id': '2',
     'location': [52.1986444444444, 0.138133333333333],
     'name': 'alexander bed and breakfast',
     'phone': '01223525725',
     'postcode': 'cb12de',
     'price': {'double': '50', 'single': '40'},
     'pricerange': 'cheap',
     'stars': '4',
     'takesbookings': 'yes',
     'type': 'guesthouse'},
    {'address': '41 warkworth street',
     'area': 'centre',
     'internet': 'yes',
     'parking': 'yes',
     'id': '15',
     'location': [52.20439812598512, 0.13059139251708984],
     'name': 'el shaddai',
     'phone': '01223327978',
     'postcode': 'cb11eg',
     'price': {'double': '60', 'family': '62', 'single': '40'},
     'pricerange': 'cheap',
     'stars': '0',
     'takesbookings': 'yes',
     'type': 'guesthouse'}]))]

## 2. Generation with ground truth dialogue act

#### Fill the dataset object to give the previous utterance and the dialogue act

In [10]:
from torch.utils.data import Dataset

class MultiWoZDataset:
    def __init__(self, dataset, database, history_size=5):
        self.dataset = dataset
        self.database = database
        self.history_size = history_size
        self.index = []
        for i, dial in enumerate(dataset):
            for j, speaker in enumerate(dial['turns']['speaker']):
                if speaker == 1:
                    self.index.append((i,j))
    def __len__(self):
        return len(self.index)

    def __getitem__(self, index):
       pass

In [11]:
mw_train = MultiWoZDataset(training_set, multiwoz_db)
mw_valid = MultiWoZDataset(validation_set, multiwoz_db)
mw_test = MultiWoZDataset(test_set, multiwoz_db)

In [12]:
mw_test[1]

{'history': "[USER]:I need train reservations from norwich to cambridge[BOT]I have 133 trains matching your request. Is there a specific day and time you would like to travel?[USER]:I'd like to leave on Monday and arrive by 18:00.",
 'label': 'There are 12 trains for the day and time you request. Would you like to book it now?',
 'act': '[dialogue_act]train_choice 12[end_dialogue_act]',
 'label_delex': 'There are [train_choice] trains for the day and time you request. Would you like to book it now?',
 'db': '[db][result:train:4][end_db]',
 'state': "[state][intent:find_train][train-arriveby:['18:00']][train-day:['monday']][train-departure:['norwich']][train-destination:['cambridge']][end_intent][end_state]"}

An example of output (for this part only history, label and act are necessary). Notice that history return all the previous `history_size` turn !!!
```
{'history': "[USER]:I need train reservations from norwich to cambridge[BOT]I have 133 trains matching your request. Is there a specific day and time you would like to travel?[USER]:I'd like to leave on Monday and arrive by 18:00.",
 'label': 'There are 12 trains for the day and time you request. Would you like to book it now?',
 'act': '[dialogue_act]train_choice 12[end_dialogue_act]',
 'label_delex': 'There are [train_choice] trains for the day and time you request. Would you like to book it now?',
 'db': '[db][result:train:4][end_db]',
 'state': "[state][intent:find_train][train-arriveby:['18:00']][train-day:['monday']][train-departure:['norwich']][train-destination:['cambridge']][end_intent][end_state]"}
```

### Defining the model and the dataset

We will consider as model the distilgpt2 model, a distilled model of gpt2 

In [13]:
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2")
tokenizer.add_special_tokens({'pad_token': '<|endoftext|>'})
model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")
model.resize_token_embeddings(len(tokenizer))

Embedding(50257, 768)

define now a class `WoZActDataset` that take the previous dataset in the constructor and return in getitem data in the following format: 

```
"[USER]:i need a place to dine in the center thats expensive[BOT]I have several options for you; do you prefer African, Asian, or British food?[USER]:Any sort of food would be fine, as long as it is a bit expensive. Could I get the phone number for your recommendation?[dialogue_act]restaurant_food Afrian,restaurant_name Bedouin,restaurant_area centre[end_dialogue_act][ANSWER]There is an Afrian place named Bedouin in the centre. How does that sound?"
```

In [14]:
class WoZActDataset(Dataset):
    def __init__(self, multiwoz_dataset, training=True):
        pass
    
    def __getitem__(self, index):
        pass

    def __len__(self):
        return len(self.multiwoz_dataset)

class TokenizerCollator(Dataset):
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, data):
        return self.tokenizer(data, return_tensors='pt', padding=True)
        

In [15]:
train = WoZActDataset(mw_train)
valid = WoZActDataset(mw_valid)
test = WoZActDataset(mw_test, training=False)

In [16]:
train[1]

'[USER]:i need a place to dine in the center thats expensive[BOT]I have several options for you; do you prefer African, Asian, or British food?[USER]:Any sort of food would be fine, as long as it is a bit expensive. Could I get the phone number for your recommendation?[dialogue_act]restaurant_food Afrian,restaurant_name Bedouin,restaurant_area centre[end_dialogue_act][ANSWER]There is an Afrian place named Bedouin in the centre. How does that sound?'

## Training the model
Below complete the function validation for the training which will print the validation loss. Do not hesitate to copy part of the code !!!

In [17]:
from tqdm.notebook import trange, tqdm
from torch import optim
from torch import nn


class Trainer():
    def __init__(self, model, padding_idx=100):
        self.model = model
        self.optimizer = None

    def at_training_start(self, learning_rate = 1e-3):
        self.optimizer = optim.Adam(self.model.parameters(), lr=learning_rate)
        self.criterion = nn.CrossEntropyLoss(ignore_index=50257)

    def training_step(self, data):
        y_pred = self.model(**data)
        y_truth = data["input_ids"][:, 1:].flatten()

        loss_reconstruction = self.criterion(y_pred.logits[:,:-1].reshape(y_truth.shape[0], -1), y_truth)
        (loss_reconstruction).backward()
        return loss_reconstruction.item()


    def validation(self, validation_dl, use_gpu=False, iter_count=None):
        pass
        
    def fit(self,
            training_dl,
            validation_dl,
            learning_rate = 1e-3,
            validation_frequency = 8,
            max_iter = 10000,
            use_gpu=False,

        ):
        if(use_gpu):
          self.model = self.model.cuda()
        self.at_training_start(learning_rate)

        iter_count = 0
        loss_buffer = []
        pbar = trange(max_iter)

        while(iter_count < max_iter):
            for data in training_dl:
                if use_gpu:
                    data = {k:v.cuda() for k, v in data.items()}
                self.optimizer.zero_grad()
                loss_buffer += [self.training_step(data)]
                self.optimizer.step()

                if(iter_count  % validation_frequency == 0):
                    print("Loss at iteration %s is %s"%(iter_count, np.mean(loss_buffer)))
                    self.model.eval()
                    self.validation(validation_dl, use_gpu=use_gpu, iter_count=iter_count)
                    self.model.train()
                    loss_buffer = []
                iter_count += 1
                pbar.update(1)
                if(iter_count >= max_iter):
                  break

In [18]:
collator = TokenizerCollator(tokenizer)
training_dl = DataLoader(train, batch_size=8, shuffle=True, collate_fn=collator, num_workers=2)
validation_dl = DataLoader(valid, batch_size=32, shuffle=True, collate_fn=collator, num_workers=2)
my_trainer = Trainer(model)
my_trainer.fit(training_dl, validation_dl, validation_frequency=1000, use_gpu=True, max_iter=10000)

  0%|          | 0/10000 [00:00<?, ?it/s]

Loss at iteration 0 is 6.794318675994873
Loss validation at iteration 0 is 7.564188224412662
Loss at iteration 1000 is 1.1781438655257226
Loss validation at iteration 1000 is 0.9026029530541722
Loss at iteration 2000 is 0.9880004197955131
Loss validation at iteration 2000 is 0.8588697838060784
Loss at iteration 3000 is 0.9366088448762894
Loss validation at iteration 3000 is 0.8363133921767726
Loss at iteration 4000 is 0.8963712045550346
Loss validation at iteration 4000 is 0.8115563449405488
Loss at iteration 5000 is 0.8778614881634712
Loss validation at iteration 5000 is 0.8171859422287384
Loss at iteration 6000 is 0.8606208698749542
Loss validation at iteration 6000 is 0.7965945031219747
Loss at iteration 7000 is 0.8513367728292942
Loss validation at iteration 7000 is 0.7924642921526195
Loss at iteration 8000 is 0.8053309039771557
Loss validation at iteration 8000 is 0.7956381447903522
Loss at iteration 9000 is 0.8048457061052322
Loss validation at iteration 9000 is 0.794336295231080

## Prediction on the test set
To predict you should use the `generate` method from the model:
```
model.generate(data['input_ids'], do_sample=False,
                                max_length=512, pad_token_id=model.config.eos_token_id)
```
 Notice if you do not have sufficient computational ressource, you can download the model at [huggingface](https://huggingface.co/ThomasGerald/multiwoz_with_ground_truth_act)


In [1]:
# Load model directly from hugging face
pass

In [22]:
collator = TokenizerCollator(tokenizer)
test_prediction = []
test_dl = DataLoader(test, batch_size=8, shuffle=False, collate_fn=collator, num_workers=2)
model.eval()
use_gpu = True

# predict the test set

In [24]:

test_generated_text = []
for output_ids in test_prediction:
    test_generated_text.append(tokenizer.decode(output_ids).split('[ANSWER]')[-1].replace('<|endoftext|>', ''))

In [35]:
print('Ground truth: %s \nPredicted: %s'%(mw_test[7]['label'], test_generated_text[7]))

Ground truth: Okay, your booking was successful! The reference number is MUFCMYFF . The table will be reserved for 15 minutes. 
Predicted: I was able to book that for you. Your reference number is MUFCMYFF. Is there anything else I can help you with?


## Evaluate the model 
To evaluate the model you can use :
* N-GRAM Methods such as ROUGE, BLEU or METEOR
* Verify that act items are in the generated data

## 3. Generation with state and database [Optional]

You should take inspiration from the article proposed in the directory to propose an approach that do not use ground truth to generate answer, but contrary to last week, take into account data located in the database. Notice for this step, it is not expected to get very good performances. Depending on your progress you can only consider generating delexicalised sentence i.e. sentences without database entry value. For instance you could predict such a sentence : 

```There are [train_choice] trains meeting your needs with the first leaving at [train_leaveat] and the last one leaving at [train_leaveat]```

Notice to formulate consistent answer information from database are still required. Of course you can adapt the tools in the file woz_tools.py.