# Build a Datasets to read the schema

In [1]:
import random

import datasets

from datasets import load_from_disk

dataset = load_from_disk("schema_data")

In [2]:
dataset = dataset.flatten()

In [3]:
import json

with open("/dstc8-schema-guided-dialogue/train/schema.json", 'r') as f:
    schemas = json.load(f)

- [ ] write preprocess function for inputs

form dataset to input ids in this format.
```
[CLS] keep | jump [SEP] utterance [SEP] Intent1 description # intent1 [SEP] Intent2 description # intent2[SEP]
```

- [x] process keep | jump 
- [x] process utterance
- [ ] process intent description and intent

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['dialogue_id', 'services', 'turns.speaker', 'turns.utterance', 'turns.frames'],
        num_rows: 16142
    })
    validation: Dataset({
        features: ['dialogue_id', 'services', 'turns.speaker', 'turns.utterance', 'turns.frames'],
        num_rows: 2482
    })
    test: Dataset({
        features: ['dialogue_id', 'services', 'turns.speaker', 'turns.utterance', 'turns.frames'],
        num_rows: 4201
    })
})

In [5]:
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [6]:
tokenizer.decode(tokenizer.encode("s", "b"))

'[CLS] s [SEP] b [SEP]'

In [7]:
sample_dataset = dataset['validation']

In [8]:
sample_dataset

Dataset({
    features: ['dialogue_id', 'services', 'turns.speaker', 'turns.utterance', 'turns.frames'],
    num_rows: 2482
})

In [9]:
tokenizer.decode(tokenizer.encode('keep', 'I want to make a restaurant reservation for 2 people at half past 11 in the morning.'))

'[CLS] keep [SEP] i want to make a restaurant reservation for 2 people at half past 11 in the morning. [SEP]'

In [10]:
def combine_sys_usr(sample):
    utterances = sample['turns.utterance']
    utterances.insert(0, "")
    start = 0
    end = 1
    res = []
    while end < len(utterances):
        combined_utt = "sys: " + utterances[start] + "usr: " + utterances[end]
        res.append(combined_utt)
        start = end + 1
        end = start + 1
    sample['new_utt'] = res
    return sample

sample_dataset = sample_dataset.map(combine_sys_usr)

sample_dataset['new_utt']

  0%|          | 0/2482 [00:00<?, ?ex/s]

[['sys: usr: I want to make a restaurant reservation for 2 people at half past 11 in the morning.',
  'sys: What city do you want to dine in? Do you have a preferred restaurant?usr: Please find restaurants in San Jose. Can you try Sino?',
  "sys: Confirming: I will reserve a table for 2 people at Sino in San Jose. The reservation time is 11:30 am today.usr: Yes, thanks. What's their phone number?",
  "sys: Your reservation has been made. Their phone number is 408-247-8880.usr: What's their address? Do they have vegetarian options on their menu?",
  'sys: The street address is 377 Santana Row #1000. They have good vegetarian options.usr: Thanks very much.',
  "sys: Is there anything else I can help you with?usr: No, that's all. Thanks."],
 ['sys: usr: I am not in the mood to cook today. I want to eat out at a restaurant instead.',
  'sys: Which area would you like me to look in? Which restaurant would you like to eat in and at what time?usr: Look for a restaurant in Saratoga. Check to s

In [11]:
def keep_jump(sample):
    """Judge whether frame change or not.
    """
    SYS = 0
    USR = 1
    
    tokenized_utts = []
    last_frame = {}
    for spk, utt, frames in zip(sample['turns.speaker'], sample['new_utt'], sample['turns.frames']):
        if spk == USR:
            continue
        else:
            frames = {}
            for frame_id in frames:
                frames[frame_id] = ''
            if frames == last_frame:
                tmp_utt = ['keep', utt]
            else:
                tmp_utt = ['jump', utt]
            tokens = tokenizer.encode(tmp_utt[0], tmp_utt[1])
            tokenized_utts.append(tokens)
            last_frame = frames
    sample["utts_with_changes"] = tokenized_utts
    return sample
sample_dataset = sample_dataset.map(keep_jump)
sample_dataset

  0%|          | 0/2482 [00:00<?, ?ex/s]

Dataset({
    features: ['dialogue_id', 'services', 'turns.speaker', 'turns.utterance', 'turns.frames', 'new_utt', 'utts_with_changes'],
    num_rows: 2482
})

In [12]:
tokenizer.decode((sample_dataset['utts_with_changes'][1][2]))

'[CLS] keep [SEP] sys : you would like to reserve a table for 11 : 30 am for next monday? usr : that would be fine. is the restaurant costly? [SEP]'

In [14]:
# todo
def process_intents(sample):
    pass

form dataset to input ids in this format.
```
[CLS] keep | jump [SEP] utterance [SEP] Intent1 description # intent1 [SEP] Intent2 description # intent2[SEP]
```
目前完成 keep，jump，以及utterance的 tokenizer，现在需要调查一下
- [ ] intent description 和 intent 之间是怎么整合到一起的。
    - [ ] 阅读 corpus intent 相关部分，盲猜 intent description # intent