# Build a Datasets to read the schema

In [1]:
import random

import datasets

from datasets import load_from_disk

dataset = load_from_disk("schema_data")

In [2]:
dataset = dataset.flatten()

In [3]:
import json

with open("/dstc8-schema-guided-dialogue/train/schema.json", 'r') as f:
    train_schemas = json.load(f)
import json

with open("/dstc8-schema-guided-dialogue/test/schema.json", 'r') as f:
    test_schemas = json.load(f)
import json

with open("/dstc8-schema-guided-dialogue/dev/schema.json", 'r') as f:
    dev_schemas = json.load(f)
# change schema format
import collections


def schema_dict(scheme: list):
    res = collections.defaultdict(list)
    for service_dict in scheme:
        service_name = service_dict['service_name']
        assert service_name not in res, f"{service_name} is duplicated."
        res[service_name] = service_dict
    return res


test_schemas = schema_dict(test_schemas)
train_schemas = schema_dict(train_schemas)
dev_schemas = schema_dict(dev_schemas)

In [4]:
test_schemas.keys()

dict_keys(['Alarm_1', 'Buses_3', 'Events_3', 'Flights_4', 'Homes_2', 'Hotels_2', 'Hotels_4', 'Media_3', 'Messaging_1', 'Movies_1', 'Movies_3', 'Music_3', 'Payment_1', 'RentalCars_3', 'Restaurants_2', 'RideSharing_2', 'Services_1', 'Services_4', 'Trains_1', 'Travel_1', 'Weather_1'])

- [ ] write preprocess function for inputs

form dataset to input ids in this format.
```
[CLS] keep | jump [SEP] utterance [SEP] Intent1 description # intent1 [SEP] Intent2 description # intent2[SEP]
```

- [x] process keep | jump 
- [x] process utterance
- [ ] process intent description and intent

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['dialogue_id', 'services', 'turns.speaker', 'turns.utterance', 'turns.frames'],
        num_rows: 16142
    })
    validation: Dataset({
        features: ['dialogue_id', 'services', 'turns.speaker', 'turns.utterance', 'turns.frames'],
        num_rows: 2482
    })
    test: Dataset({
        features: ['dialogue_id', 'services', 'turns.speaker', 'turns.utterance', 'turns.frames'],
        num_rows: 4201
    })
})

In [6]:
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [7]:
tokenizer.decode(tokenizer.encode("s", "b"))

'[CLS] s [SEP] b [SEP]'

In [8]:
sample_dataset = dataset['validation']

In [9]:
sample_dataset

Dataset({
    features: ['dialogue_id', 'services', 'turns.speaker', 'turns.utterance', 'turns.frames'],
    num_rows: 2482
})

In [10]:
tokenizer.decode(tokenizer.encode('keep', 'I want to make a restaurant reservation for 2 people at half past 11 in the morning.'))

'[CLS] keep [SEP] I want to make a restaurant reservation for 2 people at half past 11 in the morning. [SEP]'

In [11]:
def combine_sys_usr(sample):
    utterances = sample['turns.utterance']
    utterances.insert(0, "")
    start = 0
    end = 1
    res = []
    while end < len(utterances):
        combined_utt = "sys: " + utterances[start] + "usr: " + utterances[end]
        res.append(combined_utt)
        start = end + 1
        end = start + 1
    sample['new_utt'] = res
    return sample

sample_dataset = sample_dataset.map(combine_sys_usr)

Loading cached processed dataset at schema_data/validation/cache-914f78a7ba34973d.arrow


In [12]:
def keep_jump(sample):
    """Judge whether frame change or not.
    """
    SYS = 0
    USR = 1
    
    tokenized_utts = []
    last_frame = {}
    for spk, utt, frames in zip(sample['turns.speaker'], sample['new_utt'], sample['turns.frames']):
        if spk == USR:
            continue
        else:
            frames = {}
            for frame_id in frames:
                frames[frame_id] = ''
            if frames == last_frame:
                tmp_utt = ['keep', utt]
            else:
                tmp_utt = ['jump', utt]
            tokens = tokenizer.encode(tmp_utt[0], tmp_utt[1])
            tokenized_utts.append(tokens)
            last_frame = frames
    sample["utts_with_changes"] = tokenized_utts
    return sample
sample_dataset = sample_dataset.map(keep_jump)
sample_dataset

Loading cached processed dataset at schema_data/validation/cache-e7d4a8d6d8970d79.arrow


Dataset({
    features: ['dialogue_id', 'services', 'turns.speaker', 'turns.utterance', 'turns.frames', 'new_utt', 'utts_with_changes'],
    num_rows: 2482
})

In [13]:
tokenizer.decode((sample_dataset['utts_with_changes'][1][2]))

'[CLS] keep [SEP] sys : You would like to reserve a table for 11 : 30 am for next Monday? usr : That would be fine. Is the restaurant costly? [SEP]'

form dataset to input ids in this format.
```
[CLS] keep | jump [SEP] utterance [SEP] Intent1 description # intent1 [SEP] Intent2 description # intent2[SEP]
```
目前完成 keep，jump，以及utterance的 tokenizer，现在需要调查一下
- [ ] intent description 和 intent 之间是怎么整合到一起的。
    - [ ] 阅读 corpus intent 相关部分，盲猜 intent description # intent

1. description 直接经过 bert tokenize 得到对应tokens。
2. tokenize(intent description # intent name)
3. tokenizer 利用的是 bert-base-cased，需要区分大小写。

In [15]:
tokenizer.convert_ids_to_tokens(tokenizer.encode("Make a reservation with the therapist based on user's wish # BookAppointment"))

['[CLS]',
 'Make',
 'a',
 'reservation',
 'with',
 'the',
 'therapist',
 'based',
 'on',
 'user',
 "'",
 's',
 'wish',
 '#',
 'Book',
 '##A',
 '##pp',
 '##oint',
 '##ment',
 '[SEP]']

- [ ] check the dataset, pair the intent and description.

In [16]:
sample_dataset['turns.frames'][0]

[{'service': ['Restaurants_2'],
  'slots': [{'slot': ['time'], 'start': [56], 'exclusive_end': [83]}],
  'state': [{'active_intent': 'ReserveRestaurant',
    'requested_slots': [],
    'slot_values': {'slot_name': ['number_of_seats', 'time'],
     'slot_value_list': [['2'], ['half past 11 in the morning']]}}],
  'actions': [{'act': [4, 4, 6],
    'slot': ['time', 'number_of_seats', 'intent'],
    'canonical_values': [['11:30'], ['2'], ['ReserveRestaurant']],
    'values': [['half past 11 in the morning'],
     ['2'],
     ['ReserveRestaurant']]}],
  'service_results': [{'service_results_list': []}],
  'service_call': [{'method': '',
    'parameters': {'parameter_slot_name': [],
     'parameter_canonical_value': []}}]},
 {'service': ['Restaurants_2'],
  'slots': [{'slot': [], 'start': [], 'exclusive_end': []}],
  'state': [{'active_intent': '',
    'requested_slots': [],
    'slot_values': {'slot_name': [], 'slot_value_list': []}}],
  'actions': [{'act': [13, 13],
    'slot': ['restaura

In [17]:
sample_dataset

Dataset({
    features: ['dialogue_id', 'services', 'turns.speaker', 'turns.utterance', 'turns.frames', 'new_utt', 'utts_with_changes'],
    num_rows: 2482
})

In [18]:
sample_dataset[0]['turns.frames'][1]

{'service': ['Restaurants_2'],
 'slots': [{'slot': [], 'start': [], 'exclusive_end': []}],
 'state': [{'active_intent': '',
   'requested_slots': [],
   'slot_values': {'slot_name': [], 'slot_value_list': []}}],
 'actions': [{'act': [13, 13],
   'slot': ['restaurant_name', 'location'],
   'canonical_values': [[], []],
   'values': [[], []]}],
 'service_results': [{'service_results_list': []}],
 'service_call': [{'method': '',
   'parameters': {'parameter_slot_name': [],
    'parameter_canonical_value': []}}]}

In [19]:
len(train_schemas)

26

In [20]:
len(sample_dataset[0]['turns.frames'])

12

In [21]:
len(sample_dataset[0]['turns.utterance'])

13

In [22]:
sample_dataset[0]['turns.utterance']

['',
 'I want to make a restaurant reservation for 2 people at half past 11 in the morning.',
 'What city do you want to dine in? Do you have a preferred restaurant?',
 'Please find restaurants in San Jose. Can you try Sino?',
 'Confirming: I will reserve a table for 2 people at Sino in San Jose. The reservation time is 11:30 am today.',
 "Yes, thanks. What's their phone number?",
 'Your reservation has been made. Their phone number is 408-247-8880.',
 "What's their address? Do they have vegetarian options on their menu?",
 'The street address is 377 Santana Row #1000. They have good vegetarian options.',
 'Thanks very much.',
 'Is there anything else I can help you with?',
 "No, that's all. Thanks.",
 'Have a great day.']

- [x] 找到 intents 在 dialogue 数据中的位置

In [23]:
# get the infos of train
for dia_id, i in enumerate(dataset['train']['dialogue_id']):
    if i == "104_00000":
        print(dia_id)
        break

13083


In [24]:
dataset['train'][13083]['turns.frames']

[{'service': ['Events_1'],
  'slots': [{'slot': ['date', 'subcategory'],
    'start': [68, 18],
    'exclusive_end': [85, 22]}],
  'state': [{'active_intent': 'FindEvents',
    'requested_slots': [],
    'slot_values': {'slot_name': ['date', 'subcategory'],
     'slot_value_list': [['2nd of this month'], ['Rock']]}}],
  'actions': [{'act': [4, 4, 6],
    'slot': ['date', 'subcategory', 'intent'],
    'canonical_values': [['2019-03-02'], ['Rock'], ['FindEvents']],
    'values': [['2nd of this month'], ['Rock'], ['FindEvents']]}],
  'service_results': [{'service_results_list': []}],
  'service_call': [{'method': '',
    'parameters': {'parameter_slot_name': [],
     'parameter_canonical_value': []}}]},
 {'service': ['Events_1'],
  'slots': [{'slot': [], 'start': [], 'exclusive_end': []}],
  'state': [{'active_intent': '',
    'requested_slots': [],
    'slot_values': {'slot_name': [], 'slot_value_list': []}}],
  'actions': [{'act': [13, 13],
    'slot': ['category', 'city_of_event'],
   

In [25]:
for j in [i['state'] for i in dataset['train'][13083]['turns.frames']]:
    print(j[0]['active_intent'])

FindEvents

FindEvents

FindEvents

FindEvents

FindEvents

FindEvents

SearchHotel

SearchHotel

SearchHotel

BuyEventTickets

BuyEventTickets

BuyEventTickets



每一轮对话，按照 frame，service 来提取 intent，所以一轮对话可能被拆分为多个数据。

TODO: 判断最终需要多少数据，是 1 个 utt 对应一个 intent，还是一个 sys, utt 对应一个 intent

回答：输入的形式都是 sys， utt对，一对儿对应一个intent。

In [26]:
# Generate intents from frames.
def get_intents(samples):
    """Get intents from dataset.
    """
    intents = []
    for speaker, frame in zip(samples['turns.speaker'], samples['turns.frames']):
        if speaker == 0: # 0 means usr, 1 means sys
            intent = frame['state'][0]['active_intent']
            intents.append(intent)
    samples['intents'] = intents
    return samples

sample_dataset = sample_dataset.map(get_intents)
# test
sample_dataset['intents'][0]

  0%|          | 0/2482 [00:00<?, ?ex/s]

['ReserveRestaurant',
 'ReserveRestaurant',
 'ReserveRestaurant',
 'ReserveRestaurant',
 'ReserveRestaurant',
 'NONE']

utterance 会比 frames 多一个，因为最开头有个空的

In [27]:
train_schemas['Banks_1']['description']

'Manage bank accounts and transfer money'

In [28]:
sample_dataset[0]['turns.frames']

[{'service': ['Restaurants_2'],
  'slots': [{'slot': ['time'], 'start': [56], 'exclusive_end': [83]}],
  'state': [{'active_intent': 'ReserveRestaurant',
    'requested_slots': [],
    'slot_values': {'slot_name': ['number_of_seats', 'time'],
     'slot_value_list': [['2'], ['half past 11 in the morning']]}}],
  'actions': [{'act': [4, 4, 6],
    'slot': ['time', 'number_of_seats', 'intent'],
    'canonical_values': [['11:30'], ['2'], ['ReserveRestaurant']],
    'values': [['half past 11 in the morning'],
     ['2'],
     ['ReserveRestaurant']]}],
  'service_results': [{'service_results_list': []}],
  'service_call': [{'method': '',
    'parameters': {'parameter_slot_name': [],
     'parameter_canonical_value': []}}]},
 {'service': ['Restaurants_2'],
  'slots': [{'slot': [], 'start': [], 'exclusive_end': []}],
  'state': [{'active_intent': '',
    'requested_slots': [],
    'slot_values': {'slot_name': [], 'slot_value_list': []}}],
  'actions': [{'act': [13, 13],
    'slot': ['restaura

In [29]:
sample_dataset[0]['intents']

['ReserveRestaurant',
 'ReserveRestaurant',
 'ReserveRestaurant',
 'ReserveRestaurant',
 'ReserveRestaurant',
 'NONE']

In [30]:
test_schemas['Restaurants_2']['intents']

[{'name': 'ReserveRestaurant',
  'description': 'Make a table reservation at a restaurant',
  'is_transactional': True,
  'required_slots': ['restaurant_name', 'location', 'time'],
  'optional_slots': {'number_of_seats': '2', 'date': '2019-03-01'},
  'result_slots': ['restaurant_name',
   'date',
   'time',
   'has_seating_outdoors',
   'has_vegetarian_options',
   'phone_number',
   'rating',
   'address',
   'number_of_seats',
   'price_range',
   'location',
   'category']},
 {'name': 'FindRestaurants',
  'description': 'Find restaurants by location and by category',
  'is_transactional': False,
  'required_slots': ['category', 'location'],
  'optional_slots': {'price_range': 'dontcare',
   'has_vegetarian_options': 'dontcare',
   'has_seating_outdoors': 'dontcare'},
  'result_slots': ['restaurant_name',
   'has_seating_outdoors',
   'has_vegetarian_options',
   'phone_number',
   'rating',
   'address',
   'price_range',
   'location',
   'category']}]

In [39]:
test_schemas['Flights_3']

[]

In [45]:
test_schemas["Flights_4"]['intents']

[{'name': 'SearchOnewayFlight',
  'description': 'Search for one way flights to the destination',
  'is_transactional': False,
  'required_slots': ['origin_airport',
   'destination_airport',
   'departure_date'],
  'optional_slots': {'seating_class': 'Economy',
   'number_of_tickets': '1',
   'airlines': 'dontcare'},
  'result_slots': ['number_of_tickets',
   'seating_class',
   'origin_airport',
   'destination_airport',
   'departure_date',
   'is_nonstop',
   'outbound_departure_time',
   'outbound_arrival_time',
   'price',
   'airlines']},
 {'name': 'SearchRoundtripFlights',
  'description': 'Search for roundtrip flights for the trip',
  'is_transactional': False,
  'required_slots': ['origin_airport',
   'destination_airport',
   'departure_date',
   'return_date'],
  'optional_slots': {'seating_class': 'Economy',
   'number_of_tickets': '1',
   'airlines': 'dontcare'},
  'result_slots': ['number_of_tickets',
   'seating_class',
   'origin_airport',
   'destination_airport',
   

In [162]:
sample_dataset[492]['intents']

['CheckBalance',
 'CheckBalance',
 'CheckBalance',
 'TransferMoney',
 'TransferMoney',
 'TransferMoney',
 'TransferMoney',
 'TransferMoney']

In [163]:
sample_dataset[492]['intents_descriptions']

['Get the balance of an account',
 'Get the balance of an account',
 'Get the balance of an account',
 'Transfer money to another user',
 'Transfer money to another user',
 'Transfer money to another user',
 'Transfer money to another user',
 'Transfer money to another user']

In [156]:
# get intent description from frame and intents name.
def get_description(samples, schema):
    """Get frames' intent description from schema.

    when intent is "None", the intent description will be "none".
    """
    descriptions = []
    for frame, single_intent in zip(samples['turns.frames'], samples['intents']):
        service_name = frame['service'][0]
        service_intents = schema[service_name]['intents']
        description = 'none'
        for intent in service_intents:
            if intent == "TransferMoney":
                print(service_intents)
            if intent['name'] == single_intent:
                description = intent['description']
                break
        descriptions.append(description)
    samples['intents_descriptions'] = descriptions
    return samples

# test
from functools import partial
get_dev_descriptions = partial(get_description, schema=dev_schemas)
sample_dataset = sample_dataset.map(get_dev_descriptions, batched=False)

  0%|          | 0/2482 [00:00<?, ?ex/s]

In [98]:
# test
index = random.randint(0, 1000)
print(len(sample_dataset[index]['turns.utterance']))
print(len(sample_dataset[index]['intents']))
print(len(sample_dataset[index]['intents_descriptions']))

27
13
13


In [166]:
# tokenize intents' description and intents.
def tokenize_intents(samples):
    all_tokens = []
    sentences = []
    for description, intent in zip(samples['intents_descriptions'], samples['intents']):
        utt = description + "#" + intent
        sentences.append(utt)
    tokens = [tokenizer.convert_ids_to_tokens(tokenizer.encode(s, add_special_tokens=False)) for s in sentences]
    print(tokens)
# test
tokenize_intents(sample_dataset[492])

[['Get', 'the', 'balance', 'of', 'an', 'account', '#', 'Check', '##B', '##alan', '##ce'], ['Get', 'the', 'balance', 'of', 'an', 'account', '#', 'Check', '##B', '##alan', '##ce'], ['Get', 'the', 'balance', 'of', 'an', 'account', '#', 'Check', '##B', '##alan', '##ce'], ['Transfer', 'money', 'to', 'another', 'user', '#', 'Transfer', '##M', '##oney'], ['Transfer', 'money', 'to', 'another', 'user', '#', 'Transfer', '##M', '##oney'], ['Transfer', 'money', 'to', 'another', 'user', '#', 'Transfer', '##M', '##oney'], ['Transfer', 'money', 'to', 'another', 'user', '#', 'Transfer', '##M', '##oney'], ['Transfer', 'money', 'to', 'another', 'user', '#', 'Transfer', '##M', '##oney']]


TODO：判断每一条数据的 intents 是随着对话增多的，还是每个数据对应一个

1. 每个 utt 只对应一个 frame 的 intents，并且不会随着对话增多变动。

TODO：一个utt怎么找到对应的 intent？


TODO: flatten 整个数据集。

In [168]:
pd_dataset = dataset.set_format('pd')

In [182]:
dataset

DatasetDict({
    train: Dataset({
        features: ['dialogue_id', 'services', 'turns.speaker', 'turns.utterance', 'turns.frames'],
        num_rows: 16142
    })
    validation: Dataset({
        features: ['dialogue_id', 'services', 'turns.speaker', 'turns.utterance', 'turns.frames'],
        num_rows: 2482
    })
    test: Dataset({
        features: ['dialogue_id', 'services', 'turns.speaker', 'turns.utterance', 'turns.frames'],
        num_rows: 4201
    })
})

In [183]:
frames = dataset['train']['turns.frames']
utterances = dataset['train']['turns.utterance']
speaker = dataset['train']['turns.speaker']

In [185]:
import pandas as pd

In [191]:
frames.values

array([array([{'service': array(['Restaurants_1'], dtype=object), 'slots': array([{'slot': array([], dtype=object), 'start': array([], dtype=int32), 'exclusive_end': array([], dtype=int32)}],
                    dtype=object), 'state': array([{'active_intent': 'FindRestaurants', 'requested_slots': array([], dtype=object), 'slot_values': {'slot_name': array([], dtype=object), 'slot_value_list': array([], dtype=object)}}],
                    dtype=object), 'actions': array([{'act': array([6]), 'slot': array(['intent'], dtype=object), 'canonical_values': array([array(['FindRestaurants'], dtype=object)], dtype=object), 'values': array([array(['FindRestaurants'], dtype=object)], dtype=object)}],
                    dtype=object), 'service_results': array([{'service_results_list': array([], dtype=object)}], dtype=object), 'service_call': array([{'method': '', 'parameters': {'parameter_slot_name': array([], dtype=object), 'parameter_canonical_value': array([], dtype=object)}}],
             

In [None]:
for v in frames:
    print(v)