In [1]:
from datasets import load_from_disk
import random
import pandas as pd
dataset = load_from_disk('schema_data')
# convert dataset to pandas
dataset = dataset.flatten()

# 代码主要目的：

- 输入数据采用的是多级列表的结构，对于提取信息非常慢且逻辑复杂。
    - 利用 pandas 重构数据集，采用平铺列表的形式。

- 重要原则：不改变原始数据，只对数据做提取处理。

## 判断 turns， speakers， frames 里面的数目是不是一一对应。

In [2]:
# change [dict, dict, dict]
dataset.set_format('pandas')

test_df = dataset['test'][:]
# calculate length
def cal_length(row):
    return len(row['turns.speaker']) == len(row['turns.utterance']) == len(row['turns.frames'])

# test
assert test_df.apply(cal_length, axis=1).sum() == len(test_df)

In [3]:
# change [dict, dict, dict]
dataset.set_format('pandas')

test_df = dataset['train'][:]
# calculate length
def cal_length(row):
    return len(row['turns.speaker']) == len(row['turns.utterance']) == len(row['turns.frames'])

# test
assert test_df.apply(cal_length, axis=1).sum() == len(test_df)

In [4]:
# change [dict, dict, dict]
dataset.set_format('pandas')

original_df = dataset['validation'][:]
# calculate length
def cal_length(row):
    return len(row['turns.speaker']) == len(row['turns.utterance']) == len(row['turns.frames'])

# test
assert original_df.apply(cal_length, axis=1).sum() == len(original_df)

结论：对于整个数据集，turns，speaker，frames里面的数据个数完全一致。

## 将 speaker，utterance，frames 平铺

In [5]:
speakers = []
for speaker in original_df['turns.speaker']:
    speakers.extend(speaker)
utts = []
for utt in original_df['turns.utterance']:
    utts.extend(utt)
frames = []
for frame in original_df['turns.frames']:
    frames.extend(frame)
assert len(speakers) == len(utts)
test_df = pd.DataFrame({
    'speaker': speakers,
    'utterance': utts,
    'frame': frames})

# test
assert (len(speakers) == len(utts) == len(frames)) is True

结论：speaker，utt，frames 平铺且一一对应

## 将 dialogue id，speakers 添加到 dataframes 中，保证信息完整

In [6]:
original_df.sample()

Unnamed: 0,dialogue_id,services,turns.speaker,turns.utterance,turns.frames
1798,15_00066,"[Events_1, RentalCars_1]","[0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, ...",[I'd like to attend an upcoming Golden Bears V...,"[{'service': ['Events_1'], 'slots': [{'slot': ..."


In [7]:
# 保证每个turn的长度相同
assert original_df.apply(lambda x: len(x['turns.speaker']) == len(x['turns.speaker'] == len(x['turns.frames'])), axis=1).unique()


In [8]:
dialogues_ids = []
for i in range(0, len(original_df)):
    N = len(original_df.iloc[i]['turns.speaker'])
    dialogue_id = original_df.iloc[i]['dialogue_id']
    for j in range(0, N):
        dialogues_ids.append(dialogue_id)

# test
assert len(dialogues_ids) == len(test_df)

test_df['dialogue_id'] = pd.Series(dialogues_ids)

In [9]:
# add services to test_df
services = []
for i in range(0, len(original_df)):
    N = len(original_df.iloc[i]['turns.speaker'])
    for j in range(0, N):
        services.append(original_df.iloc[i]['services'])

# test
assert len(services) == len(test_df)

test_df['services'] = pd.Series(services)

结论：原始信息平铺，数据从 datasets 到 pandas 转换完毕。

## 添加 sys: usr: 合并列

In [10]:
model_input_utt = []
for i in range(0, len(test_df), 2):
    if i == 0:
        utt = "sys : ," + " usr : " + test_df.utterance[i]
    else:
        utt = "sys : " + test_df.utterance[i-1] + " usr : " + test_df.utterance[i]
    
    model_input_utt.append(utt)
    model_input_utt.append("") # system response
# add model input into df
test_df = pd.concat(
    [test_df, pd.DataFrame({'model_input_utt': model_input_utt})], axis=1)


In [11]:
# test
assert 'model_input_utt' in test_df.columns

结论：sys，usr 合并列已经加入，第一句 sys 为空

## drop 所有的系统消息。

因为系统的 frame 包含了 sys 的信息，sys 的信息已经在 input utt 里面了， 删除掉 spk == 1 的所有行。

In [12]:
test_df.frame.apply(len).unique()

array([6])

In [13]:
test_df.frame[0].keys()

dict_keys(['service', 'slots', 'state', 'actions', 'service_results', 'service_call'])

每个 frame 都有六个key

结论： 只有用户发言的数据

## 平铺 frame 列

In [14]:
test_df.sample(5)

Unnamed: 0,speaker,utterance,frame,dialogue_id,services,model_input_utt
12586,0,Sounds good to me. I would like to pick it up ...,"{'service': ['RentalCars_1'], 'slots': [{'slot...",8_00025,"[Buses_1, RentalCars_1]",sys : What time do you need it? Could you pick...
7061,1,"No problem, have a lovely meal!","{'service': ['Restaurants_2'], 'slots': [{'slo...",4_00100,[Restaurants_2],
19804,0,What is my savings account balance.,"{'service': ['Banks_2'], 'slots': [{'slot': []...",11_00026,"[Banks_2, Weather_1]","sys : No worries, have a pleasant day ahead! u..."
15045,1,Your tickets have been booked successfully. Pl...,"{'service': ['Events_1'], 'slots': [{'slot': [...",9_00008,"[Events_1, Banks_2]",
14289,1,Do you want to purchase tickets for this event?,"{'service': ['Events_1'], 'slots': [{'slot': [...",8_00101,"[Events_1, Banks_2]",


In [15]:
frames_dataframe = pd.DataFrame(test_df.frame.to_list())

In [16]:
# test
assert len(frames_dataframe) == len(test_df)

结论： 平铺完毕

In [17]:
test_df = pd.concat([test_df, frames_dataframe], axis=1)

In [18]:
test_df = test_df.drop('frame', axis=1)

In [19]:
state_type = test_df.state.apply(lambda x: type(x))

## BUG: 计算原始 frames 里面的 state 的个数和 utterance 的个数是否一致。

In [20]:
# original states in frames
original_frames = original_df['turns.frames']

In [21]:
test_df['state'] = test_df.state.apply(lambda x: x[0])

## BUG：state 里面存在 Nan 空值

猜测：
- [x] frame 平铺后的个数与数据集不一致

结论： 由于 drop 掉了 sys， index 不匹配导致

In [22]:
len(frames_dataframe)

48726

## 平铺state

In [23]:
test_df = pd.concat([test_df, pd.DataFrame(test_df.state.to_list())], axis=1)

In [24]:
test_df = test_df.drop('state', axis=1)

## jump or keep

由于keep和jump的判断和上一个frame的类别有关系，判断一下frames

In [25]:
random_dia_id = test_df.dialogue_id.sample().values[0]
test_df[test_df.dialogue_id == random_dia_id]

Unnamed: 0,speaker,utterance,dialogue_id,services,model_input_utt,service,slots,actions,service_results,service_call,active_intent,requested_slots,slot_values
40376,0,Can you please find a therapist for me?,18_00014,"[Services_4, Banks_2, Restaurants_2]",sys : have a great day usr : Can you please fi...,[Services_4],"[{'slot': [], 'start': [], 'exclusive_end': []}]","[{'act': [6], 'slot': ['intent'], 'canonical_v...",[{'service_results_list': []}],"[{'method': '', 'parameters': {'parameter_slot...",FindProvider,[],"{'slot_name': [], 'slot_value_list': []}"
40377,1,"Sure. Want a psychologist or a psychiatrist, o...",18_00014,"[Services_4, Banks_2, Restaurants_2]",,[Services_4],"[{'slot': [], 'start': [], 'exclusive_end': []}]","[{'act': [13, 13], 'slot': ['type', 'city'], '...",[{'service_results_list': []}],"[{'method': '', 'parameters': {'parameter_slot...",,[],"{'slot_name': [], 'slot_value_list': []}"
40378,0,Please find me a family counselor in Santa Rosa.,18_00014,"[Services_4, Banks_2, Restaurants_2]",sys : Sure. Want a psychologist or a psychiatr...,[Services_4],"[{'slot': ['city'], 'start': [37], 'exclusive_...","[{'act': [4, 4], 'slot': ['type', 'city'], 'ca...",[{'service_results_list': []}],"[{'method': '', 'parameters': {'parameter_slot...",FindProvider,[],"{'slot_name': ['city', 'type'], 'slot_value_li..."
40379,1,There's a family counselor in Santa Rosa calle...,18_00014,"[Services_4, Banks_2, Restaurants_2]",,[Services_4],"[{'slot': ['therapist_name', 'city'], 'start':...","[{'act': [11, 11, 11], 'slot': ['therapist_nam...",[{'service_results_list': [{'service_slot_name...,"[{'method': 'FindProvider', 'parameters': {'pa...",,[],"{'slot_name': [], 'slot_value_list': []}"
40380,0,What's the address?,18_00014,"[Services_4, Banks_2, Restaurants_2]",sys : There's a family counselor in Santa Rosa...,[Services_4],"[{'slot': [], 'start': [], 'exclusive_end': []}]","[{'act': [13], 'slot': ['address'], 'canonical...",[{'service_results_list': []}],"[{'method': '', 'parameters': {'parameter_slot...",FindProvider,[address],"{'slot_name': ['city', 'type'], 'slot_value_li..."
40381,1,Their office location is 2455 Bennett Valley R...,18_00014,"[Services_4, Banks_2, Restaurants_2]",,[Services_4],"[{'slot': ['address'], 'start': [25], 'exclusi...","[{'act': [4], 'slot': ['address'], 'canonical_...",[{'service_results_list': []}],"[{'method': '', 'parameters': {'parameter_slot...",,[],"{'slot_name': [], 'slot_value_list': []}"
40382,0,Okay. Can you instead find me a psychiatrist?,18_00014,"[Services_4, Banks_2, Restaurants_2]",sys : Their office location is 2455 Bennett Va...,[Services_4],"[{'slot': [], 'start': [], 'exclusive_end': []}]","[{'act': [4, 14], 'slot': ['type', ''], 'canon...",[{'service_results_list': []}],"[{'method': '', 'parameters': {'parameter_slot...",FindProvider,[],"{'slot_name': ['city', 'type'], 'slot_value_li..."
40383,1,There's a psychiatrist in Santa Rosa called Be...,18_00014,"[Services_4, Banks_2, Restaurants_2]",,[Services_4],"[{'slot': ['therapist_name', 'city'], 'start':...","[{'act': [11, 11, 11], 'slot': ['therapist_nam...",[{'service_results_list': [{'service_slot_name...,"[{'method': 'FindProvider', 'parameters': {'pa...",,[],"{'slot_name': [], 'slot_value_list': []}"
40384,0,Okay. What's her phone number and office address?,18_00014,"[Services_4, Banks_2, Restaurants_2]",sys : There's a psychiatrist in Santa Rosa cal...,[Services_4],"[{'slot': [], 'start': [], 'exclusive_end': []}]","[{'act': [13, 13], 'slot': ['address', 'phone_...",[{'service_results_list': []}],"[{'method': '', 'parameters': {'parameter_slot...",FindProvider,"[address, phone_number]","{'slot_name': ['city', 'type'], 'slot_value_li..."
40385,1,"You can call her at 707-566-4600, and physical...",18_00014,"[Services_4, Banks_2, Restaurants_2]",,[Services_4],"[{'slot': ['phone_number', 'address'], 'start'...","[{'act': [4, 4], 'slot': ['phone_number', 'add...",[{'service_results_list': []}],"[{'method': '', 'parameters': {'parameter_slot...",,[],"{'slot_name': [], 'slot_value_list': []}"


In [26]:
test_df.drop(['slots', 'actions', 'service_results', 'service_call', 'requested_slots', 'slot_values'], axis=1, inplace=True)

keep or jump 规则

1. sys 的对话没有 keep or jump。
2. 如果 frames 变化，标注 jump，否则，keep。
3. 特殊情况：第一次对话的 frame 为 空。

In [27]:
def keep_or_jump(x):
    # if speaker is system, ignore
    result = []
    for index, (spk, cur_service) in enumerate(zip(x['speaker'], x['service'])):
        # if speaker is system, ignore
        if spk == 1:
            result.append('keep')
            continue
        # if service changed, jump
        if index == 0:
            result.append('jump')
        else:
            if set(cur_service) != set(x['service'].iloc[index-1]):
                result.append('jump')
            else:
                result.append('keep')
    return result

grouped_dialogue = test_df.groupby('dialogue_id')

results = []
for dialogue_id, data in grouped_dialogue:
    _tmp = pd.DataFrame(data=keep_or_jump(data), columns=['keep_or_jump'], index=data.index)
    results.append(_tmp)

results = pd.concat(results, axis=0)

In [28]:
# test
assert len(results) == len(test_df)

In [29]:
# merge two dataframe
test_df = pd.concat([test_df, results], axis=1)

In [30]:
# test
random_dia_id = test_df.dialogue_id.sample().values[0]
test_df[test_df.dialogue_id == random_dia_id]

Unnamed: 0,speaker,utterance,dialogue_id,services,model_input_utt,service,active_intent,keep_or_jump
22330,0,san jose intermediate car wednesday next week,12_00012,"[RentalCars_1, Homes_1]",sys : Have a terrific day! usr : san jose inte...,[RentalCars_1],GetCarsAvailable,jump
22331,1,what time then?,12_00012,"[RentalCars_1, Homes_1]",,[RentalCars_1],,keep
22332,0,18:00 works for me,12_00012,"[RentalCars_1, Homes_1]",sys : what time then? usr : 18:00 works for me,[RentalCars_1],GetCarsAvailable,keep
22333,1,when do you want it?,12_00012,"[RentalCars_1, Homes_1]",,[RentalCars_1],,keep
22334,0,next monday works,12_00012,"[RentalCars_1, Homes_1]",sys : when do you want it? usr : next monday w...,[RentalCars_1],GetCarsAvailable,keep
22335,1,sjc international airport march 4th standard c...,12_00012,"[RentalCars_1, Homes_1]",,[RentalCars_1],,keep
22336,0,what total cost?,12_00012,"[RentalCars_1, Homes_1]",sys : sjc international airport march 4th stan...,[RentalCars_1],GetCarsAvailable,keep
22337,1,$72 in total,12_00012,"[RentalCars_1, Homes_1]",,[RentalCars_1],,keep
22338,0,that works for me,12_00012,"[RentalCars_1, Homes_1]",sys : $72 in total usr : that works for me,[RentalCars_1],GetCarsAvailable,keep
22339,1,wanna get the car?,12_00012,"[RentalCars_1, Homes_1]",,[RentalCars_1],,keep


结论：keep 和 jump 的标签验证完毕

## intents description 提取

In [31]:
import json

with open("/dstc8-schema-guided-dialogue/train/schema.json", 'r') as f:
    train_schemas = json.load(f)
import json

with open("/dstc8-schema-guided-dialogue/test/schema.json", 'r') as f:
    test_schemas = json.load(f)
import json

with open("/dstc8-schema-guided-dialogue/dev/schema.json", 'r') as f:
    dev_schemas = json.load(f)
# change schema format
import collections


def schema_dict(scheme: list):
    res = collections.defaultdict(list)
    for service_dict in scheme:
        service_name = service_dict['service_name']
        assert service_name not in res, f"{service_name} is duplicated."
        res[service_name] = service_dict
    return res


test_schemas = schema_dict(test_schemas)
train_schemas = schema_dict(train_schemas)
dev_schemas = schema_dict(dev_schemas)

In [32]:
test_df['active_intent'].unique()

array(['ReserveRestaurant', '', 'NONE', 'SearchOnewayFlight',
       'SearchRoundtripFlights', 'GetRide', 'GetCarsAvailable',
       'ReserveCar', 'FindBus', 'BuyBusTicket', 'GetAlarms', 'AddAlarm',
       'FindProvider', 'BookAppointment', 'GetWeather', 'FindApartment',
       'ScheduleVisit', 'FindRestaurants', 'CheckBalance',
       'TransferMoney', 'FindAttractions', 'FindMovies', 'RentMovie',
       'SearchHotel', 'ReserveHotel', 'LookupSong', 'PlaySong',
       'FindEvents', 'BuyEventTickets'], dtype=object)

结论：每句话的 intent 只有一个

intents 构建逻辑


1. 从各自的services下，对应当前 frame 的 service 的所有的 intents description。
2. 每个 service 的 intents 必须包含一个 none。
3. 如果一句话对应 N 个 intents， 那么这条样本要拓展 N 次。

In [33]:
test_df = test_df.explode('service')

In [34]:
def intent_description(x, schema):
    results = []
    cur_service = x['service']
    intents = schema[cur_service]['intents']
    for intent in intents:
        intent_name = intent['name']
        intent_description = intent['description']
        intent_input = intent_description + " # " + intent_name
        results.append(intent_input)
    results.append('none')
    return results
test_df['intent_description'] = test_df.apply(lambda x: intent_description(x, dev_schemas), axis=1)

In [35]:
# test
random_dia_id = test_df.dialogue_id.sample().values[0]
test_df[test_df.dialogue_id == random_dia_id]['intent_description'].values

array([list(['Find events in a given city # FindEvents', 'Buy tickets for an event # BuyEventTickets', 'none']),
       list(['Find events in a given city # FindEvents', 'Buy tickets for an event # BuyEventTickets', 'none']),
       list(['Find events in a given city # FindEvents', 'Buy tickets for an event # BuyEventTickets', 'none']),
       list(['Find events in a given city # FindEvents', 'Buy tickets for an event # BuyEventTickets', 'none']),
       list(['Find events in a given city # FindEvents', 'Buy tickets for an event # BuyEventTickets', 'none']),
       list(['Find a bus journey for a given pair of cities # FindBus', 'Buy tickets for a bus journey # BuyBusTicket', 'none']),
       list(['Find a bus journey for a given pair of cities # FindBus', 'Buy tickets for a bus journey # BuyBusTicket', 'none']),
       list(['Find a bus journey for a given pair of cities # FindBus', 'Buy tickets for a bus journey # BuyBusTicket', 'none']),
       list(['Find a bus journey for a given 

结论： 对于 intent 以及 description 提取完毕，数据量有所增加。

## 构建 datasets 数据集

1. 将 pandas 转换成 datasets 数据集。
2. 构建 input ids。

In [36]:
import datasets
from datasets import Dataset
test_dataset = Dataset.from_pandas(test_df)
test_dataset = test_dataset.remove_columns('__index_level_0__')

In [37]:
from transformers import AutoTokenizer
bert_tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

In [38]:
test_dataset[0]

{'speaker': 0,
 'utterance': 'I want to make a restaurant reservation for 2 people at half past 11 in the morning.',
 'dialogue_id': '1_00000',
 'services': ['Restaurants_2'],
 'model_input_utt': 'sys : , usr : I want to make a restaurant reservation for 2 people at half past 11 in the morning.',
 'service': 'Restaurants_2',
 'active_intent': 'ReserveRestaurant',
 'keep_or_jump': 'jump',
 'intent_description': ['Make a table reservation at a restaurant # ReserveRestaurant',
  'Find restaurants by location and by category # FindRestaurants',
  'none']}

input id 

```
[CLS] [keep or jump] [SEP] [input utt] [SEP] [intent1 utt] [SEP] [intent2 utt] [SEP]

In [39]:
# max_uttr_len is 94 by default.
# max_seq_len is 196 by default.
def construct_input_tokens(x, max_uttr_len=94):
    data = ["[CLS]"]
    keep_or_jump = bert_tokenizer.tokenize(x['keep_or_jump'])
    data.extend(keep_or_jump + ["[SEP]"])
    input_utt = bert_tokenizer.tokenize(x['model_input_utt'], max_length=max_uttr_len, truncation=True, padding="max_length")
    data.extend(input_utt + ["[SEP]"]) # 98

    for intent_utt in x['intent_description']:
        intent_utt = bert_tokenizer.tokenize(intent_utt)
        data.extend(intent_utt + ["[SEP]"])
    
    label = bert_tokenizer.tokenize(x['active_intent'])
    return {'input_tokens': data, 'label_tokens': label}
test_dataset = test_dataset.map(construct_input_tokens, batched=False)

  0%|          | 0/50440 [00:00<?, ?ex/s]

结论： 无法利用 batch 并行，不过速度还行，逻辑比较简单。

## 构建 attention mask

1. intent description 需要 padding 到 max_num_intents=11.
    - padding 形式为 intent description + [PAD] [SEP] ... [PAD] [SEP]
    - [PAD] [SEP] 数量取决于 max intents 的数量 - intents 的实际数量。
2. 合并全部输入后的最大长度为 196，长度不足用 paddings 补齐。
3. segment_ids
    - intent description 为 1，其余是 0.
4. intent mask 
    - shape = max_num_intents x max_seq_length
    - 每一行 intent tokens 部分为 1
        - 不包括 special tokens
    - 对于 padding 填充的 intent，填 1
5. special_token_ids
    - 对于每个 intents，与 intent mask 一样，只不过按照 max_num_intents 扩充。
6. input mask
    - shape: max_seq_length x max_seq_length
    - 