In [1]:
from datasets import load_dataset

dataset = load_dataset('multi_woz_v22')

In [2]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['dialogue_id', 'services', 'turns'],
        num_rows: 8437
    })
    validation: Dataset({
        features: ['dialogue_id', 'services', 'turns'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['dialogue_id', 'services', 'turns'],
        num_rows: 1000
    })
})


In [3]:
from collections import Counter
from datasets import DatasetDict

# 用于统计所有 split 中的 domain 分布
domain_counter = Counter()
# 用于统计每个对话中涉及的 domain 数量的分布
domain_count_distribution = Counter()

for split in ['train', 'validation', 'test']:
    dataset_split = dataset[split]
    for services in dataset_split['services']:
        # services 是一个 domain 列表，如 ['restaurant', 'hotel']
        domain_counter.update(services)
        
        num_domains = len(set(services))  # 去重后计算 domain 数量
        domain_count_distribution[num_domains] += 1

# 打印每个 domain 出现的 dialogue 数量（注意：重复 domain 会被重复计数）
print("Dialogue count per domain (non-unique):")
for domain, count in domain_counter.most_common():
    print(f"{domain}: {count}")

print("=="*30)

# 打印每个 domain 数量对应的对话数量
print("Number of dialogues with N domains:")
for num_domains, count in sorted(domain_count_distribution.items()):
    print(f"{num_domains} domain(s): {count} dialogues")

Dialogue count per domain (non-unique):
restaurant: 4728
hotel: 4182
train: 3931
attraction: 3485
taxi: 1872
hospital: 108
bus: 6
Number of dialogues with N domains:
0 domain(s): 417 dialogues
1 domain(s): 3275 dialogues
2 domain(s): 5229 dialogues
3 domain(s): 1485 dialogues
4 domain(s): 31 dialogues
