In [1]:
import os
import json
from collections import OrderedDict
from copy import deepcopy
import nltk
import re
from difflib import SequenceMatcher

https://nlp.stanford.edu/blog/a-new-multi-turn-multi-domain-task-oriented-dialogue-dataset/

In [2]:
nltk.pos_tag(['to'])

[('to', 'TO')]

In [3]:
def make_iob(phrase,goal_tracker):
    resolution = re.compile('-\d')
    phrase = deepcopy(phrase.lower())
    goal_tracker = deepcopy(goal_tracker)
    order = [[(k,v),phrase.index(v)] for k,v in goal_tracker.items() if v !=None and v in phrase]
    order = sorted(order, key=lambda x: x[1])
    goal_tracker = OrderedDict([o[0] for o in order])
    pos = nltk.word_tokenize(phrase)
    tag=["O"]*len(pos)
    for k,v in goal_tracker.items():
        B=True
        if (v in phrase):
            for i,p in enumerate(pos):
                if p!="<MASK>" and (p in v):
                    if len(p)<=2 and i < len(pos)-1:
                        if len(p)==2 and 'N' not in nltk.pos_tag([p])[0][1]:
                            continue
                        elif pos[i+1] not in v:
                            continue
                    if B:
                        tag[i] = 'B-'+k
                        B=False
                        pos[i]="<MASK>"
                    else:
                        if i!=0 and tag[i-1][0] in ['B','I']: 
                            tag[i] = 'I-'+k
                            pos[i]="<MASK>"
    tag = [resolution.sub('',t) for t in tag]
    return tag


def get_phrase(phrase,value):
    token = nltk.word_tokenize(phrase.lower())
    value = value.lower()
    token = [w[0] for w in nltk.pos_tag(token) if w[1].isalpha()]
    unigram = [[t] for t in token]
    bigram = list(nltk.ngrams(token,2))
    trigram = list(nltk.ngrams(token,3))
    fourgram = list(nltk.ngrams(token,4))
    candit = unigram+bigram+trigram+fourgram
    value = nltk.word_tokenize(value)
    
    result = []
    for c in candit:
        r = SequenceMatcher(None,c,value)
        result.append([r.ratio()," ".join(c)])
    
    result = sorted(result,key=lambda x:x[0],reverse=True)
    return result[0][1]

In [4]:
make_iob('he is Good',{'per-1':'he','per-2':'good'})

['O', 'O', 'B-per']

In [5]:
data = json.load(open('../dataset/kvret/kvret_train_public.json','r'))

In [6]:
entity = json.load(open('../dataset/kvret/kvret_entities.json','r'))

In [7]:
type(entity)

dict

In [8]:
entity.keys()

dict_keys(['location', 'weekly_time', 'time', 'weather_attribute', 'agenda', 'poi_type', 'distance', 'traffic_info', 'room', 'event', 'poi', 'party', 'date', 'temperature'])

In [9]:
data[0].keys()

dict_keys(['scenario', 'dialogue'])

In [10]:
for d in data[0]['dialogue']:
    print(d['turn'],' : ',d['data']['utterance'])

driver  :  where's the nearest parking garage
assistant  :  The nearest parking garage is Dish Parking at 550 Alester Ave. Would you like directions there? 
driver  :  Yes, please set directions via a route that avoids all heavy traffic if possible. 
assistant  :  It looks like there is a road block being reported on the route but I will still find the quickest route to 550 Alester Ave. 
driver  :  Thanks so much for your help. 
assistant  :  You're very welcome!


In [11]:
data = json.load(open('../dataset/kvret/kvret_train_public.json','r'))

In [12]:
from tqdm import tqdm

In [46]:
diags={'navigate' : [], 'schedule' : [], 'weather' : []}

In [47]:
for dindex in tqdm(range(len(data))):
    diag=[]
    length = len(data[dindex]['dialogue'])
    meta_task=""
    if length<=1: continue
    for i in range(length-1):
        if data[dindex]['dialogue'][i]['turn']=='driver':
            task = data[dindex]['scenario']['task']['intent']
            meta_task = task
            phrase = data[dindex]['dialogue'][i]['data']['utterance']
            if phrase=='': continue
            if data[dindex]['dialogue'][i+1]['data'].get('slots'):
                slot = data[dindex]['dialogue'][i+1]['data']['slots']
                for k,v in slot.items():
                    vv = get_phrase(phrase,v)
                    slot[k]=vv
                if slot.get('poi'): 
                    #task = task+'_request_poi'
                    slot.pop('poi')
                if slot.get('address'):
                    #task = task+'_request_address'
                    slot.pop('address')
            else:
                slot = {'dummy' : '<DUMMMMMMYYYYYY!!>'}
            if data[dindex]['dialogue'][i+1]['data']['end_dialogue']:
                task = 'thanks'
            bio = make_iob(phrase,slot)
            diag.append([nltk.word_tokenize(phrase),bio,task])
        else:
            phrase = data[dindex]['dialogue'][i]['data']['utterance']
            diag.append([nltk.word_tokenize(phrase),'BOT','BOT'])
    
    last = data[dindex]['dialogue'][-1]['data']['utterance']
    diag.append([nltk.word_tokenize(last),'BOT','BOT'])
    diags[meta_task].append(diag)

100%|██████████| 2425/2425 [00:08<00:00, 277.66it/s]


In [60]:
from itertools import combinations_with_replacement, combinations

In [61]:
CASE = ['weather','navigate','schedule']

In [67]:
case = []

In [70]:
for com in combinations_with_replacement(CASE,3):
    case.append(com)

In [113]:
recom_diags=[]

In [114]:
NUM=5000

In [115]:
import random

In [116]:
for i in range(NUM):
    c = random.choice(case)
    diag=[]
    for cc in c:
        origin = random.choice(diags[cc])
        diag.extend(origin)
    recom_diags.append(diag)

In [118]:
len(recom_diags)

5000

In [119]:
import pickle

In [121]:
pickle.dump(recom_diags,open('context_train.pkl','wb'))

In [122]:
with open('train.iob','w',encoding='utf-8') as f:
    for diag in recom_diags:
        for utter in diag:
            if utter[1]=='BOT':
                f.write(' '.join(utter[0])+'\n')
            else:
                f.write(' '.join(utter[0])+"|||"+' '.join(utter[1])+"|||"+utter[2]+"\n")
        
        f.write("\n")

* Task 단위 분류 및 레이블링
* 오류 수정
* 모델링 (우선 도메인부터)