# File set up
 - use only robot utterances and emotions
 - don't use NONE label -> neutral=0, anger=1, sad=2, happy=3, contentment=4 
      - dialogue model: NONE=0(padding_idx), neutral=1, anger=2, sad=3, happy=4, contentment=5

In [1]:
import os
import glob
import re
import tqdm
from collections import Counter
import numpy as np
from functools import reduce

## Modify to not using NONE label for recognition
 - change 0~5 labels -> 0~4 labels (except NONE)
 - explicit robot utterances and emotions

In [2]:
def load_files(from_file, out_file):
    file1_dial = open(from_file + '_dial.txt', 'r')
    file2_dial = open(out_file + '_dial.txt', 'w')
    file1_emotion = open(from_file + '_emotion.txt', 'r')
    file2_emotion = open(out_file + '_emotion.txt', 'w')
    
    dials = file1_dial.readlines()
    emotions = file1_emotion.readlines()
    #robo_dials = [dial.split('\t')[1] for dial in dials if len(dial.split('\t')) == 2]
    robo_emotions = [emotion.split('\t')[1].strip() for emotion in emotions if len(emotion.split('\t')) == 2]
    out_dials = []
    out_emotions = []
    
    [(out_dials.append(uttr), out_emotions.append(str(int(emotion)-1))) for (uttr, emotion) in zip(dials, robo_emotions) if emotion != "0"]
    file2_dial.writelines(out_dials)
    file2_emotion.writelines('\n'.join(out_emotions))

In [None]:
from_dir = "../data/em_dial/splitted"
out_dir = "../emotion_dialogue_model/em_recog/data/em_robo/splitted"
domain = ["/cleaning","/exercise","/lunch","/sleep","/game","/all"]
last_part = ["/test","/valid","/train"]

for d in domain:
    if not os.path.exists(out_dir+d):
        os.makedirs(out_dir+d)
    for l in last_part:
        load_files(from_dir+d+l, out_dir+d+l)

In [3]:
from_dir_over = "../data/em_dial/splitted/over_sample"
out_dir_over = "../emotion_dialogue_model/em_recog/data/em_robo/splitted/over_sample"
domain = ["/cleaning","/exercise","/lunch","/sleep","/game","/all"]
last_part = ["/test","/valid","/train"]

for d in domain:
    if not os.path.exists(out_dir_over+d):
        os.makedirs(out_dir_over+d)
    for l in last_part:
        load_files(from_dir_over+d+l, out_dir_over+d+l)

# File check of emotions
* some dialogue have inadequately format (not user-system such as user-system-user-user-user...)

In [34]:
def check_files(read_file):
    file_dial = open(read_file + '_dial.txt', 'r')
    file_emotion = open(read_file + '_emotion.txt', 'r')
    dials = file_dial.readlines()
    emotions = file_emotion.readlines()
    for i,r in enumerate(emotions):
        if len(r.split('\t'))!=2:
            print(i,r)
            print(dials[i])

In [35]:
from_dir = "../data/em_dial/splitted"
domain = ["/cleaning","/exercise","/lunch","/sleep","/game","/all"]
last_part = ["/test","/valid","/train"]

for d in domain:
    for l in last_part:        
        print(d,l)
        check_files(from_dir+d+l)

/cleaning /test
/cleaning /valid
/cleaning /train
/exercise /test
/exercise /valid
/exercise /train
/lunch /test
/lunch /valid
/lunch /train
/sleep /test
/sleep /valid
/sleep /train
/game /test
/game /valid
/game /train
/all /test
/all /valid
/all /train


# Add em_word in last part of em_dial
* try to add kaomozi in training to confirm how reflected emoitons

In [7]:
id2emWord = {"0":"","1":"#NEU","2":"#ANG","3":"#SAD","4":"#HAP","5":"#CON"}
id2emo = {"0":"NONE","1":"neutral","2":"anger","3":"sad","4":"happy","5":"contentment"}

In [8]:
def print_num_emotion(count):
    for i in range(6):
        print("   {} : {}".format(id2emo[str(i)],count[str(i)]))

def add_lastEm(from_file, out_file):
    file1_dial = open(from_file + '_dial.txt', 'r')
    file2_dial = open(out_file + '_dial.txt', 'w')
    file1_emotion = open(from_file + '_emotion.txt', 'r')
    file2_emotion = open(out_file + '_emotion.txt', 'w')
    
    dials = file1_dial.readlines()
    emotions = file1_emotion.readlines()
    
    for i,r in enumerate(emotions):
        if len(r.split('\t'))!=2:
            print(i,r)
            raise ValueError
    robo_emotions = [emotion.split('\t')[1].strip() for emotion in emotions]
    print_num_emotion(Counter(robo_emotions))
    
    out_dials = []
    [out_dials.append(uttr.strip('\n')+" "+id2emWord[emotion]+'\n') for (uttr, emotion) in zip(dials, robo_emotions)]
    file2_dial.writelines(out_dials)
    file2_emotion.writelines(emotions)

In [13]:
from_dir = "../data/em_dial/splitted"
out_lastEm_dir = "../data/em_dial/splitted/lastEm"
domain = ["/cleaning","/exercise","/lunch","/sleep","/game","/all"]
last_part = ["/test","/valid","/train"]

for d in domain:
    if not os.path.exists(out_lastEm_dir+d):
        os.makedirs(out_lastEm_dir+d)
    print("\ndomain : ", d)
    for l in last_part:
        print("input file : ", from_dir+d+l)          
        print("output file : ", out_lastEm_dir+d+l)   
        add_lastEm(from_dir+d+l, out_lastEm_dir+d+l)


domain :  /cleaning
input file :  ../data/em_dial/splitted/cleaning/test
output file :  ../data/em_dial/splitted/lastEm/cleaning/test
   NONE : 37
   neutral : 25
   anger : 75
   sad : 46
   happy : 18
   contentment : 4
input file :  ../data/em_dial/splitted/cleaning/valid
output file :  ../data/em_dial/splitted/lastEm/cleaning/valid
   NONE : 29
   neutral : 31
   anger : 48
   sad : 67
   happy : 27
   contentment : 1
input file :  ../data/em_dial/splitted/cleaning/train
output file :  ../data/em_dial/splitted/lastEm/cleaning/train
   NONE : 265
   neutral : 318
   anger : 412
   sad : 462
   happy : 199
   contentment : 28

domain :  /exercise
input file :  ../data/em_dial/splitted/exercise/test
output file :  ../data/em_dial/splitted/lastEm/exercise/test
   NONE : 22
   neutral : 63
   anger : 37
   sad : 43
   happy : 29
   contentment : 5
input file :  ../data/em_dial/splitted/exercise/valid
output file :  ../data/em_dial/splitted/lastEm/exercise/valid
   NONE : 22
   neutral 

* add including em_word vocab in twitter_dial_txt and em_dial_txt ・・・ data/twitter_dial/vocab/spvocab_tw_label.modl

In [15]:
def concat_dials(from_file1, from_file2, out_file):
    file1_dial = open(from_file1, 'r')
    file2_dial = open(from_file2, 'r')
    file3_dial = open(out_file, 'w')
    dials1 = file1_dial.readlines()
    print("length of from_file1 : ", len(dials1))
    dials2 = file2_dial.readlines()
    print("length of from_file2 : ", len(dials2))
    both = dials1 + dials2
    print("length of output_file : ", len(both))
    
    file3_dial.writelines(both)

In [14]:
em_kao_file = "../data/em_dial/splitted/lastEm/over_sample/all/train_dial.txt"
twitter_file = "../data/twitter_dial/splitted/train_dial.txt"
out_vocab_file = "../data/twitter_dial/vocab/spvocab_tw_label.model"
concat_dials(em_kao_file, twitter_file, out_vocab_file)

NameError: name 'concat_dials' is not defined

* Build vocablary command
~~~
python ../preprocess_em/build_spvocab.py \
    --input ../data/em_dial/vocab/tw_emkao/tw_emkao_train.txt \
    --model ../data/em_dial/vocab/tw_emkao/spvocab \
    --size 32000 \
    --coverage 0.9995
~~~

* Confirm if building vocab success

In [None]:
conf_vocab_model = "../data/em_dial/vocab/tw_emkao/spvocab.model"

import sentencepiece as spm
sp = spm.SentencePieceProcessor()
sp.Load(conf_vocab_model)
for kao in id2kao.values():
    encoded = sp.EncodeAsIds(kao)
    print("encoded {} : {}".format(kao,encoded))
    decoded = sp.DecodeIds(encoded)
    print("decoded {} : {}".format(kao,decoded))    

# Over sampling
* Because em_dial have unvalunced number of emotion labels, we should over sampling 

In [16]:
def over_sampling(from_file, out_file):
    file1_dial = open(from_file + '_dial.txt', 'r')
    file2_dial = open(out_file + '_dial.txt', 'w')
    file1_emotion = open(from_file + '_emotion.txt', 'r')
    file2_emotion = open(out_file + '_emotion.txt', 'w')  
    dials = file1_dial.readlines()
    emotions = file1_emotion.readlines()
    robo_emotions = [emotion.split('\t')[1].strip() for emotion in emotions]
    count_emo = Counter(robo_emotions)
    # freq_emo is number of most frequency emotions
    freq_emo = count_emo.most_common()[0][1]
    print("Count emotions : ", count_emo)
    times = [int(freq_emo/count_emo[str(i)]) for i in range(6)]
    print("times : ", times)
    plas = [freq_emo - times[i]*count_emo[str(i)] for i in range(6)]
    print("plas : ", plas)
    out_dials = []
    out_emotions = []
    
    for i, (uttr, emotion) in enumerate(zip(dials, robo_emotions)):
        emotion = int(emotion)
        if plas[emotion] != 0:
            (out_dials.append(uttr), out_emotions.append(emotions[i]))
            plas[emotion] = plas[emotion]-1
        for j in range(times[emotion]):
            (out_dials.append(uttr), out_emotions.append(emotions[i]))            
    file2_dial.writelines(out_dials)
    file2_emotion.writelines(out_emotions)

In [33]:
from_dir = "../data/em_dial/splitted"
out_over_dir = "../data/em_dial/splitted/over_sample"
domain = ["/cleaning","/exercise","/lunch","/sleep","/game","/all"]
last_part = ["/test","/valid","/train"]

for d in domain:
    if not os.path.exists(out_over_dir+d):
        os.makedirs(out_over_dir+d)
    for l in last_part:
        print("output file : ", out_over_dir+d+l)
        over_sampling(from_dir+d+l, out_over_dir+d+l)

output file :  ../data/em_dial/splitted/over_sample/cleaning/test
Count emotions :  Counter({'2': 75, '3': 46, '0': 37, '1': 25, '4': 18, '5': 4})
times :  [2, 3, 1, 1, 4, 18]
plas :  [1, 0, 0, 29, 3, 3]
output file :  ../data/em_dial/splitted/over_sample/cleaning/valid
Count emotions :  Counter({'3': 67, '2': 48, '1': 31, '0': 29, '4': 27, '5': 1})
times :  [2, 2, 1, 1, 2, 67]
plas :  [9, 5, 19, 0, 13, 0]
output file :  ../data/em_dial/splitted/over_sample/cleaning/train
Count emotions :  Counter({'3': 462, '2': 412, '1': 318, '0': 265, '4': 199, '5': 28})
times :  [1, 1, 1, 1, 2, 16]
plas :  [197, 144, 50, 0, 64, 14]
output file :  ../data/em_dial/splitted/over_sample/exercise/test
Count emotions :  Counter({'1': 63, '3': 43, '2': 37, '4': 29, '0': 22, '5': 5})
times :  [2, 1, 1, 1, 2, 12]
plas :  [19, 0, 26, 20, 5, 3]
output file :  ../data/em_dial/splitted/over_sample/exercise/valid
Count emotions :  Counter({'1': 90, '3': 37, '4': 36, '2': 28, '0': 22, '5': 9})
times :  [4, 1, 3, 

# Make file of over sampling and  em_word

In [17]:
from_dir = "../data/em_dial/splitted/over_sample"
out_over_lastEm_dir = "../data/em_dial/splitted/lastEm/over_sample"
domain = ["/cleaning","/exercise","/lunch","/sleep","/game","/all"]
last_part = ["/test","/valid","/train"]

for d in domain:
    if not os.path.exists(out_over_lastEm_dir+d):
        os.makedirs(out_over_lastEm_dir+d)
    for l in last_part:
        print("output file : ", out_over_lastEm_dir+d+l)   
        add_kaomozi(from_dir+d+l, out_over_lastEm_dir+d+l)

output file :  ../data/em_dial/splitted/lastEm/over_sample/cleaning/test
   NONE : 75
   neutral : 75
   anger : 75
   sad : 75
   happy : 75
   contentment : 75
output file :  ../data/em_dial/splitted/lastEm/over_sample/cleaning/valid
   NONE : 67
   neutral : 67
   anger : 67
   sad : 67
   happy : 67
   contentment : 67
output file :  ../data/em_dial/splitted/lastEm/over_sample/cleaning/train
   NONE : 462
   neutral : 462
   anger : 462
   sad : 462
   happy : 462
   contentment : 462
output file :  ../data/em_dial/splitted/lastEm/over_sample/exercise/test
   NONE : 63
   neutral : 63
   anger : 63
   sad : 63
   happy : 63
   contentment : 63
output file :  ../data/em_dial/splitted/lastEm/over_sample/exercise/valid
   NONE : 90
   neutral : 90
   anger : 90
   sad : 90
   happy : 90
   contentment : 90
output file :  ../data/em_dial/splitted/lastEm/over_sample/exercise/train
   NONE : 498
   neutral : 498
   anger : 498
   sad : 498
   happy : 498
   contentment : 498
output file 

# domain file making model to recognize domain

In [2]:
def domain_id(from_file, domain, last, out_file):
    num_dial_list = [len(open(from_file+d+last+"_dial.txt", 'r').readlines()) for d in domain]
    print(num_dial_list)
    out_dial = open(out_file+"_domain.txt", 'w')
    num_dial_domain =[str(domain_id) for domain_id, num in enumerate(num_dial_list) for i in range(num)] 
    print("length of num_dial_domain : ", len(num_dial_domain))
    
    out_dial.writelines("\n".join(num_dial_domain))

In [36]:
from_file = "../emotion_dialogue_model/em_recog/data/em_robo/splitted/over_sample"
out_recog_dir = "../emotion_dialogue_model/em_recog/data/em_robo/splitted/over_sample/all"
domain = ["/cleaning","/exercise","/lunch","/sleep","/game"]
last_part = ["/test","/valid","/train"]
if not os.path.exists(out_recog_dir):
    os.makedirs(out_recog_dir)
for l in last_part:
    print("output file : ", out_recog_dir)   
    domain_id(from_file, domain, l, out_recog_dir+l)

output file :  ../emotion_dialogue_model/em_recog/data/em_robo/splitted/over_sample/all
[375, 315, 340, 300, 245]
length of num_dial_domain :  1575
output file :  ../emotion_dialogue_model/em_recog/data/em_robo/splitted/over_sample/all
[335, 450, 315, 360, 325]
length of num_dial_domain :  1785
output file :  ../emotion_dialogue_model/em_recog/data/em_robo/splitted/over_sample/all
[2310, 2490, 2175, 2560, 2480]
length of num_dial_domain :  12015


In [5]:
from_file = "../emotion_dialogue_model/em_recog/data/em_robo/splitted/"
out_recog_dir = "../emotion_dialogue_model/em_recog/data/em_robo/splitted/all"
domain = ["/cleaning","/exercise","/lunch","/sleep","/game"]
last_part = ["/test","/valid","/train"]
if not os.path.exists(out_recog_dir):
    os.makedirs(out_recog_dir)
for l in last_part:
    print("output file : ", out_recog_dir)   
    domain_id(from_file, domain, l, out_recog_dir+l)

output file :  ../emotion_dialogue_model/em_recog/data/em_robo/splitted/all
[183, 170, 157, 175, 176]
length of num_dial_domain :  861
output file :  ../emotion_dialogue_model/em_recog/data/em_robo/splitted/all
[161, 162, 178, 184, 173]
length of num_dial_domain :  858
output file :  ../emotion_dialogue_model/em_recog/data/em_robo/splitted/all
[1417, 1446, 1384, 1417, 1373]
length of num_dial_domain :  7037


# Split words of inferences to calucurate BLEU

In [9]:
import MeCab
mecab = MeCab.Tagger("-Owakati")


def split_sentences(input_file, output_file):
    input_line = open(input_file, "r").readlines()
    output_f = open(output_file, "w")
    output_line = [mecab.parse(line) if line[-4] != "#" else line[:-4] for line in input_line]
    output_f.writelines(output_line)

In [34]:
emotion_list = ["/NONE", "/neutral", "/anger", "/sad","/happy","/contentment"]

def split_EachEmotionFiles(input_dir):
    print("output directory name : \n\t"+input_dir)
    for emotion in emotion_list:
        split_sentences(input_dir+emotion+"_inf.txt", input_dir+emotion+"_inf_wakati.txt")
        print("\t\t"+emotion+"_inf_wakati.txt")
    split_sentences(input_dir+"/reference.txt", input_dir+"/reference_wakati.txt")

In [40]:
import sys
def concat_files_allemotions(input_dir):
    file_Emotions = []
    file_neutral = []
    file_AngSadHapCon = []
    # not using NONE files (becatuse NONE isn't used to build systems) 
    for i, emotion in enumerate(emotion_list[1:]):
        file_Emotions += open(input_dir+emotion+"_inf_wakati.txt", "r").readlines()
        if i == 0:
            if emotion != "/neutral":
                print("Error")
                sys.exit()
            file_neutral += open(input_dir+emotion+"_inf_wakati.txt", "r").readlines()
        else:
            file_AngSadHapCon += open(input_dir+emotion+"_inf_wakati.txt", "r").readlines()
        file_emotion = open(input_dir+"ref_emotion.txt", "r").readlines()
        for j, ref_emo in enumerate(file_emotion):
            if ref_emo == i:
                correctEm_
            
    print("output directory name (concat inferences): \n\t"+input_dir)
    open(input_dir+"/allem_inf_wakati.txt", "w").writelines(file_Emotions)
    open(input_dir+"/AngSadHapCon_inf_wakati.txt", "w").writelines(file_AngSadHapCon)
    ref_file = open(input_dir+"/reference_wakati.txt","r").readlines()
    open(input_dir+"/ref_times5_wakati.txt","w").writelines(ref_file*5)
    open(input_dir+"/neu_times4_wakati.txt","w").writelines(file_neutral*4)
    print("\t\t"+"/allem_inf_wakati.txt")
    print("\t\t"+"/AngSadHapCon_inf_wakati.txt")
    print("\t\t"+"/ref_times5_wakati.txt")
    print("\t\t"+"/neu_times4_wakati.txt")

### Split words of output (not last Em_word)
* files which learned with oversapling all domain and spcific domain 

In [41]:
input_dir_lists = ["../emotion_dialogue_model/output/emotion_64/overall/output_64emoTw21_overall14_cleaning_5",
             "../emotion_dialogue_model/output/emotion_64/overall/output_64emoTw21_overall14_exercise_4",
             "../emotion_dialogue_model/output/emotion_64/overall/output_64emoTw21_overall14_lunch_8",
             "../emotion_dialogue_model/output/emotion_64/overall/output_64emoTw21_overall14_sleep_3",
             "../emotion_dialogue_model/output/emotion_64/overall/output_64emoTw21_overall14_game_5"]
for input_dir in input_dir_lists:
    split_EachEmotionFiles(input_dir)
    concat_files_allemotions(input_dir)

output directory name : 
	../emotion_dialogue_model/output/emotion_64/overall/output_64emoTw21_overall14_cleaning_5
		/NONE_inf_wakati.txt
		/neutral_inf_wakati.txt
		/anger_inf_wakati.txt
		/sad_inf_wakati.txt
		/happy_inf_wakati.txt
		/contentment_inf_wakati.txt
output directory name (concat inferences): 
	../emotion_dialogue_model/output/emotion_64/overall/output_64emoTw21_overall14_cleaning_5
		/allem_inf_wakati.txt
		/AngSadHapCon_inf_wakati.txt
		/ref_times5_wakati.txt
		/neu_times4_wakati.txt
output directory name : 
	../emotion_dialogue_model/output/emotion_64/overall/output_64emoTw21_overall14_exercise_4
		/NONE_inf_wakati.txt
		/neutral_inf_wakati.txt
		/anger_inf_wakati.txt
		/sad_inf_wakati.txt
		/happy_inf_wakati.txt
		/contentment_inf_wakati.txt
output directory name (concat inferences): 
	../emotion_dialogue_model/output/emotion_64/overall/output_64emoTw21_overall14_exercise_4
		/allem_inf_wakati.txt
		/AngSadHapCon_inf_wakati.txt
		/ref_times5_wakati.txt
		/neu_times4_w

In [44]:
input_dir_lists_emSize = ["../emotion_dialogue_model/output/emotion_6/overall/output_6emoTw21_overall_16",
             "../emotion_dialogue_model/output/emotion_32/overall/output_32emoTw19_overall_17",
             "../emotion_dialogue_model/output/emotion_64/overall/output_64emoTw21_overall_14",
             "../emotion_dialogue_model/output/emotion_96/overall/output_96emoTw21_overall_17",
             "../emotion_dialogue_model/output/emotion_128/overall/output_128emoTw17_overall_16"]
for input_dir in input_dir_lists_emSize:
    split_EachEmotionFiles(input_dir)
    concat_files_allemotions(input_dir)

output directory name : 
	../emotion_dialogue_model/output/emotion_6/overall/output_6emoTw21_overall_16
		/NONE_inf_wakati.txt
		/neutral_inf_wakati.txt
		/anger_inf_wakati.txt
		/sad_inf_wakati.txt
		/happy_inf_wakati.txt
		/contentment_inf_wakati.txt
output directory name (concat inferences): 
	../emotion_dialogue_model/output/emotion_6/overall/output_6emoTw21_overall_16
		/allem_inf_wakati.txt
		/AngSadHapCon_inf_wakati.txt
		/ref_times5_wakati.txt
		/neu_times4_wakati.txt
output directory name : 
	../emotion_dialogue_model/output/emotion_32/overall/output_32emoTw19_overall_17
		/NONE_inf_wakati.txt
		/neutral_inf_wakati.txt
		/anger_inf_wakati.txt
		/sad_inf_wakati.txt
		/happy_inf_wakati.txt
		/contentment_inf_wakati.txt
output directory name (concat inferences): 
	../emotion_dialogue_model/output/emotion_32/overall/output_32emoTw19_overall_17
		/allem_inf_wakati.txt
		/AngSadHapCon_inf_wakati.txt
		/ref_times5_wakati.txt
		/neu_times4_wakati.txt
output directory name : 
	../emoti

### Split words of output (use last Em_word)
* files which learned with oversapling all domain with lastEm and spcific domain wiht lastEm 

In [43]:
input_dir_lists2 = ["../emotion_dialogue_model/output/emotion_64/lastEm/overall/output_64emoTw21_overallEm27_cleaning_5",
             "../emotion_dialogue_model/output/emotion_64/lastEm/overall/output_64emoTw21_overallEm27_exercise_7",
             "../emotion_dialogue_model/output/emotion_64/lastEm/overall/output_64emoTw21_overallEm27_lunch_11",
             "../emotion_dialogue_model/output/emotion_64/lastEm/overall/output_64emoTw21_overallEm27_sleep_4",
             "../emotion_dialogue_model/output/emotion_64/lastEm/overall/output_64emoTw21_overallEm27_game_4"]
for input_dir in input_dir_lists2:
    split_EachEmotionFiles(input_dir)
    concat_files_allemotions(input_dir)

output directory name : 
	../emotion_dialogue_model/output/emotion_64/lastEm/overall/output_64emoTw21_overallEm27_cleaning_5
		/NONE_inf_wakati.txt
		/neutral_inf_wakati.txt
		/anger_inf_wakati.txt
		/sad_inf_wakati.txt
		/happy_inf_wakati.txt
		/contentment_inf_wakati.txt
output directory name (concat inferences): 
	../emotion_dialogue_model/output/emotion_64/lastEm/overall/output_64emoTw21_overallEm27_cleaning_5
		/allem_inf_wakati.txt
		/AngSadHapCon_inf_wakati.txt
		/ref_times5_wakati.txt
		/neu_times4_wakati.txt
output directory name : 
	../emotion_dialogue_model/output/emotion_64/lastEm/overall/output_64emoTw21_overallEm27_exercise_7
		/NONE_inf_wakati.txt
		/neutral_inf_wakati.txt
		/anger_inf_wakati.txt
		/sad_inf_wakati.txt
		/happy_inf_wakati.txt
		/contentment_inf_wakati.txt
output directory name (concat inferences): 
	../emotion_dialogue_model/output/emotion_64/lastEm/overall/output_64emoTw21_overallEm27_exercise_7
		/allem_inf_wakati.txt
		/AngSadHapCon_inf_wakati.txt
		/r

### Split per speaker
* (system_uttr1 user_uttr system_uttr2) -> (system_uttr1) and (user_uttr), (system_uttr2)


In [23]:
sys_user_sys_dir = "../data/em_dial/splitted/sys_user_sys"
domain = ["/cleaning","/exercise","/lunch","/sleep","/game","/all"]
last_part = ["/all","/train","/valid","/test"]

def split_per_speaker(from_path, output_sys1, output_user, output_sys2):
    print("from_path", from_path)
    file_dial = open(from_path+"_dial.txt", "r").readlines()
    file_acc = open(from_path+"_accept.txt", "r").readlines()
    file_emo = open(from_path+"_emotion.txt", "r").readlines()
    
    system_utter1 = [utter_pair.split("\t")[0]+"\n" for utter_pair in file_dial]    
    user_utter = [utter_pair.split("\t")[1]+"\n" for utter_pair in file_dial]
    system_utter2 = [utter_pair.split("\t")[2]+"\n" for utter_pair in file_dial]
    print("Number of dialogue : ", len(system_utter1))
    
    user_acc = [acc_pair.split("\t")[1]+"\n" for acc_pair in file_acc]

    system_emo1 = [emo_pair.split("\t")[1]+"\n" for emo_pair in file_emo]
    user_emo = [emo_pair.split("\t")[0]+"\n" for emo_pair in file_emo]
    system_emo2 = [emo_pair.split("\t")[2]+"\n" for emo_pair in file_emo]

    open(output_sys1+"_dial.txt", "w").writelines(system_utter1)
    open(output_sys1+"_emotion.txt", "w").writelines(system_emo1)    
    open(output_user+"_dial.txt", "w").writelines(user_utter)
    open(output_user+"_accept.txt", "w").writelines(user_acc)
    open(output_user+"_emotion.txt", "w").writelines(user_emo)
    open(output_sys2+"_dial.txt", "w").writelines(system_utter1)
    open(output_sys2+"_emotion.txt", "w").writelines(system_emo2)              

In [24]:
import os

In [25]:
for i,l in enumerate(last_part):
    print("\ttype of data : "+l)  
    for j,d in enumerate(domain[:-1]):
        print("Domain : "+d) 
        input_dir= sys_user_sys_dir+d
        output_sys1 = input_dir+"/system1"
        output_user = input_dir+"/user"
        output_sys2 = input_dir+"/system2"

        if not os.path.exists(output_sys1):
            os.makedirs(output_sys1)
        if not os.path.exists(output_user):
            os.makedirs(output_user)
        if not os.path.exists(output_sys2):
            os.makedirs(output_sys2)
        split_per_speaker(input_dir+l, output_sys1+l, output_user+l, output_sys2+l)

	type of data : /all
Domain : /cleaning
from_path ../data/em_dial/splitted/sys_user_sys/cleaning/all
Number of dialogue :  2092
Domain : /exercise
from_path ../data/em_dial/splitted/sys_user_sys/exercise/all
Number of dialogue :  2019
Domain : /lunch
from_path ../data/em_dial/splitted/sys_user_sys/lunch/all
Number of dialogue :  1985
Domain : /sleep
from_path ../data/em_dial/splitted/sys_user_sys/sleep/all
Number of dialogue :  1980
Domain : /game
from_path ../data/em_dial/splitted/sys_user_sys/game/all
Number of dialogue :  1953
	type of data : /train
Domain : /cleaning
from_path ../data/em_dial/splitted/sys_user_sys/cleaning/train
Number of dialogue :  1658
Domain : /exercise
from_path ../data/em_dial/splitted/sys_user_sys/exercise/train
Number of dialogue :  1613
Domain : /lunch
from_path ../data/em_dial/splitted/sys_user_sys/lunch/train
Number of dialogue :  1588
Domain : /sleep
from_path ../data/em_dial/splitted/sys_user_sys/sleep/train
Number of dialogue :  1583
Domain : /game
fr