# File set up
 - use only robot utterances and emotions
 - don't use NONE label -> neutral=0, anger=1, sad=2, happy=3, contentment=4 
      - dialogue model: NONE=0(padding_idx), neutral=1, anger=2, sad=3, happy=4, contentment=5

In [1]:
import os
import glob
import re
import tqdm
from collections import Counter
import numpy as np

## Modify to not using NONE label for recognition
 - change 0~5 labels -> 0~4 labels (except NONE)
 - explicit robot utterances and emotions

In [30]:
def load_files(from_file, out_file):
    file1_dial = open(from_file + '_dial.txt', 'r')
    file2_dial = open(out_file + '_dial.txt', 'w')
    file1_emotion = open(from_file + '_emotion.txt', 'r')
    file2_emotion = open(out_file + '_emotion.txt', 'w')
    
    dials = file1_dial.readlines()
    emotions = file1_emotion.readlines()
    #robo_dials = [dial.split('\t')[1] for dial in dials if len(dial.split('\t')) == 2]
    robo_emotions = [emotion.split('\t')[1].strip() for emotion in emotions if len(emotion.split('\t')) == 2]
    out_dials = []
    out_emotions = []
    
    [(out_dials.append(uttr), out_emotions.append(str(int(emotion)-1))) for (uttr, emotion) in zip(dials, robo_emotions) if emotion != "0"]
    file2_dial.writelines(out_dials)
    file2_emotion.writelines('\n'.join(out_emotions))

In [33]:
from_dir = "../data/em_dial/splitted"
out_dir = "../emotion_dialogue_model/em_recog/data/em_robo/splitted"
domain = ["/cleaning","/exercise","/lunch","/sleep","/game","/all"]
last_part = ["/test","/valid","/train"]

for d in domain:
    if not os.path.exists(out_dir+d):
        os.makedirs(out_dir+d)
    for l in last_part:
        load_files(from_dir+d+l, out_dir+d+l)

['まだ綺麗な方だからいいよ。\t部屋を綺麗にできない人はモテないよ\n', 'えー、めんどくさいよ。\tじゃあ、いつするの？\n', 'そうだっけ？\t綺麗な部屋は気持ちがいんだよ。\n', 'どう気持ちがいいの？\t気分がスッキリして晴れやかな気持ちになれるんだよ。\n', '最近ついてなくて気分がすぐれないんだよね。\tそんな時こそ掃除するべきだよ。\n']
['まだそんなに散らかっていないよ。\t足の踏み場もないくらいだけど。\n', '必要な物が取りやすくていいんだよ。\tどこに何を置いてあるか分からなくなるでしょう。\n', 'もうある程度の場所は覚えたよ。\tこれだけ物が散乱していると掃除機をかけることも出来ないじゃない。\n', 'だから当分の間は必要ないかな。\t衛生上にもよくないよ。\n', 'そんなに影響がでていると思わないけどなぁ。\tさっきから咳をしているけどそれは埃のせいじゃないかな。\n']
['あとでね。今忙しいんだ。\tあとであとでって言うばかりでいつも片付けないじゃないか。\n', 'だってめんどくさいんだもん。そんなに部屋を綺麗にしたいならロボットが掃除したらいいじゃないか。\t私が掃除してもAさんのためにならないよ。掃除してあげてもいいけど私がいなくなったあと誰が掃除するの?\n', '…。ロボットはいなくならないでしょ?\tそれはわからないよ。いつ離れ離れになるか…。そのためにわたしが今指導しているのだよ。\n', 'うーん。でも今は漫画を読んでいるから。無理。\tどうしてこれだけ言ってもわかってくれないの！掃除しないのなら全部捨てるよ！\n', 'え、だめだめ。捨てたらだめだよ、大切なものばかりなんだから。\tだったらなおさら自分で片付けなさい！私だって怒るときは怒るんだ！\n']
['外は暑いからめんどくさいよ。\tですが、天気が良いですよ。\n', 'え、どこに行くの\tそうですね、噴水がある涼しい公園などいかがですか\n', 'うーんそこなら良いかも\t公園までウォーキングし涼しいところを見つけ冷たい麦茶でも飲んではいかがですか\n', 'そうだね運動したら汗をかくから水分をとらないとね\tそうです運動したらしっかり水分を補給して下さい\n', 'でもやっぱり外は暑いな〜\t帽子をかぶり、キンキンに冷えた麦茶のペット

# File check of emotions
* some dialogue have inadequately format (not user-system such as user-system-user-user-user...)

In [34]:
def check_files(read_file):
    file_dial = open(read_file + '_dial.txt', 'r')
    file_emotion = open(read_file + '_emotion.txt', 'r')
    dials = file_dial.readlines()
    emotions = file_emotion.readlines()
    for i,r in enumerate(emotions):
        if len(r.split('\t'))!=2:
            print(i,r)
            print(dials[i])

In [39]:
from_dir = "../data/em_dial/splitted"
domain = ["/cleaning","/exercise","/lunch","/sleep","/game","/all"]
last_part = ["/test","/valid","/train"]

for d in domain:
    for l in last_part:        
        print(d,l)
        check_files(from_dir+d+l)

/cleaning /test
/cleaning /valid
/cleaning /train
/exercise /test
/exercise /valid
/exercise /train
/lunch /test
/lunch /valid
/lunch /train
/sleep /test
/sleep /valid
/sleep /train
/game /test
/game /valid
/game /train
/all /test
/all /valid
/all /train


# Add kaomozi in last part of em_dial
* try to add kaomozi in training to confirm how reflected emoitons

In [20]:
id2kao = {"0":"","1":"（・_・）","2":"（怒）","3":"（悲）","4":"（喜）","5":"（安心）"}
id2emo = {"0":"NONE","1":"neutral","2":"anger","3":"sad","4":"happy","5":"contentment"}

In [45]:
def print_num_emotion(count):
    for i in range(6):
        print("   {} : {}".format(id2emo[str(i)],count[str(i)]))

def add_kaomozi(from_file, out_file):
    file1_dial = open(from_file + '_dial.txt', 'r')
    file2_dial = open(out_file + '_dial.txt', 'w')
    file1_emotion = open(from_file + '_emotion.txt', 'r')
    file2_emotion = open(out_file + '_emotion.txt', 'w')
    
    dials = file1_dial.readlines()
    emotions = file1_emotion.readlines()
    
    for i,r in enumerate(emotions):
        if len(r.split('\t'))!=2:
            print(i,r)
            raise ValueError
    robo_emotions = [emotion.split('\t')[1].strip() for emotion in emotions]
    print_num_emotion(Counter(robo_emotions))
    
    out_dials = []
    [out_dials.append(uttr.strip('\n')+id2kao[emotion]+'\n') for (uttr, emotion) in zip(dials, robo_emotions)]
    file2_dial.writelines(out_dials)
    file2_emotion.writelines(emotions)

In [39]:
from_dir = "../data/em_dial/splitted"
out_kao_dir = "../data/em_dial/splitted/kao"
domain = ["/cleaning","/exercise","/lunch","/sleep","/game","/all"]
last_part = ["/test","/valid","/train"]

for d in domain:
    if not os.path.exists(out_kao_dir+d):
        os.makedirs(out_kao_dir+d)
    print("\ndomain : ", d)
    for l in last_part:
        print("input file : ", from_dir+d+l)          
        print("output file : ", out_kao_dir+d+l)   
        add_kaomozi(from_dir+d+l, out_kao_dir+d+l)


domain :  /cleaning
input file :  ../data/em_dial/splitted/cleaning/test
output file :  ../data/em_dial/splitted/kao/cleaning/test
   NONE : 37
   neutral : 25
   anger : 75
   sad : 46
   happy : 18
   contentment : 4
input file :  ../data/em_dial/splitted/cleaning/valid
output file :  ../data/em_dial/splitted/kao/cleaning/valid
   NONE : 29
   neutral : 31
   anger : 48
   sad : 67
   happy : 27
   contentment : 1
input file :  ../data/em_dial/splitted/cleaning/train
output file :  ../data/em_dial/splitted/kao/cleaning/train
   NONE : 265
   neutral : 318
   anger : 412
   sad : 462
   happy : 199
   contentment : 28

domain :  /exercise
input file :  ../data/em_dial/splitted/exercise/test
output file :  ../data/em_dial/splitted/kao/exercise/test
   NONE : 22
   neutral : 63
   anger : 37
   sad : 43
   happy : 29
   contentment : 5
input file :  ../data/em_dial/splitted/exercise/valid
output file :  ../data/em_dial/splitted/kao/exercise/valid
   NONE : 22
   neutral : 90
   anger :

* add including kaomozi vocab in twitter_dial_txt and em_dial_txt ・・・ data/train_kao/spvocab.model

In [25]:
def concat_dials(from_file1, from_file2, out_file):
    file1_dial = open(from_file1, 'r')
    file2_dial = open(from_file2, 'r')
    file3_dial = open(out_file, 'w')
    dials1 = file1_dial.readlines()
    print("length of from_file1 : ", len(dials1))
    dials2 = file2_dial.readlines()
    print("length of from_file2 : ", len(dials2))
    both = dials1 + dials2
    print("length of output_file : ", len(both))
    
    file3_dial.writelines(both)

In [26]:
em_kao_file = "../data/em_dial/splitted/kao/over_sample/all/train_dial.txt"
twitter_file = "../data/twitter_dial/splitted/train_dial.txt"
out_vocab_file = "../data/em_dial/vocab/tw_emkao/tw_emkao_train.txt"
concat_dials(em_kao_file, twitter_file, out_vocab_file)

length of from_file1 :  13278
length of from_file2 :  2813875
length of output_file :  2827153


* Build vocablary command
```
python ../preprocess_em/build_spvocab.py \
    --input ../data/em_dial/vocab/tw_emkao/tw_emkao_train.txt \
    --model ../data/em_dial/vocab/tw_emkao/spvocab \
    --size 32000 \
    --coverage 0.9995
```

* Confirm if building vocab success

In [30]:
conf_vocab_model = "../data/em_dial/vocab/tw_emkao/spvocab.model"

import sentencepiece as spm
sp = spm.SentencePieceProcessor()
sp.Load(conf_vocab_model)
for kao in id2kao.values():
    encoded = sp.EncodeAsIds(kao)
    print("encoded {} : {}".format(kao,encoded))
    decoded = sp.DecodeIds(encoded)
    print("decoded {} : {}".format(kao,decoded))    

encoded （・_・） : [4, 1989, 1793, 259]
decoded （・_・） : (・_・)
encoded （悲） : [4, 1989, 1675, 259]
decoded （悲） : (悲)
encoded （喜） : [4, 1989, 7772, 259]
decoded （喜） : (喜)
encoded  : []
decoded  : 
encoded （安心） : [4, 1989, 816, 259]
decoded （安心） : (安心)
encoded （怒） : [4, 1989, 1093, 259]
decoded （怒） : (怒)


In [36]:
print(sp.EncodeAsIds("悲"))
print(sp.DecodeIds([4]))

[4, 1675]



# Over sampling
* Because em_dial have unvalunced number of emotion labels, we should over sampling 

In [5]:
def over_sampling(from_file, out_file):
    file1_dial = open(from_file + '_dial.txt', 'r')
    file2_dial = open(out_file + '_dial.txt', 'w')
    file1_emotion = open(from_file + '_emotion.txt', 'r')
    file2_emotion = open(out_file + '_emotion.txt', 'w')  
    dials = file1_dial.readlines()
    emotions = file1_emotion.readlines()
    robo_emotions = [emotion.split('\t')[1].strip() for emotion in emotions]
    count_emo = Counter(robo_emotions)
    # freq_emo is number of most frequency emotions
    freq_emo = count_emo.most_common()[0][1]
    print("Count emotions : ", count_emo)
    times = [int(freq_emo/count_emo[str(i)]) for i in range(6)]
    print("times : ", times)
    plas = [freq_emo - times[i]*count_emo[str(i)] for i in range(6)]
    print("plas : ", plas)
    out_dials = []
    out_emotions = []
    
    for i, (uttr, emotion) in enumerate(zip(dials, robo_emotions)):
        emotion = int(emotion)
        if plas[emotion] != 0:
            (out_dials.append(uttr), out_emotions.append(emotions[i]))
            plas[emotion] = plas[emotion]-1
        for j in range(times[emotion]):
            (out_dials.append(uttr), out_emotions.append(emotions[i]))            
    file2_dial.writelines(out_dials)
    file2_emotion.writelines(out_emotions)

In [38]:
from_dir = "../data/em_dial/splitted"
out_over_dir = "../data/em_dial/splitted/over_sample"
domain = ["/cleaning","/exercise","/lunch","/sleep","/game","/all"]
last_part = ["/test","/valid","/train"]

for d in domain:
    if not os.path.exists(out_over_dir+d):
        os.makedirs(out_over_dir+d)
    for l in last_part:
        print("output file : ", out_over_dir+d+l)
        over_sampling(from_dir+d+l, out_over_dir+d+l)

output file :  ../data/em_dial/splitted/over_sample/cleaning/test
Count emotions :  Counter({'2': 75, '3': 46, '0': 37, '1': 25, '4': 18, '5': 4})
times :  [2, 3, 1, 1, 4, 18]
plas :  [1, 0, 0, 29, 3, 3]
output file :  ../data/em_dial/splitted/over_sample/cleaning/valid
Count emotions :  Counter({'3': 67, '2': 48, '1': 31, '0': 29, '4': 27, '5': 1})
times :  [2, 2, 1, 1, 2, 67]
plas :  [9, 5, 19, 0, 13, 0]
output file :  ../data/em_dial/splitted/over_sample/cleaning/train
Count emotions :  Counter({'3': 462, '2': 412, '1': 318, '0': 265, '4': 199, '5': 28})
times :  [1, 1, 1, 1, 2, 16]
plas :  [197, 144, 50, 0, 64, 14]
output file :  ../data/em_dial/splitted/over_sample/exercise/test
Count emotions :  Counter({'1': 63, '3': 43, '2': 37, '4': 29, '0': 22, '5': 5})
times :  [2, 1, 1, 1, 2, 12]
plas :  [19, 0, 26, 20, 5, 3]
output file :  ../data/em_dial/splitted/over_sample/exercise/valid
Count emotions :  Counter({'1': 90, '3': 37, '4': 36, '2': 28, '0': 22, '5': 9})
times :  [4, 1, 3, 

# Make file of over sampling and  kaomozi

In [46]:
from_dir = "../data/em_dial/splitted/over_sample"
out_over_kao_dir = "../data/em_dial/splitted/kao/over_sample"
domain = ["/cleaning","/exercise","/lunch","/sleep","/game","/all"]
last_part = ["/test","/valid","/train"]

for d in domain:
    if not os.path.exists(out_over_kao_dir+d):
        os.makedirs(out_over_kao_dir+d)
    for l in last_part:
        print("output file : ", out_over_kao_dir+d+l)   
        add_kaomozi(from_dir+d+l, out_over_kao_dir+d+l)

output file :  ../data/em_dial/splitted/kao/over_sample/cleaning/test
   NONE : 75
   neutral : 75
   anger : 75
   sad : 75
   happy : 75
   contentment : 75
output file :  ../data/em_dial/splitted/kao/over_sample/cleaning/valid
   NONE : 67
   neutral : 67
   anger : 67
   sad : 67
   happy : 67
   contentment : 67
output file :  ../data/em_dial/splitted/kao/over_sample/cleaning/train
   NONE : 462
   neutral : 462
   anger : 462
   sad : 462
   happy : 462
   contentment : 462
output file :  ../data/em_dial/splitted/kao/over_sample/exercise/test
   NONE : 63
   neutral : 63
   anger : 63
   sad : 63
   happy : 63
   contentment : 63
output file :  ../data/em_dial/splitted/kao/over_sample/exercise/valid
   NONE : 90
   neutral : 90
   anger : 90
   sad : 90
   happy : 90
   contentment : 90
output file :  ../data/em_dial/splitted/kao/over_sample/exercise/train
   NONE : 498
   neutral : 498
   anger : 498
   sad : 498
   happy : 498
   contentment : 498
output file :  ../data/em_dial