In [1]:
import random
import copy
import math
import numpy as np
from scipy import stats
import prettytable as pt
from collections import Counter
from sklearn.model_selection import train_test_split

# Train Test Valid Split

In [2]:
def open_data(dirname):
    raw = [line[:-1] for line in open(dirname, encoding='utf-8')]
    data = [] 
    for line in raw:
        if line == '':
            data.append([])
        else:
            data[-1].append(line)
    return data

In [3]:
def train_test_valid_split(data):
    ind = list(range(len(data)))
    x_train,x_new = train_test_split(ind, train_size=0.8, shuffle = True)
    x_test,x_valid = train_test_split(x_new, train_size=0.5, shuffle = True)
    return x_train, x_test, x_valid # 8:1:1

In [4]:
def get_main_score(scores):
    mode = stats.mode(scores)[0][0]
    return mode

In [5]:
def get_mean_score(scores):
    mean = np.mean(scores)
    if mean > 3:
        mean = 3
    elif mean < 3:
        mean = 1
    else:
        mean = 2
    return mean

In [6]:
def construct_data(idx,data):
    dataset = []
    for i in idx:
        dialogue = data[i]
        dataset.append(dialogue)  
    return dataset  

In [7]:
def write_txt(filename,data):
    file = open(filename,'w')
    file.writelines(['\n'])
    for i in range(len(data)):
        I = data[i]
        for j in range(len(I)):
            J = I[j]
            file.writelines([J,'\n'])
        file.writelines(['\n'])
    file.close()    

In [8]:
#dirname = 'MWOZ.txt'
dirname = 'SGD.txt'
data = open_data(dirname)

In [9]:
x_train, x_test, x_valid = train_test_valid_split(data)

In [10]:
data_train = construct_data(x_train,data)
data_test = construct_data(x_test,data)
data_valid = construct_data(x_valid,data)

In [11]:
write_txt('train.txt',data_train)
write_txt('test.txt',data_test)
write_txt('valid.txt',data_valid)

# Convert Data Format

In [12]:
f = open("act_sgd.txt",encoding = "utf-8").read()
#f = open("act_mwoz.txt",encoding = "utf-8").read()
act_l = f.splitlines()
act_list = {}
for i in range(len(act_l)):
    num,act= str(act_l[i]).split('\t')
    act_list[act] = num 

In [28]:
def split_session(session,act_list):
    
    emo_seq = []
    #emo_diff_seq = []
    emo_diff_seq = []
    act_seq = []
    user_text = []
    sys_text = []

    for i in range(len(session)):
        role = session[i].split('\t')[0]
        if role.upper() == 'USER':
            _, u_t, act, s = session[i].split('\t')
            user_text.append(u_t+'|||')
        
            s = s.split(',')
            score = get_main_score([i for i in s])
            emo_seq.append(int(score))
        
            act = act.split(',')
            act = int(act_list[act[0]])
            act_seq.append(act)   
        
        else:
            s_t = session[i].split('\t')[1]
            sys_text.append(s_t)

            
    for j in range(1, len(emo_seq)):
         emo_diff_seq.append(emo_seq[j] - emo_seq[j-1])
            
    return emo_seq,emo_diff_seq,act_seq,user_text,sys_text     

In [14]:
def split_session_mean(session,act_list):
    
    emo_seq = []
    emo_diff_seq = []
    act_seq = []
    user_text = []
    sys_text = []

    for i in range(len(session)):
        role = session[i].split('\t')[0]
        if role.upper() == 'USER':
            _, u_t, act, s = session[i].split('\t')
            user_text.append(u_t+'|||')
        
            s = s.split(',')
            score = get_mean_score([int(i) for i in s])
            emo_seq.append(int(score))
        
            act = act.split(',')
            act = int(act_list[act[0]])
            act_seq.append(act)   
        
        else:
            s_t = session[i].split('\t')[1]
            sys_text.append(s_t)

            
    for j in range(1, len(emo_seq)):
         emo_diff_seq.append(emo_seq[j] - emo_seq[j-1])
            
    return emo_seq,emo_diff_seq,act_seq,user_text,sys_text     

In [29]:
def write_data(dirname,filename):
    data = open_data(dirname)
    with open(filename,"w") as f:
        for i in range(len(data)):
            session = data[i]
            text = []
            if len(session)/2 > 2:
                emo_seq,emo_diff_seq,act_seq,user_text,sys_text = split_session(session,act_list) # five class
                #emo_seq,emo_diff_seq,act_seq,user_text,sys_text = split_session_mean(session,act_list) # three class
                for j in range(len(sys_text)-1):
                    text_1 = user_text[j]+sys_text[j]
                    text.append(text_1)
                    text_2 = []
                    text_u = user_text[j+1]
                    text_2.append(text_u)
                    current_text =  text + text_2
                    act = act_seq[:j+2]
                    sat = emo_seq[j+1]-1
                    sat_diff = emo_diff_seq[j+1]
                    sat_seq = [i-1 for i in emo_seq[:j+2]]
                    if j > 1:
                        f.writelines([str(current_text),'\t',str(act),'\t',str(sat_seq),'\t',str(sat_diff),'\t',str(sat),'\n'])

In [30]:
write_data('train.txt','train_sgd.txt')
write_data('test.txt','test_sgd.txt')
write_data('valid.txt','valid_sgd.txt')

# Count Classes and Weight

In [8]:
def count_sat(dirname):
    with open(dirname, 'r', encoding='utf-8') as infile:
        score = []
        for line in infile:
                items = line.strip('\n').split('\t')
                sat = int(items[-1]) 
                score.append(sat)
    return score

In [12]:
dirname = 'train_sgd.txt'
emo_train = count_sat(dirname)
count_train = Counter(emo_train)
print(count_train)

# inverse weights five class
weights = [count_train[0],count_train[1],count_train[2],count_train[3],count_train[4]]
weights = [max(weights)/x for x in weights]
weights

Counter({2: 6308, 3: 965, 1: 521, 4: 39, 0: 3})


[2102.6666666666665,
 12.107485604606525,
 1.0,
 6.536787564766839,
 161.74358974358975]

In [9]:
dirname = 'train_sgd.txt'
emo_train = count_sat(dirname)
count_train = Counter(emo_train)
print(count_train)

# inverse weights three class
weights = [count_train[0],count_train[1],count_train[2]]
weights = [max(weights)/x for x in weights]
weights

Counter({2: 3679, 1: 2375, 0: 1782})


[2.064534231200898, 1.5490526315789475, 1.0]

In [13]:
dirname = 'test_sgd.txt'
emo_test = count_sat(dirname)
count_test = Counter(emo_test)
print(count_test)

Counter({2: 773, 3: 143, 1: 81, 4: 4})


In [14]:
dirname = 'valid_sgd.txt'
emo_valid = count_sat(dirname)
count_valid= Counter(emo_valid)
print(count_valid)

Counter({2: 812, 3: 122, 1: 55, 4: 7})


In [None]:
# Inverse of Square Root weights
weights = [count_train[0],count_train[1],count_train[2],count_train[3],count_train[4]]
weights = [100/np.sqrt(x) for x in weights]
weights

In [None]:
# Effective Number of Samples
b = 0.999 # 0.9, 0.99, 0.999, 0.9999
weights = [count_train[0],count_train[1],count_train[2],count_train[3],count_train[4]]
eff_num = [1.0 - np.power(b,x) for x in weights]
weights = (1-b)/np.array(eff_num)
weights = weights/np.sum(weights)*5
weights

In [36]:
def count_sat_diff(dirname):
    with open(dirname, 'r', encoding='utf-8') as infile:
        score = []
        for line in infile:
                items = line.strip('\n').split('\t')
                sat = int(items[-2]) 
                score.append(sat)
    return score

In [39]:
dirname = 'train_sgd.txt'
emo_train = count_sat_diff(dirname)
count_train = Counter(emo_train)
print(count_train)

# inverse weights five class
weights = [count_train[-3],count_train[-2],count_train[-1],count_train[0],count_train[1],count_train[2],count_train[3]]
weights = [max(weights)/x for x in weights]
weights

Counter({0: 5475, 1: 1187, -1: 1045, -2: 69, 2: 57, -3: 2, 3: 1})


[2737.5,
 79.34782608695652,
 5.239234449760765,
 1.0,
 4.612468407750632,
 96.05263157894737,
 5475.0]

In [42]:
dirname = 'train_sgd.txt'
emo_train = count_sat_diff(dirname)
count_train = Counter(emo_train)
print(count_train)

# inverse weights three class
weights = [count_train[-2],count_train[-1],count_train[0],count_train[1],count_train[2]]
weights = [max(weights)/x for x in weights]
weights

Counter({0: 3853, 1: 1439, -1: 1175, -2: 697, 2: 672})


[5.527977044476327,
 3.2791489361702126,
 1.0,
 2.6775538568450314,
 5.7336309523809526]