### Utility notebook for stats calculation, prompts sampling and post-processing for GPT-3 generated dataset

In [2]:
import pandas as pd
import csv

In [19]:
# train stats
train_file = open('eca-train-cleaned.tsv')
train_tsv = csv.reader(train_file, delimiter="\t")
total_length = 0
total = 0
for i, line in enumerate(train_tsv):
    if i == 0: continue
    total += 1
    total_length += len(line[0].split())
    # print(len(line[0].split()))

print(total)
print(total_length)
print(total_length / total)

3012
187611
62.287848605577686


In [20]:
# dev stats
dev_file = open('eca-dev-cleaned.tsv')
dev_tsv = csv.reader(dev_file, delimiter="\t")
total_length = 0
total = 0
for i, line in enumerate(dev_tsv):
    if i == 0: continue
    total += 1
    total_length += len(line[0].split())
    # print(len(line[0].split()))

print(total)
print(total_length)
print(total_length / total)

376
23444
62.351063829787236


In [21]:
# test stats
test_file = open('eca-test-cleaned.tsv')
test_tsv = csv.reader(test_file, delimiter="\t")
total_length = 0
total = 0
for i, line in enumerate(test_tsv):
    if i == 0: continue
    total += 1
    total_length += len(line[0].split())
    # print(len(line[0].split()))

print(total)
print(total_length)
print(total_length / total)

319
19210
60.21943573667711


In [11]:
# train stats
train_file = open('eca-dev-cleaned-exp-TRS-4.tsv')
train_tsv = csv.reader(train_file, delimiter="\t")
total_length = 0
total = 0
for i, line in enumerate(train_tsv):
    if i == 0: continue
    total += 1
    total_length += len(line[-1].strip().split())
    # print(len(line[-1].strip().split()))
    # print((line[-1].strip()))
    # print(len(line[0].split()))

print(total)
print(total_length)
print(total_length / total)

376
12121
32.236702127659576


#### Sample data for choosing prompts

In [2]:
train_df = pd.read_csv('eca-train-cleaned.tsv', delimiter="\t")
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3012 entries, 0 to 3011
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   document       3012 non-null   object
 1   token_label    3012 non-null   object
 2   emotion_label  3012 non-null   object
dtypes: object(3)
memory usage: 70.7+ KB


In [3]:
train_df[train_df['emotion_label'] == 'surprise'].sample(1)

Unnamed: 0,document,token_label,emotion_label
757,"""Look here "" he began and he was surprised at ...",O O O O O O O O B-EMO O B-CAU I-CAU I-CAU I-CA...,surprise


In [11]:
two_shot_df = pd.read_csv('eca-test-cleaned-exp-TRS-2.tsv', delimiter="\t")
two_shot_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319 entries, 0 to 318
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   document       319 non-null    object
 1   token_label    319 non-null    object
 2   emotion_label  319 non-null    object
 3   explanation    319 non-null    object
dtypes: object(4)
memory usage: 10.1+ KB


In [12]:
len_lst = []
for exp in two_shot_df['explanation']:
    # print(len(exp))
    len_lst.append(len(str(exp).split()))

print(sum(len_lst) / len(len_lst))

70.20689655172414


### Post-process raw GPT-3 generations to contain only explanations

In [12]:
split = 'train'
shot = 'TRS-4'
shot_raw_df = pd.read_csv('./model/GPT3/data/ghazi/fold0/eca-{}-cleaned-exp-{}-raw.tsv'.format(split, shot), delimiter="\t")
shot_raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 656 entries, 0 to 655
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   document       656 non-null    object
 1   token_label    656 non-null    object
 2   emotion-label  656 non-null    object
 3   explanation    656 non-null    object
dtypes: object(4)
memory usage: 20.6+ KB


In [13]:
explanations = []
for sent in shot_raw_df['explanation']:
    if len(sent.split('The cause results in the emotion mentioned because')) != 2:
        explanations.append(sent)
        print(sent)
    else:
        # print(sent.split('The cause results in the emotion mentioned because')[1])
        explanations.append(sent.split('The cause results in the emotion mentioned because')[1])
shot_raw_df['explanations'] = explanations
shot_raw_df = shot_raw_df.drop(['explanation'], axis=1)

 In the text, the cause is that Tolkien was happy to find anyone who could appreciate his writing. The emotion mentioned is that Tolkien thought of an audience when he wrote. The cause result in the emotion mentioned because when an author is happy to find someone who could appreciate his writing, it means that the author thought of an audience when he wrote. Therefore, the cause leads to the emotion mentioned in the text.
 In the text, the cause is that the woman was worried about having to manage the farm by herself again. The emotion mentioned is that the woman was worried. The cause results in the emotion mentioned because the woman was worried about having to manage the farm by herself again. The cause results in the emotion mentioned because the woman was worried about having to manage the farm by herself again.


In [14]:
shot_raw_df.to_csv('./model/GPT3/data/ghazi/fold0/eca-{}-cleaned-exp-{}.tsv'.format(split,shot), sep='\t', index=False)

In [15]:
# gpt3 prompting for ghazi data
for fold in ['fold1', 'fold2', 'fold3', 'fold4']:
    for split in ['train', 'dev']:
        for shot in ['TRS-2', 'TRS-4']:
            # split = 'train'
            # shot = 'TRS-4'
            shot_raw_df = pd.read_csv('./model/GPT3/data/ghazi/{}/eca-{}-cleaned-exp-{}-raw.tsv'.format(fold, split, shot), delimiter="\t")
            # shot_raw_df.info()

            explanations = []
            for sent in shot_raw_df['explanation']:
                if len(sent.split('The cause results in the emotion mentioned because')) != 2:
                    explanations.append(sent)
                    print(sent)
                else:
                    # print(sent.split('The cause results in the emotion mentioned because')[1])
                    explanations.append(sent.split('The cause results in the emotion mentioned because')[1])
            shot_raw_df['explanations'] = explanations
            shot_raw_df = shot_raw_df.drop(['explanation'], axis=1)

            shot_raw_df.to_csv('./model/GPT3/data/ghazi/{}/eca-{}-cleaned-exp-{}.tsv'.format(fold, split,shot), sep='\t', index=False)

 In the text, the cause is that the sisters thought of possible further deterioration. The emotion mentioned is that the sisters looked alarmed. The cause results in emotion mentioned because when people think of something that could make their situation worse, they often become alarmed. In this case, the sisters are worried that things could get worse for them.
 In the text, the cause is that the wife of former Beirut hostage Jackie Mann died of lung cancer. The emotion mentioned is that the author was heartbroken she could not see her before she died of lung cancer. The cause results emotion mentioned because the author and the wife of former Beirut hostage Jackie Mann were close friends. Therefore, when the wife of former Beirut hostage Jackie Mann died, the author was heartbroken that she was not able to see her before she died.
 In the text, the cause is that McGoldrick signed on for four years after talks with Highbury boss George Graham. The emotion mentioned is that McGoldrick 

### Significance test (paired t-test)

In [1]:
import scipy.stats as stats

##### Ablation

In [6]:
# effectiveness of bi-lstm
bect_nolstm = [0.2097, 0.2142,	0.2119,	0.4633,	0.386,	0.4209,	0.2287,	0.2381,	0.2332]
bect = [0.2143,	0.223,	0.2186,	0.4856,	0.39,	0.4323,	0.2265,	0.2499,	0.2376]
stats.ttest_rel(bect_nolstm, bect)

TtestResult(statistic=-3.484667142363774, pvalue=0.008262824741817632, df=8)

##### COMET-based

In [9]:
# ECSE+ECSP: BECT VS Causes
Causes = [0.2195,	0.2275,	0.2233, 0.2272,	0.2507,	0.2383]
bect = [0.2143,	0.223,	0.2186, 0.2265,	0.2499,	0.2376]
stats.ttest_rel(bect, Causes)

TtestResult(statistic=-3.0262961673342823, pvalue=0.02920498051254723, df=5)

##### GLUCOSE-based

In [10]:
# ECSE: (dim1-spec VS BECT)
dim1_spec = [0.2274, 0.2314, 0.2293]
bect = [0.2143,	0.223,	0.2186]
stats.ttest_rel(bect, dim1_spec)

TtestResult(statistic=-7.910330289353436, pvalue=0.015608076802147201, df=2)

In [11]:
# EESE: (dim7-spec VS BECT)
dim7_spec = [0.4792, 0.3944, 0.4326]
bect = [0.4856,	0.39,	0.4323]
stats.ttest_rel(bect, dim7_spec)

TtestResult(statistic=0.18002778497876945, pvalue=0.8737202026861619, df=2)

In [12]:
# ECSP: (dim2-spec VS BECT)
dim2_spec = [0.2242,0.2599,	0.2407]
bect = [0.2265,	0.2499,	0.2376]
stats.ttest_rel(bect, dim2_spec)

TtestResult(statistic=-1.0113796089977531, pvalue=0.418294575369641, df=2)

##### GPT3-prompting

In [13]:
# ECSE: (TRS-4 VS BECT-doc)
TRS_4 = [0.2594,0.3069,	0.281]
bect_doc = [0.2349,	0.2765,	0.2539]
stats.ttest_rel(bect_doc, TRS_4)

TtestResult(statistic=-16.010861734483587, pvalue=0.0038782730768579655, df=2)

In [14]:
# ECSP: (TRS-4 VS BECT-doc)
TRS_4 = [0.2784,0.2952,	0.2865]
bect_doc = [0.2531,	0.2714,	0.262]
stats.ttest_rel(bect_doc, TRS_4)

TtestResult(statistic=-56.615384615386574, pvalue=0.00031183718060284505, df=2)

In [2]:
# EESE: (TRS-2-256 vs bect-doc-256)
TRS_2 = [0.4953, 0.5339, 0.5138]
bect_doc = [0.4969, 0.5344, 0.514]
stats.ttest_rel(bect_doc, TRS_2)

TtestResult(statistic=1.8014990349722022, pvalue=0.2134165967347317, df=2)

In [2]:
# ECSE: (bect-doc vs bect-doc-Causes)
bect_doc_Causes = [0.2524, 0.3014, 0.2746]
bect_doc = [0.2448,	0.2893,	0.2645]
stats.ttest_rel(bect_doc, bect_doc_Causes)

TtestResult(statistic=-7.630998044000648, pvalue=0.01674259148268539, df=2)

In [3]:
# EESE: (bect-doc vs bect-doc-Causes)
bect_doc = [0.4955,	0.5212,	0.5077]
bect_doc_Causes = [0.4967,	0.5227,	0.5091]
stats.ttest_rel(bect_doc, bect_doc_Causes)

TtestResult(statistic=-15.496543393375081, pvalue=0.004138356507618268, df=2)

In [4]:
# ECSP: (bect-doc vs bect-doc-Causes)
bect_doc = [0.2777,	0.2893,	0.2833]
bect_doc_Causes = [0.2672,	0.2801,	0.2733]
stats.ttest_rel(bect_doc, bect_doc_Causes)

TtestResult(statistic=26.14939191787053, pvalue=0.0014592355275277777, df=2)

In [1]:
import pickle

In [4]:
def loadList(path):
    pkl_file = open(path, 'rb')
    segContent = pickle.load(pkl_file)
    pkl_file.close()
    return segContent

In [5]:
data_set = loadList('eca_sti_data.pkl')
data_set_ids = list(range(len(data_set)))

In [14]:
len(data_set_ids[4*164: ])

164

In [9]:
int(0.2 * len(data_set_ids))

164