In [1]:
import sys, re, string, pickle
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
from sklearn.metrics import confusion_matrix
pd.set_option('display.max_colwidth', None)

In [2]:
data = pd.read_csv("../results/human_annotation/prolific_results.csv", encoding='utf-8')
data = data.T
data = data.reset_index()
data = data.drop([0, 1], axis=0)
data.head(3)

Unnamed: 0,index,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
2,I wanted that gift as much as cancer,sarcasm,metaphor,sarcasm,sarcasm,simile,sarcasm,sarcasm,sarcasm,sarcasm,sarcasm,sarcasm,sarcasm,sarcasm,sarcasm,sarcasm
3,"Which words were most indicative for your decision? (maximum 3 words or phrases, separated by commas (,))","wanted, gift, cancer","as much as,","cancer, gift","gift, much, cancer",as much as,"Wanted, Gift, Cancer,",as much as,"cancer, that, gift","as much cancer,","cancer, want","gift, wanted, cancer","wanted, gift, cancer","gift, much, cancer","wanted, gift, cancer,",gift cancer
4,I was encouraged not to get a response back from the company after applying for the job,sarcasm,sarcasm,idiom,idiom,sarcasm,idiom,idiom,simile,idiom,idiom,sarcasm,sarcasm,idiom,sarcasm,idiom


# 1. Get labels and reason words

## 1.1 Labels assigned by human participants

In [3]:
classification = data.iloc[::2]
classification = classification.reset_index(drop=True)
classification = classification.add_prefix('part_')
classification = classification.rename(columns={"part_index":"text"})
classification.head(3)

Unnamed: 0,text,part_0,part_1,part_2,part_3,part_4,part_5,part_6,part_7,part_8,part_9,part_10,part_11,part_12,part_13,part_14
0,I wanted that gift as much as cancer,sarcasm,metaphor,sarcasm,sarcasm,simile,sarcasm,sarcasm,sarcasm,sarcasm,sarcasm,sarcasm,sarcasm,sarcasm,sarcasm,sarcasm
1,I was encouraged not to get a response back from the company after applying for the job,sarcasm,sarcasm,idiom,idiom,sarcasm,idiom,idiom,simile,idiom,idiom,sarcasm,sarcasm,idiom,sarcasm,idiom
2,I felt like a hero when I realized that I brought a present for a baby boy to a baby shower for a baby girl,sarcasm,idiom,simile,sarcasm,sarcasm,simile,metaphor,idiom,simile,sarcasm,sarcasm,sarcasm,sarcasm,simile,sarcasm


## 1.2 Reason words named by participants

In [4]:
reasons = data.iloc[1::2]
reasons = reasons.drop('index', axis=1)
reasons = reasons.reset_index(drop=True)
reasons = reasons.add_prefix('keyword_')
reasons = reasons.rename(columns={"keyword_index":"text"})
reasons.head(3)

Unnamed: 0,keyword_0,keyword_1,keyword_2,keyword_3,keyword_4,keyword_5,keyword_6,keyword_7,keyword_8,keyword_9,keyword_10,keyword_11,keyword_12,keyword_13,keyword_14
0,"wanted, gift, cancer","as much as,","cancer, gift","gift, much, cancer",as much as,"Wanted, Gift, Cancer,",as much as,"cancer, that, gift","as much cancer,","cancer, want","gift, wanted, cancer","wanted, gift, cancer","gift, much, cancer","wanted, gift, cancer,",gift cancer
1,"encouraged, not response","no, get, applying",was encouraged not to,"not get a response back, from the company",not,"not to get,",not to get,"encouraged, response, after","encouraged not to,",not get response,"not, response, job","encouraged, not, response",not to get a response back,"encouraged, response, applying,",not get response
2,"felt, hero","felt like a hero,",felt like a hero,"hero, baby boy, to a baby shower for a girl",for,"like a hero,",I felt like,"when, brought, baby","felt like a,","hero, boy, girl","hero, boy, girl","hero, baby boy, baby girl","hero, baby, girl","Hero, present, boy,",hero


## 1.3 Concatenate dataframes

In [5]:
conc = pd.concat([classification, reasons], axis=1)
conc.head(3)

Unnamed: 0,text,part_0,part_1,part_2,part_3,part_4,part_5,part_6,part_7,part_8,...,keyword_5,keyword_6,keyword_7,keyword_8,keyword_9,keyword_10,keyword_11,keyword_12,keyword_13,keyword_14
0,I wanted that gift as much as cancer,sarcasm,metaphor,sarcasm,sarcasm,simile,sarcasm,sarcasm,sarcasm,sarcasm,...,"Wanted, Gift, Cancer,",as much as,"cancer, that, gift","as much cancer,","cancer, want","gift, wanted, cancer","wanted, gift, cancer","gift, much, cancer","wanted, gift, cancer,",gift cancer
1,I was encouraged not to get a response back from the company after applying for the job,sarcasm,sarcasm,idiom,idiom,sarcasm,idiom,idiom,simile,idiom,...,"not to get,",not to get,"encouraged, response, after","encouraged not to,",not get response,"not, response, job","encouraged, not, response",not to get a response back,"encouraged, response, applying,",not get response
2,I felt like a hero when I realized that I brought a present for a baby boy to a baby shower for a baby girl,sarcasm,idiom,simile,sarcasm,sarcasm,simile,metaphor,idiom,simile,...,"like a hero,",I felt like,"when, brought, baby","felt like a,","hero, boy, girl","hero, boy, girl","hero, baby boy, baby girl","hero, baby, girl","Hero, present, boy,",hero


In [6]:
source = pd.read_csv('../results/human_annotation/samples_to_annotate.csv')
final = source.merge(conc, left_on='text', right_on='text')

num_to_label = {1: "sarcasm", 2: "idiom", 3: "simile", 4: "metaphor"}
label_to_num = {"sarcasm":1, "idiom":2, "simile":3, "metaphor":4}

final['label'] = final['label'].map(num_to_label)

In [7]:
for i, row in classification.iterrows():
    sarc = 0
    idiom = 0
    metaphor = 0
    simile = 0
    
    for j in range(1, 16):
        if row[j] == "sarcasm":
            sarc +=1
        if row[j] == "idiom":
            idiom +=1
        if row[j] == "metaphor":
            metaphor +=1
        if row[j] == "simile":
            simile +=1
    rowdict = {"sarcasm": sarc, "idiom": idiom, "metaphor":metaphor, "simile": simile}
    max_value = max(sarc, idiom, metaphor, simile)

    if max_value <7:
        print(max_value)
    # rowdict_rev = {v:k for k, v in rowdict.items()}
    rowdict_rev = defaultdict(list)
    for k, v in rowdict.items():
        rowdict_rev[v].append(k)

    if max_value >= 7:
        classification.loc[i, 'agreement'] = 'high'
        classification.loc[i, 'majority'] = str(rowdict_rev[max_value])
    
    #comment out for alternative statistics
    else:
        classification.loc[i, 'agreement'] = 'low'
        classification.loc[i, 'majority'] = str(rowdict_rev[max_value])

6
6
6
6


In [8]:
conc = pd.concat([classification, reasons], axis=1)
final = source.merge(conc, left_on='text', right_on='text')

num_to_label = {1: "sarcasm", 2: "idiom", 3: "simile", 4: "metaphor"}
label_to_num = {"sarcasm":1, "idiom":2, "simile":3, "metaphor":4}

final['label'] = final['label'].map(num_to_label)
final.head()

Unnamed: 0.1,Unnamed: 0,text,label,model,difficult,part_0,part_1,part_2,part_3,part_4,...,keyword_5,keyword_6,keyword_7,keyword_8,keyword_9,keyword_10,keyword_11,keyword_12,keyword_13,keyword_14
0,1598,I wanted that gift as much as cancer,simile,bert,no,sarcasm,metaphor,sarcasm,sarcasm,simile,...,"Wanted, Gift, Cancer,",as much as,"cancer, that, gift","as much cancer,","cancer, want","gift, wanted, cancer","wanted, gift, cancer","gift, much, cancer","wanted, gift, cancer,",gift cancer
1,140,I was encouraged not to get a response back from the company after applying for the job,sarcasm,rf,yes,sarcasm,sarcasm,idiom,idiom,sarcasm,...,"not to get,",not to get,"encouraged, response, after","encouraged not to,",not get response,"not, response, job","encouraged, not, response",not to get a response back,"encouraged, response, applying,",not get response
2,1812,I felt like a hero when I realized that I brought a present for a baby boy to a baby shower for a baby girl,sarcasm,bert,no,sarcasm,idiom,simile,sarcasm,sarcasm,...,"like a hero,",I felt like,"when, brought, baby","felt like a,","hero, boy, girl","hero, boy, girl","hero, baby boy, baby girl","hero, baby, girl","Hero, present, boy,",hero
3,217,The armor was tough as glass,simile,rf,yes,sarcasm,metaphor,metaphor,sarcasm,simile,...,"armor, tough, glass",was tough as,"armor, tough, glass","tough as, glass","armour, glass","tough, as, glass","was, tough, as","armor, tough, glass,","Armor, tough, glass,",tough as glass
4,137,That is why it is so important to remain like a planted oak,simile,rf,yes,simile,idiom,simile,metaphor,idiom,...,"like, planted oak,",like a,"remain, like, oak","to remain, like a,","remain, planted oak","like, planted, oak","remain, planted oak","like a planted oak,","important, planted, oak,",like


# 2. Get majority labels, easy instances and difficult instances

In [9]:
final['majority'] = final['majority'].str.replace("[", "", regex=True)
final['majority'] = final['majority'].str.replace("]", "", regex=True)
final['majority'] = final['majority'].str.replace("'", "", regex=True)
final = final.dropna()

In [10]:
easy = final[final['difficult'] == "no"]
hard = final[final['difficult'] == "yes"]

In [11]:
final[final['majority'] != final['label']][['label', 'majority']].value_counts('label')

label
metaphor    9
simile      5
idiom       3
sarcasm     1
dtype: int64

In [12]:
easy[easy['majority'] != easy['label']][['label', 'majority']].value_counts('label')

label
metaphor    3
simile      3
idiom       1
dtype: int64

In [13]:
hard[hard['majority'] != hard['label']][['label', 'majority']].value_counts('label')

label
metaphor    6
idiom       2
simile      2
sarcasm     1
dtype: int64

In [14]:
hard[['text', 'label', 'majority', 'model']] 

Unnamed: 0,text,label,majority,model
1,I was encouraged not to get a response back from the company after applying for the job,sarcasm,idiom,rf
3,The armor was tough as glass,simile,metaphor,rf
4,That is why it is so important to remain like a planted oak,simile,simile,rf
5,But today carly wanted to feel like a kick-ass chick in a superhero movie.,simile,simile,rf
8,He spent three years seconded to a lame duck industry.,idiom,idiom,bert
9,And made answer very gravely.,metaphor,idiom,rf
10,I farted in the middle of my speech but no big deal; it happens to me all the time.,sarcasm,sarcasm,bert
12,"Recently I ate at a restaurant and the whole place smelled terrible, just like a bouquet of roses.",sarcasm,sarcasm,bert
14,The whole family moved out bag and baggage.,idiom,idiom,bert
16,You can freely switch between them but the rest go like a deflated dummies.,simile,simile,rf


In [15]:
easy[['text', 'label', 'majority']] 

Unnamed: 0,text,label,majority
0,I wanted that gift as much as cancer,simile,sarcasm
2,I felt like a hero when I realized that I brought a present for a baby boy to a baby shower for a baby girl,sarcasm,sarcasm
6,They were burning with desire.,metaphor,metaphor
7,The tax cut will fertilize the economy.,metaphor,metaphor
11,"I do bear that in mind, doctor.",idiom,idiom
13,John fell behind his class mates.,metaphor,idiom
15,I am super proud that my 15 year old son crashed my ferrari while taking it for a test drive,sarcasm,sarcasm
17,"If I could just go with the flow for a couple of hours, I may just have a decent enough time to get through it.",idiom,idiom
18,You're as affectionate as a crocodile,simile,sarcasm
21,Her optimism flicked away these worries.,metaphor,metaphor


# 3. Plot confusion matrix of human annotations

In [16]:
cm = confusion_matrix(hard['label'], hard['majority'], normalize = 'true', 
                      labels=['sarcasm', 'simile', 'idiom', 'metaphor'])
figure = sns.heatmap(cm, annot=True, cmap="GnBu", fmt=".2f", xticklabels = ['sarcasm', 'simile', 'idiom', 'metaphor'], 
                  yticklabels=['sarcasm', 'simile', 'idiom', 'metaphor'])
plt.savefig('../results/human_annotation/heatmap_hard.pdf')
plt.close()

In [17]:
cm = confusion_matrix(easy['label'], easy['majority'], normalize = 'true', 
                      labels=['sarcasm', 'simile', 'idiom', 'metaphor'])
figure = sns.heatmap(cm, annot=True, cmap="GnBu", fmt=".2f", xticklabels = ['sarcasm', 'simile', 'idiom', 'metaphor'], 
                  yticklabels=['sarcasm', 'simile', 'idiom', 'metaphor'])
plt.savefig('../results/human_annotation/heatmap_easy.pdf')
plt.close()

# 4. Write out reason words

In [18]:
def get_words(df):
    
    all_results = []
    for i, row in df.iterrows():
        try:
            all_word_per_row = " ".join(row).lower()
        except TypeError:
            all_word_per_row = "error"
            
        all_word_per_row = all_word_per_row.translate(str.maketrans('', '', string.punctuation))
        words = all_word_per_row.strip().split()
        mydict = defaultdict(int)
        for token in words:
            mydict[token] +=1
        mydict = {k: v for k, v in sorted(mydict.items(), key=lambda item: item[1], reverse=True)}
        all_results.append(mydict)

    return all_results

In [19]:
final[['keyword_0', 'keyword_1']].head(5)

Unnamed: 0,keyword_0,keyword_1
0,"wanted, gift, cancer","as much as,"
1,"encouraged, not response","no, get, applying"
2,"felt, hero","felt like a hero,"
3,"armor, tough, glass","tough, as, glass"
4,"like, planted oak","planted oak,"


In [20]:
final['all_words'] = get_words(final.iloc[:, 22:])

In [21]:
final[['text', 'label', 'majority', 'all_words']]

Unnamed: 0,text,label,majority,all_words
0,I wanted that gift as much as cancer,simile,sarcasm,"{'cancer': 12, 'gift': 10, 'as': 7, 'much': 6, 'wanted': 5, 'that': 1, 'want': 1}"
1,I was encouraged not to get a response back from the company after applying for the job,sarcasm,idiom,"{'not': 12, 'response': 9, 'get': 7, 'encouraged': 6, 'to': 5, 'applying': 2, 'a': 2, 'back': 2, 'no': 1, 'was': 1, 'from': 1, 'the': 1, 'company': 1, 'after': 1, 'job': 1}"
2,I felt like a hero when I realized that I brought a present for a baby boy to a baby shower for a baby girl,sarcasm,sarcasm,"{'hero': 11, 'a': 6, 'baby': 6, 'felt': 5, 'like': 5, 'boy': 5, 'girl': 5, 'for': 2, 'to': 1, 'shower': 1, 'i': 1, 'when': 1, 'brought': 1, 'present': 1}"
3,The armor was tough as glass,simile,metaphor,"{'tough': 13, 'glass': 12, 'as': 8, 'armor': 5, 'armour': 2, 'was': 2}"
4,That is why it is so important to remain like a planted oak,simile,simile,"{'oak': 11, 'like': 10, 'planted': 10, 'remain': 6, 'a': 5, 'to': 1, 'important': 1}"
5,But today carly wanted to feel like a kick-ass chick in a superhero movie.,simile,simile,"{'like': 9, 'kickass': 7, 'superhero': 7, 'feel': 5, 'a': 5, 'chick': 4, 'movie': 2, 'kick': 2, 'ass': 2, 'wanted': 2, 'in': 1, 'to': 1}"
6,They were burning with desire.,metaphor,metaphor,"{'burning': 13, 'desire': 12, 'with': 8, 'were': 2, 'buring': 1}"
7,The tax cut will fertilize the economy.,metaphor,metaphor,"{'fertilize': 12, 'economy': 8, 'tax': 4, 'the': 2, 'cut': 2, 'ferilize': 1, 'fertilise': 1, 'with': 1, 'will': 1}"
8,He spent three years seconded to a lame duck industry.,idiom,idiom,"{'duck': 14, 'lame': 13, 'industry': 8, 'seconded': 2, 'industy': 1, 'three': 1, 'to': 1, 'he': 1, 'spent': 1}"
9,And made answer very gravely.,metaphor,idiom,"{'gravely': 13, 'very': 6, 'answer': 6, 'made': 4}"


In [22]:
# -- choose top 20 important words per class -- 
idiom_final = final[final['label']=='idiom']['all_words'].tolist()
sarcasm_final = final[final['label']=='sarcasm']['all_words'].tolist()
metaphor_final = final[final['label']=='metaphor']['all_words'].tolist()
simile_final = final[final['label']=='simile']['all_words'].tolist()

agg_idiom_final = {}
agg_sarcasm_final = {}
agg_metaphor_final = {}
agg_simile_final = {}

for dictionary in idiom_final:
    agg_idiom_final.update(dictionary)
for dictionary in sarcasm_final:
    agg_sarcasm_final.update(dictionary)
for dictionary in metaphor_final:
    agg_metaphor_final.update(dictionary)
for dictionary in simile_final:
    agg_simile_final.update(dictionary)

In [23]:
agg_idiom_final = {k: v for k, v in sorted(agg_idiom_final.items(), key=lambda item: item[1], reverse=True)[:20]}
agg_sarcasm_final = {k: v for k, v in sorted(agg_sarcasm_final.items(), key=lambda item: item[1], reverse=True)[:20]}
agg_metaphor_final = {k: v for k, v in sorted(agg_metaphor_final.items(), key=lambda item: item[1], reverse=True)[:20]}
agg_simile_final = {k: v for k, v in sorted(agg_simile_final.items(), key=lambda item: item[1], reverse=True)[:20]}

In [24]:
print(agg_idiom_final)
print(agg_sarcasm_final)
print(agg_metaphor_final)
print(agg_simile_final)

{'sink': 15, 'leak': 15, 'full': 15, 'duck': 14, 'cut': 14, 'nth': 14, 'swim': 14, 'beans': 14, 'broad': 14, 'beam': 14, 'lame': 13, 'bag': 13, 'baggage': 12, 'dried': 12, 'degree': 12, 'hand': 12, 'bear': 11, 'flow': 11, 'smell': 11, 'of': 11}
{'overjoyed': 15, 'grateful': 14, 'pleasant': 14, 'great': 13, 'praised': 13, 'roses': 12, 'crashed': 12, 'excited': 12, 'vomiting': 12, 'myself': 12, 'not': 11, 'no': 11, 'hero': 11, 'terrible': 11, 'proud': 11, 'drops': 11, 'happy': 11, 'couldnt': 11, 'deal': 10, 'mistake': 10}
{'toppled': 15, 'rose': 15, 'fell': 14, 'shoot': 14, 'burning': 13, 'gravely': 13, 'never': 13, 'cure': 13, 'desire': 12, 'fertilize': 12, 'the': 12, 'flicked': 12, 'darkness': 12, 'spirits': 12, 'prescribes': 12, 'behind': 11, 'ravaged': 11, 'dwell': 11, 'leak': 10, 'speed': 9}
{'as': 17, 'smooth': 14, 'tough': 13, 'dummies': 13, 'crocodile': 13, 'snowman': 13, 'cancer': 12, 'glass': 12, 'affectionate': 12, 'oak': 11, 'an': 11, 'angel': 11, 'dream': 11, 'gift': 10, 'li

In [25]:
with open('../results/human_annotation/reasons_idiom.txt', 'w') as f:
    for k, v in agg_idiom_final.items():
        print(k, file=f)
        
with open('../results/human_annotation/reasons_sarcasm.txt', 'w') as f:
    for k, v in agg_sarcasm_final.items():
        print(k, file=f)
        
with open('../results/human_annotation/reasons_metaphor.txt', 'w') as f:
    for k, v in agg_metaphor_final.items():
        print(k, file=f)
        
with open('../results/human_annotation/reasons_simile.txt', 'w') as f:
    for k, v in agg_simile_final.items():
        print(k, file=f)