In [1]:
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize

# 1. Read in the original files

In [2]:
df = pd.read_json('./data/original.jsonl', lines=True)
df

Unnamed: 0,id,premise,hypothesis,label,explanation,split,type,idiom
0,1,I left my adult son home for a few days and ju...,I was gone for only a few days and my consider...,Contradiction,Most people would not consider leaving dirty d...,train,Sarcasm,
1,2,I just caught a guy picking up used chewing gu...,it was such a pleasant sight to see a guy pick...,Contradiction,Picking up used chewing gum is really unhygien...,train,Sarcasm,
2,3,You could feel their sudden appearance in the ...,Their sudden appearance in the farmhouse was l...,Entailment,A gust of arctic wind is an icy blast that is ...,train,Simile,
3,4,"April's had never left, perfectly white and fu...","April's had never left, like a toothpaste comm...",Entailment,A toothpaste commercial is something that is a...,train,Simile,
4,5,I cooked a meal for family and it came out hor...,I feel terrible that the meal I cooked for my ...,Entailment,It is often very embarrassing when someone coo...,train,CreativeParaphrase,
...,...,...,...,...,...,...,...,...
7529,7530,I was very upset and frustrated when I came ou...,I was utterly delighted when I came outside th...,Contradiction,Most people would not be happy if their car ha...,train,Sarcasm,
7530,7531,Even though he may have seemed civil from his ...,He had a veneer of civility from his education...,Entailment,A cornered pit viper is a very dangerous anima...,train,Simile,
7531,7532,I was in a hotel and there was hair in the sho...,I found hair in the shower drain and it made m...,Entailment,It is generally considered unclean to find hai...,train,CreativeParaphrase,
7532,7533,He smiled treacherously up at me.,"I looked down at him, and he smiled at me like...",Contradiction,"An angel is a being of divine nature, and they...",train,Simile,


In [3]:
df.to_csv("./data/original.tsv", sep="\t", encoding="utf-8", index=False)

# 2. Get figurative instances

All instances in the column ```hypothesis``` are the figurative instances, except for *CreativeParaphrase*, 
as the hypotheses of *CreativeParaphrase* are designed as the literal counterpart of *Sarcasm* (see the [dataset-paper](https://arxiv.org/pdf/2205.12404.pdf) for details).

In [4]:
df_figurative = df[df["type"] != "CreativeParaphrase"][["hypothesis", "type"]]
df_figurative.rename(columns={'hypothesis': 'text', "type": "label"}, inplace=True)
print(df_figurative["label"].value_counts())
df_figurative

Sarcasm     2216
Idiom       1768
Simile      1250
Metaphor    1250
Name: label, dtype: int64


Unnamed: 0,text,label
0,I was gone for only a few days and my consider...,Sarcasm
1,it was such a pleasant sight to see a guy pick...,Sarcasm
2,Their sudden appearance in the farmhouse was l...,Simile
3,"April's had never left, like a toothpaste comm...",Simile
5,My soul was a lampless sea and she was the tem...,Metaphor
...,...,...
7528,The cigarettes snuffed his health.,Metaphor
7529,I was utterly delighted when I came outside th...,Sarcasm
7530,He had a veneer of civility from his education...,Simile
7532,"I looked down at him, and he smiled at me like...",Simile


In [5]:
df_figurative.insert(2, "source", [i + "_hypothesis" for i in df_figurative["label"].values])
print(df_figurative["label"].value_counts())
df_figurative

Sarcasm     2216
Idiom       1768
Simile      1250
Metaphor    1250
Name: label, dtype: int64


Unnamed: 0,text,label,source
0,I was gone for only a few days and my consider...,Sarcasm,Sarcasm_hypothesis
1,it was such a pleasant sight to see a guy pick...,Sarcasm,Sarcasm_hypothesis
2,Their sudden appearance in the farmhouse was l...,Simile,Simile_hypothesis
3,"April's had never left, like a toothpaste comm...",Simile,Simile_hypothesis
5,My soul was a lampless sea and she was the tem...,Metaphor,Metaphor_hypothesis
...,...,...,...
7528,The cigarettes snuffed his health.,Metaphor,Metaphor_hypothesis
7529,I was utterly delighted when I came outside th...,Sarcasm,Sarcasm_hypothesis
7530,He had a veneer of civility from his education...,Simile,Simile_hypothesis
7532,"I looked down at him, and he smiled at me like...",Simile,Simile_hypothesis


# 3. Get literal instances

We use the premises from all classes.

**Note** - for the class *Sarcasm*, the literal instances are from two sources:
1. The premises of the class *Sarcasm*: they are the literal counterparts with a contradict-relation to the sarcasm instances in the original dataset.
1. The premises of the class *CreativeParaphrase*: they are the literal counterparts with a entail-relation to the sarcasm instances in the original dataset. 
    - (We don't use the hypotheses of the class *CreativeParaphrase*, because many of them are syntactically similar and might thus bias the model)

In [6]:
df_literal = df.loc[: , ["premise", "type"]]
df_literal.rename(columns={'premise': 'text', "type": "label_interim"}, inplace=True)
print(df_literal["label_interim"].value_counts())
df_literal

Sarcasm               2216
Idiom                 1768
Simile                1250
Metaphor              1250
CreativeParaphrase    1050
Name: label_interim, dtype: int64


Unnamed: 0,text,label_interim
0,I left my adult son home for a few days and ju...,Sarcasm
1,I just caught a guy picking up used chewing gu...,Sarcasm
2,You could feel their sudden appearance in the ...,Simile
3,"April's had never left, perfectly white and fu...",Simile
4,I cooked a meal for family and it came out hor...,CreativeParaphrase
...,...,...
7529,I was very upset and frustrated when I came ou...,Sarcasm
7530,Even though he may have seemed civil from his ...,Simile
7531,I was in a hotel and there was hair in the sho...,CreativeParaphrase
7532,He smiled treacherously up at me.,Simile


In [7]:
df_literal.insert(2, "source", [i + "_premise" for i in df_literal["label_interim"].values])
print(df_literal["source"].value_counts())
df_literal

Sarcasm_premise               2216
Idiom_premise                 1768
Simile_premise                1250
Metaphor_premise              1250
CreativeParaphrase_premise    1050
Name: source, dtype: int64


Unnamed: 0,text,label_interim,source
0,I left my adult son home for a few days and ju...,Sarcasm,Sarcasm_premise
1,I just caught a guy picking up used chewing gu...,Sarcasm,Sarcasm_premise
2,You could feel their sudden appearance in the ...,Simile,Simile_premise
3,"April's had never left, perfectly white and fu...",Simile,Simile_premise
4,I cooked a meal for family and it came out hor...,CreativeParaphrase,CreativeParaphrase_premise
...,...,...,...
7529,I was very upset and frustrated when I came ou...,Sarcasm,Sarcasm_premise
7530,Even though he may have seemed civil from his ...,Simile,Simile_premise
7531,I was in a hotel and there was hair in the sho...,CreativeParaphrase,CreativeParaphrase_premise
7532,He smiled treacherously up at me.,Simile,Simile_premise


In [8]:
df_literal.insert(2, "label", ["Literal"] * df_literal.shape[0])
print(df_literal["label"].value_counts())
df_literal

Literal    7534
Name: label, dtype: int64


Unnamed: 0,text,label_interim,label,source
0,I left my adult son home for a few days and ju...,Sarcasm,Literal,Sarcasm_premise
1,I just caught a guy picking up used chewing gu...,Sarcasm,Literal,Sarcasm_premise
2,You could feel their sudden appearance in the ...,Simile,Literal,Simile_premise
3,"April's had never left, perfectly white and fu...",Simile,Literal,Simile_premise
4,I cooked a meal for family and it came out hor...,CreativeParaphrase,Literal,CreativeParaphrase_premise
...,...,...,...,...
7529,I was very upset and frustrated when I came ou...,Sarcasm,Literal,Sarcasm_premise
7530,Even though he may have seemed civil from his ...,Simile,Literal,Simile_premise
7531,I was in a hotel and there was hair in the sho...,CreativeParaphrase,Literal,CreativeParaphrase_premise
7532,He smiled treacherously up at me.,Simile,Literal,Simile_premise


In [9]:
df_literal.drop(["label_interim"], axis=1, inplace=True)
df_literal

Unnamed: 0,text,label,source
0,I left my adult son home for a few days and ju...,Literal,Sarcasm_premise
1,I just caught a guy picking up used chewing gu...,Literal,Sarcasm_premise
2,You could feel their sudden appearance in the ...,Literal,Simile_premise
3,"April's had never left, perfectly white and fu...",Literal,Simile_premise
4,I cooked a meal for family and it came out hor...,Literal,CreativeParaphrase_premise
...,...,...,...
7529,I was very upset and frustrated when I came ou...,Literal,Sarcasm_premise
7530,Even though he may have seemed civil from his ...,Literal,Simile_premise
7531,I was in a hotel and there was hair in the sho...,Literal,CreativeParaphrase_premise
7532,He smiled treacherously up at me.,Literal,Simile_premise


# 4. Write out final dataframe

## 4.1 Merge partial dataframes

In [10]:
df_all = pd.concat([df_figurative, df_literal])
print(df_all["label"].value_counts())
df_all

Literal     7534
Sarcasm     2216
Idiom       1768
Simile      1250
Metaphor    1250
Name: label, dtype: int64


Unnamed: 0,text,label,source
0,I was gone for only a few days and my consider...,Sarcasm,Sarcasm_hypothesis
1,it was such a pleasant sight to see a guy pick...,Sarcasm,Sarcasm_hypothesis
2,Their sudden appearance in the farmhouse was l...,Simile,Simile_hypothesis
3,"April's had never left, like a toothpaste comm...",Simile,Simile_hypothesis
5,My soul was a lampless sea and she was the tem...,Metaphor,Metaphor_hypothesis
...,...,...,...
7529,I was very upset and frustrated when I came ou...,Literal,Sarcasm_premise
7530,Even though he may have seemed civil from his ...,Literal,Simile_premise
7531,I was in a hotel and there was hair in the sho...,Literal,CreativeParaphrase_premise
7532,He smiled treacherously up at me.,Literal,Simile_premise


In [11]:
label_map = {
    "Literal": 0,
    "Sarcasm": 1,
    "Idiom": 2,
    "Simile": 3,
    "Metaphor": 4
}

df_all["label"] = df_all["label"].map(label_map)

## 4.2 Drop duplicates and NaN

In [12]:
df_all.isnull().values.any()

False

In [13]:
df_all = df_all.drop_duplicates()
print(df_all["label"].value_counts())
df_all

0    6506
1    2212
2     884
3     625
4     621
Name: label, dtype: int64


Unnamed: 0,text,label,source
0,I was gone for only a few days and my consider...,1,Sarcasm_hypothesis
1,it was such a pleasant sight to see a guy pick...,1,Sarcasm_hypothesis
2,Their sudden appearance in the farmhouse was l...,3,Simile_hypothesis
3,"April's had never left, like a toothpaste comm...",3,Simile_hypothesis
5,My soul was a lampless sea and she was the tem...,4,Metaphor_hypothesis
...,...,...,...
7528,The cigarettes cured his health.,0,Metaphor_premise
7530,Even though he may have seemed civil from his ...,0,Simile_premise
7531,I was in a hotel and there was hair in the sho...,0,CreativeParaphrase_premise
7532,He smiled treacherously up at me.,0,Simile_premise


In [14]:
df_all[df_all["label"] == 0]["source"].value_counts()

Idiom_premise                 1767
Simile_premise                1247
Sarcasm_premise               1225
Metaphor_premise              1221
CreativeParaphrase_premise    1046
Name: source, dtype: int64

In [15]:
df_all[df_all["label"] != 0]["source"].value_counts()

Sarcasm_hypothesis     2212
Idiom_hypothesis        884
Simile_hypothesis       625
Metaphor_hypothesis     621
Name: source, dtype: int64

## 4.3 Add binary labels

In [16]:
label_binary = []

for i in df_all["label"]:
    if i == 0:
        label_binary.append(0)
    else:
        label_binary.append(1)

df_all.insert(2, "label_binary", label_binary)

In [17]:
df_all["label_binary"].value_counts()

0    6506
1    4342
Name: label_binary, dtype: int64

## 4.4 Train-test split

**Shuffle the dataframe:**

In [18]:
df_all = df_all.sample(frac=1).reset_index(drop=True)
df_all

Unnamed: 0,text,label,label_binary,source
0,I can't believe my ex didn't pay his car note ...,0,0,Sarcasm_premise
1,But then the paper would not find out about yo...,0,0,Idiom_premise
2,Last week my kid said some really mean things ...,0,0,CreativeParaphrase_premise
3,"The gravy was so fatty, it made the meat taste...",0,0,Metaphor_premise
4,He pulls a giant disc out and flashes it like ...,3,1,Simile_hypothesis
...,...,...,...,...
10843,"He was about sixty, short and fat .",0,0,Simile_premise
10844,I'm a few dollars short for an important bill ...,0,0,CreativeParaphrase_premise
10845,"In the near future, e-sports would become extr...",0,0,Simile_premise
10846,A thought pops into my head and it's very clear.,0,0,Simile_premise


In [19]:
df_all.to_csv("./data/figlang_all.tsv", sep="\t", encoding="utf-8", index=False)

In [20]:
df_train, df_test = train_test_split(df_all, test_size=0.2, random_state=42)

In [21]:
print(df_train["label"].value_counts())
df_train.head()

0    5186
1    1762
2     714
3     517
4     499
Name: label, dtype: int64


Unnamed: 0,text,label,label_binary,source
4791,That building looks very strong,0,0,Simile_premise
5079,I was really agonizing over this decision for ...,0,0,Metaphor_premise
5512,I was so cheerful when I came outside this mor...,1,1,Sarcasm_hypothesis
4593,Their language broadcasts us to believe them.,4,1,Metaphor_hypothesis
9592,"For centuries, the ruler has made life like a ...",3,1,Simile_hypothesis


In [22]:
print(df_test["label"].value_counts())
df_test.head()

0    1320
1     450
2     170
4     122
3     108
Name: label, dtype: int64


Unnamed: 0,text,label,label_binary,source
4704,My considerate roommate cooked some meat with ...,1,1,Sarcasm_hypothesis
7735,The spy was very quiet,0,0,Simile_premise
6903,Look at lucy - very skinny and slender.,0,0,Idiom_premise
7850,"Turns out, tag between super heroes can get li...",3,1,Simile_hypothesis
360,"Allister knows nothing about wine, he is a tee...",0,0,Metaphor_premise


In [23]:
df_train.to_csv("./data/figlang_train.tsv", sep="\t", encoding="utf-8", index=False)
df_test.to_csv("./data/figlang_test.tsv", sep="\t", encoding="utf-8", index=False)

# 5. Count token amount

In [2]:
df_all = pd.read_csv("./data/figlang_all.tsv", sep="\t", encoding="utf-8")
df_figurative = df_all[df_all["label"] !=0]

print(df_figurative.shape)
print(df_figurative["label"].value_counts())
df_figurative

(4342, 4)
1    2212
2     884
3     625
4     621
Name: label, dtype: int64


Unnamed: 0,text,label,label_binary,source
4,He pulls a giant disc out and flashes it like ...,3,1,Simile_hypothesis
9,Some bright young thing had gotten ahold of a ...,2,1,Idiom_hypothesis
13,"“I might be mistaken, but Sean's father looked...",2,1,Idiom_hypothesis
14,Her movements like a strange strip tease .,3,1,Simile_hypothesis
16,"I had to leave my childhood home, and am grate...",1,1,Sarcasm_hypothesis
...,...,...,...,...
10837,Pleased with how my co-worker is incompetent a...,1,1,Sarcasm_hypothesis
10839,Now he remembered what he didn't particularly ...,2,1,Idiom_hypothesis
10840,"She felt transported back to sixth grade, when...",2,1,Idiom_hypothesis
10841,I feel delighted that I can't seem to get my b...,1,1,Sarcasm_hypothesis


In [3]:
total_token_count = 0

for text in df_figurative["text"]:
    total_token_count += len(word_tokenize(text))
    
print("TOTAL TOKEN AMOUNT: {}".format(total_token_count))

TOTAL TOKEN AMOUNT: 74782


In [4]:
print("PER-CLASS TOKEN AMOUNT:")

for label in [1, 2, 3, 4]:
    current_token_count = 0
    current_df = df_figurative[df_figurative["label"]==label]
    for text in current_df["text"]:
        current_token_count += len(word_tokenize(text))
        
    print("\tClass {}: {}".format(label, current_token_count))

PER-CLASS TOKEN AMOUNT:
	Class 1: 45233
	Class 2: 14795
	Class 3: 9062
	Class 4: 5692
