In [1]:
import pandas as pd
import numpy as np

In [2]:
!gdown 1N87vmtobGYPZosrd_dE7RHw-c0RYXeZG

Downloading...
From: https://drive.google.com/uc?id=1N87vmtobGYPZosrd_dE7RHw-c0RYXeZG
To: /content/sarcasmania-dataset.csv
  0% 0.00/4.17M [00:00<?, ?B/s]100% 4.17M/4.17M [00:00<00:00, 183MB/s]


In [3]:
data = pd.read_csv('sarcasmania-dataset.csv')

In [4]:
data['tweet'] = data['tweet'].str.replace(r'[@]+[A-Za-z0-9_-]+', '', regex=True)

In [5]:
data = data.rename(columns={'tweet': 'comment'})

In [6]:
ann_columns = ['humor']

In [7]:
data['humor'].value_counts()

0    20137
1    19643
Name: humor, dtype: int64

In [8]:
for col in ann_columns:
    data[col] = data[col].astype(int)

In [9]:
np.random.seed(22)

selected_texts = data['comment'].drop_duplicates().sample(n=1000).tolist()
selected_data = data.loc[data['comment'].isin(selected_texts)]
not_selected_data = data.loc[~data['comment'].isin(selected_texts)]

In [10]:
def get_prompt(text):
    pattern = f'Which one of the attributes: "funny", "not funny" describes a given text? Write your answer in the form of a Python list containing the appropriate attribute.\n\nText: {text}'

    return pattern

selected_data.loc[:, 'prompt'] = selected_data['comment'].apply(get_prompt)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value


In [11]:
def get_labels(row):
    return np.array(['not funny', 'funny'][int(row[ann_columns].values == 1)])

selected_data.loc[:, 'annotation'] = selected_data.apply(get_labels , axis=1)

In [12]:
selected_data.loc[:, ['prompt', 'annotation', 'comment']].reset_index(drop=True).to_csv("sarcasmania_prompts.csv")

In [13]:
selected_data['annotation'].value_counts()

TypeError: ignored

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 5231, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'numpy.ndarray'


not funny    510
funny        490
Name: annotation, dtype: int64

In [14]:
len(data) * 0.15

5967.0

In [15]:
from scipy.stats import entropy

def get_class(class_name):
  return ['not funny', 'funny'].index(class_name)

def _entropy(labels, base=None):
    _, counts = np.unique(labels, return_counts=True)
    return entropy(counts, base=base)

_entropy([get_class(elem) for elem in selected_data.annotation.tolist()])

0.6929471672244782

In [16]:
np.random.seed(22)

selected_texts = data['comment'].drop_duplicates().sample(n=5967).tolist()
selected_data = data.loc[data['comment'].isin(selected_texts)]

from scipy.stats import entropy

def get_class(class_name):
  return ['not funny', 'funny'].index(class_name)

def _entropy(labels, base=None):
    _, counts = np.unique(labels, return_counts=True)
    return entropy(counts, base=base)

def get_labels(row):
    return np.array(['not funny', 'funny'][int(row[ann_columns].values == 1)])

selected_data.loc[:, 'annotation'] = selected_data.apply(get_labels , axis=1)

_entropy([get_class(elem) for elem in selected_data.annotation.tolist()])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value


0.6928912262059419

In [17]:
selected_data['annotation'].value_counts()

TypeError: ignored

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 5231, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'numpy.ndarray'


not funny    3051
funny        2916
Name: annotation, dtype: int64

# Prepare dataset

In [85]:
np.random.seed(42)

dev_df = not_selected_data.sample(n=5967).drop_duplicates(subset=['comment'])

dev_texts = dev_df['comment'].tolist()

train_df = not_selected_data[~not_selected_data['comment'].isin(dev_texts)].drop_duplicates(subset=['comment'])
test_df = pd.read_csv('sarcasmania_prompts.csv')

In [86]:
dev_texts = set(dev_texts)

train_texts = set(train_df['comment'].tolist())
test_texts = set(test_df['comment'].tolist())

print(len(dev_df))
len(dev_texts - train_texts)

5967


5967

In [87]:
train_df['split'] = 'train'
dev_df['split'] = 'val'
test_df['split'] = 'test'

In [88]:
train_df['humor'] = train_df['humor'].astype(int)
dev_df['humor'] = dev_df['humor'].astype(int)

In [89]:
def binarize_label(label):
  label_dict = {'not funny': 0, 'funny': 1}
  return label_dict[str(label)]
  
test_df['annotation'] = test_df['annotation'].apply(binarize_label)

In [90]:
train_df['text_id'] = [i for i in range(len(train_df))]
dev_df['text_id'] = [i for i in range(len(train_df), (len(train_df) + len(dev_df)))]
test_df['text_id'] = [i for i in range((len(train_df) + len(dev_df)), (len(train_df) + len(dev_df) + len(test_df)))]

In [91]:
train_df = train_df.rename(columns={'comment': 'text', 'humor': 'is_funny'})
dev_df = dev_df.rename(columns={'comment': 'text', 'humor': 'is_funny'})
test_df = test_df.rename(columns={'comment': 'text', 'annotation': 'is_funny'})

In [92]:
full_df = pd.concat([train_df, dev_df, test_df]).reset_index(drop=True)

In [95]:
full_df['annotator_id'] = 0

In [96]:
full_df = full_df[['text_id', 'annotator_id', 'text', 'is_funny', 'split']]

In [97]:
full_df

Unnamed: 0,text_id,annotator_id,text,is_funny,split
0,0,0,i hope youre lurking rn. i want to listen to ...,0,train
1,1,0,05 really taught me a valuable lesson I'm neve...,1,train
2,2,0,"Never had a voice to protest, so you fed me s...",0,train
3,3,0,Rest in peace & love to you and your family,0,train
4,4,0,yay! Can't wait to be reunited with you huni...,0,train
...,...,...,...,...,...
39753,39753,0,Yes.. please stalk me so I know you're an inse...,0,test
39754,39754,0,Yes thanks piss me off even more than I alread...,1,test
39755,39755,0,"You'd make a really cool ghost! "" is a new pos...",1,test
39756,39756,0,You know you need to get a life when you even ...,1,test


In [98]:
len(full_df['text'].unique().tolist())

39758

In [99]:
full_df['split'].value_counts()

train    32791
val       5967
test      1000
Name: split, dtype: int64

In [100]:
data_df = full_df[['text_id', 'text']]
annotations_df = full_df[['text_id', 'annotator_id', 'is_funny', 'split']]

In [101]:
data_df.to_csv('data.csv', index=False)
annotations_df.to_csv('annotations.csv', index=False)