In [1]:
import pandas as pd
import textwrap
import numpy as np

## CONFIG

In [2]:
# Dataset is too large to process it fully
MAX_N_SAMPLES_PER_ANNOTATOR = 15
NUMBER_OF_MAX_CONTEXTS = 3

## Pobranie danych

In [3]:
#test dataset
#!wget https://github.com/google-research/google-research/raw/master/goemotions/data/test.tsv
#!wget https://github.com/google-research/google-research/raw/master/goemotions/data/train.tsv

In [4]:
#labels
#!wget https://raw.githubusercontent.com/google-research/google-research/master/goemotions/data/emotions.txt

In [5]:
# Full data
#!wget -P data/full_dataset/ https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_1.csv
#!wget -P data/full_dataset/ https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_2.csv
#!wget -P data/full_dataset/ https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_3.csv

In [6]:
full_data_df = pd.concat(
    [
        pd.read_csv("data/full_dataset/goemotions_1.csv"),
        pd.read_csv("data/full_dataset/goemotions_2.csv"),
        pd.read_csv("data/full_dataset/goemotions_3.csv"),
    ]
)

In [7]:
labels_df = pd.read_csv('emotions.txt', header=None)
labels = labels_df.values[:,0].tolist()

In [8]:
labels

['admiration',
 'amusement',
 'anger',
 'annoyance',
 'approval',
 'caring',
 'confusion',
 'curiosity',
 'desire',
 'disappointment',
 'disapproval',
 'disgust',
 'embarrassment',
 'excitement',
 'fear',
 'gratitude',
 'grief',
 'joy',
 'love',
 'nervousness',
 'optimism',
 'pride',
 'realization',
 'relief',
 'remorse',
 'sadness',
 'surprise',
 'neutral']

In [9]:
train_df = pd.read_csv('train.tsv', sep='\t', header=None)

In [10]:
train_df = train_df.rename(
    columns={
        0: "text",
        1: "label",
        2: "id",
    }
)

In [11]:
test_df = pd.read_csv('test.tsv', sep='\t', header=None)

In [12]:
test_df = test_df.rename(
    columns={
        0: "text",
        1: "label",
        2: "id",
    }
)

In [13]:
test_df.head(2)

Unnamed: 0,text,label,id
0,I’m really sorry about your situation :( Altho...,25,eecwqtt
1,It's wonderful because it's awful. At not with.,0,ed5f85d


In [14]:
selected_data_df = test_df[["label", "id"]].set_index("id").join(
    full_data_df.set_index("id"), on="id"
).reset_index()

In [15]:
train_data_df = train_df[["label", "id"]].set_index("id").join(
    full_data_df.set_index("id"), on="id"
).reset_index()

In [16]:
selected_data_df.head(2)

Unnamed: 0,id,label,text,author,subreddit,link_id,parent_id,created_utc,rater_id,example_very_unclear,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,eecwqtt,25,I’m really sorry about your situation :( Altho...,whiteknight617,relationship_advice,t3_ah98jv,t3_ah98jv,1547823000.0,20,False,...,1,0,0,0,0,0,0,1,0,0
1,eecwqtt,25,I’m really sorry about your situation :( Altho...,whiteknight617,relationship_advice,t3_ah98jv,t3_ah98jv,1547823000.0,49,False,...,0,0,0,0,0,0,0,1,0,0


In [17]:
selected_data_df["rater_id"].value_counts()

4     1010
61     919
37     878
2      553
52     544
      ... 
80       8
65       8
0        7
53       6
47       2
Name: rater_id, Length: 81, dtype: int64

## Zebranie surowych anotacji

In [18]:
selected_data_df = selected_data_df.rename(columns={"label": "final_label"})

In [19]:
def get_rater_labels(row) -> list[str]:
    return [
        label
        for label in labels
        if row[label] == 1
    ]

In [20]:
selected_data_df["rater_label"] = selected_data_df.apply(get_rater_labels, axis=1)
train_data_df["rater_label"] = train_data_df.apply(get_rater_labels, axis=1)

In [21]:
selected_data_df = selected_data_df.rename(columns={"id": "text_id"})
train_data_df = train_data_df.rename(columns={"id": "text_id"})

## Zapisanie danych treningowych

In [22]:
train_data_df.to_csv("train.csv", sep=";")

## Wybranie podzbioru wszystkich danych

In [23]:
print("All annotations:", len(selected_data_df))
# For reproducibility
random_state = np.random.RandomState(seed=7)
selected_data_df["selected"] = False
for rater_id in range(0, selected_data_df["rater_id"].max() + 1):
    rater_df = selected_data_df[selected_data_df["rater_id"] == rater_id]
    # Select samples
    rater_df = rater_df.sample(
        min(MAX_N_SAMPLES_PER_ANNOTATOR, len(rater_df)),
        random_state=random_state,
    )
    selected_data_df.loc[rater_df.index, "selected"] = True
selected_data_df = selected_data_df[selected_data_df["selected"]]
selected_data_df = selected_data_df.drop(columns="selected")
selected_data_df = selected_data_df.sort_values(by="rater_id")
print("After selection:", len(selected_data_df))

All annotations: 19470
After selection: 1151


## Przygotowanie zbioru danych bez kontekstu

In [24]:
def create_prompt_without_context(row):
    prompt ="""\
From the given list of all emotions, choose the ones that the input text arouses in most people reading it.
List of all emotions: admiration, amusement, anger, annoyance, approval, caring, confusion, curiosity, desire, disappointment, disapproval, disgust, embarrassment, excitement, fear, gratitude, grief, joy, love, nervousness, optimism, pride, realization, relief, remorse, sadness, surprise, neutral.
Text: {text}.
Write your answer in the form of a Python list containing exactly {n_labels} unique selected most matching {emotion}.
Do not explain yourself.""".format(
        text=row.text,
        n_labels=len(row.rater_label),
        emotion='emotion' if len(row.rater_label) == 1 else 'emotions',
    )
    return prompt

In [25]:
idx = 3
display(selected_data_df.iloc[idx].rater_label)
display(create_prompt_without_context(selected_data_df.iloc[idx]))

['gratitude']

'From the given list of all emotions, choose the ones that the input text arouses in most people reading it.\nList of all emotions: admiration, amusement, anger, annoyance, approval, caring, confusion, curiosity, desire, disappointment, disapproval, disgust, embarrassment, excitement, fear, gratitude, grief, joy, love, nervousness, optimism, pride, realization, relief, remorse, sadness, surprise, neutral.\nText: Please ask questions about moving to Denver in the Q&A sticky, thanks! Short answer: no .\nWrite your answer in the form of a Python list containing exactly 1 unique selected most matching emotion.\nDo not explain yourself.'

In [26]:
prompts_df = selected_data_df[
    [
        "text_id",
        "rater_id",
        "text",
        "rater_label",
    ]
].reset_index(drop=True)
prompts_df["id"] = prompts_df.index
prompts_df["prompt"] = prompts_df.apply(
    create_prompt_without_context,
    axis=1,
)

In [27]:
prompts_df.head(2)

Unnamed: 0,text_id,rater_id,text,rater_label,id,prompt
0,edrwj9z,0,That’s how I outfitted my team in that awesome...,"[excitement, joy]",0,"From the given list of all emotions, choose th..."
1,ed5g4ed,0,All I want for Christmas is a broken home,[desire],1,"From the given list of all emotions, choose th..."


In [28]:
prompts_df.to_csv(f"go_emotions_no_context.csv", sep=";")

## Przygotowanie zbioru danych z kontekstem

In [29]:
def create_prompt_with_context(row):
    other_rater_rows = train_data_df[
        (train_data_df["rater_id"] == row.rater_id)
        & (train_data_df["text_id"] != row.text_id)
    ]
    selected_rater_rows = other_rater_rows.sample(
        min(NUMBER_OF_MAX_CONTEXTS, len(other_rater_rows))  # There is single annotator with 2 texts
    )
    prompt ="""\
Knowing that a certain person described known texts with provided emotions:
{prompts}
With what emotions would this person describe the unknown text?
List of all emotions: admiration, amusement, anger, annoyance, approval, caring, confusion, curiosity, desire, disappointment, disapproval, disgust, embarrassment, excitement, fear, gratitude, grief, joy, love, nervousness, optimism, pride, realization, relief, remorse, sadness, surprise, neutral.
Unknown text: {text}.
Write your answer in the form of a Python list containing exactly {n_labels} unique selected most matching {emotion}.
Do not explain yourself.""".format(
        prompts=".\n".join(
            [
                "Known text: {text}; {emotion}: {rater_emotions}".format(
                    text=row.text,
                    emotion='Emotion' if len(row.rater_label) == 1 else 'Emotions',
                    rater_emotions=", ".join(row.rater_label)
                )
                for _, row in selected_rater_rows.iterrows()
            ]
        ),
        text=row.text,
        n_labels=len(row.rater_label),
        emotion='emotion' if len(row.rater_label) == 1 else 'emotions',
    )
    return prompt

In [30]:
idx = 3
display(selected_data_df.iloc[idx].rater_label)
display(create_prompt_with_context(selected_data_df.iloc[idx]))

['gratitude']

'Knowing that a certain person described known texts with provided emotions:\nKnown text: 7 billion people doing things realtime is not to be underestimated; Emotion: neutral.\nKnown text: Omfg the worst!! want to pull my hair out!!; Emotions: annoyance, disgust.\nKnown text: No worries. Have a nice day.; Emotions: joy, realization\nWith what emotions would this person describe the unknown text?\nList of all emotions: admiration, amusement, anger, annoyance, approval, caring, confusion, curiosity, desire, disappointment, disapproval, disgust, embarrassment, excitement, fear, gratitude, grief, joy, love, nervousness, optimism, pride, realization, relief, remorse, sadness, surprise, neutral.\nUnknown text: Please ask questions about moving to Denver in the Q&A sticky, thanks! Short answer: no .\nWrite your answer in the form of a Python list containing exactly 1 unique selected most matching emotion.\nDo not explain yourself.'

In [31]:
contextualized_prompts_df = selected_data_df[
    [
        "text_id",
        "rater_id",
        "text",
        "rater_label",
    ]
].reset_index(drop=True)
contextualized_prompts_df["id"] = contextualized_prompts_df.index
contextualized_prompts_df["prompt"] = contextualized_prompts_df.apply(
    create_prompt_with_context,
    axis=1,
)

In [32]:
contextualized_prompts_df.head(2)

Unnamed: 0,text_id,rater_id,text,rater_label,id,prompt
0,edrwj9z,0,That’s how I outfitted my team in that awesome...,"[excitement, joy]",0,Knowing that a certain person described known ...
1,ed5g4ed,0,All I want for Christmas is a broken home,[desire],1,Knowing that a certain person described known ...


In [33]:
contextualized_prompts_df.to_csv(f"go_emotions_context_{NUMBER_OF_MAX_CONTEXTS}.csv", sep=";")