In [1]:
import pandas as pd
from datasets import load_dataset
from IPython.display import display
import os
from openai import OpenAI

# Import dataset from HuggingFace

In [2]:
orig_dataset = load_dataset('go_emotions', 'raw')

orig_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'id', 'author', 'subreddit', 'link_id', 'parent_id', 'created_utc', 'rater_id', 'example_very_unclear', 'admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral'],
        num_rows: 211225
    })
})

In [3]:
orig_dataset = orig_dataset['train'].to_pandas()

# Remove unnecessary columns.
# All records have example_very_unclear = False
orig_dataset = orig_dataset.drop(['id', 'author', 'subreddit', 'link_id', 'parent_id', 'created_utc', 'rater_id', 'example_very_unclear'], axis=1)

orig_dataset

Unnamed: 0,text,admiration,amusement,anger,annoyance,approval,caring,confusion,curiosity,desire,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,That game hurt.,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,>sexuality shouldn’t be a grouping category I...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"You do right, if you don't care then fuck 'em!",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,Man I love reddit.,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,"[NAME] was nowhere near them, he was by the Fa...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
211220,Everyone likes [NAME].,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
211221,Well when you’ve imported about a gazillion of...,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
211222,That looks amazing,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
211223,The FDA has plenty to criticize. But like here...,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [35]:
label_columns = orig_dataset.columns.tolist()
# From go_emotions README
label_encoding = {  0: 'admiration',      1: 'amusement',   2: 'anger',           3: 'annoyance',   4: 'approval',    
                    5: 'caring',          6: 'confusion',   7: 'curiosity',       8: 'desire',      9: 'disappointment',  
                    10: 'disapproval',    11: 'disgust',    12: 'embarrassment',  13: 'excitement', 14: 'fear',   
                    15: 'gratitude',      16: 'grief',      17: 'joy',            18: 'love',       19: 'nervousness',    
                    20: 'optimism',       21: 'pride',      22: 'realization',    23: 'relief',     24: 'remorse',    
                    25: 'sadness',        26: 'surprise',   27: 'neutral' }

inverse_encoding = {}
for key, value in label_encoding.items():
    inverse_encoding[value] = key

['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral']


In [5]:
# Create a column 'labels' with a list of label values
orig_dataset.insert(1,'labels','')
orig_dataset['labels'] = orig_dataset[label_columns[1:]].values.tolist()
orig_dataset['labels'] = orig_dataset['labels'].apply(lambda t: [i for i, x in enumerate(t) if x])

# Remove unlabeled records
orig_dataset = orig_dataset[orig_dataset['labels'].map(lambda d: len(d)) > 0]

orig_dataset

Unnamed: 0,text,labels,admiration,amusement,anger,annoyance,approval,caring,confusion,curiosity,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,That game hurt.,[25],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,"You do right, if you don't care then fuck 'em!",[27],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,Man I love reddit.,[18],0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,"[NAME] was nowhere near them, he was by the Fa...",[27],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
5,Right? Considering it’s such an important docu...,[15],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
211219,"Well, I'm glad you're out of all that now. How...",[17],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
211220,Everyone likes [NAME].,[18],0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
211221,Well when you’ve imported about a gazillion of...,[5],0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
211222,That looks amazing,[0],1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Augment

In [13]:
def find_target_label(dataset):
    # Get the number of records needed to balance each label
    label_values = pd.Series([x for item in dataset.labels for x in item]).value_counts()
    label_values.drop(27, inplace=True)   # 'Neutral' is more of a lack of emotion than an emotion
    balance_label_values = label_values.max() - label_values

    balance_label_values.sort_index(ascending=True)
    
    target_label = balance_label_values.idxmax()
    print("Target Label: " + target_label)
    
    return target_label

In [15]:
def get_random_record(dataset, target_label):
    # Temp remove target labeled records and get a random record from remaining dataset
    sample = dataset[~dataset['labels'].apply(lambda x: target_label in x)].sample()
    # If a neutral sample is randomly selected, select again until it's something not-neutral
    while 27 in sample['labels'].tolist()[0]:
       sample = dataset[~dataset['labels'].apply(lambda x: target_label in x)].sample()
    
    display(sample)
    return sample

In [56]:
def generate_text_prompt(sample, target_label):
    sample_text = sample['text'].values[0]
    
    # Translate list of encoded labels to prompt
    match len(sample['labels'].values[0]):
        case 1:
            sample_label = label_encoding[sample['labels'].values[0][0]]
            
        case 2:
            sample_label = (label_encoding[sample['labels'].values[0][0]] + " and " 
                            + label_encoding[sample['labels'].values[0][1]])
            
        case _:
            sample_label = label_encoding[sample['labels'].values[0][0]]
            for label in sample['labels'].values[0][1:]:
                if label != sample['labels'].values[0][-1]:
                    sample_label += ', ' + label_encoding[label]
                else:
                    sample_label += ', and ' + label_encoding[label]       
                    
    print('Labels: ' + sample_label)
    
    if len(sample['labels'].values[0]) == 1:
        sample_label = ' ' + sample_label
    else:
        sample_label = 's ' + sample_label
        
    query = f"The comment, \"{sample_text}\" portrays the emotion{sample_label}. Based on the topic of this comment, generate a new comment that would portray a clear example of {label_encoding[target_label]}"
    print("Query: " + query)
    return query

In [59]:
def generate_label_prompt(response_text):
    query = f"Select one or more emotions that the comment, \"{response_text}\" portrays ONLY from the following list: {label_columns[2:]}."

    return query

In [79]:
def clean_response_text(text):
    
    while text[0] == '\n' or text[0] == ' ':
        text = text[1:]
    
    text = text.replace('\n', ' ')
    
    return text

In [22]:
client = OpenAI(api_key=os.environ.get('OPENAI_API_KEY'))
synth_dataset = pd.DataFrame()

In [93]:
target_label = find_target_label(orig_dataset)
sample = get_random_record(orig_dataset, target_label)
text_query = generate_text_prompt(sample, target_label)

# OpenAI InstructGPT
response = client.completions.create(model="gpt-3.5-turbo-instruct", prompt=text_query)
response_text = clean_response_text(response.choices[0].text)    # InstructGPT response starts with \n\n
print("Response: " + response_text)

label_query = generate_label_prompt(response_text)
response_labels = []


# Poll for label several times
for i in range(5):
    response = client.completions.create(model="gpt-3.5-turbo-instruct", prompt=label_query)
    response_labels.append(clean_response_text(response.choices[0].text))
label_list = [label.split(", ") for label in response_labels]
# labels_list is a list of lists of labels
# for labels in label_list:
#    for label in labels:
#        if label.lower in inverse_encoding.keys:
            
    
print(label_list)

Target Label: 16


Unnamed: 0,text,labels,admiration,amusement,anger,annoyance,approval,caring,confusion,curiosity,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
157282,U can try. Maybe say u are busy now Thursday b...,"[7, 20]",0,0,0,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0


Labels: curiosity and optimism
Query: The comment, "U can try. Maybe say u are busy now Thursday but Friday is still open. I mean u are not waiting for her forever right?" portrays the emotions curiosity and optimism. Based on the topic of this comment, generate a new comment that would portray a clear example of grief
Response: "I just can't believe she's really gone. Her presence will be
[['grief', 'sadness'], ['grief', 'sadness'], ['Grief', 'sadness'], ['Grief', 'Sadness'], ['grief', 'sadness']]


In [19]:
# Build new records until balance is achieved 
while balance_label_values.min():
    goal_label = balance_label_values.idxmax()
    
    
    # Start with the most minor class
    for index, value in balance_label_values.sort_values(ascending=False).items():
        print(f'{index} {value}')

16 16947
23 16331
21 16318
19 15810
12 15144
24 15095
14 14423
8 13803
11 12319
26 12106
13 11991
5 11621
25 10862
6 10261
17 9637
2 9536
18 9429
9 9151
20 8905
22 8835
1 8375
7 7928
10 6196
15 5995
3 4002
0 489
4 0
