#### Data augmentation for emotion detection

This notebook showcases how I used a local LLM - Qwen 7b to augment the data within the Tweets - emotion classification dataset.

I reclassified labels for better logical consistency and augmented labels which had a smaller representation.

In [1]:
import os
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "true"
from transformers import pipeline
import torch
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer

In [2]:
print(torch.__version__)
print(torch.cuda.is_available())

2.5.1+cu118
True


In [3]:
hf_read = os.environ["hf_read"]

In [4]:
model_name = "Qwen/Qwen2.5-7B-Instruct"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto",
    use_auth_token=hf_read
)
tokenizer = AutoTokenizer.from_pretrained(model_name)



Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [5]:
prompt = "Give me a short introduction to large language model."
messages = [
    {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=512
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

In [6]:
response

'Sure! A large language model (LLM) is a type of artificial intelligence model designed to understand and generate human-like text based on the input it receives. These models are typically built using deep learning techniques, particularly transformer architectures, which allow them to process and generate sequences of text.\n\nKey features of large language models include:\n\n1. **Training Data**: LLMs are trained on vast amounts of text data from the internet, books, articles, and other sources, which helps them learn patterns and nuances in language.\n\n2. **Context Understanding**: They can understand context and generate responses that are relevant and coherent within the given context.\n\n3. **Versatility**: LLMs can perform various language-related tasks such as translation, summarization, question-answering, and more.\n\n4. **Scalability**: These models often contain millions or even billions of parameters, allowing them to capture complex language structures and patterns.\n\n

In [7]:
train_df = pd.read_csv('datasets/E-c-En-train.csv')
test_df = pd.read_csv('datasets/E-c-En-dev.csv')

all_data = pd.concat([train_df, test_df], axis=0)

In [18]:
all_data.reset_index(drop=True, inplace=True)

In [19]:
all_data

Unnamed: 0,ID,Tweet,anger,anticipation,disgust,fear,joy,love,optimism,pessimism,sadness,surprise,trust
0,2017-En-21441,“Worry is a down payment on a problem you may ...,0,1,0,0,0,0,1,0,0,0,1
1,2017-En-31535,Whatever you decide to do make sure it makes y...,0,0,0,0,1,1,1,0,0,0,0
2,2017-En-21068,@Max_Kellerman it also helps that the majorit...,1,0,1,0,1,0,1,0,0,0,0
3,2017-En-31436,Accept the challenges so that you can literall...,0,0,0,0,1,0,1,0,0,0,0
4,2017-En-22195,My roommate: it's okay that we can't spell bec...,1,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7719,2018-En-01993,@BadHombreNPS @SecretaryPerry If this didn't m...,1,0,1,0,0,0,0,0,0,0,0
7720,2018-En-01784,Excited to watch #stateoforigin tonight! Come ...,0,0,0,0,1,0,1,0,0,0,0
7721,2018-En-04047,"Blah blah blah Kyrie, IT, etc. @CJC9BOSS leavi...",1,0,1,0,0,0,0,0,1,0,0
7722,2018-En-03041,#ThingsIveLearned The wise #shepherd never tru...,0,0,0,0,0,0,0,0,0,0,0


In [72]:
emotion_labels = list(all_data.columns[2:])
emotion_labels

['anger',
 'anticipation',
 'disgust',
 'fear',
 'joy',
 'love',
 'optimism',
 'pessimism',
 'sadness',
 'surprise',
 'trust']

In [20]:
# unpack emotions into unique list for each emotion where = 1

def unpack_emotions(df): 
    emotion_dict = {emotion: [] for emotion in emotion_labels}
    
    for col in emotion_labels:
        emotion_dict[col] = df[df[col] == 1]['Tweet'].tolist()
    
    return emotion_dict

In [21]:
emotion_dict = unpack_emotions(all_data)

In [34]:
def generate_instruction_response(
    prompt: str,
    tokenizer,
    model,
    system_prompt: str = "You are Qwen, created by Alibaba Cloud. You are a helpful assistant.",
    max_new_tokens: int = 512
) -> str:
    """
    Generates a response to an instruction prompt using the provided tokenizer and model.

    Args:
        prompt (str): The user instruction.
        tokenizer: The tokenizer object.
        model: The model object.
        system_prompt (str): The system prompt for the assistant.
        max_new_tokens (int): Maximum number of new tokens to generate.

    Returns:
        str: The generated response.
    """
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": prompt}
    ]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=max_new_tokens
    )
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]

    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

    # Deallocate memory after each response
    del model_inputs, generated_ids
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.ipc_collect()
        
    return response

In [67]:
system_prompt = '''
Your task is to validate a collection of tweets for their emotional content.

You will be given an emotion label and a collection of tweets.

For each tweet determine whether the emotion label is clearly expressed by the content of the tweet.

The tweets will be in a list format and comma separated.


If the label is there return 1 else return 0, return the result in the form of a list and in json format.

Use the following format for your response:
```json
{
    "emotion": "label",
    "result": [1, 0, 1, ...]
}
```

Where "emotion" is the emotion label and "result" is a list of 1s and 0s indicating whether the emotion is present in each tweet.
'''

In [116]:
emotion = 'joy' 
tweet_sample = emotion_dict[emotion][:20]

prompt = f"The emotion is: {emotion}\nTweet examples: {tweet_sample}"

In [112]:
emotion_labels

['anger',
 'anticipation',
 'disgust',
 'fear',
 'joy',
 'love',
 'optimism',
 'pessimism',
 'sadness',
 'surprise',
 'trust']

In [117]:
tweet_sample

['Whatever you decide to do make sure it makes you #happy.',
 "@Max_Kellerman  it also helps that the majority of NFL coaching is inept. Some of Bill O'Brien's play calling was wow, ! #GOPATS",
 "Accept the challenges so that you can literally even feel the exhilaration of victory.' -- George S. Patton 🐶",
 "No but that's so cute. Atsu was probably shy about photos before but cherry helped her out uwu",
 'Tiller and breezy should do a collab album. Rapping and singing prolly be fire',
 'Star trek online has a update to download oh fuming yay',
 'The bitter the battle, the sweeter the victory...',
 'Awareness of time is awareness of time lost. #awareness #time ',
 '@adamrodricks I like your optimism!',
 "My cat is bloody lucky the RSPCA weren't open at 3am last night!!! #fuming 😡🐱",
 '@FaithHill I remember it well #happy #afraid #Positive',
 "I mean I'm not done watching the pilot, but it's nice to see a group of actors perform without story lines dripping relentless nihilism.",
 "@Rogu

In [207]:
classification_system_prompt = f'''
Task: Classify the emotional content of a tweet.

You are given a list of possible emotion labels: {emotion_labels}.

For each tweet, identify which of these emotions are clearly expressed. Return your answer as a comma-separated string of the emotion labels that apply.

If the tweet does not express any of the listed emotions, return an empty string.

Important: Respond only with the applicable emotion labels, separated by commas (no additional text or formatting). Do not include any emotions that are not contained within the list

Tweet: People you need to look up the definition of protest. What you are doing is not protesting it's called vandalism. #angry #stop
Output: anger, disgust

Tweet: it's pretty depressing when u hit pan on ur favourite highlighter
Output: sadness

Tweet: Star trek online has a update to download oh fuming yay
Output: anger

Tweet: @WaterboysAS I would never strategically vote for someone I don't agree with. A lot of the Clinton vote based on fear and negativity
Output: disgust, fear, pessimism

Tweet: Whatever you decide to do make sure it makes you #happy.
Output: joy

Tweet: All the young people are so bitter about how the older contestants probably know how to make Bakewell Tarts 😂 #GBBO
Output: joy, disgust
'''

In [None]:
responses = []

for tweet in tweet_sample: 
    prompt = f"Tweet: {tweet}\nOutput:"
    response = generate_instruction_response(prompt, tokenizer, model, classification_system_prompt)
    
    responses.append(response)

In [126]:
responses

['joy',
 'disgust, anger',
 'joy, excitement',
 'joy, trust',
 'joy, anticipation',
 'anger, joy',
 'joy',
 'sadness',
 'optimism',
 'anger, disgust',
 'joy, optimism, fear',
 'joy, optimism',
 'fear, anxiety',
 'joy',
 'optimism, joy',
 'joy, disgust',
 'joy',
 'joy',
 'sadness',
 'joy']

In [130]:
all_tweets = all_data['Tweet'].drop_duplicates().tolist()

In [133]:
all_responses = []

In [None]:
print(f'There are {len(all_tweets)-len(all_responses)} unique tweets to process.')

for idx, tweet in enumerate(all_tweets):

    if idx + 1 <= len(all_responses):
        pass
    else:
        prompt = f"Tweet: {tweet}\nOutput:"
        response = generate_instruction_response(prompt, tokenizer, model, classification_system_prompt)
        all_responses.append(response)
    if (idx + 1) % 100 == 0 or idx == len(all_tweets) - 1:
        print(f"Processed {((idx + 1) / len(all_tweets)) * 100:.2f}% of tweets")

There are 5609 unique tweets to process.
Processed 1.29% of tweets
Processed 2.59% of tweets
Processed 3.88% of tweets
Processed 5.18% of tweets
Processed 6.47% of tweets
Processed 7.77% of tweets
Processed 9.06% of tweets
Processed 10.36% of tweets
Processed 11.65% of tweets
Processed 12.95% of tweets
Processed 14.24% of tweets
Processed 15.54% of tweets
Processed 16.83% of tweets
Processed 18.13% of tweets
Processed 19.42% of tweets
Processed 20.71% of tweets
Processed 22.01% of tweets
Processed 23.30% of tweets
Processed 24.60% of tweets
Processed 25.89% of tweets
Processed 27.19% of tweets
Processed 28.48% of tweets
Processed 29.78% of tweets
Processed 31.07% of tweets
Processed 32.37% of tweets
Processed 33.66% of tweets
Processed 34.96% of tweets
Processed 36.25% of tweets
Processed 37.55% of tweets
Processed 38.84% of tweets
Processed 40.13% of tweets
Processed 41.43% of tweets
Processed 42.72% of tweets
Processed 44.02% of tweets
Processed 45.31% of tweets
Processed 46.61% of t

In [209]:
all_data['Re-Classified'] = all_responses

#### Augmenting data for under represented labels

Here we will use Qwen to further generate additional tweets for the model to train on

The approach I have taken is to create 10 categories to aid and hopefully enhance the variation on the samples generated.

Here I can generate 10 x 10 tweets so 100 additional samples for each emotion.

Limiting the number of samples generated hopefully ensures that the quality of the output is high and that the model does not have its attention to detail degraded as the context grows.

This could potentially be run any number of n times and hopefully the model will generate a varied sample as well, this can however be assessed.

In [198]:
tweet_categories = [
    "Politics",
    "Science and Technology",
    "Film and Media",
    "Music and Entertainment",
    "Pop Culture",
    "Sports and Fitness",
    "Personal Life and Emotions",
    "Social Issues and Activism",
    "Humor and Memes",
    "Consumer Experiences and Brands"
]

system_prompt = f'''
Your task is to generate a creative list of 10 unique and engaging tweets for a provided category

You will also be given a specific emotion to incorporate into the tweets, ensure that the tweets reflect this emotion clearly.

Try to have the tweets be diverse in content, tone, and style while still being relevant to the category. 

Include emojis in a range of 2 or 3 of the tweets however the rest should not include emojis.

Do not use the word of the emotion label in your response, use synonyms or related words or expressions.

And also limit the tweet to only that emotion expression, do not include any other emotions.

Ensure that each tweet is at least 5 words long and no more than 280 characters.

Return only the tweets, do not include any additional text or formatting and separate them by new lines.
'''

emotions_to_augment = ['trust', 'surprise', 'anticipation', 'pessimism']

In [None]:
synonym_system_prompt = "You are a creative and helpful assistant, you adhere to generating content that adheres to the guidelines of your instructions whilst being as expressive as possible."

synonym_prompt = """for each of these emotions: ['trust', 'surprise', 'anticipation', 'pessimism'] 

create a JSON of 10 synonyms which could be used for the emotion, use the emotion as the key and return a comma separated list

ensure to use language that is colloquial and natural, do not use overly formal or technical language
"""

response = generate_instruction_response(synonym_prompt, tokenizer, model, synonym_system_prompt)

In [194]:
synonyms = eval(response.replace('```json', '').replace('```', '').strip())  # Convert the string response to a dictionary
synonyms

{'trust': 'belief, confidence, faith, reliance, dependability, credibility, reliability, dependability, surety, trustworthiness',
 'surprise': 'shock, amazement, astonishment, wonder, astoundment, flabbergast, marvel, astound, stagger, dumbfound',
 'anticipation': 'expectation, eagerness, excitement, hope, looking forward, readiness, impatience, thrill, anticipation, fervent hope',
 'pessimism': 'doubt, skepticism, cynicism, pessimistic outlook, gloom, negativity, despair, bleakness, fatalism, downbeat'}

In [202]:
augmented_tweets = {'emotion': [],
                    'category': [],
                    'tweet': []}

In [203]:
for emotion in emotions_to_augment:
    print(f"Generating tweets for emotion: {emotion}")
    for tweet_category in tweet_categories:
        print(f"Generating tweets for category: {tweet_category}")
        prompt = f"Category: {tweet_category}\nEmotion: {emotion}\nExample synonyms: {synonyms.get(emotion,'')}\nGenerate 10 unique tweets:"
        response = generate_instruction_response(prompt, tokenizer, model, system_prompt)

        generated_tweets = [gen for gen in response.split('\n') if len(gen.strip()) > 0][:10]
        assert len(generated_tweets) == 10, "Expected exactly 10 tweets to be generated."

        augmented_tweets['emotion'].extend([emotion] * 10)
        augmented_tweets['category'].extend([tweet_category] * 10)
        augmented_tweets['tweet'].extend(generated_tweets)  # Ensure we only take the first 10 tweets

Generating tweets for emotion: trust
Generating tweets for category: Politics
Generating tweets for category: Science and Technology
Generating tweets for category: Film and Media
Generating tweets for category: Music and Entertainment
Generating tweets for category: Pop Culture
Generating tweets for category: Sports and Fitness
Generating tweets for category: Personal Life and Emotions
Generating tweets for category: Social Issues and Activism
Generating tweets for category: Humor and Memes
Generating tweets for category: Consumer Experiences and Brands
Generating tweets for emotion: surprise
Generating tweets for category: Politics
Generating tweets for category: Science and Technology
Generating tweets for category: Film and Media
Generating tweets for category: Music and Entertainment
Generating tweets for category: Pop Culture
Generating tweets for category: Sports and Fitness
Generating tweets for category: Personal Life and Emotions
Generating tweets for category: Social Issues 

In [204]:
pd.DataFrame(augmented_tweets)['tweet'].to_list()

['Trust is the foundation of a strong democracy. Without it, nothing else matters. #Politics',
 "Building trust takes time but breaking it can happen in an instant. Let's be mindful. #Belief",
 'Our leaders must earn our trust through consistent actions and transparency. #Reliability',
 'Every vote counts towards establishing trust in our political systems. #Credibility',
 'Transparency is key to building trust between citizens and their government. #Dependability',
 "It's crucial to hold our leaders accountable to maintain public trust. #Surety",
 "Strong institutions are built on trust. Let's work together to strengthen ours. #Dependability",
 'Transparency and honesty breed trust. Let’s demand more from our leaders. #Credibility',
 "Elections are not just about choosing leaders; they're about building trust in our system. #Belief",
 'We must trust in the process and the people who serve us. #Trustworthiness',
 'New AI advancements show us we can rely on technology to solve complex p

In [206]:
augmented_tweets = pd.DataFrame(augmented_tweets)

In [211]:
augmented_tweets['ID'] = ['Augmented_data - ' + str(n) for n in range(1, len(augmented_tweets) + 1)]

In [212]:
augmented_tweets

Unnamed: 0,emotion,category,tweet,ID
0,trust,Politics,Trust is the foundation of a strong democracy....,Augmented_data - 1
1,trust,Politics,Building trust takes time but breaking it can ...,Augmented_data - 2
2,trust,Politics,Our leaders must earn our trust through consis...,Augmented_data - 3
3,trust,Politics,Every vote counts towards establishing trust i...,Augmented_data - 4
4,trust,Politics,Transparency is key to building trust between ...,Augmented_data - 5
...,...,...,...,...
395,pessimism,Consumer Experiences and Brands,Why bother with customer service when it's alw...,Augmented_data - 396
396,pessimism,Consumer Experiences and Brands,Technology companies promising privacy? More l...,Augmented_data - 397
397,pessimism,Consumer Experiences and Brands,Product reviews are so unreliable; they might ...,Augmented_data - 398
398,pessimism,Consumer Experiences and Brands,Brand promises feel hollow when they're just a...,Augmented_data - 399


In [213]:
all_augmented_responses = []

In [215]:
print(f"There are {len(augmented_tweets['tweet'])-len(all_augmented_responses)} unique tweets to process.")

for idx, tweet in enumerate(augmented_tweets['tweet']):

    if idx + 1 <= len(all_augmented_responses):
        pass
    else:
        prompt = f"Tweet: {tweet}\nOutput:"
        response = generate_instruction_response(prompt, tokenizer, model, classification_system_prompt)
        all_augmented_responses.append(response)
    if (idx + 1) % 100 == 0 or idx == len(augmented_tweets['tweet']) - 1:
        print(f"Processed {((idx + 1) / len(augmented_tweets['tweet'])) * 100:.2f}% of tweets")

There are 400 unique tweets to process.
Processed 25.00% of tweets
Processed 50.00% of tweets
Processed 75.00% of tweets
Processed 100.00% of tweets


In [221]:
augmented_tweets['Re-Classified'] = [x + ', ' + y if x not in y 
 else y 
 for x, y in 
 zip(augmented_tweets['emotion'], all_augmented_responses)]

In [224]:
augmented_tweets.columns = ['emotion', 'category', 'Tweet', 'ID', 'Re-Classified']

In [225]:
augmented_tweets

Unnamed: 0,emotion,category,Tweet,ID,Re-Classified
0,trust,Politics,Trust is the foundation of a strong democracy....,Augmented_data - 1,trust
1,trust,Politics,Building trust takes time but breaking it can ...,Augmented_data - 2,"trust, optimism"
2,trust,Politics,Our leaders must earn our trust through consis...,Augmented_data - 3,"trust, optimism"
3,trust,Politics,Every vote counts towards establishing trust i...,Augmented_data - 4,"trust, optimism"
4,trust,Politics,Transparency is key to building trust between ...,Augmented_data - 5,trust
...,...,...,...,...,...
395,pessimism,Consumer Experiences and Brands,Why bother with customer service when it's alw...,Augmented_data - 396,"pessimism, disappointment, sadness"
396,pessimism,Consumer Experiences and Brands,Technology companies promising privacy? More l...,Augmented_data - 397,"pessimism, disgust, fear"
397,pessimism,Consumer Experiences and Brands,Product reviews are so unreliable; they might ...,Augmented_data - 398,"pessimism, disgust, frustration"
398,pessimism,Consumer Experiences and Brands,Brand promises feel hollow when they're just a...,Augmented_data - 399,"pessimism, disgust, sadness"


In [227]:
columns = ['ID', 'Tweet', 'Re-Classified']

combined_reclassified = pd.concat([all_data.loc[:,columns], augmented_tweets.loc[:,columns]], axis=0)

In [230]:
emotion_labels

['anger',
 'anticipation',
 'disgust',
 'fear',
 'joy',
 'love',
 'optimism',
 'pessimism',
 'sadness',
 'surprise',
 'trust']

In [228]:
combined_reclassified

Unnamed: 0,ID,Tweet,Re-Classified
0,2017-En-21441,“Worry is a down payment on a problem you may ...,"fear, optimism"
1,2017-En-31535,Whatever you decide to do make sure it makes y...,joy
2,2017-En-21068,@Max_Kellerman it also helps that the majorit...,"anger, disgust"
3,2017-En-31436,Accept the challenges so that you can literall...,"joy, excitement"
4,2017-En-22195,My roommate: it's okay that we can't spell bec...,"sadness, disappointment"
...,...,...,...
395,Augmented_data - 396,Why bother with customer service when it's alw...,"pessimism, disappointment, sadness"
396,Augmented_data - 397,Technology companies promising privacy? More l...,"pessimism, disgust, fear"
397,Augmented_data - 398,Product reviews are so unreliable; they might ...,"pessimism, disgust, frustration"
398,Augmented_data - 399,Brand promises feel hollow when they're just a...,"pessimism, disgust, sadness"


In [231]:
for emotion in emotion_labels:
    combined_reclassified[emotion] = combined_reclassified['Re-Classified'].apply(
        lambda x: 1 if emotion in x else 0
    )

In [232]:
combined_reclassified

Unnamed: 0,ID,Tweet,Re-Classified,anger,anticipation,disgust,fear,joy,love,optimism,pessimism,sadness,surprise,trust
0,2017-En-21441,“Worry is a down payment on a problem you may ...,"fear, optimism",0,0,0,1,0,0,1,0,0,0,0
1,2017-En-31535,Whatever you decide to do make sure it makes y...,joy,0,0,0,0,1,0,0,0,0,0,0
2,2017-En-21068,@Max_Kellerman it also helps that the majorit...,"anger, disgust",1,0,1,0,0,0,0,0,0,0,0
3,2017-En-31436,Accept the challenges so that you can literall...,"joy, excitement",0,0,0,0,1,0,0,0,0,0,0
4,2017-En-22195,My roommate: it's okay that we can't spell bec...,"sadness, disappointment",0,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,Augmented_data - 396,Why bother with customer service when it's alw...,"pessimism, disappointment, sadness",0,0,0,0,0,0,0,1,1,0,0
396,Augmented_data - 397,Technology companies promising privacy? More l...,"pessimism, disgust, fear",0,0,1,1,0,0,0,1,0,0,0
397,Augmented_data - 398,Product reviews are so unreliable; they might ...,"pessimism, disgust, frustration",0,0,1,0,0,0,0,1,0,0,0
398,Augmented_data - 399,Brand promises feel hollow when they're just a...,"pessimism, disgust, sadness",0,0,1,0,0,0,0,1,1,0,0


In [233]:
combined_reclassified.to_csv('datasets/E-c-En-Reclassified.csv', index=False)