In [None]:
import pandas as pd
import re, string

# 1. Load the dataset
df = pd.read_csv('/content/emotion-emotion_69k.csv')
print(f"Initial dataset: {len(df)} samples")
print("Emotion classes and counts:\n", df['emotion'].value_counts(), "\n")

# Drop irrelevant columns (if any Unnamed) and select text/label columns
df = df[[col for col in df.columns if col.startswith('Unnamed')==False]]
# Identify text and label columns
if 'text' in df.columns:
    text_col = 'text'
elif 'empathetic_dialogues' in df.columns:
    text_col = 'empathetic_dialogues'
else:
    text_col = 'Situation'  # fallback
label_col = 'emotion' if 'emotion' in df.columns else 'label'
df = df[[text_col, label_col]]
print(f"Using '{text_col}' as text and '{label_col}' as label.\n")

# 2. Clean the text column: lowercase, remove URLs, punctuation, extra whitespace
def clean_text(text):
    text = str(text).lower()  # lowercase conversion
    # remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)  # remove URLs with regex
    # remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))  # strip punctuation
    # collapse whitespace
    text = ' '.join(text.split())  # remove extra spaces/newlines
    return text

df[text_col] = df[text_col].apply(clean_text)
print(f"After cleaning text: {len(df)} samples (text normalized)\n")

# 3. Drop rows with missing values and duplicates
df = df.dropna(subset=[text_col, label_col])  # drop rows with any NaNs
print(f"After dropna: {len(df)} samples")
print("Emotion classes (with missing removed):\n", df[label_col].value_counts(), "\n")

df = df.drop_duplicates(subset=[text_col, label_col])  # drop exact duplicates
print(f"After drop_duplicates: {len(df)} samples")
print("Emotion classes (post-duplicate removal):\n", df[label_col].value_counts(), "\n")

# 4. Ensure each emotion class is represented
classes = df[label_col].unique()
print(f"Unique emotion classes: {len(classes)}")

# 5. Stratified random sampling to get 20,000 examples
total_desired = 20000
current_total = len(df)
frac = total_desired / current_total

# Sample while preserving original indices initially
sampled_df_temp = df.groupby(label_col, group_keys=False).apply(
    lambda x: x.sample(frac=frac, random_state=42), include_groups=True
)
# Store the original indices of the sampled rows
original_sampled_indices = sampled_df_temp.index

# Now create df_sampled with a reset index for the initial reporting
df_sampled = sampled_df_temp.reset_index(drop=True)

print(f"\nAfter stratified sampling (proportion={frac:.4f}): {len(df_sampled)} samples")
print("Classes in sample:\n", df_sampled[label_col].value_counts(), "\n")

# If we have slightly fewer than 20k due to rounding, randomly sample additional rows to reach 20000
if len(df_sampled) < total_desired:
    needed = total_desired - len(df_sampled)

    # Find the rows in df that were *not* originally sampled
    remaining_df = df.drop(original_sampled_indices)

    # Sample 'needed' rows from these remaining rows
    extra = remaining_df.sample(n=needed, random_state=42)

    # Concatenate the initially sampled (original indexed) rows with the extra rows.
    # Then reset the index of the combined DataFrame.
    df_sampled = pd.concat([sampled_df_temp, extra], ignore_index=True)

    print(f"Added {needed} extra samples to reach {len(df_sampled)}")

# 6. Shuffle the final dataset
df_final = df_sampled.sample(frac=1, random_state=42).reset_index(drop=True)
print(f"\nFinal shuffled dataset: {len(df_final)} samples")
print("Final emotion class distribution:\n", df_final[label_col].value_counts(), "\n")

# 7. Save to CSV
df_final.to_csv('emotion_20k_clean.csv', index=False)
print("Saved cleaned sample to 'emotion_20k_clean.csv'")

Initial dataset: 64636 samples
Emotion classes and counts:
 emotion
surprised                                                                                                                                                                                         3295
excited                                                                                                                                                                                           2465
angry                                                                                                                                                                                             2296
proud                                                                                                                                                                                             2247
annoyed                                                                                                                                 

  sampled_df_temp = df.groupby(label_col, group_keys=False).apply(


In [None]:
# Assume df_final and label_col are available from the previous execution
# If df_final was not in memory, it could be loaded from 'emotion_20k_clean.csv'
# df_final = pd.read_csv('emotion_20k_clean.csv')

total_desired_new = 15000
current_total_df_final = len(df_final)
frac_new = total_desired_new / current_total_df_final

# Perform stratified sampling from df_final
df_15k_sampled_temp = df_final.groupby(label_col, group_keys=False).apply(
    lambda x: x.sample(frac=frac_new, random_state=42), include_groups=True
)

# Store original indices to handle potential undersampling if needed
original_sampled_indices_15k = df_15k_sampled_temp.index

df_15k_sampled = df_15k_sampled_temp.reset_index(drop=True)

print(f"\nAfter stratified sampling for 15k (proportion={frac_new:.4f}): {len(df_15k_sampled)} samples")
print("Classes in 15k sample:\n", df_15k_sampled[label_col].value_counts(), "\n")

# If we have slightly fewer than 15k due to rounding, randomly sample additional rows to reach 15000
if len(df_15k_sampled) < total_desired_new:
    needed_15k = total_desired_new - len(df_15k_sampled)

    # Find the rows in df_final that were *not* originally sampled for 15k
    remaining_df_for_15k = df_final.drop(original_sampled_indices_15k)

    # Sample 'needed_15k' rows from these remaining rows
    extra_15k = remaining_df_for_15k.sample(n=needed_15k, random_state=42)

    # Concatenate the initially sampled (original indexed) rows with the extra rows.
    # Then reset the index of the combined DataFrame.
    df_15k_sampled = pd.concat([df_15k_sampled_temp, extra_15k], ignore_index=True)

    print(f"Added {needed_15k} extra samples to reach {len(df_15k_sampled)}")

# Shuffle the final 15k dataset
df_final_15k = df_15k_sampled.sample(frac=1, random_state=42).reset_index(drop=True)
print(f"\nFinal shuffled 15k dataset: {len(df_final_15k)} samples")
print("Final 15k emotion class distribution:\n", df_final_15k[label_col].value_counts(), "\n")

# Save to CSV
df_final_15k.to_csv('emotion_15k_clean.csv', index=False)
print("Saved cleaned 15k sample to 'emotion_15k_clean.csv'")


After stratified sampling for 15k (proportion=0.7500): 15001 samples
Classes in 15k sample:
 emotion
surprised       766
excited         574
angry           532
proud           520
annoyed         515
sad             515
grateful        488
lonely          488
afraid          484
terrified       480
disgusted       477
anxious         475
guilty          474
furious         473
confident       472
hopeful         472
anticipating    471
impressed       468
nostalgic       467
disappointed    459
joyful          456
prepared        453
jealous         452
content         442
devastated      428
embarrassed     428
sentimental     412
caring          404
trusting        403
ashamed         392
apprehensive    362
faithful        299
Name: count, dtype: int64 


Final shuffled 15k dataset: 15001 samples
Final 15k emotion class distribution:
 emotion
surprised       766
excited         574
angry           532
proud           520
sad             515
annoyed         515
lonely          488


  df_15k_sampled_temp = df_final.groupby(label_col, group_keys=False).apply(


In [None]:
# Assume df_final and label_col are available from previous executions
# If df_final was not in memory, it could be loaded from 'emotion_20k_clean.csv'
# df_final = pd.read_csv('emotion_20k_clean.csv')

total_desired_new = 2000
current_total_df_final = len(df_final) # This refers to the 20k dataset
frac_new = total_desired_new / current_total_df_final

# Perform stratified sampling from df_final (the 20k dataset)
df_8k_sampled_temp = df_final.groupby(label_col, group_keys=False).apply(
    lambda x: x.sample(frac=frac_new, random_state=42), include_groups=True
)

# Store original indices to handle potential undersampling if needed
original_sampled_indices_8k = df_8k_sampled_temp.index

df_8k_sampled = df_8k_sampled_temp.reset_index(drop=True)

print(f"\nAfter stratified sampling for 8k (proportion={frac_new:.4f}): {len(df_8k_sampled)} samples")
print("Classes in 8k sample:\n", df_8k_sampled[label_col].value_counts(), "\n")

# If we have slightly fewer than 8k due to rounding, randomly sample additional rows to reach 8000
if len(df_8k_sampled) < total_desired_new:
    needed_8k = total_desired_new - len(df_8k_sampled)

    # Find the rows in df_final that were *not* originally sampled for 8k
    remaining_df_for_8k = df_final.drop(original_sampled_indices_8k)

    # Sample 'needed_8k' rows from these remaining rows
    extra_8k = remaining_df_for_8k.sample(n=needed_8k, random_state=42)

    # Concatenate the initially sampled (original indexed) rows with the extra rows.
    # Then reset the index of the combined DataFrame.
    df_8k_sampled = pd.concat([df_8k_sampled_temp, extra_8k], ignore_index=True)

    print(f"Added {needed_8k} extra samples to reach {len(df_8k_sampled)}")

# Shuffle the final 8k dataset
df_final_8k = df_8k_sampled.sample(frac=1, random_state=42).reset_index(drop=True)
print(f"\nFinal shuffled 8k dataset: {len(df_final_8k)} samples")
print("Final 8k emotion class distribution:\n", df_final_8k[label_col].value_counts(), "\n")

# Save to CSV
df_final_8k.to_csv('emotion_2k_clean.csv', index=False)
print("Saved cleaned 8k sample to 'emotion_2k_clean.csv'")


After stratified sampling for 8k (proportion=0.1000): 1998 samples
Classes in 8k sample:
 emotion
surprised       102
excited          76
angry            71
annoyed          69
sad              69
proud            69
grateful         65
lonely           65
disgusted        64
afraid           64
terrified        64
guilty           63
furious          63
confident        63
anxious          63
anticipating     63
hopeful          63
nostalgic        62
impressed        62
joyful           61
disappointed     61
jealous          60
prepared         60
content          59
devastated       57
embarrassed      57
sentimental      55
caring           54
trusting         54
ashamed          52
apprehensive     48
faithful         40
Name: count, dtype: int64 

Added 2 extra samples to reach 2000

Final shuffled 8k dataset: 2000 samples
Final 8k emotion class distribution:
 emotion
surprised       102
excited          76
angry            71
proud            70
sad              69
annoyed   

  df_8k_sampled_temp = df_final.groupby(label_col, group_keys=False).apply(


In [None]:
import pandas as pd

# Load original dataset
df = pd.read_csv("emotion-emotion_69k.csv")

# Check the column that contains emotions
print(df.columns)

# Assuming the emotion column is named "emotion"
emotion_col = "emotion"

# Drop rows where the emotion column is NaN before processing
df = df.dropna(subset=[emotion_col])

# Count number of classes
classes = df[emotion_col].unique()

# Number of samples per class (approx)
samples_per_class = 2000 // len(classes)

balanced_samples = []

# Sample equally from each emotion
for c in classes:
    class_subset = df[df[emotion_col] == c]

    # If a class has fewer samples than needed, sample with replacement
    replace_flag = len(class_subset) < samples_per_class

    sampled = class_subset.sample(
        n=samples_per_class,
        replace=replace_flag,
        random_state=42
    )

    balanced_samples.append(sampled)

# Combine all sampled data
final_df = pd.concat(balanced_samples)

# Shuffle the dataset
final_df = final_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Save new CSV
final_df.to_csv("emotion_10k_balanced.csv", index=False)

print("New CSV created: emotion_10k_balanced.csv")
print(final_df.head())

Index(['Index', 'Situation', 'emotion', 'empathetic_dialogues', 'labels',
       'Unnamed: 5', 'Unnamed: 6'],
      dtype='object')
New CSV created: emotion_10k_balanced.csv
   Index                                          Situation  \
0  45424           I am so ready for Labor day, can't wait.   
1  45893  My son had all A's last year at school.  I am ...   
2   9483  I remember finding this baby bird having falle...   
3  35867                                              I can   
4   1420                    I had a great day at work today   

                                             emotion  \
0                                            excited   
1                                              proud   
2                                             caring   
3  t believe my daughter taught herself how to pl...   
4                                I really killed it!   

                                empathetic_dialogues  \
0  Customer :Yes and I am headed out of town.\nAg...  

In [None]:
dp=pd.read_csv("emotion_10k_balanced.csv")
print(dp)

      Index                                          Situation  \
0     45424           I am so ready for Labor day, can't wait.   
1     45893  My son had all A's last year at school.  I am ...   
2      9483  I remember finding this baby bird having falle...   
3     35867                                              I can   
4      1420                    I had a great day at work today   
...     ...                                                ...   
1973  37239           Today I saw the world's biggest cockroah   
1974   3902  I finally got an interview! I have been applyi...   
1975  41077  I went home to my moms house ( I moved away 13...   
1976  45752  I got a really bad headache and it ruined my w...   
1977  34851  Last week I saw my best friend break his leg. ...   

                                                emotion  \
0                                               excited   
1                                                 proud   
2                             

In [None]:
dp

Unnamed: 0,Index,Situation,emotion,empathetic_dialogues,labels,Unnamed: 5,Unnamed: 6
0,45424,"I am so ready for Labor day, can't wait.",excited,Customer :Yes and I am headed out of town.\nAg...,That sounds awesome. Where are you going?,,
1,45893,My son had all A's last year at school. I am ...,proud,Customer :My son had all A's last year at scho...,Wow you must be so proud!,,
2,9483,I remember finding this baby bird having falle...,caring,Customer :You won't believe what I found when ...,Oh no,,
3,35867,I can,t believe my daughter taught herself how to pl...,impressed,Customer :MY daughter taught herself to play t...,"Wow that is impressive, how old is she?",
4,1420,I had a great day at work today,I really killed it!,confident,Customer :There are many different variables t...,Well that's really great that you were able to...,
...,...,...,...,...,...,...,...
1973,37239,Today I saw the world's biggest cockroah,disgusted,Customer :Today I saw the world's biggest cock...,"Wow! That is gross, did you kill it?",,
1974,3902,I finally got an interview! I have been applyi...,content,Customer :I have applied to so many jobs latel...,What's the news? Do you have an interview?,,
1975,41077,I went home to my moms house ( I moved away 13...,nostalgic,Customer :I loved it. I went to my moms and sa...,At least you had a good experience. I definite...,,
1976,45752,I got a really bad headache and it ruined my w...,devastated,"Customer :Yeah, maybe I should take an advil\n...",You hvent taken anything yet?,,


In [None]:
dp=dp.drop("Unnamed: 6", axis=1)

In [None]:
dp

Unnamed: 0,Index,Situation,emotion,empathetic_dialogues,labels,Unnamed: 5
0,45424,"I am so ready for Labor day, can't wait.",excited,Customer :Yes and I am headed out of town.\nAg...,That sounds awesome. Where are you going?,
1,45893,My son had all A's last year at school. I am ...,proud,Customer :My son had all A's last year at scho...,Wow you must be so proud!,
2,9483,I remember finding this baby bird having falle...,caring,Customer :You won't believe what I found when ...,Oh no,
3,35867,I can,t believe my daughter taught herself how to pl...,impressed,Customer :MY daughter taught herself to play t...,"Wow that is impressive, how old is she?"
4,1420,I had a great day at work today,I really killed it!,confident,Customer :There are many different variables t...,Well that's really great that you were able to...
...,...,...,...,...,...,...
1973,37239,Today I saw the world's biggest cockroah,disgusted,Customer :Today I saw the world's biggest cock...,"Wow! That is gross, did you kill it?",
1974,3902,I finally got an interview! I have been applyi...,content,Customer :I have applied to so many jobs latel...,What's the news? Do you have an interview?,
1975,41077,I went home to my moms house ( I moved away 13...,nostalgic,Customer :I loved it. I went to my moms and sa...,At least you had a good experience. I definite...,
1976,45752,I got a really bad headache and it ruined my w...,devastated,"Customer :Yeah, maybe I should take an advil\n...",You hvent taken anything yet?,


In [None]:
dp = dp.rename(columns={"Unnamed: 0": "Index_id"})

In [None]:
dp=dp.drop("Unnamed: 5", axis=1)

In [None]:
dp

Unnamed: 0,Index,Situation,emotion,empathetic_dialogues,labels
0,45424,"I am so ready for Labor day, can't wait.",excited,Customer :Yes and I am headed out of town.\nAg...,That sounds awesome. Where are you going?
1,45893,My son had all A's last year at school. I am ...,proud,Customer :My son had all A's last year at scho...,Wow you must be so proud!
2,9483,I remember finding this baby bird having falle...,caring,Customer :You won't believe what I found when ...,Oh no
3,35867,I can,t believe my daughter taught herself how to pl...,impressed,Customer :MY daughter taught herself to play t...
4,1420,I had a great day at work today,I really killed it!,confident,Customer :There are many different variables t...
...,...,...,...,...,...
1973,37239,Today I saw the world's biggest cockroah,disgusted,Customer :Today I saw the world's biggest cock...,"Wow! That is gross, did you kill it?"
1974,3902,I finally got an interview! I have been applyi...,content,Customer :I have applied to so many jobs latel...,What's the news? Do you have an interview?
1975,41077,I went home to my moms house ( I moved away 13...,nostalgic,Customer :I loved it. I went to my moms and sa...,At least you had a good experience. I definite...
1976,45752,I got a really bad headache and it ruined my w...,devastated,"Customer :Yeah, maybe I should take an advil\n...",You hvent taken anything yet?


In [None]:
dp.to_csv("emotion_2k_processed.csv", index=False)