In [None]:
# import necessary libraries
import pandas as pd
import pickle
import numpy as np

In [None]:
# adjust the path as necessary to point to your pickle file
file_path = '/content/drive/My Drive/ITNPBD5 PROJECT/processed_empathetic_dialogues.pkl'

# load the DataFrame
with open(file_path, 'rb') as file:
    processed_df = pickle.load(file)

In [None]:
# verify the data
print(f"Loaded DataFrame shape: {processed_df.shape}")
print("\nFirst few rows:")
print(processed_df.head())
print("\nColumn names:")
print(processed_df.columns)

Loaded DataFrame shape: (76497, 7)

First few rows:
       context                                             prompt  \
0       guilty  i felt guilty when i was driving home one nigh...   
1       guilty  i felt guilty when i was driving home one nigh...   
2       guilty  i felt guilty when i was driving home one nigh...   
3       guilty  i felt guilty when i was driving home one nigh...   
4  sentimental  i remember going to the fireworks with my best...   

                                               input  \
0  yeah about 10 years ago i had a horrifying exp...   
1  yeah about 10 years ago i had a horrifying exp...   
2  yeah about 10 years ago i had a horrifying exp...   
3  yeah about 10 years ago i had a horrifying exp...   
4  i remember going to see the fireworks with my ...   

                                              target       conv_id  \
0                       did you suffer any injuries?  hit:0_conv:0   
1  no i wasn't hit. it turned out they were drunk...  hi

We group the data by conv_id and context to get unique conversations per emotion. We then stratify sampling based on context to ensure representation from all emotion categories. and randomly select conversations within each emotion category. Then finally collect all utterances from the selected conversations until we reach about 1000 samples.

In [None]:
# group by conversation and get the first utterance of each (for emotion)
conv_groups = processed_df.groupby('conv_id').first().reset_index()

In [None]:
# calculate the number of conversations to sample from each emotion
emotion_counts = conv_groups['context'].value_counts()
total_convs = len(conv_groups)
sample_size = 1000
prop_to_sample = sample_size / total_convs

sampled_convs = []
for emotion, count in emotion_counts.items():
    n_sample = max(1, int(count * prop_to_sample))  # Ensure at least 1 sample per emotion
    emotion_convs = conv_groups[conv_groups['context'] == emotion]
    sampled_convs.extend(emotion_convs.sample(n=n_sample, random_state=42)['conv_id'].tolist())

In [None]:
# get all utterances from the sampled conversations
sampled_df = processed_df[processed_df['conv_id'].isin(sampled_convs)]

In [None]:
# if we have more than 1000 samples, randomly subsample to get closer to 1000
if len(sampled_df) > 1000:
    sampled_df = sampled_df.sample(n=1000, random_state=42)

In [None]:
print(f"Final sample size: {len(sampled_df)}")
print("\nEmotion distribution in sample:")
print(sampled_df['context'].value_counts(normalize=True))

Final sample size: 1000

Emotion distribution in sample:
context
surprised       0.056
guilty          0.040
lonely          0.039
proud           0.037
embarrassed     0.036
nostalgic       0.035
joyful          0.034
excited         0.034
anxious         0.034
sentimental     0.033
content         0.033
anticipating    0.033
disgusted       0.032
grateful        0.032
annoyed         0.032
caring          0.032
jealous         0.031
prepared        0.031
terrified       0.030
ashamed         0.029
confident       0.029
devastated      0.028
furious         0.028
impressed       0.028
angry           0.028
sad             0.027
afraid          0.026
trusting        0.026
faithful        0.025
disappointed    0.025
hopeful         0.020
apprehensive    0.017
Name: proportion, dtype: float64


In [None]:
# save the sampled data
sampled_df.to_csv('/content/drive/My Drive/ITNPBD5 PROJECT/sampled_empathetic_dialogues.csv', index=False)