# Import Data

In [11]:
import pandas as pd

# Path to ESConv data
esconv_path = r'../datasets/custom_corpus/esconv_data/esconv.json'

# Read the ESConv dataset (assuming JSON Lines format)
esconv_df = pd.read_json(esconv_path)

# Quick check

display(esconv_df.head())

display(esconv_df.info())

display(esconv_df.describe())


Unnamed: 0,experience_type,emotion_type,problem_type,situation,survey_score,dialog,seeker_question1,seeker_question2,supporter_question1,supporter_question2
0,Previous Experience,anxiety,job crisis,I hate my job but I am scared to quit and seek...,"{'seeker': {'initial_emotion_intensity': '5', ...","[{'speaker': 'seeker', 'annotation': {}, 'cont...",Partner was very supportive,More guidance in conversation or examples,,
1,Current Experience,anger,problems with friends,I have complete unsupportive friends its to th...,"{'seeker': {'initial_emotion_intensity': '5', ...","[{'speaker': 'supporter', 'annotation': {'stra...",,,It was simple,The middle screen hover function gets in the way
2,Current Experience,fear,job crisis,I have been out of work for five weeks in quar...,"{'seeker': {'initial_emotion_intensity': '4', ...","[{'speaker': 'supporter', 'annotation': {'stra...",no,no,,
3,Current Experience,depression,ongoing depression,I am depressed staying home due to COVID,"{'seeker': {'initial_emotion_intensity': '4', ...","[{'speaker': 'supporter', 'annotation': {'stra...",No,No,,
4,Current Experience,depression,breakup with partner,I found out that my boyfriend had been lying t...,"{'seeker': {'initial_emotion_intensity': '5', ...","[{'speaker': 'supporter', 'annotation': {'stra...",Good exercise,no,,


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1300 entries, 0 to 1299
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   experience_type      1300 non-null   object
 1   emotion_type         1300 non-null   object
 2   problem_type         1300 non-null   object
 3   situation            1300 non-null   object
 4   survey_score         1300 non-null   object
 5   dialog               1300 non-null   object
 6   seeker_question1     1300 non-null   object
 7   seeker_question2     1300 non-null   object
 8   supporter_question1  1300 non-null   object
 9   supporter_question2  1300 non-null   object
dtypes: object(10)
memory usage: 101.7+ KB


None

Unnamed: 0,experience_type,emotion_type,problem_type,situation,survey_score,dialog,seeker_question1,seeker_question2,supporter_question1,supporter_question2
count,1300,1300,1300,1300,1300,1300,1300.0,1300.0,1300.0,1300.0
unique,2,11,13,1296,196,1300,699.0,520.0,646.0,470.0
top,Current Experience,anxiety,ongoing depression,Anxiety about losing my job,"{'seeker': {'initial_emotion_intensity': '4', ...","[{'speaker': 'seeker', 'annotation': {}, 'cont...",,,,
freq,991,354,351,2,107,1,283.0,362.0,431.0,550.0


# Remove Unwanted Excess

In [12]:
esconv = esconv_df[['emotion_type', 'dialog']].copy()

In [13]:
display(esconv.shape)

display(esconv.head())

display(esconv.describe())

display(esconv['emotion_type'].value_counts())

(1300, 2)

Unnamed: 0,emotion_type,dialog
0,anxiety,"[{'speaker': 'seeker', 'annotation': {}, 'cont..."
1,anger,"[{'speaker': 'supporter', 'annotation': {'stra..."
2,fear,"[{'speaker': 'supporter', 'annotation': {'stra..."
3,depression,"[{'speaker': 'supporter', 'annotation': {'stra..."
4,depression,"[{'speaker': 'supporter', 'annotation': {'stra..."


Unnamed: 0,emotion_type,dialog
count,1300,1300
unique,11,1300
top,anxiety,"[{'speaker': 'seeker', 'annotation': {}, 'cont..."
freq,354,1


emotion_type
anxiety        354
depression     334
sadness        308
anger          111
fear            95
shame           42
disgust         40
nervousness     13
pain             1
jealousy         1
guilt            1
Name: count, dtype: int64

# Create Subset

## quick smoke test

In [14]:
emotion_samples = {
    'anxiety': 40,
    'depression': 40,
    'sadness': 40,
    'anger': 30,
    'fear': 20,
    'shame': 11,
    'disgust': 10,
    'nervousness': 6,
    'pain': 1,
    'jealousy': 1,
    'guilt': 1
}

for emotion, target_count in emotion_samples.items():
    # Filter rows matching the emotion
    emotion_subset = esconv[esconv['emotion_type'] == emotion]

    # Count rows where first speaker in dialog is 'seeker'
    count_seeker_first = 0
    for idx, row in emotion_subset.iterrows():
        dialog = row['dialog']
        # Check if dialog exists and has at least one turn
        if dialog and len(dialog) > 0:
            first_speaker = dialog[0]['speaker']
            if first_speaker == 'seeker':
                count_seeker_first += 1

    if count_seeker_first >= target_count:
        print(f"I can find at least {target_count} samples of {emotion} where the seeker talks first.")
    else:
        print(f"Sorry, but there are only {count_seeker_first} samples of {emotion} where the seeker talks first.")


I can find at least 40 samples of anxiety where the seeker talks first.
I can find at least 40 samples of depression where the seeker talks first.
I can find at least 40 samples of sadness where the seeker talks first.
I can find at least 30 samples of anger where the seeker talks first.
I can find at least 20 samples of fear where the seeker talks first.
I can find at least 11 samples of shame where the seeker talks first.
I can find at least 10 samples of disgust where the seeker talks first.
I can find at least 6 samples of nervousness where the seeker talks first.
Sorry, but there are only 0 samples of pain where the seeker talks first.
I can find at least 1 samples of jealousy where the seeker talks first.
Sorry, but there are only 0 samples of guilt where the seeker talks first.


In [15]:
40+40+40+30+20+11+10+6+0+1+0

198

In [16]:
esconv.head()

Unnamed: 0,emotion_type,dialog
0,anxiety,"[{'speaker': 'seeker', 'annotation': {}, 'cont..."
1,anger,"[{'speaker': 'supporter', 'annotation': {'stra..."
2,fear,"[{'speaker': 'supporter', 'annotation': {'stra..."
3,depression,"[{'speaker': 'supporter', 'annotation': {'stra..."
4,depression,"[{'speaker': 'supporter', 'annotation': {'stra..."


In [17]:
print(esconv.emotion_type.value_counts())

emotion_type
anxiety        354
depression     334
sadness        308
anger          111
fear            95
shame           42
disgust         40
nervousness     13
pain             1
jealousy         1
guilt            1
Name: count, dtype: int64


In [18]:
import pandas as pd

emotion_samples = {
    'anxiety': 40,
    'depression': 40,
    'sadness': 40,
    'anger': 30,
    'fear': 20,
    'shame': 11,
    'disgust': 10,
    'nervousness': 6,
    'jealousy': 1
    # pain and guilt excluded as requested
}

# Create a list for DataFrames per emotion
sampled_dfs = []
for emotion, n in emotion_samples.items():
    # Subset rows for emotion where dialog starts with seeker
    eligible = esconv[
        (esconv['emotion_type'] == emotion) &
        (esconv['dialog'].apply(lambda d: isinstance(d, list) and len(d) > 0 and d[0].get('speaker') == 'seeker'))
    ]
    # Sample without replacement (if not enough, will just use all available)
    sampled = eligible.sample(n=min(n, len(eligible)), random_state=42)
    sampled_dfs.append(sampled)

# Combine all samples into one DataFrame
esconv_subset = pd.concat(sampled_dfs, ignore_index=True)


In [19]:

# Check result
print(esconv_subset['emotion_type'].value_counts())
print("Total rows:", len(esconv_subset))


emotion_type
anxiety        40
depression     40
sadness        40
anger          30
fear           20
shame          11
disgust        10
nervousness     6
jealousy        1
Name: count, dtype: int64
Total rows: 198


In [20]:
esconv_subset

Unnamed: 0,emotion_type,dialog
0,anxiety,"[{'speaker': 'seeker', 'annotation': {}, 'cont..."
1,anxiety,"[{'speaker': 'seeker', 'annotation': {}, 'cont..."
2,anxiety,"[{'speaker': 'seeker', 'annotation': {}, 'cont..."
3,anxiety,"[{'speaker': 'seeker', 'annotation': {}, 'cont..."
4,anxiety,"[{'speaker': 'seeker', 'annotation': {}, 'cont..."
...,...,...
193,nervousness,"[{'speaker': 'seeker', 'annotation': {}, 'cont..."
194,nervousness,"[{'speaker': 'seeker', 'annotation': {}, 'cont..."
195,nervousness,"[{'speaker': 'seeker', 'annotation': {}, 'cont..."
196,nervousness,"[{'speaker': 'seeker', 'annotation': {}, 'cont..."


In [27]:
esconv_subset = esconv_subset.sample(frac=1, random_state=42).reset_index(drop=True)


In [28]:
esconv_subset

Unnamed: 0,emotion_type,dialog
0,fear,"[{'speaker': 'seeker', 'annotation': {}, 'cont..."
1,disgust,"[{'speaker': 'seeker', 'annotation': {}, 'cont..."
2,anger,"[{'speaker': 'seeker', 'annotation': {}, 'cont..."
3,anxiety,"[{'speaker': 'seeker', 'annotation': {}, 'cont..."
4,anxiety,"[{'speaker': 'seeker', 'annotation': {}, 'cont..."
...,...,...
193,sadness,"[{'speaker': 'seeker', 'annotation': {}, 'cont..."
194,shame,"[{'speaker': 'seeker', 'annotation': {}, 'cont..."
195,anger,"[{'speaker': 'seeker', 'annotation': {}, 'cont..."
196,depression,"[{'speaker': 'seeker', 'annotation': {}, 'cont..."


In [29]:
def concatenate_adjacent_same_speaker(dialog):
    merged = []
    if not dialog or len(dialog) == 0:
        return merged
    prev_speaker = dialog[0]['speaker']
    prev_text = dialog[0]['content']
    for turn in dialog[1:]:
        if turn['speaker'] == prev_speaker:
            prev_text += " " + turn['content']
        else:
            merged.append({'speaker': prev_speaker, 'content': prev_text})
            prev_speaker = turn['speaker']
            prev_text = turn['content']
    merged.append({'speaker': prev_speaker, 'content': prev_text})
    return merged

# Apply to each row’s dialog in place
esconv_subset['dialog'] = esconv_subset['dialog'].apply(concatenate_adjacent_same_speaker)


In [30]:
esconv_subset

Unnamed: 0,emotion_type,dialog
0,fear,"[{'speaker': 'seeker', 'content': 'Hello'}, {'..."
1,disgust,"[{'speaker': 'seeker', 'content': 'hello'}, {'..."
2,anger,"[{'speaker': 'seeker', 'content': 'I'm qualifi..."
3,anxiety,"[{'speaker': 'seeker', 'content': 'Hello '}, {..."
4,anxiety,"[{'speaker': 'seeker', 'content': 'Hello'}, {'..."
...,...,...
193,sadness,"[{'speaker': 'seeker', 'content': 'Hi'}, {'spe..."
194,shame,"[{'speaker': 'seeker', 'content': 'Hello. '}, ..."
195,anger,"[{'speaker': 'seeker', 'content': 'I am always..."
196,depression,"[{'speaker': 'seeker', 'content': 'I'm in depr..."


In [31]:
import re
import pandas as pd

def clean_text(text):
    if not isinstance(text, str):
        return ""
    # Remove newlines, tabs, multiple spaces, and basic html tags/entities
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'<[^>]*>', '', text)
    text = text.replace('&nbsp;', ' ').replace('&amp;', '&')
    return text.strip()

pairs = []
for row in esconv_subset.itertuples(index=False):
    dialog = row.dialog
    # Iterate by index to get seeker-supporter pairs
    i = 0
    while i < len(dialog) - 1:
        first = dialog[i]
        second = dialog[i+1]
        if first['speaker'] == 'seeker' and second['speaker'] == 'supporter':
            context = clean_text(first['content'])
            response = clean_text(second['content'])
            if context and response:
                pairs.append({'Context': context, 'Response': response})
            i += 2
        else:
            i += 1  # If not a valid pair, move forward one


In [32]:
# Create DataFrame
esconv_dialog = pd.DataFrame(pairs)


In [34]:
print(f"Total pairs: {len(esconv_dialog)}")


Total pairs: 2255


In [37]:

# Preview first few
esconv_dialog

Unnamed: 0,Context,Response
0,Hello,"Hello, how are you doing?"
1,I am hanging in there as well as I can be,What's been going on?
2,I am worried about losing my job. The company ...,It seems like that fear of losing your job has...
3,It has indeed. I chose an industry that was fa...,"Yeah, COVID is an unprecedented time where any..."
4,Yes! Exactly and I cannot stand that. I love m...,I can relate to that. It's always nice when ev...
...,...,...
2250,"I agree, thank you for being open to talking t...",It sounds to me like you're thinking that you'...
2251,"Yes, that is exactly how I feel My parents cam...",I can tell you from personal experience and se...
2252,"Thank you, I hope so :)","I know it may sound silly, but you may want to..."
2253,"I've had other people say the same, thank you ...","Studies on the benefits of meditation, mindful..."


In [38]:
esconv_dialog.to_csv('esconv_dialog.csv', index=False)


In [39]:
esconv_subset.emotion_type.value_counts()   

emotion_type
anxiety        40
sadness        40
depression     40
anger          30
fear           20
shame          11
disgust        10
nervousness     6
jealousy        1
Name: count, dtype: int64