# Defining functions

In [1]:
emo1_path = '/home/atlas/hlt/HLT/dailydialog/1_50_emotion&attitude_dialogues.json'
emo2_path = '/home/atlas/hlt/HLT/dailydialog/2_50_emotion&attitude_dialogues.json'
relat_path = '/home/atlas/hlt/HLT/dailydialog/50_relationship_dialogues.json'
work_path = '/home/atlas/hlt/HLT/dailydialog/50_work_dialogues_labeled.json'

In [2]:
import pandas as pd
import json
# load the data in one single dataframe
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return pd.DataFrame(data)

In [3]:
def refactor(df):# Extract all utterances from the dialogues
    all_utterances = []

    for _, row in df.iterrows():
        # Each row has an 'utterances' field which is a list of utterance dictionaries
        utterances = row['utterances']

        # Add dialogue ID and topic to each utterance for reference
        for utterance in utterances:
            utterance['dialogue_id'] = row['id']
            utterance['topic'] = row['topic']
            all_utterances.append(utterance)

    # Convert to DataFrame for easier analysis
    df = pd.DataFrame(all_utterances)
    return df

# Check for errors

In [4]:
dataset_path = emo1_path
emo1_df = load_data(dataset_path)
emo1_df = refactor(emo1_df)
freq = emo1_df['hat'].value_counts()
freq

hat
red       140
white      79
black      19
green       4
yellow      3
Name: count, dtype: int64

In [5]:
dataset_path = emo2_path
emo2_df = load_data(dataset_path)
emo2_df = refactor(emo2_df)
freq = emo2_df['hat'].value_counts()
freq

hat
white     150
black      43
green      27
yellow     25
red        22
Name: count, dtype: int64

In [6]:
dataset_path = relat_path
rel_df = load_data(dataset_path)
rel_df = refactor(rel_df)
freq = rel_df['hat'].value_counts()
freq

hat
white     138
yellow     34
red        10
green       8
black       3
Name: count, dtype: int64

In [7]:
dataset_path = work_path
work_df = load_data(dataset_path)
work_df = refactor(work_df)
freq = work_df['hat'].value_counts()
freq

hat
white         186
black          46
yellow         34
red            30
green          20
rosso           1
bwhitelack      1
                1
Name: count, dtype: int64

# Fixing work dialogues file
- the 'rosso' value is clearly red
- the empty value is yellow, it was inserted in 'act' column
- the bwhitelack I think is black

In [8]:
# show the rows of df where the value of 'hat' is 'rosso'
error_1 = work_df[work_df['hat'] == 'rosso']
error_1

Unnamed: 0,turn,utterance,emotion,act,hat,dialogue_id,topic
47,1,"Sorry , I'm supposed to be with my son .",no_emotion,commissive,rosso,10463,Work


In [9]:
error_2 = work_df[work_df['hat'] == '']
error_2

Unnamed: 0,turn,utterance,emotion,act,hat,dialogue_id,topic
201,4,"I'm sure it will , if we are appointed your ag...",no_emotion,yellow,,10487,Work


In [10]:
# show the rows of df where the value of 'hat' is 'rosso'
error_3 = work_df[work_df['hat'] == 'bwhitelack']
error_3

Unnamed: 0,turn,utterance,emotion,act,hat,dialogue_id,topic
153,10,What if the results from the two inspections d...,no_emotion,question,bwhitelack,10481,Work


In [11]:
work_df['hat'] = work_df['hat'].replace('rosso', 'red')
work_df.loc[work_df['hat'] == '', 'act'] = 'directive'
work_df['hat'] = work_df['hat'].replace('', 'yellow')
work_df['hat'] = work_df['hat'].replace('bwhitelack', 'black')

In [12]:
freq = work_df['hat'].value_counts()
freq

hat
white     186
black      47
yellow     35
red        31
green      20
Name: count, dtype: int64

# join all the dfs

In [13]:
# join all the dataframes
all_df = pd.concat([emo1_df, emo2_df, rel_df, work_df], ignore_index=True)

In [14]:
freq = all_df['hat'].value_counts()
freq

hat
white     553
red       203
black     112
yellow     97
green      59
Name: count, dtype: int64

In [15]:
import json

# partiamo da df, il DataFrame “piatto” con colonne tipo:
# ['dialogue_id','topic','turn','utterance','emotion','act','hat', …]

# 1. Raggruppa per dialogo
dialogues = []
for (did, topic), group in all_df.groupby(['dialogue_id','topic']):
    # 2. Per ogni gruppo, estrai le utterances come lista di dict
    utts = group.drop(columns=['dialogue_id','topic']) \
                .to_dict(orient='records')
    # 3. Ricrea il dict del dialogo
    dialogues.append({
        'id': int(did),         # o lascia com’era (stringa)
        'topic': topic,
        'utterances': utts
    })

# 4. Scrivi su file
with open('hand_labelled_dataset.json', 'w', encoding='utf-8') as f:
    json.dump(dialogues, f, ensure_ascii=False, indent=2)
