# Original dataset

In [1]:
import sys, os
sys.path.append(os.path.abspath(".."))
from util import get_dataframe_from_json

In [None]:
red_df = get_dataframe_from_json("dialogues.json")

## Red dataset

In [None]:
red_df.loc[red_df['emotion'] == 'anger',  'hat'] = 'red'
red_df.loc[red_df['emotion'] == 'fear', 'hat'] = 'red'
red_df.loc[red_df['emotion'] == 'disgust','hat'] = 'red'
red_df.loc[red_df['emotion'] == 'sadness','hat'] = 'red'

In [39]:
red_df = red_df[red_df['hat'] == 'red']

## Black and White
- each line is a dictionary

In [4]:
import pandas as pd

bw_df = pd.read_json("../Automated Labeling/Black_White_Hat/dataset_with_bloom.json", lines=True, encoding="utf-8")

In [5]:
# insert value "black" in the hat column where the bloom_label is "Analysis"
bw_df.loc[bw_df['bloom_label'] == 'Analysis',  'hat'] = 'black'
bw_df.loc[bw_df['bloom_label'] == 'Knowledge', 'hat'] = 'white'
bw_df.loc[bw_df['bloom_label'] == 'Evaluation','hat'] = 'white'

In [6]:
print(bw_df.groupby('hat').size())

hat
          8277
black     4916
white    37734
dtype: int64


In [7]:
# keep only the rows where the column 'hat' has value 'black'
black_df = bw_df[bw_df['hat'] == 'black']
white_df = bw_df[bw_df['hat'] == 'white']

## Yellow dataset

In [19]:
# load json /home/atlas/hlt/HLT/Automated Labeling/Yellow Hat/dataset_with_bloom_optimism.json
yellow_df = pd.read_json("../Automated Labeling/Yellow Hat/dataset_with_bloom_optimism.json", encoding="utf-8")

In [20]:
yellow_df.loc[yellow_df['optimism_label'] == 'optimist','hat'] = 'yellow'

In [21]:
print(yellow_df.groupby('hat').size())

hat
          50491
yellow      436
dtype: int64


In [22]:
yellow_df.loc[yellow_df['emotion'] == 'happiness','hat'] = 'yellow'

In [23]:
print(yellow_df.groupby('hat').size())

hat
          42668
yellow     8259
dtype: int64


In [25]:
yellow_df = yellow_df[yellow_df['hat'] == 'yellow']

## Green

In [None]:
green_df = pd.read_csv("../Automated Labeling/Green Hat/Lm Studio Labeling/hat_preds.csv", encoding="utf-8")

In [28]:
green_df.loc[green_df['hat'] == 'Y','hat'] = 'green'

In [29]:
green_df = green_df[green_df['hat'] == 'green']

## Merging

In [33]:
# keep only column utterance and hat
green_df = green_df[['utterance', 'hat']]
red_df = red_df[['utterance', 'hat']]
white_df = white_df[['utterance', 'hat']]
black_df = black_df[['utterance', 'hat']]
yellow_df = yellow_df[['utterance', 'hat']]

In [41]:
# show size of each dataframe
print("Red Hat DataFrame size:", red_df.shape)
print("Black Hat DataFrame size:", black_df.shape)
print("White Hat DataFrame size:", white_df.shape)
print("Yellow Hat DataFrame size:", yellow_df.shape)
print("Green Hat DataFrame size:", green_df.shape)

Red Hat DataFrame size: (2698, 2)
Black Hat DataFrame size: (4916, 2)
White Hat DataFrame size: (37734, 2)
Yellow Hat DataFrame size: (8259, 2)
Green Hat DataFrame size: (16365, 2)


In [42]:
# merge all the dataframes into one dataframe
merged_df = pd.concat([red_df, black_df, white_df, yellow_df, green_df], ignore_index=True)

In [43]:
# see the duplicates utterances in the merged dataframe, create a new column with the number of rows for each utterance
merged_df['count'] = merged_df.groupby('utterance')['utterance'].transform('size')
# sort merged_df by count and utterance
merged_df = merged_df.sort_values(by=['count', 'utterance'], ascending=[False, True])

In [44]:
# drop all the duplicates in the merged dataframe
merged_df = merged_df.drop_duplicates(subset=['utterance'], keep='first')
# drop the count column
merged_df = merged_df.drop(columns=['count'])   

In [45]:
# show distribution of the hat column
print(merged_df.groupby('hat').size())

hat
black      4192
green      7997
red        2408
white     30032
yellow     1045
dtype: int64


In [46]:
# keep up to 1000 samples for each hat, pick randomly
red_df = merged_df[merged_df['hat'] == 'red'].sample(n=1000, random_state=1)
black_df = merged_df[merged_df['hat'] == 'black'].sample(n=1000, random_state=1)
white_df = merged_df[merged_df['hat'] == 'white'].sample(n=1000, random_state=1)
yellow_df = merged_df[merged_df['hat'] == 'yellow'].sample(n=1000, random_state=1)
green_df = merged_df[merged_df['hat'] == 'green'].sample(n=1000, random_state=1)

# merge all the dataframes into one dataframe
merged_df = pd.concat([red_df, black_df, white_df, yellow_df, green_df], ignore_index=True)
# shuffle the dataframe
merged_df = merged_df.sample(frac=1, random_state=1).reset_index(drop=True)

In [47]:
# write the dataframe to a csv file
merged_df.to_csv("final_dataset.csv", index=False, encoding="utf-8")