## Config

In [1]:
dataset_name = 'GoEmotions (Ekman 6)'
labels_column = 'labels'    # emotion

limit_columns = False
remove_others = False

sample_size = 400
random_seed = 0

## Load Dataset

In [None]:
import pandas as pd
from config_files import dataset_config

dataset_metadata = dataset_config.dataset[dataset_name]

file_location = f"./{dataset_metadata['relpath']}"

if dataset_metadata['filetype'] == 'csv':
    df_full_dataset = pd.read_csv(file_location)
elif dataset_metadata['filetype'] == 'tsv':
    df_full_dataset = pd.read_csv(file_location, sep="\t")

df_full_dataset

In [None]:
df_full_dataset.drop(columns = dataset_metadata['unused_columns'], inplace=True)
df_full_dataset.rename(columns = dataset_metadata['remap_columns'], inplace=True)

display(df_full_dataset)

full_label_count =  pd.Series(df_full_dataset.labels).value_counts()
print(full_label_count)

In [None]:
if remove_others:
    df_full_dataset = df_full_dataset[df_full_dataset['labels'] != 'others']
    display(df_full_dataset)

    full_label_count =  pd.Series(df_full_dataset.labels).value_counts()
full_label_percentages = full_label_count / full_label_count.sum() * 100
print(full_label_percentages)

In [None]:
sample_fraction = (sample_size+1) / df_full_dataset.shape[0]
df_subset = df_full_dataset.groupby('labels', group_keys=False).apply(
    lambda x: x.sample(frac=sample_fraction, random_state = random_seed)     # Get a random, stratified sample
)

df_subset = df_subset.sample(frac=1, random_state=random_seed)  # Shuffle rows
df_subset


In [None]:
subset_label_count = pd.Series(df_subset.labels).value_counts()
print(subset_label_count)
subset_label_percentages = subset_label_count / subset_label_count.sum() * 100
print(subset_label_percentages)

## Export

In [None]:
df_subset['text'].to_csv(f"datasets/goemotions/ekman_6_single_label/full_subset/goemotions_6_single_poc_unlabeled.csv", index=True)
df_subset.to_csv(f"datasets/goemotions/ekman_6_single_label/full_subset/goemotions_6_single_poc.csv", index=True)