# Prepare real verified data

In [1]:
import pandas as pd

pd.set_option('display.float_format', '{:,.4f}'.format)


def load_parquets(type):
    return pd.concat([
        pd.read_parquet(f'real-verified-{type}.parquet'),
        pd.read_parquet(f'synthetic-age-{type}.parquet'),
        pd.read_parquet(f'synthetic-disability-{type}.parquet'),
        pd.read_parquet(f'synthetic-racial-{type}.parquet'),
        pd.read_parquet(f'synthetic-sexuality-{type}.parquet'),
        pd.read_parquet(f'synthetic-general-{type}.parquet'),
        pd.read_parquet(f'synthetic-feminine-{type}.parquet'),
        pd.read_parquet(f'synthetic-masculine-{type}.parquet')
    ])

df = {}
df['train'] = load_parquets('train')
df['val'] = load_parquets('val')
df['test'] = load_parquets('test')

full_df = pd.concat(df.values())

label_columns = [col for col in df['train'].columns if col.startswith('label_')]
analysis_columns = [col for col in df['train'].columns if col.startswith('analysis_')]
categories = [col.replace('label_', '') for col in label_columns]
text_column = 'text'

print(f"Categories: {categories}")
print(f"Labels: {label_columns}")
print(f"Analysis: {analysis_columns}")
print(f"Input: {text_column}")

for type, dataframe in df.items():
    print(f"\nRows ({type}): {len(dataframe)}")
    for category in categories:
        label = f"label_{category}"
        bias = len(dataframe[dataframe[label] == True])
        unbiased = len(dataframe[dataframe[label] == False])
        print(f"\t{category}: {bias} biased, {unbiased} unbiased")
    
    non_neutral = dataframe[label_columns].apply(lambda x: any([i for i in x]), axis=1).sum()
    neutral = dataframe[label_columns].apply(lambda x: all([not i for i in x]), axis=1).sum()
    print(f'\tHas at least one bias category: {non_neutral} ({non_neutral/len(dataframe):.2%})')
    print(f'\tHas no bias categories: {neutral} ({neutral/len(dataframe):.2%})')

non_neutral = full_df[label_columns].apply(lambda x: any([i for i in x]), axis=1).sum()
neutral = full_df[label_columns].apply(lambda x: all([not i for i in x]), axis=1).sum()

print(f'\nFULL-DATASET Rows: {len(full_df)}')
print(f'FULL-DATASET Has at least one bias category: {non_neutral} ({non_neutral/len(full_df):.2%})')
print(f'FULL-DATASET Has no bias categories: {neutral} ({neutral/len(full_df):.2%})')



In [2]:
def export_row(row):
    id = row['id'].replace(':', '_')
    basefile = f"../{type}/{id}"
    row.to_json(f"{basefile}.json", indent=4)
    with open(f"{basefile}.txt", 'w') as f:
        f.write(row['text'])

for type, dataframe in df.items():
   dataframe.apply(export_row, axis=1)