# Load Parquet

In [47]:
import pandas as pd

pd.set_option('display.float_format', '{:,.4f}'.format)

df = pd.read_parquet(f'job-bias-synthetic-human-benchmark.parquet')
df = df[(df['verified'] == True) & (df['synthetic'] == False)]

label_columns = [col for col in df.columns if col.startswith('label_')]
analysis_columns = [col for col in df.columns if col.startswith('analysis_')]
categories = [col.replace('label_', '') for col in label_columns]
text_column = 'text'

print(f"Categories: {categories}")
print(f"Labels: {label_columns}")
print(f"Analysis: {analysis_columns}")
print(f"Input: {text_column}")
print(f"Rows: {len(df)}")
for category in categories:
    label = f"label_{category}"
    bias = len(df[df[label] == True])
    unbiased = len(df[df[label] == False])
    print(f"\t{category}: {bias} biased, {unbiased} unbiased")
    
neutral = df[label_columns].apply(lambda x: all([not i for i in x]), axis=1).sum()
print("Neutral:", neutral)

In [48]:
import os

ids = {}
for type in ['train','val','test']:
    os.makedirs(f"verified/{type}", exist_ok=True)
    ids[type] = []
    with open(f"verified/{type}/ids.txt", "r") as file:
        lines = file.readlines()
        for line in lines:
            ids[type].append(line.strip())

print(f"Train: {len(ids['train'])}")
print(f"Val: {len(ids['val'])}")
print(f"Test: {len(ids['test'])}")
print(f"Total: {len(ids['train']) + len(ids['val']) + len(ids['test'])}")

In [49]:
df.columns

In [50]:
# Convert it into a structure for the final dataset

import pandas as pd
import json

df['notes'] = df['notes'].fillna('')


columns = ['id']
for category in ['age','disability','masculine','feminine','racial','sexuality','general']:
    columns.append(f'label_{category}')
    columns.append(f'analysis_{category}')
    
columns += ['verified', 'synthetic', 'text', 'metadata']
metadata_columns = ['document_id', 'source', 'country', 'company','position','notes','original']
df['metadata'] = df.apply(lambda row: json.dumps(row[metadata_columns].to_dict()), axis=1)
df = df[columns]
df

In [51]:
df.columns

In [52]:

print(df['metadata'].head(1).values[0])

In [53]:
df_train = df[df['id'].isin(ids['train'])]
df_val = df[df['id'].isin(ids['val'])]
df_test = df[df['id'].isin(ids['test'])]

print(f"Train: {len(df_train)}")
print(f"Val: {len(df_val)}")
print(f"Test: {len(df_test)}")
print(f"Train: {len(df_train) + len(df_val) + len(df_test)}")

In [54]:
df_train.to_parquet(f'real-verified-train.parquet', compression='gzip')
df_val.to_parquet(f'real-verified-val.parquet', compression='gzip')
df_test.to_parquet(f'real-verified-test.parquet', compression='gzip')