In [2]:

import pandas as pd


def analyse_dataframe(dataframe):
    label_columns = [col for col in dataframe.columns if col.startswith('label_')]
    analysis_columns = [col for col in dataframe.columns if col.startswith('analysis_')]
    categories = [col.replace('label_', '') for col in label_columns]
    text_column = 'text'

    print(f"Categories: {categories}")
    print(f"Labels: {label_columns}")
    print(f"Analysis: {analysis_columns}")
    print(f"Input: {text_column}")

    non_neutral = dataframe[label_columns].apply(lambda x: any([i for i in x]), axis=1).sum()
    neutral = dataframe[label_columns].apply(lambda x: all([not i for i in x]), axis=1).sum()

    print(f'\nRows: {len(dataframe)}')
    print(f'Has at least one bias category: {non_neutral} ({non_neutral / len(dataframe):.2%})')
    print(f'Has no bias categories: {neutral} ({neutral / len(dataframe):.2%})')



# job-bias-synthetic-human-benchmark

In [35]:
df_synthetic_taxonomy = pd.read_parquet('input/synthetic-biased-job-descriptions.parquet')
analyse_dataframe(df_synthetic_taxonomy)
df_synthetic_taxonomy

In [32]:
import json

def create_meta(row):
    meta = {
        'model': 'meta-llama/Meta-Llama-3-70B-Instruct',
        'timestamp': row['document_id'].split(':')[-2],
        'inference_time': row['inference_time'],
        'prompt_tokens': row['prompt_tokens'],
        'completion_tokens': row['completion_tokens'],
        'total_tokens': row['total_tokens'],
        'temperature': 0.8,
        # 'seed': 2024,
        'input': row['input'],
        'output': row['output'],
    }
    return json.dumps(meta)

In [37]:
df_synthetic_taxonomy['metadata'] = df_synthetic_taxonomy.apply(create_meta, axis=1)
df_synthetic_taxonomy.rename(columns={'document_id': 'id'}, inplace=True)
df_synthetic_taxonomy['verified'] = False
df_synthetic_taxonomy['synthetic'] = True
df_synthetic_taxonomy = df_synthetic_taxonomy[['id', 'label_age', 'analysis_age', 'label_disability',
       'analysis_disability', 'label_masculine', 'analysis_masculine',
       'label_feminine', 'analysis_feminine', 'label_racial',
       'analysis_racial', 'label_sexuality', 'analysis_sexuality',
       'label_general', 'analysis_general', 'verified', 'synthetic', 'text',
       'metadata']]

df_synthetic_taxonomy

In [40]:
df_synthetic_taxonomy.to_parquet('output/unverified-train-taxonomy.parquet')

In [14]:
import json

json.loads(df_synthetic_taxonomy.head(1)['notes'].values[0])

In [12]:
df_synthetic_taxonomy.columns

# Feminine Bias

In [12]:
import json

def create_meta_gender(row):
    meta = {
       # 'document_id': row['document_id'],
        'position': row['position'],
        'inference_time': row['inference_time'],
        'model': 'gpt-4o-2024-05-13',
        'prompt_tokens': row['prompt_tokens'],
        'completion_tokens': row['completion_tokens'],
        'total_tokens': row['total_tokens'],
        'input': row['input'],
        'output': row['output'],
    }
    return json.dumps(meta)

In [13]:
import pandas as pd

df_feminine_bias = pd.read_parquet('input/feminine-bias.parquet')

analyse_dataframe(df_feminine_bias)

df_feminine_bias['text'] = df_feminine_bias['text'].str.replace('***', '', regex=False)
df_feminine_bias['text'] = df_feminine_bias['text'].str.replace('**', '', regex=False)
df_feminine_bias['text'] = df_feminine_bias['text'].str.replace('*', '-', regex=False)


df_feminine_bias['metadata'] = df_feminine_bias.apply(create_meta_gender, axis=1)
df_feminine_bias['verified'] = False
df_feminine_bias['synthetic'] = True

df_feminine_bias.rename(columns={'document_id': 'id'}, inplace=True)

df_feminine_bias = df_feminine_bias[['id', 'label_age', 'analysis_age', 'label_disability',
       'analysis_disability', 'label_masculine', 'analysis_masculine',
       'label_feminine', 'analysis_feminine', 'label_racial',
       'analysis_racial', 'label_sexuality', 'analysis_sexuality',
       'label_general', 'analysis_general', 'verified', 'synthetic', 'text',
       'metadata']]

df_feminine_bias

In [17]:
df_feminine_bias.to_parquet('output/unverified-train-gpt4-feminine.parquet')

# Masculine

In [16]:
import pandas as pd

df_masculine_bias = pd.read_parquet('input/masculine-bias.parquet')

analyse_dataframe(df_masculine_bias)

df_masculine_bias['text'] = df_masculine_bias['text'].str.replace('***', '', regex=False)
df_masculine_bias['text'] = df_masculine_bias['text'].str.replace('**', '', regex=False)
df_masculine_bias['text'] = df_masculine_bias['text'].str.replace('*', '-', regex=False)


df_masculine_bias['metadata'] = df_masculine_bias.apply(create_meta_gender, axis=1)
df_masculine_bias['verified'] = False
df_masculine_bias['synthetic'] = True

df_masculine_bias.rename(columns={'document_id': 'id'}, inplace=True)

df_masculine_bias = df_masculine_bias[['id', 'label_age', 'analysis_age', 'label_disability',
       'analysis_disability', 'label_masculine', 'analysis_masculine',
       'label_feminine', 'analysis_feminine', 'label_racial',
       'analysis_racial', 'label_sexuality', 'analysis_sexuality',
       'label_general', 'analysis_general', 'verified', 'synthetic', 'text',
       'metadata']]

df_masculine_bias

In [18]:
df_masculine_bias.to_parquet('output/unverified-train-gpt4-masculine.parquet')

# GPT4 Labelled

In [18]:
import pandas as pd
import glob

# Get the list of all parquet files in directory
files = glob.glob('input/gpt4o/*.parquet.snappy')

df_gp4_turbo_labelled = []

for file in files:
    df = pd.read_parquet(file)
    df_gp4_turbo_labelled.append(df)

df_gp4_turbo_labelled = pd.concat(df_gp4_turbo_labelled, ignore_index=True)
analyse_dataframe(df_gp4_turbo_labelled)
df_gp4_turbo_labelled

In [19]:
import json

def create_id_gpt4_turbo_labelled(row):
    return f"{row['document_id']}:{row['phrase_index']}"
    
def create_meta_gpt4_turbo_labelled(row):
    meta = {
        'document_id': row['document_id'],
        'phrase_index': row['phrase_index'],
        'phrase_series': row['phrase_series'],
        'source': row['source'],
        'country': row['country'],
        'company': row['company'],
        'position': row['position'],
        'original': row['html'],
        'model': row['llm_model_name'],
        #'timestamp': row['document_id'].split(':')[-2],
        'inference_time': row['llm_inference_time'],
        'prompt_tokens': row['llm_input_token_count'],
        'completion_tokens': row['llm_output_token_count'],
        'total_tokens': row['llm_total_token_count'],
        'temperature': 0.8,
        # 'seed': 2024,
        #'input': row['input'],
        'output': row['llm_output'],
    }
    return json.dumps(meta)

In [20]:
df_gp4_turbo_labelled['id'] = df_gp4_turbo_labelled.apply(create_id_gpt4_turbo_labelled, axis=1)
df_gp4_turbo_labelled['metadata'] = df_gp4_turbo_labelled.apply(create_meta_gpt4_turbo_labelled, axis=1)
df_gp4_turbo_labelled['verified'] = False
df_gp4_turbo_labelled['synthetic'] = False
label_columns = [col for col in df_gp4_turbo_labelled.columns if col.startswith('label_')]
categories = [col.replace('label_', '') for col in label_columns]
for category in categories:
    df_gp4_turbo_labelled[f'analysis_{category}'] = None
    
df_gp4_turbo_labelled = df_gp4_turbo_labelled[['id', 'label_age', 'analysis_age', 'label_disability',
       'analysis_disability', 'label_masculine', 'analysis_masculine',
       'label_feminine', 'analysis_feminine', 'label_racial',
       'analysis_racial', 'label_sexuality', 'analysis_sexuality',
       'label_general', 'analysis_general', 'verified', 'synthetic', 'text',
       'metadata']]

df_gp4_turbo_labelled

In [23]:
df_gp4_turbo_labelled.value_counts('label_age')

In [22]:
df_gp4_turbo_labelled.to_parquet('output/unverified-train-gpt4-labelled.parquet')

# LLama3

In [25]:
import pandas as pd
import glob

# Get the list of all parquet files in directory
files = glob.glob('input/llama3/*.parquet')

df_llama3_labelled = []

for file in files:
    df = pd.read_parquet(file)
    df_llama3_labelled.append(df)

df_llama3_labelled = pd.concat(df_llama3_labelled, ignore_index=True)
analyse_dataframe(df_llama3_labelled)
df_llama3_labelled

In [26]:
import json

def create_meta_llama3_labelled(row):
    meta = json.loads(row['notes'])
    meta['document_id'] = row['document_id']
    meta['source'] = row['source']
    meta['country'] = row['country']
    meta['company'] = row['company']
    meta['position'] = row['position']
    meta['original'] = row['original']
    
    return json.dumps(meta)

In [29]:
df_llama3_labelled['metadata'] = df_llama3_labelled.apply(create_meta_llama3_labelled, axis=1)
df_llama3_labelled['verified'] = False
df_llama3_labelled['synthetic'] = False

df_llama3_labelled = df_llama3_labelled[['id', 'label_age', 'analysis_age', 'label_disability',
       'analysis_disability', 'label_masculine', 'analysis_masculine',
       'label_feminine', 'analysis_feminine', 'label_racial',
       'analysis_racial', 'label_sexuality', 'analysis_sexuality',
       'label_general', 'analysis_general', 'verified', 'synthetic', 'text',
       'metadata']]

df_llama3_labelled

In [30]:
df_llama3_labelled.to_parquet('output/unverified-train-llama3-labelled.parquet')

# Polarity synthetics

In [3]:
import json

def create_meta_polarity(row):
    meta = {
       # 'document_id': row['document_id'],
        'position': row['position'],
        'inference_time': row['inference_time'],
        'model': 'gpt-4o-2024-05-13',
        'prompt_tokens': row['prompt_tokens'],
        'completion_tokens': row['completion_tokens'],
        'total_tokens': row['total_tokens'],
        'input': row['input'],
        'output': row['output'],
    }
    return json.dumps(meta)
import re


def replace_asterisks(row):
    text = row['text']
    text = re.sub(r'\*{2,}', '', text)
    text = re.sub(r'\*', '-', text)
    return text


In [5]:
import pandas as pd

df_polarity_synthetics = pd.read_parquet('input/polarity-synthetic.parquet')
analyse_dataframe(df_polarity_synthetics)
df_polarity_synthetics

In [6]:

df_polarity_synthetics['metadata'] = df_polarity_synthetics.apply(create_meta_polarity, axis=1)
df_polarity_synthetics['text'] = df_polarity_synthetics.apply(replace_asterisks, axis=1)
df_polarity_synthetics['verified'] = False
df_polarity_synthetics['synthetic'] = True

df_polarity_synthetics = df_polarity_synthetics[['id', 'label_age', 'analysis_age', 'label_disability',
       'analysis_disability', 'label_masculine', 'analysis_masculine',
       'label_feminine', 'analysis_feminine', 'label_racial',
       'analysis_racial', 'label_sexuality', 'analysis_sexuality',
       'label_general', 'analysis_general', 'verified', 'synthetic', 'text',
       'metadata']]

df_polarity_synthetics

In [7]:
df_polarity_synthetics.to_parquet('output/unverified-train-mixed-polarity-synthetic.parquet')