In [1]:
category = 'masculine'

In [2]:
# Fix any tags that aren't getting closed.
def fix_closing_tag(row):
    if f'<{category}>' in row['output'] and f'</{category}>' not in row['output']:
        return row['output'] + f'</{category}>'
    else:
        return row['output']

def extract_tag_text(row):
    output = row['output']
    start_tag = f"<{category}>"
    end_tag = f"</{category}>"

    start_index = output.find(start_tag)
    end_index = output.find(end_tag)

    if start_index != -1 and end_index != -1:  # tags were found
        start_index += len(start_tag)  # adjust to index after the start tag
        result = output[start_index:end_index].strip()
        result = result.replace('*', '')  # extract content between tags
        return result

    return None  # tags were not found or improperly formatted

In [3]:
import re
import os
import pandas as pd

output_dir = f'/home/teveritt/Datasets/2024-mcm-everitt-ryan/datasets/synthetic-job-postings/polarity-synthetic/{category}'
jsonl_gpt4o_file = f'{output_dir}/polarity-synthetic-gpt4o.jsonl'
jsonl_llama3b70_file = f'{output_dir}/polarity-synthetic-llama3b70.jsonl'
parquet_file = f'{output_dir}/polarity-synthetic.parquet'

df_gpt4o = pd.read_json(jsonl_gpt4o_file, lines=True)
df_llama3b70 = pd.read_json(jsonl_llama3b70_file, lines=True)
df = pd.concat([df_gpt4o, df_llama3b70])

# These can introduce age bias
df = df[~df.text.str.contains("10\+? years", case=False, na=False)]

#df = df[~((df['model'] == 'gpt-4o-2024-05-13') & (df[f'label_{category}'] == False))] # gpt4o unbiased outputs had a high rate of bias.

df['id'] = df['document_id']

df['text'] = df['text'].str.replace('***', '', regex=False)
df['text'] = df['text'].str.replace('**', '', regex=False)
df['text'] = df['text'].str.replace('*', '-', regex=False)


df['output'] = df.apply(fix_closing_tag, axis=1)
df[f'analysis_{category}'] = df.apply(extract_tag_text, axis=1)

for column in df.columns:
    if column.startswith('analysis_') and column != f'analysis_{category}':
        df[column] = ''
    if column.startswith('label_') and column != f'label_{category}':
        df[column] = False

df

Unnamed: 0,id,document_id,position,label_age,label_disability,label_feminine,label_masculine,label_racial,label_sexuality,label_general,...,text,analysis_age,analysis_disability,analysis_feminine,analysis_masculine,analysis_racial,analysis_sexuality,analysis_general,input,output
0,Synthetic:gpt-4o-2024-05-13:20240630233741:neg...,Synthetic:gpt-4o-2024-05-13:20240630233741:neg...,senior business analyst,False,False,False,True,False,False,False,...,"Company Background:\nLi, Wilson and Woods is a...",,,,The job posting discourages non-male candidate...,,,,"[{""role"": ""system"", ""content"": ""\nThis GPT gen...","<j>\n**Company Background:**\nLi, Wilson and W..."
1,Synthetic:gpt-4o-2024-05-13:20240630233746:neu...,Synthetic:gpt-4o-2024-05-13:20240630233746:neu...,dba engineer,False,False,False,False,False,False,False,...,"Company Background:\nLucas, Brown, and Mann is...",,,,The job posting uses neutral and inclusive lan...,,,,"[{""role"": ""system"", ""content"": ""\nThis GPT gen...","<j>\n**Company Background:**\nLucas, Brown, an..."
2,Synthetic:gpt-4o-2024-05-13:20240630233750:neg...,Synthetic:gpt-4o-2024-05-13:20240630233750:neg...,e-learning content developer,False,False,False,True,False,False,False,...,Company Background:\nPearson-Fleming is a lead...,,,,This job posting subtly discourages non-male c...,,,,"[{""role"": ""system"", ""content"": ""\nThis GPT gen...",<j>\n**Company Background:**\nPearson-Fleming ...
3,Synthetic:gpt-4o-2024-05-13:20240630233755:neu...,Synthetic:gpt-4o-2024-05-13:20240630233755:neu...,civilian investigator,False,False,False,False,False,False,False,...,"Company Background:\n\nBurton, Cochran and Men...",,,,The job posting has been crafted to ensure it ...,,,,"[{""role"": ""system"", ""content"": ""\nThis GPT gen...","<j>\n**Company Background:**\n\nBurton, Cochra..."
5,Synthetic:gpt-4o-2024-05-13:20240630233805:neu...,Synthetic:gpt-4o-2024-05-13:20240630233805:neu...,umpire,False,False,False,False,False,False,False,...,Company: Collins-Harrison\n\nRole: Umpire\n\nC...,,,,Language in the job posting is neutral and inc...,,,,"[{""role"": ""system"", ""content"": ""\nThis GPT gen...",<j>\n**Company:** Collins-Harrison\n\n**Role:*...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133,Synthetic:meta-llama:Meta-Llama-3-70B-Instruct...,Synthetic:meta-llama:Meta-Llama-3-70B-Instruct...,physicist,False,False,False,False,False,False,False,...,"Physicist Position at Stewart, Spears and Harp...",,,,The job posting avoids using language that sub...,,,,"[{""role"": ""system"", ""content"": ""\nThis GPT gen...",Here is the reviewed and corrected job posting...
134,Synthetic:meta-llama:Meta-Llama-3-70B-Instruct...,Synthetic:meta-llama:Meta-Llama-3-70B-Instruct...,industrial program compliance analyst,False,False,False,True,False,False,False,...,"Company: Dunn, Jones and Archer\nRole: Industr...",,,,The masculine rationale in this job posting di...,,,,"[{""role"": ""system"", ""content"": ""\nThis GPT gen...",Here is the reviewed job posting:\n\n<j>\nComp...
135,Synthetic:meta-llama:Meta-Llama-3-70B-Instruct...,Synthetic:meta-llama:Meta-Llama-3-70B-Instruct...,probation assistant,False,False,False,False,False,False,False,...,Job Title: Probation Assistant\nCompany: Colli...,,,,The masculine/neutral job posting rationale fo...,,,,"[{""role"": ""system"", ""content"": ""\nThis GPT gen...",Here is the reviewed and corrected job posting...
136,Synthetic:meta-llama:Meta-Llama-3-70B-Instruct...,Synthetic:meta-llama:Meta-Llama-3-70B-Instruct...,funeral service worker,False,False,False,True,False,False,False,...,"Company: Woods, Lee and Bailey\n\nRole: Funera...",,,,The job posting uses masculine-biased language...,,,,"[{""role"": ""system"", ""content"": ""\nThis GPT gen...",Here is the reviewed and corrected job posting...


In [4]:
df['position'].unique()

array(['senior business analyst', 'dba engineer',
       'e-learning content developer', 'civilian investigator', 'umpire',
       'actuary', 'dancer', 'executive program coordinator',
       'data and technology analyst', 'dietitian', 'museum technician',
       'public policy and training coordinator',
       'juvenile implementation manager', 'physician',
       'executive project manager', 'community assistant', 'tender',
       'loan officer', 'radio operators', 'agricultural engineer',
       'employee benefits coordinator', 'tool grinder',
       'registered dental hygienist', 'administrative specialist',
       'chemical engineer', 'typist', 'climber & pruner',
       'senior human capital partnership and performance management analyst',
       'singer', 'solar energy project manager',
       'computer control operator', 'parking lot attendant',
       'religious worker', 'staff auditor', 'junior public health nurse',
       'recruiting & onboarding coordinator',
       'indust

In [5]:
df['model'].unique()

array(['gpt-4o-2024-05-13', 'meta-llama:Meta-Llama-3-70B-Instruct'],
      dtype=object)

In [6]:
sample = df[df[f'label_{category}'] == True].sample(1)

print(sample['text'].values[0])
print(f"Biased: {sample[f'label_{category}'].values[0]}")
print(f"ID: {sample['document_id'].values[0]}")
print(f"Biased: {sample[f'analysis_{category}'].values[0]}")

<company_name>Harmon PLC</company_name>
  <role>Registered Dental Hygienist</role>
  <company_background>Harmon PLC is a leading provider of dental services, committed to offering comprehensive and compassionate care to our community. We value excellence, integrity, and teamwork in all aspects of our practice.</company_background>
  <job_type>Full-Time</job_type>
  <job_description>We are seeking an experienced Registered Dental Hygienist to join our dynamic team. The ideal candidate will take charge of patient care, demonstrating a proactive approach and strong leadership qualities in ensuring dental health and hygiene standards are maintained.</job_description>
  <responsibilities>
    <item>Perform regular dental cleanings and other preventive treatments.</item>
    <item>Take charge of patient education on oral hygiene practices.</item>
    <item>Assist the dentist during procedures by providing necessary tools and instruments.</item>
    <item>Monitor and record patient progress a

In [7]:
with open(f'output-{category}.txt', 'w') as f:
    for index, record in df.iterrows():
        id = record['id']
        analysis = record[f'analysis_{category}']
        bias = record[f'label_{category}']
        text = record['text']
        f.write(f"\n====================================\nBias: {bias} || {id}\n{analysis}\n------------------------------\n\n{text}\n\n")


In [8]:
for m in df['model'].unique():
    biased = len(df[((df['model'] == m) & (df[f'label_{category}'] == True))])
    not_biased = len(df[((df['model'] == m) & (df[f'label_{category}'] == False))])
    print(f"{m} || {bias + not_biased} || {bias} biased and {not_biased} not biased")

gpt-4o-2024-05-13 || 62 || False biased and 62 not biased
meta-llama:Meta-Llama-3-70B-Instruct || 69 || False biased and 69 not biased


In [11]:
df_gold = pd.concat([
   df[((df['model'] == 'meta-llama:Meta-Llama-3-70B-Instruct') & (df[f'label_{category}'] == True))],
   df[((df['model'] == 'gpt-4o-2024-05-13') & (df[f'label_{category}'] == True))],
   df[((df['model'] == 'meta-llama:Meta-Llama-3-70B-Instruct') & (df[f'label_{category}'] == False))],
   df[((df['model'] == 'gpt-4o-2024-05-13') & (df[f'label_{category}'] == False))],
])
df_gold[df_gold[f'label_{category}'] == True].value_counts('model')

model
meta-llama:Meta-Llama-3-70B-Instruct    68
gpt-4o-2024-05-13                       61
Name: count, dtype: int64

In [12]:
df_gold[df_gold[f'label_{category}'] == False].value_counts('model')

model
meta-llama:Meta-Llama-3-70B-Instruct    69
gpt-4o-2024-05-13                       62
Name: count, dtype: int64

In [13]:
#with open(f'gold_ids_{category}.txt', 'w') as f:
#    for index, record in df_gold.iterrows():
#        id = record['id']
#        f.write(f"{id}\n")

In [14]:
#with open(f'review-{category}.txt', 'w') as f:
#    for index, record in df_gold[df_gold[f'label_{category}']==True].iterrows():
#        id = record['id']
#        analysis = record[f'analysis_{category}']
#        bias = record[f'label_{category}']
#        text = record['text']
#        f.write(f"\n====================================\nBias: {bias} || {id}\n{analysis}\n------------------------------\n\n{text}\n\n")
#    for index, record in df_gold[df_gold[f'label_{category}']==False].iterrows():
#        id = record['id']
#        analysis = record[f'analysis_{category}']
#        bias = record[f'label_{category}']
#        text = record['text']
#        f.write(f"\n====================================\nBias: {bias} || {id}\n{analysis}\n------------------------------\n\n{text}\n\n")

In [16]:
ids = []

with open(f"gold_ids_{category}.txt", "r") as file:
    ids = file.read().splitlines()

df_gold = df[df['id'].isin(ids)]
df_gold

Unnamed: 0,id,document_id,position,label_age,label_disability,label_feminine,label_masculine,label_racial,label_sexuality,label_general,...,text,analysis_age,analysis_disability,analysis_feminine,analysis_masculine,analysis_racial,analysis_sexuality,analysis_general,input,output
0,Synthetic:gpt-4o-2024-05-13:20240630233741:neg...,Synthetic:gpt-4o-2024-05-13:20240630233741:neg...,senior business analyst,False,False,False,True,False,False,False,...,"Company Background:\nLi, Wilson and Woods is a...",,,,The job posting discourages non-male candidate...,,,,"[{""role"": ""system"", ""content"": ""\nThis GPT gen...","<j>\n**Company Background:**\nLi, Wilson and W..."
1,Synthetic:gpt-4o-2024-05-13:20240630233746:neu...,Synthetic:gpt-4o-2024-05-13:20240630233746:neu...,dba engineer,False,False,False,False,False,False,False,...,"Company Background:\nLucas, Brown, and Mann is...",,,,The job posting uses neutral and inclusive lan...,,,,"[{""role"": ""system"", ""content"": ""\nThis GPT gen...","<j>\n**Company Background:**\nLucas, Brown, an..."
2,Synthetic:gpt-4o-2024-05-13:20240630233750:neg...,Synthetic:gpt-4o-2024-05-13:20240630233750:neg...,e-learning content developer,False,False,False,True,False,False,False,...,Company Background:\nPearson-Fleming is a lead...,,,,This job posting subtly discourages non-male c...,,,,"[{""role"": ""system"", ""content"": ""\nThis GPT gen...",<j>\n**Company Background:**\nPearson-Fleming ...
3,Synthetic:gpt-4o-2024-05-13:20240630233755:neu...,Synthetic:gpt-4o-2024-05-13:20240630233755:neu...,civilian investigator,False,False,False,False,False,False,False,...,"Company Background:\n\nBurton, Cochran and Men...",,,,The job posting has been crafted to ensure it ...,,,,"[{""role"": ""system"", ""content"": ""\nThis GPT gen...","<j>\n**Company Background:**\n\nBurton, Cochra..."
5,Synthetic:gpt-4o-2024-05-13:20240630233805:neu...,Synthetic:gpt-4o-2024-05-13:20240630233805:neu...,umpire,False,False,False,False,False,False,False,...,Company: Collins-Harrison\n\nRole: Umpire\n\nC...,,,,Language in the job posting is neutral and inc...,,,,"[{""role"": ""system"", ""content"": ""\nThis GPT gen...",<j>\n**Company:** Collins-Harrison\n\n**Role:*...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133,Synthetic:meta-llama:Meta-Llama-3-70B-Instruct...,Synthetic:meta-llama:Meta-Llama-3-70B-Instruct...,physicist,False,False,False,False,False,False,False,...,"Physicist Position at Stewart, Spears and Harp...",,,,The job posting avoids using language that sub...,,,,"[{""role"": ""system"", ""content"": ""\nThis GPT gen...",Here is the reviewed and corrected job posting...
134,Synthetic:meta-llama:Meta-Llama-3-70B-Instruct...,Synthetic:meta-llama:Meta-Llama-3-70B-Instruct...,industrial program compliance analyst,False,False,False,True,False,False,False,...,"Company: Dunn, Jones and Archer\nRole: Industr...",,,,The masculine rationale in this job posting di...,,,,"[{""role"": ""system"", ""content"": ""\nThis GPT gen...",Here is the reviewed job posting:\n\n<j>\nComp...
135,Synthetic:meta-llama:Meta-Llama-3-70B-Instruct...,Synthetic:meta-llama:Meta-Llama-3-70B-Instruct...,probation assistant,False,False,False,False,False,False,False,...,Job Title: Probation Assistant\nCompany: Colli...,,,,The masculine/neutral job posting rationale fo...,,,,"[{""role"": ""system"", ""content"": ""\nThis GPT gen...",Here is the reviewed and corrected job posting...
136,Synthetic:meta-llama:Meta-Llama-3-70B-Instruct...,Synthetic:meta-llama:Meta-Llama-3-70B-Instruct...,funeral service worker,False,False,False,True,False,False,False,...,"Company: Woods, Lee and Bailey\n\nRole: Funera...",,,,The job posting uses masculine-biased language...,,,,"[{""role"": ""system"", ""content"": ""\nThis GPT gen...",Here is the reviewed and corrected job posting...


In [17]:
df_gold.value_counts(f'label_{category}')

label_masculine
False    131
True     129
Name: count, dtype: int64

In [18]:
import json

label_columns = [col for col in df_gold.columns if col.startswith('label_')]
analysis_columns = [col for col in df_gold.columns if col.startswith('analysis_')]

#df['notes'] = df['notes'].fillna('')

df_gold['verified'] = True
df_gold['synthetic'] = True

columns = ['id']
for c in ['age','disability','masculine','feminine','racial','sexuality','general']:
    columns.append(f'label_{c}')
    columns.append(f'analysis_{c}')
    
columns += ['verified', 'synthetic', 'text', 'metadata']

metadata_columns = ['position', 'inference_time','prompt_tokens', 'completion_tokens', 'total_tokens', 'model', 'input', 'output']
df_gold['metadata'] = df_gold.apply(lambda row: json.dumps(row[metadata_columns].to_dict()), axis=1)

df_gold = df_gold[columns]
df_gold

Unnamed: 0,id,label_age,analysis_age,label_disability,analysis_disability,label_masculine,analysis_masculine,label_feminine,analysis_feminine,label_racial,analysis_racial,label_sexuality,analysis_sexuality,label_general,analysis_general,verified,synthetic,text,metadata
0,Synthetic:gpt-4o-2024-05-13:20240630233741:neg...,False,,False,,True,The job posting discourages non-male candidate...,False,,False,,False,,False,,True,True,"Company Background:\nLi, Wilson and Woods is a...","{""position"": ""senior business analyst"", ""infer..."
1,Synthetic:gpt-4o-2024-05-13:20240630233746:neu...,False,,False,,False,The job posting uses neutral and inclusive lan...,False,,False,,False,,False,,True,True,"Company Background:\nLucas, Brown, and Mann is...","{""position"": ""dba engineer"", ""inference_time"":..."
2,Synthetic:gpt-4o-2024-05-13:20240630233750:neg...,False,,False,,True,This job posting subtly discourages non-male c...,False,,False,,False,,False,,True,True,Company Background:\nPearson-Fleming is a lead...,"{""position"": ""e-learning content developer"", ""..."
3,Synthetic:gpt-4o-2024-05-13:20240630233755:neu...,False,,False,,False,The job posting has been crafted to ensure it ...,False,,False,,False,,False,,True,True,"Company Background:\n\nBurton, Cochran and Men...","{""position"": ""civilian investigator"", ""inferen..."
5,Synthetic:gpt-4o-2024-05-13:20240630233805:neu...,False,,False,,False,Language in the job posting is neutral and inc...,False,,False,,False,,False,,True,True,Company: Collins-Harrison\n\nRole: Umpire\n\nC...,"{""position"": ""umpire"", ""inference_time"": 4.981..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133,Synthetic:meta-llama:Meta-Llama-3-70B-Instruct...,False,,False,,False,The job posting avoids using language that sub...,False,,False,,False,,False,,True,True,"Physicist Position at Stewart, Spears and Harp...","{""position"": ""physicist"", ""inference_time"": 15..."
134,Synthetic:meta-llama:Meta-Llama-3-70B-Instruct...,False,,False,,True,The masculine rationale in this job posting di...,False,,False,,False,,False,,True,True,"Company: Dunn, Jones and Archer\nRole: Industr...","{""position"": ""industrial program compliance an..."
135,Synthetic:meta-llama:Meta-Llama-3-70B-Instruct...,False,,False,,False,The masculine/neutral job posting rationale fo...,False,,False,,False,,False,,True,True,Job Title: Probation Assistant\nCompany: Colli...,"{""position"": ""probation assistant"", ""inference..."
136,Synthetic:meta-llama:Meta-Llama-3-70B-Instruct...,False,,False,,True,The job posting uses masculine-biased language...,False,,False,,False,,False,,True,True,"Company: Woods, Lee and Bailey\n\nRole: Funera...","{""position"": ""funeral service worker"", ""infere..."


In [19]:
df_gold.columns

Index(['id', 'label_age', 'analysis_age', 'label_disability',
       'analysis_disability', 'label_masculine', 'analysis_masculine',
       'label_feminine', 'analysis_feminine', 'label_racial',
       'analysis_racial', 'label_sexuality', 'analysis_sexuality',
       'label_general', 'analysis_general', 'verified', 'synthetic', 'text',
       'metadata'],
      dtype='object')

In [20]:
df_gold.head(1)['metadata'].values[0]

'{"position": "senior business analyst", "inference_time": 4.388146638870239, "prompt_tokens": 1080, "completion_tokens": 362, "total_tokens": 1442, "model": "gpt-4o-2024-05-13", "input": "[{\\"role\\": \\"system\\", \\"content\\": \\"\\\\nThis GPT generates 200 word synthetic job postings across seven categories: age, disability, feminine, masculine, racial, sexuality, and general. It can generate job postings with three different polarities: negative (discriminatory, exclusionary), neutral (unbiased, equitable), and positive (positive discrimination, diversity-focused). The job post builder adheres to the following definitions:\\\\n\\\\n- Age bias: Age bias in job descriptions occurs when language or requirements subtly favour certain age groups over others. Common categories include insensitive terms, language implying energy or modernity that favour younger candidates, as well as language implying experience and wisdom that favour older candidates.\\\\n- Disability bias: Disability

In [21]:
from sklearn.model_selection import train_test_split

def split(dataframe):
    df_train, df_80 = train_test_split(dataframe, test_size=0.8, random_state=2024)
    df_val, df_test = train_test_split(df_80, test_size=0.5, random_state=2024)
    return df_train, df_val, df_test

df_gold_bias = df_gold[df_gold[f'label_{category}'] == True]
df_gold_unbias = df_gold[df_gold[f'label_{category}'] == False]

df_bias_train, df_bias_val, df_bias_test = split(df_gold_bias)
df_unbias_train, df_unbias_val, df_unbias_test = split(df_gold_unbias)

df_train = pd.concat([df_bias_train,df_unbias_train])
df_val = pd.concat([df_bias_val,df_unbias_val])
df_test = pd.concat([df_bias_test,df_unbias_test])

df_train.to_parquet(f'synthetic-{category}-train.parquet', compression='gzip')
df_val.to_parquet(f'synthetic-{category}-val.parquet', compression='gzip')
df_test.to_parquet(f'synthetic-{category}-test.parquet', compression='gzip')

In [22]:
# Longest phrase
longest_text = df_gold['text'].apply(lambda x: (len(x), x)).max()[1]
print(longest_text)

Company Background:
Walker and Sons has been a prominent family-owned business specializing in construction and infrastructure development for over 50 years. We pride ourselves on our strong ethical values, hard work, and commitment to excellence.

Job Type:
Full-Time, Human Resources Specialist

Job Description:
We are seeking a dedicated Human Resources Specialist to join our team. The ideal candidate will uphold our tradition of integrity and hard work, following the principles laid down by our forefathers. The role involves managing HR operations, employee relations, and upholding our countryman values and ethics in every aspect of human resources management.

Responsibilities:
- Oversee recruitment and hiring processes, ensuring we attract top talent.
- Develop and implement HR policies that reflect the values established by our forefathers.
- Manage employee relations and resolve conflicts efficiently.
- Ensure compliance with all labor laws and regulations.
- Provide training an

In [132]:

from transformers import AutoTokenizer


def print_max_tokens(model_id):
    tokenizer = AutoTokenizer.from_pretrained(model_id, add_prefix_space=True)
    max_tokens = len(tokenizer.encode(longest_text))
    print(f"Max '{model_id}' tokens: {max_tokens}")


def print_encode_decoded(model_id, longest_text):
    tokenizer = AutoTokenizer.from_pretrained(model_id, add_prefix_space=True)
    encoded_tokens = tokenizer.encode(longest_text)
    print(f"Tokens: {encoded_tokens}")
    print(f"Decoded tokens: {tokenizer.decode(encoded_tokens)}")


def print_tokens(model_id, longest_text):
    tokenizer = AutoTokenizer.from_pretrained(model_id, add_prefix_space=True)
    tokens = tokenizer.tokenize(longest_text)
    print(f"Tokens: {tokens}")


In [133]:
max_char = len(longest_text)
max_words = len(longest_text.split())

print(f'Max characters: {max_char}')
print(f'Max words: {max_words}')
for model in ['roberta-base', 'bert-base-uncased', 'microsoft/deberta-v3-small']:
    print_max_tokens(model)


Max characters: 2399
Max words: 327
Max 'roberta-base' tokens: 454
Max 'bert-base-uncased' tokens: 423




Max 'microsoft/deberta-v3-small' tokens: 405
