In [1]:
category = 'sexuality'

In [2]:
# Fix any tags that aren't getting closed.
def fix_closing_tag(row):
    if f'<{category}>' in row['output'] and f'</{category}>' not in row['output']:
        return row['output'] + f'</{category}>'
    else:
        return row['output']

def extract_tag_text(row):
    output = row['output']
    start_tag = f"<{category}>"
    end_tag = f"</{category}>"

    start_index = output.find(start_tag)
    end_index = output.find(end_tag)

    if start_index != -1 and end_index != -1:  # tags were found
        start_index += len(start_tag)  # adjust to index after the start tag
        result = output[start_index:end_index].strip()
        result = result.replace('*', '')  # extract content between tags
        return result

    return None  # tags were not found or improperly formatted

In [3]:
import re
import os
import pandas as pd

output_dir = f'/home/teveritt/Datasets/2024-mcm-everitt-ryan/datasets/synthetic-job-postings/polarity-synthetic/{category}'
jsonl_gpt4o_file = f'{output_dir}/polarity-synthetic-gpt4o.jsonl'
jsonl_llama3b70_file = f'{output_dir}/polarity-synthetic-llama3b70.jsonl'
parquet_file = f'{output_dir}/polarity-synthetic.parquet'

df_gpt4o = pd.read_json(jsonl_gpt4o_file, lines=True)
df_llama3b70 = pd.read_json(jsonl_llama3b70_file, lines=True)
df = pd.concat([df_gpt4o, df_llama3b70])

# These can introduce age bias
df = df[~df.text.str.contains("10\+? years", case=False, na=False)]

#df = df[~((df['model'] == 'gpt-4o-2024-05-13') & (df[f'label_{category}'] == False))] # gpt4o unbiased outputs had a high rate of bias.

df['id'] = df['document_id']

df['text'] = df['text'].str.replace('***', '', regex=False)
df['text'] = df['text'].str.replace('**', '', regex=False)
df['text'] = df['text'].str.replace('*', '-', regex=False)


df['output'] = df.apply(fix_closing_tag, axis=1)
df[f'analysis_{category}'] = df.apply(extract_tag_text, axis=1)

for column in df.columns:
    if column.startswith('analysis_') and column != f'analysis_{category}':
        df[column] = ''
    if column.startswith('label_') and column != f'label_{category}':
        df[column] = False

df

Unnamed: 0,id,document_id,position,label_age,label_disability,label_feminine,label_masculine,label_racial,label_sexuality,label_general,...,text,analysis_age,analysis_disability,analysis_feminine,analysis_masculine,analysis_racial,analysis_sexuality,analysis_general,input,output
0,Synthetic:gpt-4o-2024-05-13:20240628001222:neg...,Synthetic:gpt-4o-2024-05-13:20240628001222:neg...,fleet administrator,False,False,False,False,False,True,False,...,"Company Background:\nShea, Fuller, and Garrett...",,,,,,The job posting includes subtle negative langu...,,"[{""role"": ""system"", ""content"": ""\nThis GPT gen...","<j>\n**Company Background:**\nShea, Fuller, an..."
1,Synthetic:gpt-4o-2024-05-13:20240628001227:neu...,Synthetic:gpt-4o-2024-05-13:20240628001227:neu...,paralegal aide,False,False,False,False,False,False,False,...,"Company: Rhodes, Paul and Ross\n\nRole: Parale...",,,,,,The job posting is neutral in terms of sexuali...,,"[{""role"": ""system"", ""content"": ""\nThis GPT gen...","<j>\n**Company:** Rhodes, Paul and Ross\n\n**R..."
2,Synthetic:gpt-4o-2024-05-13:20240628001233:neu...,Synthetic:gpt-4o-2024-05-13:20240628001233:neu...,senior project controls specialist,False,False,False,False,False,False,False,...,Company: Crawford-Myers\n\nRole: Senior Projec...,,,,,,Crawford-Myers ensures that the job posting is...,,"[{""role"": ""system"", ""content"": ""\nThis GPT gen...",<j>\n**Company:** Crawford-Myers\n\n**Role:** ...
3,Synthetic:gpt-4o-2024-05-13:20240628001238:pos...,Synthetic:gpt-4o-2024-05-13:20240628001238:pos...,jeweler,False,False,False,False,False,True,False,...,Company Background:\nHodge-Hardy is a renowned...,,,,,,The job posting includes language that explici...,,"[{""role"": ""system"", ""content"": ""\nThis GPT gen...",<j>\n**Company Background:**\nHodge-Hardy is a...
4,Synthetic:gpt-4o-2024-05-13:20240628001242:neg...,Synthetic:gpt-4o-2024-05-13:20240628001242:neg...,field operations training coordinator,False,False,False,False,False,True,False,...,Company Background:\nLee and Sons is a reputab...,,,,,,The job posting for Lee and Sons subtly discou...,,"[{""role"": ""system"", ""content"": ""\nThis GPT gen...",<j>\n**Company Background:**\nLee and Sons is ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,Synthetic:meta-llama:Meta-Llama-3-70B-Instruct...,Synthetic:meta-llama:Meta-Llama-3-70B-Instruct...,"animal,breeder",False,False,False,False,False,True,False,...,"Company: Macias, Cummings and Weaver\n\nJob Ti...",,,,,,The job posting has a positive polarity toward...,,"[{""role"": ""system"", ""content"": ""\nThis GPT gen...",Here is the reviewed job posting:\n\n\n<j>\nCo...
196,Synthetic:meta-llama:Meta-Llama-3-70B-Instruct...,Synthetic:meta-llama:Meta-Llama-3-70B-Instruct...,teller,False,False,False,False,False,True,False,...,Company: Rodriguez-Gibson\n\nJob Title: Teller...,,,,,,The job posting subtly discourages non-heteron...,,"[{""role"": ""system"", ""content"": ""\nThis GPT gen...",Here is the reviewed and corrected job posting...
197,Synthetic:meta-llama:Meta-Llama-3-70B-Instruct...,Synthetic:meta-llama:Meta-Llama-3-70B-Instruct...,dba lead,False,False,False,False,False,False,False,...,"Company: Bishop, Johnson and Green\n\nJob Titl...",,,,,,The job posting maintains a neutral tone regar...,,"[{""role"": ""system"", ""content"": ""\nThis GPT gen...",Here is the job posting:\n\n<j>\n\nCompany: Bi...
198,Synthetic:meta-llama:Meta-Llama-3-70B-Instruct...,Synthetic:meta-llama:Meta-Llama-3-70B-Instruct...,police officer,False,False,False,False,False,False,False,...,Company: Tate and Sons\n\nJob Type: Police Off...,,,,,,The job posting maintains a neutral tone regar...,,"[{""role"": ""system"", ""content"": ""\nThis GPT gen...",Here is the reviewed and corrected job posting...


In [4]:
df['position'].unique()

array(['fleet administrator', 'paralegal aide',
       'senior project controls specialist', 'jeweler',
       'field operations training coordinator', 'revenue agent',
       'computer aide',
       'trauma-informed early care and education case supervisor',
       'physical therapist assistant', 'postal service mail sorter',
       'standards specialist', 'executive project auditor',
       'graphic artist', 'special projects manager', 'business analyst',
       'virtual systems engineer', 'senior investigative auditor',
       'computer support specialist', 'taper', 'accountable manager',
       'environmental health & safety regional manager',
       'market researcher', 'boiler inspector', 'senior product manager',
       'juvenile justice trainer', 'personal financial advisor',
       'medical examiner assistant', 'marine engineer',
       'city planning technician', 'legal coordinator',
       'vision screening assistant', 'animal caretaker',
       'marine electronics technicia

In [5]:
df['model'].unique()

array(['gpt-4o-2024-05-13', 'meta-llama:Meta-Llama-3-70B-Instruct'],
      dtype=object)

In [6]:
sample = df[df[f'label_{category}'] == True].sample(1)

print(sample['text'].values[0])
print(f"Biased: {sample[f'label_{category}'].values[0]}")
print(f"ID: {sample['document_id'].values[0]}")

Company Background:
Collins-Weber is a forward-thinking financial services firm committed to offering innovative solutions to our diverse clientele. We prioritize creating an inclusive and welcoming workplace where all employees can thrive.

Job Type:
Full-time

Job Description:
Collins-Weber is seeking a dedicated Fiscal Manager to oversee our financial operations and ensure the company's fiscal health. This role involves budget planning, financial reporting, and strategic financial management.

Responsibilities:
- Develop and manage the company’s budget.
- Prepare accurate financial statements and reports.
- Conduct financial analysis to inform strategic decisions.
- Ensure compliance with financial regulations.
- Collaborate with other departments to streamline financial processes.

Qualifications and Experience Needed:
- Bachelor’s degree in Finance, Accounting, or a related field.
- Minimum of 5 years of experience in financial management.
- Proficiency in financial software and t

In [7]:
with open(f'output-{category}.txt', 'w') as f:
    for index, record in df.iterrows():
        id = record['id']
        analysis = record[f'analysis_{category}']
        bias = record[f'label_{category}']
        text = record['text']
        f.write(f"\n====================================\nBias: {bias} || {id}\n{analysis}\n------------------------------\n\n{text}\n\n")


In [8]:
for m in df['model'].unique():
    biased = len(df[((df['model'] == m) & (df[f'label_{category}'] == True))])
    not_biased = len(df[((df['model'] == m) & (df[f'label_{category}'] == False))])
    print(f"{m} || {bias + not_biased} || {bias} biased and {not_biased} not biased")

gpt-4o-2024-05-13 || 99 || True biased and 98 not biased
meta-llama:Meta-Llama-3-70B-Instruct || 101 || True biased and 100 not biased


In [12]:
df_gold = pd.concat([
   df[((df['model'] == 'meta-llama:Meta-Llama-3-70B-Instruct') & (df[f'label_{category}'] == True))].sample(82),
   df[((df['model'] == 'gpt-4o-2024-05-13') & (df[f'label_{category}'] == True))].sample(82),
   df[((df['model'] == 'meta-llama:Meta-Llama-3-70B-Instruct') & (df[f'label_{category}'] == False))],
   df[((df['model'] == 'gpt-4o-2024-05-13') & (df[f'label_{category}'] == False))],
])
df_gold[df_gold[f'label_{category}'] == True].value_counts('model')

In [13]:
df_gold[df_gold[f'label_{category}'] == False].value_counts('model')

In [14]:
#with open(f'gold_ids_{category}.txt', 'w') as f:
#    for index, record in df_gold.iterrows():
#        id = record['id']
#        f.write(f"{id}\n")

In [15]:
#with open(f'review-{category}.txt', 'w') as f:
#    for index, record in df_gold[df_gold[f'label_{category}']==True].iterrows():
#        id = record['id']
#        analysis = record[f'analysis_{category}']
#        bias = record[f'label_{category}']
#        text = record['text']
#        f.write(f"\n====================================\nBias: {bias} || {id}\n{analysis}\n------------------------------\n\n{text}\n\n")
#    for index, record in df_gold[df_gold[f'label_{category}']==False].iterrows():
#        id = record['id']
#        analysis = record[f'analysis_{category}']
#        bias = record[f'label_{category}']
#        text = record['text']
#        f.write(f"\n====================================\nBias: {bias} || {id}\n{analysis}\n------------------------------\n\n{text}\n\n")

In [9]:
ids = []

with open(f"gold_ids_{category}.txt", "r") as file:
    ids = file.read().splitlines()

df_gold = df[df['id'].isin(ids)]
df_gold

Unnamed: 0,id,document_id,position,label_age,label_disability,label_feminine,label_masculine,label_racial,label_sexuality,label_general,...,text,analysis_age,analysis_disability,analysis_feminine,analysis_masculine,analysis_racial,analysis_sexuality,analysis_general,input,output
0,Synthetic:gpt-4o-2024-05-13:20240628001222:neg...,Synthetic:gpt-4o-2024-05-13:20240628001222:neg...,fleet administrator,False,False,False,False,False,True,False,...,"Company Background:\nShea, Fuller, and Garrett...",,,,,,The job posting includes subtle negative langu...,,"[{""role"": ""system"", ""content"": ""\nThis GPT gen...","<j>\n**Company Background:**\nShea, Fuller, an..."
1,Synthetic:gpt-4o-2024-05-13:20240628001227:neu...,Synthetic:gpt-4o-2024-05-13:20240628001227:neu...,paralegal aide,False,False,False,False,False,False,False,...,"Company: Rhodes, Paul and Ross\n\nRole: Parale...",,,,,,The job posting is neutral in terms of sexuali...,,"[{""role"": ""system"", ""content"": ""\nThis GPT gen...","<j>\n**Company:** Rhodes, Paul and Ross\n\n**R..."
2,Synthetic:gpt-4o-2024-05-13:20240628001233:neu...,Synthetic:gpt-4o-2024-05-13:20240628001233:neu...,senior project controls specialist,False,False,False,False,False,False,False,...,Company: Crawford-Myers\n\nRole: Senior Projec...,,,,,,Crawford-Myers ensures that the job posting is...,,"[{""role"": ""system"", ""content"": ""\nThis GPT gen...",<j>\n**Company:** Crawford-Myers\n\n**Role:** ...
3,Synthetic:gpt-4o-2024-05-13:20240628001238:pos...,Synthetic:gpt-4o-2024-05-13:20240628001238:pos...,jeweler,False,False,False,False,False,True,False,...,Company Background:\nHodge-Hardy is a renowned...,,,,,,The job posting includes language that explici...,,"[{""role"": ""system"", ""content"": ""\nThis GPT gen...",<j>\n**Company Background:**\nHodge-Hardy is a...
4,Synthetic:gpt-4o-2024-05-13:20240628001242:neg...,Synthetic:gpt-4o-2024-05-13:20240628001242:neg...,field operations training coordinator,False,False,False,False,False,True,False,...,Company Background:\nLee and Sons is a reputab...,,,,,,The job posting for Lee and Sons subtly discou...,,"[{""role"": ""system"", ""content"": ""\nThis GPT gen...",<j>\n**Company Background:**\nLee and Sons is ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,Synthetic:meta-llama:Meta-Llama-3-70B-Instruct...,Synthetic:meta-llama:Meta-Llama-3-70B-Instruct...,"animal,breeder",False,False,False,False,False,True,False,...,"Company: Macias, Cummings and Weaver\n\nJob Ti...",,,,,,The job posting has a positive polarity toward...,,"[{""role"": ""system"", ""content"": ""\nThis GPT gen...",Here is the reviewed job posting:\n\n\n<j>\nCo...
196,Synthetic:meta-llama:Meta-Llama-3-70B-Instruct...,Synthetic:meta-llama:Meta-Llama-3-70B-Instruct...,teller,False,False,False,False,False,True,False,...,Company: Rodriguez-Gibson\n\nJob Title: Teller...,,,,,,The job posting subtly discourages non-heteron...,,"[{""role"": ""system"", ""content"": ""\nThis GPT gen...",Here is the reviewed and corrected job posting...
197,Synthetic:meta-llama:Meta-Llama-3-70B-Instruct...,Synthetic:meta-llama:Meta-Llama-3-70B-Instruct...,dba lead,False,False,False,False,False,False,False,...,"Company: Bishop, Johnson and Green\n\nJob Titl...",,,,,,The job posting maintains a neutral tone regar...,,"[{""role"": ""system"", ""content"": ""\nThis GPT gen...",Here is the job posting:\n\n<j>\n\nCompany: Bi...
198,Synthetic:meta-llama:Meta-Llama-3-70B-Instruct...,Synthetic:meta-llama:Meta-Llama-3-70B-Instruct...,police officer,False,False,False,False,False,False,False,...,Company: Tate and Sons\n\nJob Type: Police Off...,,,,,,The job posting maintains a neutral tone regar...,,"[{""role"": ""system"", ""content"": ""\nThis GPT gen...",Here is the reviewed and corrected job posting...


In [10]:
df_gold.value_counts(f'label_{category}')

label_sexuality
False    198
True     164
Name: count, dtype: int64

In [11]:
import json

label_columns = [col for col in df_gold.columns if col.startswith('label_')]
analysis_columns = [col for col in df_gold.columns if col.startswith('analysis_')]

#df['notes'] = df['notes'].fillna('')

df_gold['verified'] = True
df_gold['synthetic'] = True

columns = ['id']
for c in ['age','disability','masculine','feminine','racial','sexuality','general']:
    columns.append(f'label_{c}')
    columns.append(f'analysis_{c}')
    
columns += ['verified', 'synthetic', 'text', 'metadata']

metadata_columns = ['position', 'inference_time','prompt_tokens', 'completion_tokens', 'total_tokens', 'model', 'input', 'output']
df_gold['metadata'] = df_gold.apply(lambda row: json.dumps(row[metadata_columns].to_dict()), axis=1)

df_gold = df_gold[columns]
df_gold

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_gold['verified'] = True
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_gold['synthetic'] = True
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_gold['metadata'] = df_gold.apply(lambda row: json.dumps(row[metadata_columns].to_dict()), axis=1)


Unnamed: 0,id,label_age,analysis_age,label_disability,analysis_disability,label_masculine,analysis_masculine,label_feminine,analysis_feminine,label_racial,analysis_racial,label_sexuality,analysis_sexuality,label_general,analysis_general,verified,synthetic,text,metadata
0,Synthetic:gpt-4o-2024-05-13:20240628001222:neg...,False,,False,,False,,False,,False,,True,The job posting includes subtle negative langu...,False,,True,True,"Company Background:\nShea, Fuller, and Garrett...","{""position"": ""fleet administrator"", ""inference..."
1,Synthetic:gpt-4o-2024-05-13:20240628001227:neu...,False,,False,,False,,False,,False,,False,The job posting is neutral in terms of sexuali...,False,,True,True,"Company: Rhodes, Paul and Ross\n\nRole: Parale...","{""position"": ""paralegal aide"", ""inference_time..."
2,Synthetic:gpt-4o-2024-05-13:20240628001233:neu...,False,,False,,False,,False,,False,,False,Crawford-Myers ensures that the job posting is...,False,,True,True,Company: Crawford-Myers\n\nRole: Senior Projec...,"{""position"": ""senior project controls speciali..."
3,Synthetic:gpt-4o-2024-05-13:20240628001238:pos...,False,,False,,False,,False,,False,,True,The job posting includes language that explici...,False,,True,True,Company Background:\nHodge-Hardy is a renowned...,"{""position"": ""jeweler"", ""inference_time"": 4.83..."
4,Synthetic:gpt-4o-2024-05-13:20240628001242:neg...,False,,False,,False,,False,,False,,True,The job posting for Lee and Sons subtly discou...,False,,True,True,Company Background:\nLee and Sons is a reputab...,"{""position"": ""field operations training coordi..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,Synthetic:meta-llama:Meta-Llama-3-70B-Instruct...,False,,False,,False,,False,,False,,True,The job posting has a positive polarity toward...,False,,True,True,"Company: Macias, Cummings and Weaver\n\nJob Ti...","{""position"": ""animal,breeder"", ""inference_time..."
196,Synthetic:meta-llama:Meta-Llama-3-70B-Instruct...,False,,False,,False,,False,,False,,True,The job posting subtly discourages non-heteron...,False,,True,True,Company: Rodriguez-Gibson\n\nJob Title: Teller...,"{""position"": ""teller"", ""inference_time"": 14.61..."
197,Synthetic:meta-llama:Meta-Llama-3-70B-Instruct...,False,,False,,False,,False,,False,,False,The job posting maintains a neutral tone regar...,False,,True,True,"Company: Bishop, Johnson and Green\n\nJob Titl...","{""position"": ""dba lead"", ""inference_time"": 14...."
198,Synthetic:meta-llama:Meta-Llama-3-70B-Instruct...,False,,False,,False,,False,,False,,False,The job posting maintains a neutral tone regar...,False,,True,True,Company: Tate and Sons\n\nJob Type: Police Off...,"{""position"": ""police officer"", ""inference_time..."


In [12]:
df_gold.columns

Index(['id', 'label_age', 'analysis_age', 'label_disability',
       'analysis_disability', 'label_masculine', 'analysis_masculine',
       'label_feminine', 'analysis_feminine', 'label_racial',
       'analysis_racial', 'label_sexuality', 'analysis_sexuality',
       'label_general', 'analysis_general', 'verified', 'synthetic', 'text',
       'metadata'],
      dtype='object')

In [13]:
df_gold.head(1)['metadata'].values[0]

'{"position": "fleet administrator", "inference_time": 5.8480143547058105, "prompt_tokens": 837, "completion_tokens": 455, "total_tokens": 1292, "model": "gpt-4o-2024-05-13", "input": "[{\\"role\\": \\"system\\", \\"content\\": \\"\\\\nThis GPT generates 200 word synthetic job postings across seven categories: age, disability, feminine, masculine, racial, sexuality, and general. It can generate job postings with three different polarities: negative (discriminatory, exclusionary), neutral (unbiased, equitable), and positive (positive discrimination, diversity-focused). The job post builder adheres to the following definitions:\\\\n\\\\n- Age bias: Language that subtly favours certain age groups, such as terms implying energy or modernity for younger candidates, or experience and wisdom for older candidates.\\\\n\\\\n- Disability bias: Terms or requirements that subtly exclude individuals based on disabilities, like ableist terms implying physical traits or lack of language on reasonable

In [14]:
from sklearn.model_selection import train_test_split

def split(dataframe):
    df_train, df_80 = train_test_split(dataframe, test_size=0.8, random_state=2024)
    df_val, df_test = train_test_split(df_80, test_size=0.5, random_state=2024)
    return df_train, df_val, df_test

df_gold_bias = df_gold[df_gold[f'label_{category}'] == True]
df_gold_unbias = df_gold[df_gold[f'label_{category}'] == False]

df_bias_train, df_bias_val, df_bias_test = split(df_gold_bias)
df_unbias_train, df_unbias_val, df_unbias_test = split(df_gold_unbias)

df_train = pd.concat([df_bias_train,df_unbias_train])
df_val = pd.concat([df_bias_val,df_unbias_val])
df_test = pd.concat([df_bias_test,df_unbias_test])

df_train.to_parquet(f'synthetic-{category}-train.parquet', compression='gzip')
df_val.to_parquet(f'synthetic-{category}-val.parquet', compression='gzip')
df_test.to_parquet(f'synthetic-{category}-test.parquet', compression='gzip')

In [131]:
# Longest phrase
longest_text = df_gold['text'].apply(lambda x: (len(x), x)).max()[1]
print(longest_text)

In [132]:

from transformers import AutoTokenizer


def print_max_tokens(model_id):
    tokenizer = AutoTokenizer.from_pretrained(model_id, add_prefix_space=True)
    max_tokens = len(tokenizer.encode(longest_text))
    print(f"Max '{model_id}' tokens: {max_tokens}")


def print_encode_decoded(model_id, longest_text):
    tokenizer = AutoTokenizer.from_pretrained(model_id, add_prefix_space=True)
    encoded_tokens = tokenizer.encode(longest_text)
    print(f"Tokens: {encoded_tokens}")
    print(f"Decoded tokens: {tokenizer.decode(encoded_tokens)}")


def print_tokens(model_id, longest_text):
    tokenizer = AutoTokenizer.from_pretrained(model_id, add_prefix_space=True)
    tokens = tokenizer.tokenize(longest_text)
    print(f"Tokens: {tokens}")


In [133]:
max_char = len(longest_text)
max_words = len(longest_text.split())

print(f'Max characters: {max_char}')
print(f'Max words: {max_words}')
for model in ['roberta-base', 'bert-base-uncased', 'microsoft/deberta-v3-small']:
    print_max_tokens(model)
