In [43]:
category = 'feminine'

In [74]:
import huggingface_hub
from openai import OpenAI

temperature = 0.8

def read_file(file_path):
    with open(file_path, 'r') as file:
        content = file.read()
    return content.rstrip('\n')

use_gpt_model = False

if use_gpt_model:
    model = 'gpt-4o'
    model_shortname = 'gpt4o'
    base_url = None
    api_key=read_file("/home/teveritt/OpenAI-API-DCU-AI.key")
else:
    model = 'meta-llama/Meta-Llama-3-70B-Instruct'
    model_shortname = 'llama3b70'
    base_url = f"https://api-inference.huggingface.co/models/{model}/v1/"
    #base_url='https://ylzx7jabydlt5hql.us-east-1.aws.endpoints.huggingface.cloud/v1/'
    api_key=huggingface_hub.get_token()

def openai_chat(messages):
    client = OpenAI(base_url=base_url, api_key=api_key)

    return client.chat.completions.create(
        model=model,
        temperature=temperature,
        messages=messages,
        stream=False,
        max_tokens=1525
    )

In [75]:
# Fetch a job title from job-phrase-list.csv
# Original source of list: https://github.com/microsoft/LUIS-Samples/blob/master/documentation-samples/tutorials/job-phrase-list.csv
# https://www.kaggle.com/datasets/estasney/job-titles?select=titles.csv

import pandas as pd
import random

def get_job_titles():
    titles = set()
    with open("microsoft-LUIS-job-phrase-list.csv", "r") as file:
        lines = file.readlines()
        for line in lines:
            titles.add(line.replace(',\n', '').replace('\n', ''))

    #kaggle_titles = pd.read_csv('kaggle-titles.csv')
    #for col in kaggle_titles.columns:
    #    if col.startswith('Title_'):
    #        titles.update(kaggle_titles[col].dropna().unique())

    return list(titles)


positions = get_job_titles()


def random_job_title():
    return random.choice(positions)


print(f'Positions: {len(positions)}')

In [76]:
random_job_title()

In [77]:
def load_definition(category):
    with open(f"definitions/{category}.txt", 'r', encoding="utf-8") as f:
        return f.read().strip().replace('\n','')

def load_polarity_definition(category, polarity):
    with open(f"polarity-definitions/{category}-{polarity}.txt", 'r', encoding="utf-8") as f:
        return f.read().strip().replace('\n','')


def random_phrases(category, polarity, num_lines=None):
    with open(f"polarity-phrases/{category}-{polarity}.txt", 'r', encoding="utf-8") as f:
        all_lines = [line.rstrip('\n') for line in f.readlines()]  # remove \n
        all_lines = [line if line.endswith('.') else line + '.' for line in all_lines]  # add . at the end
        if num_lines is not None:
            return random.sample(all_lines, num_lines)
        else:
            return all_lines

In [78]:
system = f"""
This GPT generates 200 word synthetic job postings across seven categories: age, disability, feminine, masculine, racial, sexuality, and general. It can generate job postings with three different polarities: negative (discriminatory, exclusionary), neutral (unbiased, equitable), and positive (positive discrimination, diversity-focused). The job post builder adheres to the following definitions:

- Age bias: {load_definition(category='age')}
- Disability bias: {load_definition(category='disability')}
- Feminine bias: {load_definition(category='feminine')}
- General bias: {load_definition(category='general')}
- Masculine bias: {load_definition(category='masculine')}
- Racial bias: {load_definition(category='racial')}
- Sexuality bias: {load_definition(category='sexuality')}

The generator will provide company background, job type, job description, and responsibilities, qualifications and experience needed, perks and benefits. 

The generator will be given the company name, the role, and a list of categories along with the polarities (negative/neutral/positive). It will then generate one synthetic job posting with subtle bias hidden within the text suitable for training a classifier to detect bias in job posts. The subtle bias should not be similar to each other and obviously bias. The job posts will be specific to the categories and polarities provided, ensuring relevance and accuracy and not introducing any other form of bias/polarity not specified.

The generator is free to choose specific elements or language to highlight or exclude when generating job postings and will try to fill in missing information based on context when needed.
"""

In [79]:
import json
from faker import Faker

categories = ['age', 'disability', 'feminine', 'masculine', 'racial', 'sexuality', 'general']

fake = Faker()


def get_inputs(position, inject_bias_types):
    user_input = f"""Company: {fake.company()}
Role: {position}
Bias/Polarities:"""

    for idx, type in enumerate(inject_bias_types):
        category = type.split('/')[-2]
        polarity = type.split('/')[-1]
        definition = load_polarity_definition(category,polarity)
        user_input += f"\n  {idx + 1}. {definition} Examples are:"
        for phrase in random_phrases(category, polarity, 5):
            phrase = phrase.replace('\n', '')
            user_input += f"\n    - {phrase}"

    return user_input


def get_output_format(inject_bias_types):
    output_format = """Review the job posting to make sure it has not introduced any other form of bias not specified and the rationale matches the bias/polarities specified. Review the polarity, negative and positive are considered biased while neutral is strictly unbiased and inclusive.  Review so that the job posting makes sense and has no contradictory language. Review the benefits/offer and if there is a lack of transparency (e.g. Competitive pay/salary), then adjust it to add more transparency (e.g "We are committed to fair and equitable pay practices. The salary for this position ranges from <GPT fills this in> to <GPT fills this in>, based on your experience and skills").  Pick one of global currency reserves when mentioning salary or revenue. Once reviewed and corrected, output with the following format (tag names are lowercase):
  1. Wrap the job posting within the <j>...</j> tag."""
    for idx, type in enumerate(inject_bias_types):
        category = type.split('/')[-2]
        output_format += f"\n  {idx + 2}. Summarise, using third-person, the {category} rationale within one <{category}>...</{category}> tag."

    return output_format


def generate_text(position, inject_bias_types):
    m = [
        {
            "role": "system",
            "content": system
        }, {
            "role": "user",
            "content": get_inputs(position, inject_bias_types)
        }, {
            "role": "assistant",
            "content": "I have the job posting ready, how should I respond?"
        }, {
            "role": "user",
            "content": get_output_format(inject_bias_types),
        }
    ]

    start_time = time.time()
    output = openai_chat(m)
    inference_time = time.time() - start_time

    prompt_tokens = output.usage.prompt_tokens
    completion_tokens = output.usage.completion_tokens
    total_tokens = output.usage.total_tokens
    content = output.choices[0].message.content

    #return json.dumps(m), chat(m)
    return json.dumps(m), content, output.model, inference_time, prompt_tokens, completion_tokens, total_tokens

In [80]:
print(get_inputs(random_job_title(), [f'{category}/negative', f'{category}/neutral', f'{category}/positive']))

In [81]:
print(get_output_format([f'{category}/negative']))

In [82]:
import re
import time
import datetime
import random
import hashlib


def create_hash(input_string):
    return hashlib.sha256(input_string.encode()).hexdigest()[:10]


def lowercase_tags(text):
    tags = re.findall(r'<\/?\w+', text)
    for tag in tags:
        text = text.replace(tag, tag.lower())
    return text


def extract_job_posting(text):
    text = lowercase_tags(text)
    content = re.findall(r'<j>(.*?)</j>', text, re.DOTALL)
    ret = [c.strip() for c in content]
    return ret[0] if len(ret) > 0 else text

def fix_closing_tag(file_content, tag):
    if f'<{tag}>' in file_content and f'</{tag}>' not in file_content:
        return file_content + f'</{tag}>'
    else:
        return file_content

def find_first_between_tags(file_content, tag):
    file_content = fix_closing_tag(file_content, tag)
    start_tag = f"<{tag}>"
    end_tag = f"</{tag}>"

    start_index = file_content.find(start_tag)
    end_index = file_content.find(end_tag)

    if start_index != -1 and end_index != -1:  # tags were found
        start_index += len(start_tag)  # adjust to index after the start tag
        result = file_content[start_index:end_index].strip()
        result = result.replace('*', '')  # extract content between tags
        return result

    return None  # tags were not found or improperly formatted

In [83]:
import os
import pandas as pd

output_dir = f'/home/teveritt/Datasets/2024-mcm-everitt-ryan/datasets/synthetic-job-postings/polarity-synthetic/{category}'
jsonl_file = f'{output_dir}/polarity-synthetic-{model_shortname}.jsonl'
parquet_file = f'{output_dir}/polarity-synthetic-{model_shortname}.parquet'

label_categories = ['label_' + category for category in categories]
analysis_categories = ['analysis_' + category for category in categories]

os.makedirs(output_dir, exist_ok=True)
if os.path.exists(jsonl_file):
    synthetic_df = pd.read_json(jsonl_file, lines=True)
else:
    synthetic_df = pd.DataFrame(
        columns=["document_id", "position"] + label_categories + analysis_categories + ["inference_time",
                                                                                        "prompt_tokens",
                                                                                        "completion_tokens",
                                                                                        "total_tokens", "text", "input",
                                                                                        "output"])

synthetic_df = synthetic_df.dropna(subset=['text'])
synthetic_df = synthetic_df[synthetic_df['text'] != '']

synthetic_df = synthetic_df.drop_duplicates(subset='text', keep='first')
synthetic_df

In [84]:
categories = [category]

size = 85  # Group of samples 

In [85]:
polarities = ['negative', 'neutral', 'neutral', 'positive']  # two bias and two unbiased

total_records = size * len(polarities)

for i in range(size):
    for category in categories:

        for polarity in polarities:
            count = len(synthetic_df) + 1
            formatted_percentage = "{:.2f}%".format((count / total_records) * 100)
            print(
                f'Generating synthetic for category {category}/{polarity}: {count}/{total_records} [ {formatted_percentage} ]',
                end=' ')


            position = random_job_title()
            prompt, output, model, inference_time, prompt_tokens, completion_tokens, total_tokens = generate_text(position, [
                f'{category}/{polarity}'])

            text = find_first_between_tags(output, 'j')
            analysis_age = find_first_between_tags(output, 'age')
            analysis_disability = find_first_between_tags(output, 'disability')
            analysis_feminine = find_first_between_tags(output, 'feminine')
            analysis_masculine = find_first_between_tags(output, 'masculine')
            analysis_racial = find_first_between_tags(output, 'racial')
            analysis_sexuality = find_first_between_tags(output, 'sexuality')
            analysis_general = find_first_between_tags(output, 'general')

            timestamp = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
            id = create_hash(output)
            id = f"{timestamp}:{polarity}:{id}"
            id_m = model.replace('/', ':')
            data = {
                "id": f'Synthetic:{id_m}:{id}',
                "document_id": f'Synthetic:{id_m}:{id}',
                "position": position
            }

            for label in label_categories:
                data[label] = False

            data[f'label_{category}'] = False if polarity == 'neutral' else True

            data['inference_time'] = inference_time
            data['prompt_tokens'] = prompt_tokens
            data['completion_tokens'] = completion_tokens
            data['total_tokens'] = total_tokens
            data['model'] = id_m
            data['text'] = text
            data['analysis_age'] = analysis_age
            data['analysis_disability'] = analysis_disability
            data['analysis_feminine'] = analysis_feminine
            data['analysis_masculine'] = analysis_masculine
            data['analysis_racial'] = analysis_racial
            data['analysis_sexuality'] = analysis_sexuality
            data['analysis_general'] = analysis_general
            data['input'] = prompt
            data['output'] = output

            with open(jsonl_file, 'a') as file:
                if not os.stat(jsonl_file).st_size == 0:
                    file.write('\n')
                file.write(json.dumps(data))

            synthetic_df = pd.concat([synthetic_df, pd.DataFrame(data, index=[0])], ignore_index=True)
            synthetic_df = synthetic_df.drop_duplicates(subset='text', keep='first')
            synthetic_df.to_parquet(parquet_file, compression='gzip')

            mean = synthetic_df['inference_time'].mean()
            print(f'inference: {"{:.2f}s".format(inference_time)}; {"{:.2f}s".format(mean)} avg')

        #break

synthetic_df

In [56]:
synthetic_df

In [86]:
print(synthetic_df.tail(1)['output'].values[0])

In [87]:
print(synthetic_df.tail(1)[f'id'].values[0])

In [88]:
print(synthetic_df.tail(1)[f'analysis_{category}'].values[0])

In [89]:
input = json.loads(synthetic_df.tail(1)[f'input'].values[0])[1]['content']
print(input)