In [1]:
category = 'general'

In [73]:
import huggingface_hub
from openai import OpenAI

temperature = 0.8

def read_file(file_path):
    with open(file_path, 'r') as file:
        content = file.read()
    return content.rstrip('\n')

use_gpt_model = False

if use_gpt_model:
    model = 'gpt-4o'
    model_shortname = 'gpt4o'
    base_url = None
    api_key=read_file("/home/teveritt/OpenAI-API-DCU-AI.key")
else:
    model = 'meta-llama/Meta-Llama-3-70B-Instruct'
    model_shortname = 'llama3b70'
    base_url = f"https://api-inference.huggingface.co/models/{model}/v1/"
    #base_url='https://ylzx7jabydlt5hql.us-east-1.aws.endpoints.huggingface.cloud/v1/'
    api_key=huggingface_hub.get_token()

def openai_chat(messages):
    client = OpenAI(base_url=base_url, api_key=api_key)

    return client.chat.completions.create(
        model=model,
        temperature=temperature,
        messages=messages,
        stream=False,
        max_tokens=1525
    )

In [74]:
# Fetch a job title from job-phrase-list.csv
# Original source of list: https://github.com/microsoft/LUIS-Samples/blob/master/documentation-samples/tutorials/job-phrase-list.csv
# https://www.kaggle.com/datasets/estasney/job-titles?select=titles.csv

import pandas as pd
import random

def get_job_titles():
    titles = set()
    with open("microsoft-LUIS-job-phrase-list.csv", "r") as file:
        lines = file.readlines()
        for line in lines:
            titles.add(line.replace(',\n', '').replace('\n', ''))

    #kaggle_titles = pd.read_csv('kaggle-titles.csv')
    #for col in kaggle_titles.columns:
    #    if col.startswith('Title_'):
    #        titles.update(kaggle_titles[col].dropna().unique())

    return list(titles)


positions = get_job_titles()


def random_job_title():
    return random.choice(positions)


print(f'Positions: {len(positions)}')

In [75]:
random_job_title()

In [76]:
def load_definition(category):
    with open(f"definitions/{category}.txt", 'r', encoding="utf-8") as f:
        return f.read()


def random_phrases(category, polarity, num_lines=None):
    with open(f"polarity-phrases/{category}-{polarity}.txt", 'r', encoding="utf-8") as f:
        all_lines = [line.rstrip('\n') for line in f.readlines()]  # remove \n
        all_lines = [line if line.endswith('.') else line + '.' for line in all_lines]  # add . at the end
        if num_lines is not None:
            return random.sample(all_lines, num_lines)
        else:
            return all_lines

In [77]:
system = f"""
This GPT generates 200 word synthetic job postings across seven categories: age, disability, feminine, masculine, racial, sexuality, and general. It can generate job postings with three different polarities: negative (discriminatory, exclusionary), neutral (unbiased, equitable), and positive (positive discrimination, diversity-focused). The job post builder adheres to the following definitions:

- Age bias: Language that subtly favours certain age groups, such as terms implying energy or modernity for younger candidates, or experience and wisdom for older candidates.

- Disability bias: Terms or requirements that subtly exclude individuals based on disabilities, like ableist terms implying physical traits or lack of language on reasonable accommodations.

- Feminine bias: Language that subtly favours female candidates, including gender-coded words, domestic metaphors, and gendered job titles and pronouns.

- General bias: Subtle language favouring or disadvantaging candidates based on characteristics like socio-economic status, educational background, mental health, gender roles, veteran status, criminal history, or political beliefs.

- Masculine bias: Language that subtly favours male candidates, including gender-coded words, sports or military metaphors, and gendered job titles and pronouns.

- Racial bias: Language subtly favouring certain racial groups or excluding others, like racially insensitive terms, exclusionary phrases, and assumptions about linguistic proficiency.

- Sexuality bias: Language subtly favouring certain sexual orientations or gender identities, including heteronormative terms, outdated terminology, lack of recognition of diverse family structures, assumptions about gender identity, and non-inclusive pronouns.

The generator will provide company background, job type, job description, and responsibilities, qualifications and experience needed, perks and benefits. 

The generator will be given the company name, the role, and a list of categories along with the polarities (negative/neutral/positive). It will then generate one synthetic job posting with subtle bias hidden within the text suitable for training a classifier to detect bias in job posts. The subtle bias should not be similar to each other and obviously bias. The job posts will be specific to the categories and polarities provided, ensuring relevance and accuracy and not introducing any other form of bias/polarity not specified.

The generator is free to choose specific elements or language to highlight or exclude when generating job postings and will try to fill in missing information based on context when needed.

The GPT should maintain a formal and professional tone when interacting with users.
"""

In [78]:
system2 = f"""
This GPT generates 200-word synthetic job postings across seven categories: age, disability, feminine, masculine, racial, sexuality, and general. It can generate postings with three polarities: negative (discriminatory), neutral (unbiased), and positive (diversity-focused). The job post builder follows these definitions:

    Age bias: Favours certain age groups, using terms implying energy for younger candidates, or wisdom for older candidates.
    Disability bias: Excludes individuals with disabilities, using ableist terms or lacking reasonable accommodations.
    Feminine bias: Favours female candidates through gender-coded words, domestic metaphors, and gendered job titles.
    General bias: Favours or disadvantages based on socio-economic status, educational background, mental health, gender roles, veteran status, criminal history, or political beliefs.
    Masculine bias: Favours male candidates with gender-coded words, sports/military metaphors, and gendered job titles.
    Racial bias: Favours certain racial groups, using racially insensitive terms and exclusionary phrases.
    Sexuality bias: Favours certain sexual orientations or gender identities, using heteronormative terms and non-inclusive pronouns.

The generator includes company background, job type, job description, responsibilities, qualifications, and benefits. Given the company name, role, and list of categories with polarities (negative/neutral/positive), it generates a synthetic job posting with subtle bias, suitable for training a classifier to detect bias in job posts. Subtle biases should not be obvious or repetitive.'

The GPT maintains a formal, professional tone and aims to create accurate, relevant job posts without introducing unspecified biases.
"""

In [79]:
import json
from faker import Faker

bias_polarities = {
    'age/positive': 'Age: Positive, discouraging younger candidates from considering applying',
    'age/negative': 'Age: Negative, discouraging older candidates from considering applying.',
    'age/neutral': 'Age: Neutral, language is unbiased and does not favour any age group.',

    'disability/negative': 'Disability: Negative, discourages candidates with disabilities, including neurodiverse individuals, from applying. This can occur through the inclusion of non-essential physical requirements, complex words, extra-long sentences, or lengthy lists of essential requirements.',
    'disability/neutral': 'Disability: Neutral, language that neither disadvantage nor give undue preference to candidates with disabilities. This approach focuses on inclusivity by ensuring job requirements are essential and relevant, avoiding unnecessary physical demands, and using clear, straightforward language.',
    'disability/positive': 'Disability: Positive, language that give undue preference to candidates with disabilities, often to fulfil diversity quotas or appear inclusive. This can result in tokenism, where individuals with disabilities are selected primarily based on their protected characteristics rather than their qualifications or experience.',

    'feminine/negative': 'Feminine: Negative, discouraging non-female candidates from considering applying.',
    'feminine/neutral': 'Feminine: Neutral, language is unbiased and does not favour any gender.',
    'feminine/positive': 'Feminine: Positive, language encourages female candidates to apply, potentially making it appear that non-females won’t be considered.',

    'general/negative': 'General: Negative, involves language that subtly discriminates against individuals based on socio-economic status, educational background, mental health status, gender roles, veteran status, criminal history, or political beliefs. This bias is often reflected through unprofessional, outdated, or non-transparent language that discourages certain groups from applying and perpetuates inequality and exclusion in the hiring process.',
    'general/neutral': 'General: Neutral bias in job descriptions involves using inclusive, professional, and transparent language that avoids discriminating against individuals based on socio-economic status, educational background, mental health status, gender roles, veteran status, criminal history, or political beliefs. The language is unbiased and does not favour or disadvantage any group, ensuring job postings are welcoming to all candidates and promoting fairness, equality, and inclusivity.',

    'masculine/negative': 'Masculine: Negative, discouraging non-male candidates from considering applying.',
    'masculine/neutral': 'Masculine: Neutral, language is unbiased and does not favour any gender.',
    'masculine/positive': 'Masculine: Positive, language encourages male candidates to apply, potentially making it appear that non-males won’t be considered.',

    'racial/negative': 'Racial: Negative, discouraging candidates of certain racial groups from considering applying.',
    'racial/neutral': 'Racial: Neutral, language is unbiased and does not favour any racial group.',
    'racial/positive': 'Racial: Positive, language promotes racial diversity and encourages candidates from various racial backgrounds and minority groups to apply, potentially making it appear that certain racial groups or non-minority groups won’t be considered.',

    'sexuality/negative': 'Sexuality: Negative, discouraging non-heteronormative candidates from considering applying.',
    'sexuality/neutral': 'Sexuality: Neutral, language is unbiased and does not favour any sexual orientation or gender identity.',
    'sexuality/positive': 'Sexuality: Positive, language encourages individuals of diverse sexual orientations and gender identities to apply, potentially making it appear that certain orientations or identities won’t be considered.'
}

categories = ['age', 'disability', 'feminine', 'masculine', 'racial', 'sexuality', 'general']

fake = Faker()


def get_inputs(position, inject_bias_types):
    user_input = f"""Company: {fake.company()}
Role: {position}
Bias/Polarities:"""

    for idx, type in enumerate(inject_bias_types):
        category = type.split('/')[-2]
        polarity = type.split('/')[-1]
        user_input += f"\n  {idx + 1}. {bias_polarities[type]} Examples are:"
        for phrase in random_phrases(category, polarity, 5):
            user_input += f"\n    - {phrase}"

    return user_input


def get_output_format(inject_bias_types):
    output_format = """Review the job posting to make sure it has not introduced any other form of bias not specified and the rationale matches the bias/polarities specified. Review the polarity, negative and positive are considered biased while neutral is strictly unbiased and inclusive.  Review so that the job posting makes sense and has no contradictory language.  """
    
    output_format += """Unless one of the examples says to use lack of transparency for salary/benefits/offer (e.g. competitive salary), always review the salary/benefits/offer and if there is a lack of transparency (e.g. Competitive pay/salary), then adjust it to add more transparency (e.g "We are committed to fair and equitable pay practices. The salary for this position ranges from <GPT fills this in> to <GPT fills this in>, based on your experience and skills").  Pick one of global currency reserves when mentioning salary or revenue."""
    
    
    output_format += """Once reviewed and corrected, output with the following format (tag names are lowercase):
  1. Wrap the job posting within the <j>...</j> tag."""
    for idx, type in enumerate(inject_bias_types):
        category = type.split('/')[-2]
        output_format += f"\n  {idx + 2}. Summarise, using third-person, the {category} rationale within one <{category}>...</{category}> tag."

    return output_format


def generate_text(position, inject_bias_types):
    m = [
        {
            "role": "system",
            "content": system
        }, {
            "role": "user",
            "content": get_inputs(position, inject_bias_types)
        }, {
            "role": "system",
            "content": "I have the job posting ready, how should I respond?"
        }, {
            "role": "user",
            "content": get_output_format(inject_bias_types),
        }
    ]

    start_time = time.time()
    output = openai_chat(m)
    inference_time = time.time() - start_time

    prompt_tokens = output.usage.prompt_tokens
    completion_tokens = output.usage.completion_tokens
    total_tokens = output.usage.total_tokens
    content = output.choices[0].message.content

    #return json.dumps(m), chat(m)
    return json.dumps(m), content, output.model, inference_time, prompt_tokens, completion_tokens, total_tokens

In [80]:
print(get_inputs(random_job_title(), [f'{category}/negative', f'{category}/neutral']))

In [60]:
print(get_output_format([f'{category}/negative']))

In [61]:
import re
import time
import datetime
import random
import hashlib


def create_hash(input_string):
    return hashlib.sha256(input_string.encode()).hexdigest()[:10]


def lowercase_tags(text):
    tags = re.findall(r'<\/?\w+', text)
    for tag in tags:
        text = text.replace(tag, tag.lower())
    return text

def find_first_between_tags(file_content, tag):
    # Fix missing closing tag
    if f'<{tag}>' in file_content and f'</{tag}>' not in file_content:
        file_content =  file_content + f'</{tag}>'
    
    start_tag = f"<{tag}>"
    end_tag = f"</{tag}>"

    start_index = file_content.find(start_tag)
    end_index = file_content.find(end_tag)

    if start_index != -1 and end_index != -1:  # tags were found
        start_index += len(start_tag)  # adjust to index after the start tag
        result = file_content[start_index:end_index].strip()
        result = result.replace('*', '')  # extract content between tags
        return result

    return None  # tags were not found or improperly formatted

In [81]:
import os
import pandas as pd

output_dir = f'/home/teveritt/Datasets/2024-mcm-everitt-ryan/datasets/synthetic-job-postings/polarity-synthetic/{category}'
jsonl_file = f'{output_dir}/polarity-synthetic-{model_shortname}.jsonl'
parquet_file = f'{output_dir}/polarity-synthetic-{model_shortname}.parquet'

label_categories = ['label_' + category for category in categories]
analysis_categories = ['analysis_' + category for category in categories]

os.makedirs(output_dir, exist_ok=True)
if os.path.exists(jsonl_file):
    synthetic_df = pd.read_json(jsonl_file, lines=True)
else:
    synthetic_df = pd.DataFrame(
        columns=["document_id", "position"] + label_categories + analysis_categories + ["inference_time",
                                                                                        "prompt_tokens",
                                                                                        "completion_tokens",
                                                                                        "total_tokens", "text", "input",
                                                                                        "output"])

synthetic_df = synthetic_df.dropna(subset=['text'])
synthetic_df = synthetic_df[synthetic_df['text'] != '']

synthetic_df = synthetic_df.drop_duplicates(subset='text', keep='first')
synthetic_df

In [71]:
categories = [category]

size = 60  # Group of samples 

In [72]:
polarities = ['negative', 'neutral']  # two bias and two unbiased

total_records = size * len(polarities)

for i in range(size):
    for category in categories:

        for polarity in polarities:
            count = len(synthetic_df) + 1
            formatted_percentage = "{:.2f}%".format((count / total_records) * 100)
            print(
                f'Generating synthetic for category {category}/{polarity}: {count}/{total_records} [ {formatted_percentage} ]',
                end=' ')


            position = random_job_title()
            prompt, output, model, inference_time, prompt_tokens, completion_tokens, total_tokens = generate_text(position, [
                f'{category}/{polarity}'])

            text = find_first_between_tags(output, 'j')
            analysis_age = find_first_between_tags(output, 'age')
            analysis_disability = find_first_between_tags(output, 'disability')
            analysis_feminine = find_first_between_tags(output, 'feminine')
            analysis_masculine = find_first_between_tags(output, 'masculine')
            analysis_racial = find_first_between_tags(output, 'racial')
            analysis_sexuality = find_first_between_tags(output, 'sexuality')
            analysis_general = find_first_between_tags(output, 'general')

            timestamp = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
            id = create_hash(output)
            id = f"{timestamp}:{polarity}:{id}"
            id_m = model.replace('/', ':')
            data = {
                "id": f'Synthetic:{id_m}:{id}',
                "document_id": f'Synthetic:{id_m}:{id}',
                "position": position
            }

            for label in label_categories:
                data[label] = False

            data[f'label_{category}'] = False if polarity == 'neutral' else True

            data['inference_time'] = inference_time
            data['prompt_tokens'] = prompt_tokens
            data['completion_tokens'] = completion_tokens
            data['total_tokens'] = total_tokens
            data['model'] = id_m
            data['text'] = text
            data['analysis_age'] = analysis_age
            data['analysis_disability'] = analysis_disability
            data['analysis_feminine'] = analysis_feminine
            data['analysis_masculine'] = analysis_masculine
            data['analysis_racial'] = analysis_racial
            data['analysis_sexuality'] = analysis_sexuality
            data['analysis_general'] = analysis_general
            data['input'] = prompt
            data['output'] = output

            with open(jsonl_file, 'a') as file:
                if not os.stat(jsonl_file).st_size == 0:
                    file.write('\n')
                file.write(json.dumps(data))

            synthetic_df = pd.concat([synthetic_df, pd.DataFrame(data, index=[0])], ignore_index=True)
            synthetic_df = synthetic_df.drop_duplicates(subset='text', keep='first')
            synthetic_df.to_parquet(parquet_file, compression='gzip')

            mean = synthetic_df['inference_time'].mean()
            print(f'inference: {"{:.2f}s".format(inference_time)}; {"{:.2f}s".format(mean)} avg')

        #break

synthetic_df

In [19]:
synthetic_df

In [69]:
print(synthetic_df.tail(1)['text'].values[0])

In [70]:
print(synthetic_df.tail(1)[f'analysis_{category}'].values[0])