In [1]:
model_id = 'meta-llama/Meta-Llama-3-70B-Instruct'
model_name = 'Meta-Llama-3-70B-Instruct'

In [2]:
import huggingface_hub
from openai import OpenAI

temperature = 0.8


def openai_chat(messages):
    client = OpenAI(
        base_url=f"https://api-inference.huggingface.co/models/{model_id}/v1/",
        #base_url='https://twkby3tkl3wdjrta.us-east-1.aws.endpoints.huggingface.cloud/v1/',
        api_key=huggingface_hub.get_token(),
    )

    return client.chat.completions.create(
        model=model_id,
        temperature=temperature,
        messages=messages,
        stream=False,
        max_tokens=1512
    )

In [3]:
#import json
#import tiktoken

#encoding = tiktoken.encoding_for_model("gpt2")
#len(encoding.encode(json.dumps(c)))

In [4]:
import pandas as pd

path = 'augmented-taxonomies.parquet'

taxonomies = pd.read_parquet(path)
taxonomies = taxonomies[taxonomies['source'] != 'synthetic']
taxonomies['term'] = taxonomies['term'].str.lower()
# Remove duplicate rows in the 'term'  column
taxonomies = taxonomies.drop_duplicates(subset='term')
taxonomies['reason'] = taxonomies['reason'].str.replace('^synthetic:gpt-4o:', '', regex=True)
taxonomies = taxonomies[['source', 'category', 'term', 'reason']]
taxonomies

Unnamed: 0,source,category,term,reason
0,domain,age,athletic or athletically inclined,"The term ""athletic or athletically inclined"" c..."
1,domain,age,digital native,"The term ""digital native"" implies a preference..."
2,domain,age,elderly,"The term ""elderly"" can be seen as patronising ..."
3,domain,age,energetic person,"The phrase ""energetic person"" can imply a pref..."
4,domain,age,geezer,"The term ""geezer"" is a derogatory and ageist t..."
...,...,...,...,...
670,provisional,sexuality,lifestyle choice,"The term ""lifestyle choice"" is inappropriate w..."
671,provisional,sexuality,people of both genders,"The phrase ""people of both genders"" implies a ..."
672,provisional,sexuality,sexual preference,"The term ""sexual preference"" can imply that se..."
673,provisional,sexuality,transsexual,"The term ""transsexual"" is often seen as medica..."


In [5]:
taxonomies['source'].value_counts()

source
domain         621
provisional     32
Name: count, dtype: int64

In [6]:
taxonomies['category'].value_counts()

category
masculine     193
racial        117
general       112
disability     97
feminine       56
sexuality      54
age            24
Name: count, dtype: int64

# Generate Synthetic samples

In [None]:
# Fetch a job title from job-phrase-list.csv
# Original source of list: https://github.com/microsoft/LUIS-Samples/blob/master/documentation-samples/tutorials/job-phrase-list.csv
# https://www.kaggle.com/datasets/estasney/job-titles?select=titles.csv

import random


def get_job_titles():
    titles = set()
    with open("microsoft-LUIS-job-phrase-list.csv", "r") as file:
        lines = file.readlines()
        for line in lines:
            titles.add(line.replace(',\n', '').replace('\n', ''))

    kaggle_titles = pd.read_csv('kaggle-titles.csv')
    for col in kaggle_titles.columns:
        if col.startswith('Title_'):
            titles.update(kaggle_titles[col].dropna().unique())
    
    
    return list(titles)


positions = get_job_titles()

def random_job_title():
    return random.choice(positions)


print(f'Positions: {len(positions)}')

In [43]:
random_job_title()

'Senior Account Executive / Solutions Architect'

In [8]:
def load_definition(category):
    with open(f"definitions/{category}.txt", 'r', encoding="utf-8") as f:
            return f.read()

In [9]:
def generate_prompt(job_title, samples):
    # Detailed categories of bias
    definitions = {}
    for category in taxonomies['category'].unique():
        definitions[category] = load_definition(category)

    # Ensure only valid categories are used
    valid_categories = samples.keys()
    invalid_categories = [item for item in taxonomies['category'].unique() if item not in valid_categories]
    inject_bias = len(valid_categories) > 0

    # Constructing Few-shot examples
    few_shot_examples = """
    <example>
    Job Title: Project Manager
    Categories: [age, masculine]
    Terms with Rationale:
      - Term: "energetic", Rationale: favors younger candidates
      - Term: "competitive", Rationale: resonates more with male candidates
    Job Description: We are seeking an energetic Project Manager who thrives in a competitive environment. This role is ideal for a recent graduate who is eager to lead challenging projects.
    </example>
    
    <example>
    Job Title: Customer Service Representative
    Categories: [disability, feminine]
    Terms with Rationale:
      - Term: "must be able to stand for long periods", Rationale: excludes individuals with certain disabilities
      - Term: "nurturing", Rationale: resonates more with female candidates
    Job Description: Our company is looking for a nurturing Customer Service Representative. Candidates must be able to stand for long periods and handle multiple tasks simultaneously.
    </example>
    """

    # Construct the advanced prompt
    job_title = job_title.strip()
    if inject_bias:
        prompt = f"The study of reducing the biases and terms provided is important, but an example is important to demonstrate what implicit bias can manifest in real job descriptions. Your task is to generate a complete job description for the role '{job_title}' that demonstrates the subtleties of bias within the specified framework, while maintaining inclusivity and reducing bias in all other respects. Adhere to the following requirements:\n\n"
    else:
        prompt = f"Your task is to generate a complete job description for the role '{job_title}' while maintaining inclusivity and reducing bias in all other respects. Adhere to the following requirements:\n\n"
        
    i = 1
    prompt += f"{i}. Max 400 words."
    i = i + 1
    # Add Few-shot examples
    #prompt += few_shot_examples + "\n\n"

    if inject_bias:
        prompt += f"\n\n{i}. Introduce implicit biases in the following categories:"
        i = i + 1
        for category in valid_categories:
            prompt += f"\n - {category}: {definitions[category]}"
        prompt += f"\n\n{i}. Use the following terms and their rationales to craft the job description:"
        i = i + 1
        for key, value in samples.items():
            prompt += f"\n  - Category {key}, term '{value['term']}'. {value['reason']}"

    if len(invalid_categories) > 0:
        prompt += f"\n\n{i}. It is crucial that you do not introduce implicit biases from the following categories:"
        i = i + 1
        for category in invalid_categories:
            prompt += f"\n - {category}: {definitions[category]}"

    # Add Chain-of-Thought guidance
    steps = []
    steps.append("Review the job title and understand the role.")
    if inject_bias:
        steps.append("Identify how to subtly introduce the specified biases using the provided terms.")
        steps.append("Construct a professional and realistic job description embedding the specified biases.")
    else:
        steps.append("Construct a professional and realistic job description.")
        steps.append("Attempt to classify the categories the job description contains, if you find any then rewrite the job description so that it does NOT contains the specified bias.")

    if inject_bias:
        steps.append(
            "Attempt to classify the categories the job description contains, if you find any not specified then rewrite the job description so that it only contains the specified bias and no other ones.")
        steps.append("Wrap the job description within <j>...</j> tag.")
        for category in valid_categories:
            steps.append(
                f"Think about the {category} bias introduced and wrap the two-sentence explanation within the <{category}>...</{category}> tag.")
    else:
        steps.append("Wrap the job description within <j>...</j> tag.")

    prompt += "\n\n\nStep-by-Step Instructions:\n"
    for idx, step in enumerate(steps):
        prompt += f"\n{idx + 1}. {step}"
    return prompt

In [10]:
import json


def generate_text(position, category_terms, dry_run=False):
    try:
        m = []
        m.append({
                "role": "user",
                "content": generate_prompt(position, category_terms)
            })
    
        if not dry_run:
            start_time = time.time()
            output = openai_chat(m)
            inference_time = time.time() - start_time
    
            prompt_tokens = output.usage.prompt_tokens
            completion_tokens = output.usage.completion_tokens
            total_tokens = output.usage.total_tokens
            content = output.choices[0].message.content
    
            #return json.dumps(m), chat(m)
            return json.dumps(m), content, inference_time, prompt_tokens, completion_tokens, total_tokens
        else:
            return json.dumps(m), None, None, None, None, None
    except Exception as e:
            print(e)
            time.sleep(3600)
            return generate_text(position, category_terms, dry_run)


In [30]:
import math

test_data_size = 3
min_test_split_size = 0.05 

categories = ['age', 'disability', 'feminine', 'masculine', 'racial', 'sexuality', 'general']
splits = len(categories) + 1
size = 5000 #int(math.ceil((test_data_size / min_test_split_size) / splits)) 
max_additional_categories = len(categories)  # Maximum number of additional categories per sample

label_categories = ['label_' + category for category in categories]
analysis_categories = ['analysis_' + category for category in categories]

In [12]:
output_dir = '/home/teveritt/Datasets/2024-mcm-everitt-ryan/datasets/synthetic-job-postings/pass4-B'
output_file = f'{output_dir}/synthetic-biased-job-descriptions-sync.jsonl'

In [47]:
synthetic_df = pd.read_parquet(f'{output_dir}/synthetic-biased-job-descriptions.a.parquet')
synthetic_df

Unnamed: 0,document_id,position,label_age,label_disability,label_feminine,label_masculine,label_racial,label_sexuality,label_general,inference_time,...,text,analysis_age,analysis_disability,analysis_feminine,analysis_masculine,analysis_racial,analysis_sexuality,analysis_general,input,output
0,Synthetic:meta-llama:Meta-Llama-3-70B-Instruct...,financial examiner,True,True,False,True,True,False,True,14.452569,...,Job Title: Senior Marketing Strategist\n\nWe a...,"The term ""senior"" can imply a preference for o...","The term ""attack"" can be distressing and is be...",,"The term ""team player"" can be biased as it mig...","Referring to people as ""the undocumented"" can ...",,"The term ""feminazi"" is highly derogatory and o...","[{""role"": ""user"", ""content"": ""The study of red...",Here is a job description that demonstrates th...
1,Synthetic:meta-llama:Meta-Llama-3-70B-Instruct...,aide,True,True,False,True,True,True,True,21.453533,...,Job Title: Gallery Coordinator\n\nWe are seeki...,,"The term ""the differently abled"" is biased, as...",,"The term ""doorman"" is gendered, favouring male...","The term ""blackmail"" associates negative actio...","The term ""stealth"" can imply secrecy or deceit...","The term ""peanut gallery"" has historical class...","[{""role"": ""user"", ""content"": ""The study of red...",Here is the job description that meets the req...
2,Synthetic:meta-llama:Meta-Llama-3-70B-Instruct...,producer,False,True,True,False,True,False,True,16.085596,...,Job Title: Marketing Manager\n\nWe are seeking...,,The job description contains a disability bias...,The job description contains a feminine bias b...,,The job description contains a racial bias by ...,,The job description contains a general bias by...,"[{""role"": ""user"", ""content"": ""The study of red...",Here is a job description that demonstrates th...
3,Synthetic:meta-llama:Meta-Llama-3-70B-Instruct...,client services representative,False,False,False,True,False,False,False,9.698606,...,Job Title: Senior Reconnaissance Specialist\n\...,,,,The job description introduces masculine bias ...,,,,"[{""role"": ""user"", ""content"": ""The study of red...",<j>\n\nJob Title: Senior Reconnaissance Specia...
4,Synthetic:meta-llama:Meta-Llama-3-70B-Instruct...,student data analyst,False,True,False,False,True,False,False,14.578910,...,Job Title: Marketing Director\n\nWe are seekin...,,,,,,,,"[{""role"": ""user"", ""content"": ""The study of red...",Here is a job description that demonstrates th...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11829,Synthetic:meta-llama:Meta-Llama-3-70B-Instruct...,System Operations Technical Officer,False,False,False,False,False,False,False,18.869227,...,Job Title: System Operations Technical Officer...,,,,,,,,"[{""role"": ""user"", ""content"": ""Your task is to ...",<j>\n\nJob Title: System Operations Technical ...
11830,Synthetic:meta-llama:Meta-Llama-3-70B-Instruct...,"Senior Systems Engineering Mgr - Cloud (SaaS,...",False,False,False,False,False,False,False,24.961207,...,**Job Title:** Senior Systems Engineering Mana...,,,,,,,,"[{""role"": ""user"", ""content"": ""Your task is to ...",<j>\n\n**Job Title:** Senior Systems Engineeri...
11831,Synthetic:meta-llama:Meta-Llama-3-70B-Instruct...,Volunteer Program Manager,False,False,False,False,False,False,False,16.975031,...,Job Title: Volunteer Program Manager\n\nJob Su...,,,,,,,,"[{""role"": ""user"", ""content"": ""Your task is to ...",<j>\n\nJob Title: Volunteer Program Manager\n\...
11832,Synthetic:meta-llama:Meta-Llama-3-70B-Instruct...,RF Planning,False,False,False,False,False,False,False,17.161938,...,Job Title: RF Planning Engineer\n\nJob Summary...,,,,,,,,"[{""role"": ""user"", ""content"": ""Your task is to ...",<j>\n\nJob Title: RF Planning Engineer\n\nJob ...


In [48]:
synthetic_df.to_json(output_file, orient='records', lines=True)

In [49]:
import os
import pandas as pd

if os.path.exists(output_file):
    synthetic_df = pd.read_json(output_file, lines=True)
else:
    synthetic_df = pd.DataFrame(
        columns=["document_id", "position"] + label_categories + analysis_categories + ["inference_time", "prompt_tokens",
                                                                  "completion_tokens",
                                                                  "total_tokens", "text", "input", "output"])

synthetic_df = synthetic_df.dropna(subset=['text'])
synthetic_df = synthetic_df[synthetic_df['text'] != '']


synthetic_df = synthetic_df.drop_duplicates(subset='text', keep='first')
synthetic_df

Unnamed: 0,document_id,position,label_age,label_disability,label_feminine,label_masculine,label_racial,label_sexuality,label_general,inference_time,...,text,analysis_age,analysis_disability,analysis_feminine,analysis_masculine,analysis_racial,analysis_sexuality,analysis_general,input,output
0,Synthetic:meta-llama:Meta-Llama-3-70B-Instruct...,financial examiner,True,True,False,True,True,False,True,14.452569,...,Job Title: Senior Marketing Strategist\n\nWe a...,"The term ""senior"" can imply a preference for o...","The term ""attack"" can be distressing and is be...",,"The term ""team player"" can be biased as it mig...","Referring to people as ""the undocumented"" can ...",,"The term ""feminazi"" is highly derogatory and o...","[{""role"": ""user"", ""content"": ""The study of red...",Here is a job description that demonstrates th...
1,Synthetic:meta-llama:Meta-Llama-3-70B-Instruct...,aide,True,True,False,True,True,True,True,21.453533,...,Job Title: Gallery Coordinator\n\nWe are seeki...,,"The term ""the differently abled"" is biased, as...",,"The term ""doorman"" is gendered, favouring male...","The term ""blackmail"" associates negative actio...","The term ""stealth"" can imply secrecy or deceit...","The term ""peanut gallery"" has historical class...","[{""role"": ""user"", ""content"": ""The study of red...",Here is the job description that meets the req...
2,Synthetic:meta-llama:Meta-Llama-3-70B-Instruct...,producer,False,True,True,False,True,False,True,16.085596,...,Job Title: Marketing Manager\n\nWe are seeking...,,The job description contains a disability bias...,The job description contains a feminine bias b...,,The job description contains a racial bias by ...,,The job description contains a general bias by...,"[{""role"": ""user"", ""content"": ""The study of red...",Here is a job description that demonstrates th...
3,Synthetic:meta-llama:Meta-Llama-3-70B-Instruct...,client services representative,False,False,False,True,False,False,False,9.698606,...,Job Title: Senior Reconnaissance Specialist\n\...,,,,The job description introduces masculine bias ...,,,,"[{""role"": ""user"", ""content"": ""The study of red...",<j>\n\nJob Title: Senior Reconnaissance Specia...
4,Synthetic:meta-llama:Meta-Llama-3-70B-Instruct...,student data analyst,False,True,False,False,True,False,False,14.578910,...,Job Title: Marketing Director\n\nWe are seekin...,,,,,,,,"[{""role"": ""user"", ""content"": ""The study of red...",Here is a job description that demonstrates th...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12702,Synthetic:meta-llama:Meta-Llama-3-70B-Instruct...,System Operations Technical Officer,False,False,False,False,False,False,False,18.869227,...,Job Title: System Operations Technical Officer...,,,,,,,,"[{""role"": ""user"", ""content"": ""Your task is to ...",<j>\n\nJob Title: System Operations Technical ...
12703,Synthetic:meta-llama:Meta-Llama-3-70B-Instruct...,"Senior Systems Engineering Mgr - Cloud (SaaS,...",False,False,False,False,False,False,False,24.961207,...,**Job Title:** Senior Systems Engineering Mana...,,,,,,,,"[{""role"": ""user"", ""content"": ""Your task is to ...",<j>\n\n**Job Title:** Senior Systems Engineeri...
12704,Synthetic:meta-llama:Meta-Llama-3-70B-Instruct...,Volunteer Program Manager,False,False,False,False,False,False,False,16.975031,...,Job Title: Volunteer Program Manager\n\nJob Su...,,,,,,,,"[{""role"": ""user"", ""content"": ""Your task is to ...",<j>\n\nJob Title: Volunteer Program Manager\n\...
12705,Synthetic:meta-llama:Meta-Llama-3-70B-Instruct...,RF Planning,False,False,False,False,False,False,False,17.161938,...,Job Title: RF Planning Engineer\n\nJob Summary...,,,,,,,,"[{""role"": ""user"", ""content"": ""Your task is to ...",<j>\n\nJob Title: RF Planning Engineer\n\nJob ...


In [50]:
# Dictionary to keep track of the count of samples in each category
category_count = {}
for category in categories:
    category_count[category] = (synthetic_df['label_' + category] == True).sum()

binary_df = synthetic_df[label_categories].astype(int)
binary_df['bias'] = binary_df.sum(axis=1)
category_count['neutral'] = (binary_df['bias'] == 0).sum()
category_count

{'age': 5321,
 'disability': 5266,
 'feminine': 3847,
 'masculine': 3851,
 'racial': 5156,
 'sexuality': 5276,
 'general': 5234,
 'neutral': 3292}

In [51]:
category_count = {'neutral': 3292}
category_count

{'neutral': 3292}

In [52]:
import re
import time
import datetime
import pandas as pd
import random
import math
import hashlib


def create_hash(input_string):
    return hashlib.sha256(input_string.encode()).hexdigest()[:10]

def lowercase_tags(text):
    tags = re.findall(r'<\/?\w+', text)
    for tag in tags:
        text = text.replace(tag, tag.lower())
    return text

def extract_job_posting(text):
    content = re.findall(r'<j>(.*?)</j>', text, re.DOTALL)
    ret = [c.strip() for c in content]
    return ret[0] if len(ret) > 0 else text


def find_first_between_tags(file_content, tag):
    start_tag = f"<{tag}>"
    end_tag = f"</{tag}>"

    start_index = file_content.find(start_tag)
    end_index = file_content.find(end_tag)

    if start_index != -1 and end_index != -1:  # tags were found
        start_index += len(start_tag)  # adjust to index after the start tag
        result = file_content[start_index:end_index].strip()  # extract content between tags
        return result

    return None  # tags were not found or improperly formatted


def min_category():
    c = min(category_count.items(), key=lambda item: item[1])
    return c[0]


dry_run = False

while category_count[min_category()] <= size:
    category = min_category()
    position = random_job_title()

    if not dry_run:
        percentage = (category_count[category] / size) * 100
        formatted_percentage = "{:.2f}%".format(percentage)
        print(f'{len(synthetic_df)} Generating synthetic for category {category}, position "{position}": {category_count[category]}/{size} [ {formatted_percentage} ]')

    additional_categories = random.sample(categories, k=random.randint(0, max_additional_categories))
    additional_categories = set(additional_categories)
    additional_categories.add(category)

    if category == 'neutral':
        additional_categories.clear()

    # Don't include both to reduce confusing the model
    if 'masculine' in additional_categories and 'feminine' in additional_categories:
        additional_categories.remove('feminine')
        additional_categories.remove('masculine')
        additional_categories.add(random.choice(['masculine', 'feminine']))

    category_terms = {}
    if len(additional_categories) > 0:
        category_sample = int(math.ceil(max_additional_categories / len(additional_categories)))
        category_sample = random.randint(1, category_sample)

        for cat in additional_categories:
            category_terms[cat] = taxonomies[taxonomies['category'] == cat].sample(category_sample)


    # Generate text sample with category information
    #prompt, output = generate_text(category_terms)

    prompt, output, inference_time, prompt_tokens, completion_tokens, total_tokens = generate_text(position,
                                                                                                   category_terms,
                                                                                                   dry_run)
    
    output = lowercase_tags(output)
    text = find_first_between_tags(output, 'j') if not dry_run else None
    analysis_age = find_first_between_tags(output, 'age')
    analysis_disability = find_first_between_tags(output, 'disability')
    analysis_feminine = find_first_between_tags(output, 'feminine')
    analysis_masculine = find_first_between_tags(output, 'masculine')
    analysis_racial = find_first_between_tags(output, 'racial')
    analysis_sexuality = find_first_between_tags(output, 'sexuality')
    analysis_general = find_first_between_tags(output, 'general')

    category_count[category] += 1

    timestamp = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
    id = create_hash(output) if not dry_run else i
    id = f"{timestamp}:{id}"
    id_m = model_id.replace('/', ':')
    data = {
        "document_id": f'Synthetic:{id_m}:{id}',
        "position": position
    }
    for cat in categories:
        data[f'label_{cat}'] = False

    data['inference_time'] = inference_time
    data['prompt_tokens'] = prompt_tokens
    data['completion_tokens'] = completion_tokens
    data['total_tokens'] = total_tokens
    data['text'] = text
    data['analysis_age'] = analysis_age
    data['analysis_disability'] = analysis_disability
    data['analysis_feminine'] = analysis_feminine
    data['analysis_masculine'] = analysis_masculine
    data['analysis_racial'] = analysis_racial
    data['analysis_sexuality'] = analysis_sexuality
    data['analysis_general'] = analysis_general
    data['input'] = prompt
    data['output'] = output

    for cat in additional_categories:
        data[f'label_{cat}'] = True

    if not dry_run:
        with open(output_file, 'a') as file:
            if not os.stat(output_file).st_size == 0:
                file.write('\n')
            file.write(json.dumps(data))

    synthetic_df = pd.concat([synthetic_df, pd.DataFrame(data, index=[0])], ignore_index=True)
    synthetic_df = synthetic_df.drop_duplicates(subset='text', keep='first')
    synthetic_df.to_parquet(f'{output_dir}/synthetic-biased-job-descriptions.parquet', compression='gzip')
    
    #break

synthetic_df

12707 Generating synthetic for category neutral, position "Community Events Lead": 3292/5000 [ 65.84% ]
12708 Generating synthetic for category neutral, position "Engineering Tech Intern": 3293/5000 [ 65.86% ]
12709 Generating synthetic for category neutral, position "Director - Operations and Software Development": 3294/5000 [ 65.88% ]
12710 Generating synthetic for category neutral, position "Pool Attendant": 3295/5000 [ 65.90% ]
12711 Generating synthetic for category neutral, position "RF/Microwave Engineer & Manager": 3296/5000 [ 65.92% ]
12712 Generating synthetic for category neutral, position "Senior Service Delivery Manager": 3297/5000 [ 65.94% ]
12713 Generating synthetic for category neutral, position "Manager  Architect": 3298/5000 [ 65.96% ]
12714 Generating synthetic for category neutral, position "Intern  Digital and Social Media Assistant": 3299/5000 [ 65.98% ]
12715 Generating synthetic for category neutral, position "National Account Manager for Charter Communications

KeyboardInterrupt: 

In [23]:
print(synthetic_df.tail(1)['text'].values[0])

Job Title: Policy Analyst

Job Summary:

We are seeking a highly skilled Policy Analyst to join our team. As a Policy Analyst, you will play a critical role in analyzing and developing policies that drive our organization's mission forward. You will work closely with cross-functional teams to identify areas for improvement, develop evidence-based recommendations, and implement policy changes that promote equity, inclusivity, and social justice.

Responsibilities:

* Analyze complex data and research to inform policy decisions, ensuring that recommendations are grounded in evidence and best practices.
* Collaborate with stakeholders to identify policy gaps and develop solutions that address the needs of diverse communities.
* Develop and maintain relationships with internal and external partners to advance policy initiatives and build coalitions.
* Design and implement policy evaluation frameworks to measure impact and inform future policy decisions.
* Communicate complex policy issues 

In [58]:
i = """
This GPT is designed to detect implicit bias and non-inclusive language in job descriptions. It will analyse text for specific categories of bias and return two concise sentences (using British spelling and grammar) explaining the detected bias for each relevant category, without providing additional information. The possible categories of bias it can detect are: age, disability, feminine, general, masculine, racial, and sexuality.  Only list the ones detected, ignore the rest.  If none are detected, say "None detected".

Role and Goal: The GPT should identify implicit biases and non-inclusive language in job descriptions, providing a clear and succinct explanation of the bias detected for each relevant category. It should focus only on the labels detected and avoid extraneous information.

Constraints: The GPT should only output two sentences per detected bias and should not include additional commentary or information beyond the explanation of the bias. It should strictly only output the provided categories and adhere to the definitions provided for each category of bias.

Guidelines: The GPT should follow the specific examples and language cues given in the definitions for each category of bias. It should use precise and professional language in its explanations.

Clarification: The GPT should not ask for clarification but should make a best effort to analyze the text and provide accurate detections based on the provided definitions.  If no bias detected, say "None detected".

Personalization: The GPT should maintain a neutral, informative tone and focus on delivering clear, concise explanations of the detected biases.

Output format: For each category detected, wrap the sentences with the category tag.  For example, if racial, age, disability are detected then the output should be: <racial>The concise explanation here.</racial><age>The concise explanation here.</age><disability>The concise explanation here.</disability> 

Age bias definition: Occurs when language or requirements subtly favour certain age groups over others. Common categories include insensitive terms (e.g., "geezer"), language implying energy or modernity (e.g., "young and dynamic", "recent graduate") that favour younger candidates, as well as language implying experience and wisdom (e.g., "seasoned professional", "mature") that favour older candidates.

Disability bias definition: Involves the use of terms or requirements that inadvertently exclude or disadvantage individuals based on disabilities. This can include physical, mental, sensory, or cognitive impairments. Common categories include ableist terms that imply the requirement of a physical trait (e.g., "type 50 words per minute") instead of focusing on the job function (e.g., "enter data at 50 words per minute"), unnecessary physical requirements (e.g., "must be able to lift 50 pounds" for a desk job), and the absence of language regarding reasonable accommodations to ensure that candidates with disabilities are assessed based on their suitability for the role.

Feminine bias definition: Refers to language that subtly favours or resonates more with female candidates.  Common categories include gender-coded words (e.g., "nurturing," "supportive"), domestic or caregiving metaphors, an emphasis on collaborative over individualistic skills, and gendered job titles (e.g., "hostess") and pronouns (e.g., "she/her").

General bias definition: Occurs when language or requirements use derogatory (e.g. "feminazi", "retarded") or outdated terms (e.g. "the disabled"), or subtly favour or disadvantage candidates based on various characteristics. Common categories include socio-economic status (e.g., "blue-collar"), educational background (e.g., "Degree from a top school"), mental health (e.g., "OCD"), gender and family roles (e.g., "clean-shaven", "maternity leave"), veteran status, criminal history, and political or ideological beliefs.

Masculine bias definition: Refers to language that subtly favours or resonates more with male candidates. Common categories include gender-coded words (e.g., "dominant", "competitive"), sports or military metaphors, an emphasis on individualistic over collaborative skills, and gendered job titles (e.g., "salesman") and pronouns (e.g., "he/him").

Racial bias definition: Occurs when language or requirements subtly favour certain racial groups or exclude others. Common categories include racially insensitive terms (e.g., "master/slave", "redneck"), exclusionary phrases (e.g., "brown-bag session", "white/black list"), and assumptions about linguistic proficiency or background (e.g., "native English speaker").

Sexuality bias definition: Occurs when language or requirements subtly favour certain sexual orientations, gender identities, or expressions over others, creating non-inclusive language that can exclude LGBTQ+ individuals. Common categories include terms that enforce heteronormativity (e.g., "the men and women", "opposite sex"), outdated or offensive terminology (e.g., "homosexual", "tranny"), lack of recognition of diverse family structures (e.g., "wife and husband" instead of "partner" or "spouse"), assumptions about gender identity (e.g., "born a man", "sex change"), and non-inclusive pronouns (e.g., "he/she" instead of "they" or "you").
"""

m = [{
        "role": "system",
        "content": i
    },
    {
        "role": "user",
        "content": synthetic_df.head(1)['text'].values[0]
    }
]

openai_chat(m)

In [35]:
synthetic_df.to_parquet(f'{output_dir}/synthetic-biased-job-descriptions.parquet', compression='gzip')

In [36]:
for cat in categories:
    print(synthetic_df[f'label_{cat}'].value_counts())

In [43]:
synthetic_df[label_categories].tail(1)

In [38]:
print(synthetic_df['text'].iloc[-1])

In [207]:
# Longest phrase
longest_text = synthetic_df['text'].apply(lambda x: (len(x), x)).max()[1]
longest_text

In [208]:

from transformers import AutoTokenizer


def print_max_tokens(model_id):
    tokenizer = AutoTokenizer.from_pretrained(model_id, add_prefix_space=True)
    max_tokens = len(tokenizer.encode(longest_text))
    print(f"Max '{model_id}' tokens: {max_tokens}")

def print_encode_decoded(model_id, longest_text):
    tokenizer = AutoTokenizer.from_pretrained(model_id, add_prefix_space=True)
    encoded_tokens = tokenizer.encode(longest_text)
    print(f"Tokens: {encoded_tokens}")
    print(f"Decoded tokens: {tokenizer.decode(encoded_tokens)}")
    
def print_tokens(model_id, longest_text):
    tokenizer = AutoTokenizer.from_pretrained(model_id, add_prefix_space=True)
    tokens = tokenizer.tokenize(longest_text)
    print(f"Tokens: {tokens}")
    

In [209]:
max_char = len(longest_text)
max_words = len(longest_text.split())

print(f'Max characters: {max_char}')
print(f'Max words: {max_words}')
for model_id in ['roberta-base', 'bert-base-uncased', 'microsoft/deberta-v3-small']:
    print_max_tokens(model_id)


In [210]:
import numpy as np

# Source: https://colab.research.google.com/drive/1pddMaJJIHR0O8MND42hfzYRxOPMV82KA?usp=sharing#scrollTo=RkVuiK_loty4

def categorical_entropy(df, labels):
    # entropy for labels across the dataset
    # p(l) = count(l) / sum(count(l) for l in labels))
    # H = sum(p(l) * -log2 p(l) for l in labels)
    cat_sums = df[labels].sum()
    cat_probs = np.array([cs / cat_sums.sum() for cs in cat_sums])
    return np.sum(cat_probs * -np.log2(cat_probs))

In [211]:
label_categories

In [212]:
# entropy for original dataset
categorical_entropy(synthetic_df, label_categories)

In [214]:
dedup_df = pd.read_parquet('/home/teveritt/Datasets/2024-mcm-everitt-ryan/datasets/synthetic-jobs/synthetic-biased-job-descriptions-deduped.parquet')
dedup_df

In [215]:
categorical_entropy(dedup_df, label_categories)


In [216]:
for cat in categories:
    print(dedup_df[f'label_{cat}'].value_counts())

In [223]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

data = {}

for cat in categories:
    counts = dedup_df[f'label_{cat}'].value_counts()
    data[cat] = counts

df = pd.DataFrame(data)

plt.figure(figsize=(20, 16))  # set plot figure size here
df.plot(kind='bar', stacked=True)
plt.title('Distribution of the 7 Categories')
plt.xlabel('Categories')
plt.ylabel('Counts')
plt.show()

In [226]:
import matplotlib.pyplot as plt
import pandas as pd

data = {} 

for cat in categories:
    counts = dedup_df[f'label_{cat}'].value_counts()
    data[cat] = counts

df = pd.DataFrame(data)

plt.figure(figsize=(10, 8))

df.plot(kind='barh', stacked=True)

plt.title('Distribution of the 7 Categories')
plt.ylabel('Categories')
plt.xlabel('Counts')
plt.show()
