In [1]:
import pandas as pd
from openai import OpenAI

def read_file(file_path):
    with open(file_path, 'r') as file:
        content = file.read()
    return content.rstrip('\n')


def openai_chat(messages):
    client = OpenAI(api_key=read_file("/home/teveritt/OpenAI-API-DCU-AI.key"))

    return client.chat.completions.create(
        model='gpt-4o',
        temperature=0.0,
        messages=messages,
        stream=False,
        max_tokens=1525
    )

In [32]:
def fix_benefits_transparency(text):
    m = [
        {
            "role": "system",
            "content": """
This GPT will review the benefits/offer of a job posting for any vagueness and if there is a lack of transparency (e.g. Competitive pay/salary), then it will adjust it to add more transparency.  It will pick one of global currency reserves when mentioning any form of money.  Below are some examples (the GPT is free to choose from an example or something else that's more appropriate):
    - The salary for this position ranges from <salary_lower> to <salary_upper> per annum, depending on experience and qualifications.
    - This role offers a starting salary of <salary_lower> per annum. Additionally, you will receive an annual performance bonus of up to 10% of your salary, and a <bonus> sign-on bonus.
    - The initial salary is <salary_lower> per annum with a clear progression path. Employees typically see an increase to <salary_upper> within the first 12 months, subject to performance reviews.
    - We believe in fair pay. The salary for this role is <salary_lower> per annum. We also offer pay equity reviews every six months to ensure fairness and transparency across the company.
    - The salary for this position is <salary_lower> per annum. In addition, our total compensation package includes a <training_budget> annual training budget, comprehensive health insurance, a <wellness_stipend> wellness stipend, and a pension plan with 5% employer contribution.

The GPT is free to choose the wording and it is not restricted to the exact language of the examples.  In other words, it does not have to say 'annum', but can pick annum,per year, annually, each year, yearly, per annum, etc.

Apply these replacements to any job posting provided and don't alter anything else.
If there is nothing to adjust, then respond with <SKIP>.
"""
        }, {
            "role": "user",
            "content": text
        }
    ]
    output = openai_chat(m)
    content = output.choices[0].message.content.strip()
    content = content.replace('***', '')
    content = content.replace('**', '')
    content = content.replace('*', '-')
    return content

# Fix Benefits Transparency

In [33]:
import pandas as pd

general_bias_df = pd.concat([
    pd.read_parquet(f'input/synthetic-general-train.parquet'),
    pd.read_parquet(f'input/synthetic-general-val.parquet'),
    pd.read_parquet(f'input/synthetic-general-test.parquet')
])
general_bias_df = general_bias_df[general_bias_df['label_general'] == True]
expected_lack_of_transparency = general_bias_df['id'].to_list()
expected_lack_of_transparency

['Synthetic:meta-llama:Meta-Llama-3-70B-Instruct:20240629015347:negative:336bb922ad',
 'Synthetic:gpt-4o-2024-05-13:20240629104859:negative:d1245c949b',
 'Synthetic:gpt-4o-2024-05-13:20240629012354:negative:597d73e4bb',
 'Synthetic:meta-llama:Meta-Llama-3-70B-Instruct:20240629014223:negative:6338d64ef5',
 'Synthetic:gpt-4o-2024-05-13:20240629012138:negative:0a5e972a73',
 'Synthetic:gpt-4o-2024-05-13:20240629011832:negative:7708d346e0',
 'Synthetic:meta-llama:Meta-Llama-3-70B-Instruct:20240629014430:negative:db1c060de4',
 'Synthetic:meta-llama:Meta-Llama-3-70B-Instruct:20240629015136:negative:eff3bda92d',
 'Synthetic:gpt-4o-2024-05-13:20240629012201:negative:d2300a6faa',
 'Synthetic:gpt-4o-2024-05-13:20240629012318:negative:b892540fd5',
 'Synthetic:meta-llama:Meta-Llama-3-70B-Instruct:20240629103259:negative:09534c0cec',
 'Synthetic:meta-llama:Meta-Llama-3-70B-Instruct:20240629013040:negative:bd1566b003',
 'Synthetic:meta-llama:Meta-Llama-3-70B-Instruct:20240629013432:negative:f021f3d4f

In [34]:
import os

processed_file = 'processed-transparency.txt'


with open(processed_file, 'r') as file:
    processed = set()
    for line in file.readlines():
        processed.add(line.strip().replace('\n',''))

processed = list(processed)
processed.sort()
print(f'Processed: {processed}')

for type in ['train','val', 'test']:
    dataframes = []
    for file_name in os.listdir(type):
        if file_name.endswith('.txt'):
            file_name, file_extension = os.path.splitext(file_name)
            
            id = file_name.replace('__','::')
            if id.startswith('Synthetic'):
                id = id.replace('_',':')
            if id not in expected_lack_of_transparency:
                base_path = os.path.join(type, file_name)
                if f'{base_path}.txt' in processed:
                    print(f'Already processed {base_path}.txt')
                else:
                    if file_name.startswith('Synthetic'):
                        if os.path.isfile(f'{base_path}.txt'):
                                print(f'Processing {base_path}.txt')
                                with open(f'{base_path}.txt', 'r') as file:
                                    text = file.read().strip()
                                text = fix_benefits_transparency(text)
                                if '<SKIP>' in text:
                                    print(f'Skipping {base_path}.txt')
                                else:
                                    print(f'Cleaning {base_path}.txt')
                                    with open(f'{base_path}.txt', 'w') as f:
                                        f.write(text)
                                    
                                processed.append(f'{base_path}.txt')
                                with open(processed_file, 'w') as file:
                                    for item in processed:
                                        file.write('%s\n' % item)
                            
        #break
    #break
                


Processed: []
Processing train/Synthetic_gpt-4o-2024-05-13_20240628001156_negative_871f89f11b.txt
Skipping train/Synthetic_gpt-4o-2024-05-13_20240628001156_negative_871f89f11b.txt
Processing train/Synthetic_gpt-4o-2024-05-13_20240628001159_neutral_31df24916b.txt
Skipping train/Synthetic_gpt-4o-2024-05-13_20240628001159_neutral_31df24916b.txt
Processing train/Synthetic_gpt-4o-2024-05-13_20240628001504_positive_a78519750e.txt
Skipping train/Synthetic_gpt-4o-2024-05-13_20240628001504_positive_a78519750e.txt
Processing train/Synthetic_gpt-4o-2024-05-13_20240628001915_positive_e826667c6f.txt
Skipping train/Synthetic_gpt-4o-2024-05-13_20240628001915_positive_e826667c6f.txt
Processing train/Synthetic_gpt-4o-2024-05-13_20240628001945_neutral_373be0f13a.txt
Skipping train/Synthetic_gpt-4o-2024-05-13_20240628001945_neutral_373be0f13a.txt
Processing train/Synthetic_gpt-4o-2024-05-13_20240628002025_neutral_1b59b06b98.txt
Skipping train/Synthetic_gpt-4o-2024-05-13_20240628002025_neutral_1b59b06b98.

KeyboardInterrupt: 