In [43]:
category = 'racial'

In [4]:
# Fix any tags that aren't getting closed.
def fix_closing_tag(row):
    if f'<{category}>' in row['output'] and f'</{category}>' not in row['output']:
        return row['output'] + f'</{category}>'
    else:
        return row['output']

def extract_tag_text(row):
    output = row['output']
    start_tag = f"<{category}>"
    end_tag = f"</{category}>"

    start_index = output.find(start_tag)
    end_index = output.find(end_tag)

    if start_index != -1 and end_index != -1:  # tags were found
        start_index += len(start_tag)  # adjust to index after the start tag
        result = output[start_index:end_index].strip()
        result = result.replace('*', '')  # extract content between tags
        return result

    return None  # tags were not found or improperly formatted

In [5]:
import os
import pandas as pd

output_dir = f'/home/teveritt/Datasets/2024-mcm-everitt-ryan/datasets/synthetic-job-postings/polarity-synthetic/{category}'
jsonl_gpt4o_file = f'{output_dir}/polarity-synthetic-gpt4o.jsonl'
jsonl_llama3b70_file = f'{output_dir}/polarity-synthetic-llama3b70.jsonl'
parquet_file = f'{output_dir}/polarity-synthetic.parquet'

df_gpt4o = pd.read_json(jsonl_gpt4o_file, lines=True)
df_llama3b70 = pd.read_json(jsonl_llama3b70_file, lines=True)
df = pd.concat([df_gpt4o, df_llama3b70])

# These can introduce age bias
df = df[~df.text.str.contains("10\+? years", case=False, na=False)]

df = df[~((df['model'] == 'gpt-4o-2024-05-13') & (df[f'label_{category}'] == False))] # gpt4o unbiased outputs had a high rate of bias.

df['id'] = df['document_id']

df['text'] = df['text'].str.replace('***', '', regex=False)
df['text'] = df['text'].str.replace('**', '', regex=False)
df['text'] = df['text'].str.replace('*', '-', regex=False)

df['output'] = df.apply(fix_closing_tag, axis=1)
df[f'analysis_{category}'] = df.apply(extract_tag_text, axis=1)

for column in df.columns:
    if column.startswith('analysis_') and column != f'analysis_{category}':
        df[column] = ''
    if column.startswith('label_') and column != f'label_{category}':
        df[column] = False

df

In [5]:
df['position'].unique()

In [6]:
df['model'].unique()

In [7]:
sample = df[df[f'label_{category}'] == True].sample(1)

print(sample['text'].values[0])
print(f"Biased: {sample[f'label_{category}'].values[0]}")
print(f"ID: {sample['document_id'].values[0]}")

In [6]:
with open(f'output-{category}.txt', 'w') as f:
    for index, record in df.iterrows():
        id = record['id']
        analysis = record[f'analysis_{category}']
        bias = record[f'label_{category}']
        text = record['text']
        f.write(f"\n====================================\nBias: {bias} || {id}\n{analysis}\n------------------------------\n\n{text}\n\n")


In [121]:
df_gold = pd.concat([
   df[((df['model'] == 'meta-llama:Meta-Llama-3-70B-Instruct') & (df[f'label_{category}'] == True))].sample(78),
   df[((df['model'] == 'meta-llama:Meta-Llama-3-70B-Instruct') & (df[f'label_{category}'] == False))].sample(200),
   df[((df['model'] == 'gpt-4o-2024-05-13') & (df[f'label_{category}'] == True))].sample(78)
])
df_gold[df_gold[f'label_{category}'] == True].value_counts('model')

In [122]:
df_gold[df_gold[f'label_{category}'] == False].value_counts('model')

In [129]:
#with open(f'gold_ids_{category}.txt', 'w') as f:
#    for index, record in df_gold.iterrows():
#        id = record['id']
#        f.write(f"{id}\n")

In [130]:
#with open(f'review-{category}.txt', 'w') as f:
#    for index, record in df_gold[df_gold[f'label_{category}']==True].iterrows():
#        id = record['id']
#        analysis = record[f'analysis_{category}']
#        bias = record[f'label_{category}']
#        text = record['text']
#        f.write(f"\n====================================\nBias: {bias} || {id}\n{analysis}\n------------------------------\n\n{text}\n\n")
#    for index, record in df_gold[df_gold[f'label_{category}']==False].iterrows():
#        id = record['id']
#        analysis = record[f'analysis_{category}']
#        bias = record[f'label_{category}']
#        text = record['text']
#        f.write(f"\n====================================\nBias: {bias} || {id}\n{analysis}\n------------------------------\n\n{text}\n\n")

In [44]:
ids = []

with open(f"gold_ids_{category}.txt", "r") as file:
    ids = file.read().splitlines()

df_gold = df[df['id'].isin(ids)]
df_gold

In [37]:
df_gold.value_counts(f'label_{category}')

In [38]:
#df_gold.to_parquet(f'review-{category}.parquet', compression='gzip')
#df_gold.to_json(f'review-{category}.jsonl', orient='records', lines=True)

In [39]:
df_gold.columns

In [46]:
import json

label_columns = [col for col in df_gold.columns if col.startswith('label_')]
analysis_columns = [col for col in df_gold.columns if col.startswith('analysis_')]

#df['notes'] = df['notes'].fillna('')

df_gold['verified'] = True
df_gold['synthetic'] = True

columns = ['id']
for c in ['age','disability','masculine','feminine','racial','sexuality','general']:
    columns.append(f'label_{c}')
    columns.append(f'analysis_{c}')
    
columns += ['verified', 'synthetic', 'text', 'metadata']

metadata_columns = ['position', 'inference_time','prompt_tokens', 'completion_tokens', 'total_tokens', 'model', 'input', 'output']
df_gold['metadata'] = df_gold.apply(lambda row: json.dumps(row[metadata_columns].to_dict()), axis=1)

df_gold = df_gold[columns]
df_gold

In [47]:
df_gold.columns

In [48]:
df_gold.head(1)['metadata'].values[0]

In [50]:
from sklearn.model_selection import train_test_split

def split(dataframe):
    df_train, df_80 = train_test_split(dataframe, test_size=0.8, random_state=2024)
    df_val, df_test = train_test_split(df_80, test_size=0.5, random_state=2024)
    return df_train, df_val, df_test

df_gold_bias = df_gold[df_gold[f'label_{category}'] == True]
df_gold_unbias = df_gold[df_gold[f'label_{category}'] == False]

df_bias_train, df_bias_val, df_bias_test = split(df_gold_bias)
df_unbias_train, df_unbias_val, df_unbias_test = split(df_gold_unbias)

df_train = pd.concat([df_bias_train,df_unbias_train])
df_val = pd.concat([df_bias_val,df_unbias_val])
df_test = pd.concat([df_bias_test,df_unbias_test])

df_train.to_parquet(f'synthetic-{category}-train.parquet', compression='gzip')
df_val.to_parquet(f'synthetic-{category}-val.parquet', compression='gzip')
df_test.to_parquet(f'synthetic-{category}-test.parquet', compression='gzip')

In [131]:
# Longest phrase
longest_text = df_gold['text'].apply(lambda x: (len(x), x)).max()[1]
print(longest_text)

In [132]:

from transformers import AutoTokenizer


def print_max_tokens(model_id):
    tokenizer = AutoTokenizer.from_pretrained(model_id, add_prefix_space=True)
    max_tokens = len(tokenizer.encode(longest_text))
    print(f"Max '{model_id}' tokens: {max_tokens}")


def print_encode_decoded(model_id, longest_text):
    tokenizer = AutoTokenizer.from_pretrained(model_id, add_prefix_space=True)
    encoded_tokens = tokenizer.encode(longest_text)
    print(f"Tokens: {encoded_tokens}")
    print(f"Decoded tokens: {tokenizer.decode(encoded_tokens)}")


def print_tokens(model_id, longest_text):
    tokenizer = AutoTokenizer.from_pretrained(model_id, add_prefix_space=True)
    tokens = tokenizer.tokenize(longest_text)
    print(f"Tokens: {tokens}")


In [133]:
max_char = len(longest_text)
max_words = len(longest_text.split())

print(f'Max characters: {max_char}')
print(f'Max words: {max_words}')
for model in ['roberta-base', 'bert-base-uncased', 'microsoft/deberta-v3-small']:
    print_max_tokens(model)
