# Step 1: Prompt generator

Create JSONL files for all permutations of all conditions, to be submitted via the OpenAI Batch API. Due to 50,000 max prompts/queries per file, it generates 8 versions

Depends on: `input_data/audit_names.csv`

Outputs: 
- 1 JSONL file to submit to OpenAI Batch API: `input_data/age_name_edu_{model_version}.jsonl`
- 1 CSV used to track which task_id corresponds to each prompt: `input_data/age_name_edu_seed.csv`

In [None]:
import sys
import os
import json
from tqdm import tqdm
import pandas as pd
import numpy as np
import logging
pd.set_option('display.max_rows', 500)

In [None]:
param_one = []

In [None]:
len(param_one)

In [None]:
param_two = []
len(param_two)

In [None]:
models = ['gpt-4o-mini-2024-07-18']
          
with open("models.json", "w") as file:
    json.dump(models, file)

In [None]:
def read_applicants(names_fn="input_data/audit_names.xlsx"):
    """Reads applicant data from the Excel file, matching first and last names by race."""
    try:
        # Load first and last names from the Excel file
        df_first = pd.read_excel(names_fn, sheet_name="first name").fillna(" ")
        df_last = pd.read_excel(names_fn, sheet_name="last name").fillna(" ")
        
        # Group first and last names by race
        first_names_by_race = df_first.groupby('Race')
        last_names_by_race = df_last.groupby('Race')
        
        # Combine first and last names only within the same race
        applicants = []
        for race in first_names_by_race.groups:
            first_names = first_names_by_race.get_group(race)
            last_names = last_names_by_race.get_group(race)
            
            for _, first_row in first_names.iterrows():
                for _, last_row in last_names.iterrows():
                    full_name = f"{first_row['First Name']} {last_row['Last name']}"
                    applicant = {
                        'Full Name': full_name,
                        'Gender': first_row['Gender'],
                        'Race': last_row['Race'], 
                    }
                    applicants.append(applicant)
        return applicants
    except Exception as e:
        logging.error(f"Error reading names from audit_names.xlsx: {e}")
        sys.exit(1)

In [None]:
names_list = read_applicants()
names_list

In [None]:
len(names_list)

In [None]:
row_list = []
run_id = 0

# 3 copies of each prompt for each age x name x education combination
for name in names_list:
    for x in param_one:
        for y in param_two:
             for i in range(0,3):
                row_list.append({'run_id':run_id,
                                 'name':name['Full Name'],
                                 'gender':name['Gender'],
                                 'race':name['Race'],
                                 'param_one':x,
                                 'param_two': y,
                                 'query_response_raw':np.nan,
                                 'query_response':np.nan})
                run_id += 1

In [None]:
df = pd.DataFrame(row_list)
df

In [None]:
df.groupby(['gender','param_one','param_two']).count()[['run_id']]

In [None]:
df.groupby('gender')[['run_id','param_one','param_two']].nunique()

In [None]:
df.to_csv('input_data/age_name_edu_seed.csv', index=False)

## Generate prompts

In [None]:
df = pd.read_csv('input_data/age_name_edu_seed.csv')
df

In [None]:
def generate_sentence_employer(age, name, education):
    
#     age = f" who is {age}" if age != "None-Control" else ""
    
#     education = f"""
#     Education 
#     {education}
#     """ if education != "None-Control" else ""
    
#     name = name if name != "None-Control" else ""

    return f"""
    PROMPT HERE
    """

In [None]:
print(generate_sentence_employer())

In [None]:
# Check control prompt format
for i in ['16', 'None-Control']:
    for j in ['Charlie Andersen', 'None-Control']:
        for k in ['Some high school (did not complete)', 'None-Control']:
            if i == 'None-Control' or j == 'None-Control' or k == 'None-Control':
                print(f'{i}; {j}; {k}')
                print(generate_sentence_employer(i, j, k))
                print('---------------------')

In [None]:
for version in models:
    print(version)
    tasks = []
    for index, row in df.iterrows():

        task = {
            "custom_id": f"task-{row['run_id']}",
            "method": "POST",
            "url": "/v1/chat/completions",
            "body": {
                # This is what you would have in your Chat Completions API call
                "model": f"{version}",
                "messages": [
                    {
                        "role": "user",
                        "content": generate_sentence_employer(row['age'], row['name'], row['education'])
                    }
                ],
            }
        }

        tasks.append(task)
    version = version.split('/')[-1]
    file_name = f"input_data/batch_requests/age_name_edu_{version}.jsonl"
    file_name = file_name
    
    with open(file_name, 'w') as file:
        for obj in tasks:
            file.write(json.dumps(obj) + '\n')

In [None]:
!head -n 1 input_data/batch_requests/*.jsonl

In [None]:
import zipfile

# Path to the input JSONL file
input_file = 'input_data/batch_requests/age_name_edu_gpt-4o-mini-2024-07-18.jsonl'

# Path to the output ZIP file
output_file = 'input_data/batch_requests/age_name_edu_gpt-4o-mini-2024-07-18.zip'

# Compress the file into a ZIP archive
with zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED) as zipf:
    zipf.write(input_file, arcname='data.jsonl')

print(f"File {input_file} has been compressed to {output_file}.")
