# Generate Target Prompts

In [5]:
# Importing libraries
import os
import logging
from dotenv import load_dotenv
import requests
import json
import os
import random
import pandas as pd

from generator import Generator
current_dir = os.path.abspath("")
processed_data_dir = os.path.join(current_dir, 'processed')
splits_data_dir = os.path.join(current_dir, 'splits')


# Load the .env file
load_dotenv()

# # Configure logging
logging.basicConfig(
    level=logging.CRITICAL,  # Set the default logging level
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('personas.log'),  # Log to a file
        logging.StreamHandler()  # Log to console
    ]
)

# Setting the API key and the model repository
OPENROUTER_KEY = os.environ.get('OPENROUTER_KEY')
repository = 'anthropic/claude-3.5-sonnet'
model = repository.split('/')[1]
providers = ['Anthropic']
generator = Generator(repository, OPENROUTER_KEY, providers=providers,)

# Download role info data from GitHub
url = 'https://raw.githubusercontent.com/Jiaxin-Pei/Prompting-with-Social-Roles/refs/heads/main/data/role_info.csv'
roles_info = pd.read_csv(url)

# We are interested only in the roles that are present in mmlu
roles_info = roles_info[roles_info['mmlu'] == 1]

url = 'https://raw.githubusercontent.com/tencent-ailab/persona-hub/refs/heads/main/data/persona.jsonl'
personas = pd.read_json(url, lines=True)

roles_info

Unnamed: 0,role,role cate,occupation cate,merged_cate,N-gram Frequency (2018-2019),mmlu,interpersonal,gender,align_words,gender_role_cate
2,psychologist,work,psychology,psychology,4.646573e-06,1,0,unknown,,
6,politician,work,politics,politics,5.218259e-06,1,0,unknown,,
7,sheriff,work,politics,politics,7.653504e-06,1,0,unknown,,
9,governer,work,politics,politics,3.078719e-09,1,0,unknown,,
10,geneticist,work,natural science,natural science,2.58022e-07,1,0,unknown,,
19,biologist,work,natural science,natural science,1.110472e-06,1,0,unknown,,
20,physicist,work,natural science,natural science,2.030176e-06,1,0,unknown,,
21,teacher,school,natural science,natural science,5.522994e-05,1,0,unknown,,
22,chemist,work,natural science,natural science,1.986987e-06,1,0,unknown,,
26,ecologist,work,natural science,natural science,2.50523e-07,1,0,unknown,,


In [7]:
# Define the roles dictionary mapping split to its associated role strings
roles_dict = {
    # "econ": ["economic researcher", "economist", "financial analyst"],
    # "eecs": ["electronics technician", "data scientist", "electrical engineer", "software engineer", "web developer"],
    # "history": ["historian", "archivist", "historical researcher", "archaeologist"],
    # "law": ["bailiff", "lawyer"],
    # "math": ["data analyst", "mathematician", "statistician"],
    "medicine": ["nurse", "doctor", "physician", "dentist", "surgeon"],
    # "natural science": ["geneticist", "biologist", "physicist", "teacher", "chemist", "ecologist"],
    # "politics": ["politician", "sheriff", "enthusiast", "partisan"],
    # "psychology": ["psychologist"],
}

# Prepare a list to store counts for each individual role.
# (We assume that the persona hub data has a column 'persona' containing text.)
results = []
for split, roles in roles_dict.items():
    for role in roles:
        # Use case-insensitive matching (skip NaNs with na=False)
        mask = personas['persona'].str.lower().str.contains(role.lower(), na=False)
        count = mask.sum()
        results.append({
            "Split": split,
            "Role": role,
            "Count": count
        })

# Create a DataFrame from the results
results_df = pd.DataFrame(results)
print("Counts per individual role:")
print(results_df)

# --- Overall (complessivo) statistics across all roles ---
overall_stats = results_df["Count"].agg(
    min_count="min",
    max_count="max",
    mean_count="mean",
    total_count="sum"
)
overall_stats_df = pd.DataFrame([overall_stats])
print("\nOverall (complessivo) statistics across all roles:")
print(overall_stats_df)


Counts per individual role:
      Split       Role  Count
0  medicine      nurse   1206
1  medicine     doctor   1094
2  medicine  physician    279
3  medicine    dentist    101
4  medicine    surgeon    306

Overall (complessivo) statistics across all roles:
       min_count  max_count  mean_count  total_count
Count      101.0     1206.0       597.2       2986.0


In [10]:
import asyncio
from tqdm.asyncio import tqdm
import random

task_types = [
    "describe", 
    "explain",
    "design",
    "what is",
    "how to",
    "analyze",
    "compare",
    "create",
    "solve",
    "recommend"
]

async def generate_prompt_async(persona, i):
    """
    Asynchronously generate a single prompt for a given persona.
    """
    task_type = random.choice(task_types)
    
    instruction = f'''Generate a {task_type} prompt that this persona would likely ask:

    Persona: {persona}

    Rules:
    1. The prompt should start with "{task_type}"
    2. Keep it specific and under 15 words
    3. Make it relevant to the persona's background/interests
    4. Your output must start with "User prompt:"

    Examples based on task types:
    - describe: "Describe the key features of a successful marketing campaign"
    - explain: "Explain the process of setting up a home network"
    - design: "Design a logo for a sustainable fashion brand"
    - what is: "What is the difference between UI and UX design?"
    - how to: "How to optimize a website for mobile devices?"
    '''
    
    try:
        raw_prompt = await generator.generate(instruction)
        if 'User prompt:' in raw_prompt:
            prompt = raw_prompt.split('User prompt:')[1].strip()
        else:
            prompt = raw_prompt.strip()
        return (persona, prompt, task_type)  # Now also returning task_type
    except Exception as e:
        print(f'Error processing prompt {i}: {str(e)}')
        return (persona, '', task_type)



async def generate_prompts_async(personas_list, n=228):
    """
    Asynchronously generate n prompts, each with a random persona.
    """
    tasks = []
    for i in range(n):
        # Randomly select a persona for each prompt
        random_persona = random.choice(personas_list)
        tasks.append(generate_prompt_async(random_persona, i))
    
    results = []
    pbar = tqdm(total=n, desc=f'Generating prompts', leave=True)
    for task in asyncio.as_completed(tasks):
        try:
            result = await task
            results.append(result)
            pbar.update(1)
        except Exception as e:
            print(f'Error in task: {str(e)}')
            results.append(('ERROR', ''))  # Append empty result in case of error
            pbar.update(1)
    pbar.close()
    
    return results


async def main():

    # Loop through the roles and generate prompts for each role
    # implement a loading to show the progress on roles
    for role in tqdm(roles_info['role']):
        print(f'Generating prompts for role: {role}')
        # Setting the paths
        prompts_file = f'raw/prompts_target_{model}_{role}.csv'

        # Get the personas for the current role by filtering the personas DataFrame
        # if a persona['persona'] string contains the role string

        personas_list = personas[personas['persona'].str.contains(role)]['persona'].tolist()

        if len(personas_list) == 0:
            print(f'No personas found for role: {role}')
            personas_list = [role]
            prompts_file = f'raw/prompts_target_{model}_{role}_no_persona.csv'

        # If file exists, skip generating prompts
        if os.path.exists(prompts_file):
            print(f'Prompts file already exists: {prompts_file}')
            continue
    
        print(f'Generating {128} prompts with random personas')
        results = await generate_prompts_async(personas_list, 128)
        
        # Update the DataFrame creation in main():
        df = pd.DataFrame(results, columns=['persona', 'prompt', 'task_type'])
        
        # Save the results to a CSV file
        # Ensure the directory exists if not create it
        if not os.path.exists(os.path.dirname(prompts_file)):
            os.makedirs(os.path.dirname(prompts_file))
        df.to_csv(prompts_file, index=False)

# Get the current event loop and run the async code
await main()


  0%|          | 0/35 [00:00<?, ?it/s]

Generating prompts for role: psychologist
Generating 128 prompts with random personas


Generating prompts: 100%|██████████| 128/128 [00:08<00:00, 14.34it/s]
  3%|▎         | 1/35 [00:08<05:04,  8.97s/it]

Generating prompts for role: politician
Generating 128 prompts with random personas


Generating prompts: 100%|██████████| 128/128 [00:04<00:00, 28.78it/s]
  6%|▌         | 2/35 [00:13<03:28,  6.33s/it]

Generating prompts for role: sheriff
Generating 128 prompts with random personas


Generating prompts: 100%|██████████| 128/128 [00:04<00:00, 26.24it/s]
  9%|▊         | 3/35 [00:18<03:02,  5.69s/it]

Generating prompts for role: governer
No personas found for role: governer
Generating 128 prompts with random personas


Generating prompts: 100%|██████████| 128/128 [00:03<00:00, 33.73it/s]
 11%|█▏        | 4/35 [00:22<02:33,  4.96s/it]

Generating prompts for role: geneticist
Generating 128 prompts with random personas


Generating prompts: 100%|██████████| 128/128 [00:03<00:00, 32.15it/s]
 14%|█▍        | 5/35 [00:26<02:18,  4.62s/it]

Generating prompts for role: biologist
Generating 128 prompts with random personas


Generating prompts: 100%|██████████| 128/128 [00:05<00:00, 24.83it/s]
 17%|█▋        | 6/35 [00:31<02:19,  4.81s/it]

Generating prompts for role: physicist
Generating 128 prompts with random personas


Generating prompts: 100%|██████████| 128/128 [00:05<00:00, 24.13it/s]
 20%|██        | 7/35 [00:36<02:19,  4.99s/it]

Generating prompts for role: teacher
Generating 128 prompts with random personas


Generating prompts: 100%|██████████| 128/128 [00:04<00:00, 26.89it/s]
 23%|██▎       | 8/35 [00:41<02:13,  4.93s/it]

Generating prompts for role: chemist
Generating 128 prompts with random personas


Generating prompts: 100%|██████████| 128/128 [00:09<00:00, 13.58it/s]
 26%|██▌       | 9/35 [00:51<02:45,  6.35s/it]

Generating prompts for role: ecologist
Generating 128 prompts with random personas


Generating prompts: 100%|██████████| 128/128 [00:09<00:00, 13.25it/s]
 29%|██▊       | 10/35 [01:00<03:04,  7.38s/it]

Generating prompts for role: nurse
Generating 128 prompts with random personas


Generating prompts: 100%|██████████| 128/128 [00:06<00:00, 21.06it/s]
 31%|███▏      | 11/35 [01:06<02:47,  7.00s/it]

Generating prompts for role: doctor
Generating 128 prompts with random personas


Generating prompts: 100%|██████████| 128/128 [00:09<00:00, 13.52it/s]
 34%|███▍      | 12/35 [01:16<02:58,  7.76s/it]

Generating prompts for role: physician
Generating 128 prompts with random personas


Generating prompts: 100%|██████████| 128/128 [00:05<00:00, 23.19it/s]
 37%|███▋      | 13/35 [01:21<02:36,  7.10s/it]

Generating prompts for role: dentist
Generating 128 prompts with random personas


Generating prompts: 100%|██████████| 128/128 [00:04<00:00, 30.00it/s]
 40%|████      | 14/35 [01:26<02:11,  6.25s/it]

Generating prompts for role: surgeon
Generating 128 prompts with random personas


Generating prompts: 100%|██████████| 128/128 [00:04<00:00, 31.31it/s]
 43%|████▎     | 15/35 [01:30<01:52,  5.62s/it]

Generating prompts for role: data analyst
Generating 128 prompts with random personas


Generating prompts: 100%|██████████| 128/128 [00:05<00:00, 23.64it/s]
 46%|████▌     | 16/35 [01:35<01:45,  5.57s/it]

Generating prompts for role: mathematician
Generating 128 prompts with random personas


Generating prompts: 100%|██████████| 128/128 [00:04<00:00, 26.82it/s]
 49%|████▊     | 17/35 [01:40<01:36,  5.34s/it]

Generating prompts for role: statistician
Generating 128 prompts with random personas


Generating prompts: 100%|██████████| 128/128 [00:04<00:00, 29.23it/s]
 51%|█████▏    | 18/35 [01:45<01:26,  5.07s/it]

Generating prompts for role: bailiff
Generating 128 prompts with random personas


Generating prompts: 100%|██████████| 128/128 [00:04<00:00, 27.04it/s]
 54%|█████▍    | 19/35 [01:49<01:19,  4.98s/it]

Generating prompts for role: lawyer
Generating 128 prompts with random personas


Generating prompts: 100%|██████████| 128/128 [00:04<00:00, 26.87it/s]
 57%|█████▋    | 20/35 [01:54<01:13,  4.93s/it]

Generating prompts for role: historian
Generating 128 prompts with random personas


Generating prompts: 100%|██████████| 128/128 [00:04<00:00, 31.87it/s]
 60%|██████    | 21/35 [01:58<01:05,  4.67s/it]

Generating prompts for role: archivist
Generating 128 prompts with random personas


Generating prompts: 100%|██████████| 128/128 [00:09<00:00, 14.20it/s]
 63%|██████▎   | 22/35 [02:07<01:17,  5.99s/it]

Generating prompts for role: historical researcher
Generating 128 prompts with random personas


Generating prompts: 100%|██████████| 128/128 [00:08<00:00, 15.37it/s]
 66%|██████▌   | 23/35 [02:16<01:20,  6.70s/it]

Generating prompts for role: archaeologist
Generating 128 prompts with random personas


Generating prompts: 100%|██████████| 128/128 [00:08<00:00, 14.76it/s]
 69%|██████▊   | 24/35 [02:24<01:20,  7.31s/it]

Generating prompts for role: electronics technician
Generating 128 prompts with random personas


Generating prompts: 100%|██████████| 128/128 [00:04<00:00, 31.28it/s]
 71%|███████▏  | 25/35 [02:29<01:03,  6.36s/it]

Generating prompts for role: data scientist
Generating 128 prompts with random personas


Generating prompts: 100%|██████████| 128/128 [00:04<00:00, 31.91it/s]
 74%|███████▍  | 26/35 [02:33<00:51,  5.67s/it]

Generating prompts for role: electrical engineer
Generating 128 prompts with random personas


Generating prompts: 100%|██████████| 128/128 [00:03<00:00, 34.91it/s]
 77%|███████▋  | 27/35 [02:36<00:40,  5.08s/it]

Generating prompts for role: software engineer
Generating 128 prompts with random personas


Generating prompts: 100%|██████████| 128/128 [00:04<00:00, 28.70it/s]
 80%|████████  | 28/35 [02:41<00:34,  4.91s/it]

Generating prompts for role: web developer
Generating 128 prompts with random personas


Generating prompts: 100%|██████████| 128/128 [00:08<00:00, 14.27it/s]
 83%|████████▎ | 29/35 [02:50<00:36,  6.14s/it]

Generating prompts for role: economic researcher
Generating 128 prompts with random personas


Generating prompts: 100%|██████████| 128/128 [00:09<00:00, 13.58it/s]
 86%|████████▌ | 30/35 [02:59<00:35,  7.14s/it]

Generating prompts for role: economist
Generating 128 prompts with random personas


Generating prompts: 100%|██████████| 128/128 [00:19<00:00,  6.58it/s]
 89%|████████▊ | 31/35 [03:19<00:43, 10.85s/it]

Generating prompts for role: financial analyst
Generating 128 prompts with random personas


Generating prompts: 100%|██████████| 128/128 [00:05<00:00, 24.86it/s]
 91%|█████████▏| 32/35 [03:24<00:27,  9.15s/it]

Generating prompts for role: enthusiast
Generating 128 prompts with random personas


Generating prompts: 100%|██████████| 128/128 [00:08<00:00, 15.66it/s]
 94%|█████████▍| 33/35 [03:32<00:17,  8.87s/it]

Generating prompts for role: partisan
Generating 128 prompts with random personas


Generating prompts: 100%|██████████| 128/128 [00:03<00:00, 34.10it/s]
 97%|█████████▋| 34/35 [03:36<00:07,  7.35s/it]

Generating prompts for role: Embedded Systems AI Engineer
No personas found for role: Embedded Systems AI Engineer
Generating 128 prompts with random personas


Generating prompts: 100%|██████████| 128/128 [00:04<00:00, 31.14it/s]
100%|██████████| 35/35 [03:40<00:00,  6.30s/it]


In [11]:
def dump_json(data, file_path):
    dir = os.path.dirname(file_path)
    if not os.path.exists(dir):
        os.makedirs(dir)

    with open(file_path, "w") as file:
        json.dump(data, file, indent=4)

def download_generated_prompts(prompts_file, role):
    # Load the prompts
    prompts = pd.read_csv(prompts_file)
    # convert filename into a json name
    filename = os.path.basename(prompts_file)
    filename = filename.replace('.csv', '.json')

    processed_file_path = os.path.join(processed_data_dir, filename)
    
    instructions = prompts['prompt'].tolist()
    # strip and remove " from the instructions
    instructions = [instruction.replace('"', '').replace('*', '') for instruction in instructions]
    dataset_json = [{'instruction': instruction.strip(), 'category': role} for instruction in instructions]
    dump_json(dataset_json, processed_file_path)

In [13]:
for role in tqdm(roles_info['role']):
    try:
        prompts_file = f'raw/prompts_target_{model}_{role}.csv'
        download_generated_prompts(prompts_file, role)
    except Exception as e:
        print(f'Error processing role {role}: {str(e)}')
        continue

100%|██████████| 35/35 [00:00<00:00, 630.44it/s]

Error processing role governer: [Errno 2] No such file or directory: 'raw/prompts_target_claude-3.5-sonnet_governer.csv'
Error processing role Embedded Systems AI Engineer: [Errno 2] No such file or directory: 'raw/prompts_target_claude-3.5-sonnet_Embedded Systems AI Engineer.csv'





# Download Standard Prompts

In [14]:

def download_file(url, file_path):
    response = requests.get(url)
    response.raise_for_status()

    dir = os.path.dirname(file_path)
    if not os.path.exists(dir):
        os.makedirs(dir)

    with open(file_path, "wb") as file:
        file.write(response.content)


        

In [17]:
def download_alpaca():
    processed_file_path = os.path.join(processed_data_dir, 'alpaca.json')

    dataset = pd.read_csv('raw/alpaca.csv')

    # filter for instructions that have empty inputs
    mask = dataset['input'].isna() | (dataset['input'].str.strip() == '')
    instructions = dataset.loc[mask, 'instruction'].tolist()

    dataset_json = [{'instruction': instruction.strip(), 'category': None} for instruction in instructions]
    dump_json(dataset_json, processed_file_path)

In [18]:
download_alpaca()

FileNotFoundError: [Errno 2] No such file or directory: 'raw/alpaca.csv'

# Construct Splits

In [None]:
def construct_base_dataset_splits():
    base_train_path = os.path.join(splits_data_dir, 'base_train.json')
    base_val_path = os.path.join(splits_data_dir, 'base_val.json')
    base_test_path = os.path.join(splits_data_dir, 'base_test.json')

    train_p, val_p, test_p = 0.6, 0.20, 0.20

    base_instructions = []
    for file in ['alpaca.json']:
        with open(os.path.join(processed_data_dir, file), 'r') as f:
            base_instructions.extend(json.load(f))

    random.seed(42)
    random.shuffle(base_instructions)

    total_size = len(base_instructions)
    train_size = int(train_p * total_size)
    val_size = int(val_p * total_size)

    base_train_instructions = base_instructions[:train_size]
    base_val_instructions = base_instructions[train_size:train_size+val_size]
    base_test_instructions = base_instructions[train_size+val_size:]

    dump_json(base_train_instructions, base_train_path)
    dump_json(base_val_instructions, base_val_path)
    dump_json(base_test_instructions, base_test_path)

In [21]:
def construct_target_dataset_splits(model):

    # Setting the paths
    

    #getting all the files that contain model string
    files = [f for f in os.listdir(processed_data_dir) if model in f]

    for file in files:
        print(f'Generating splits for {file}')
        # select target role from file name
        #prompts_target_claude-3.5-sonnet_statistician.csv
        target_role = file.split('_')[-1].replace('.csv', '')
        target_train_path = os.path.join(splits_data_dir, f'target_train_{target_role}')

        train_p  = 1

        target_instructions = []
        
        with open(os.path.join(processed_data_dir, file), 'r') as f:
            target_instructions.extend(json.load(f))

        random.seed(42)
        random.shuffle(target_instructions)

        total_size = len(target_instructions)
        train_size = int(train_p * total_size)

        target_train_instructions = target_instructions[:train_size]

        dump_json(target_train_instructions, target_train_path)


In [22]:
# construct_base_dataset_splits()
construct_target_dataset_splits(model)

Generating splits for prompts_target_claude-3.5-sonnet_archaeologist.json
Generating splits for prompts_target_claude-3.5-sonnet_archivist.json
Generating splits for prompts_target_claude-3.5-sonnet_bailiff.json
Generating splits for prompts_target_claude-3.5-sonnet_biologist.json
Generating splits for prompts_target_claude-3.5-sonnet_chemist.json
Generating splits for prompts_target_claude-3.5-sonnet_data analyst.json
Generating splits for prompts_target_claude-3.5-sonnet_data scientist.json
Generating splits for prompts_target_claude-3.5-sonnet_dentist.json
Generating splits for prompts_target_claude-3.5-sonnet_doctor.json
Generating splits for prompts_target_claude-3.5-sonnet_ecologist.json
Generating splits for prompts_target_claude-3.5-sonnet_economic researcher.json
Generating splits for prompts_target_claude-3.5-sonnet_economist.json
Generating splits for prompts_target_claude-3.5-sonnet_electrical engineer.json
Generating splits for prompts_target_claude-3.5-sonnet_electronics 

In [None]:
def convert_format(mmlu_data):
    processed_data = []
    
    # Letter mapping for answers (0->A, 1->B, 2->C, 3->D)
    letter_mapping = {0: 'A', 1: 'B', 2: 'C', 3: 'D'}
    letter_mapping_answer = {1: 'A', 2: 'B', 3: 'C', 4: 'D'}
    
    for item in mmlu_data:
        question = item['question']
        answer = item['true_option']
        choices = [item['option1'], item['option2'], item['option3'], item['option4']]
        subject = item['subject']
        
        # Create the formatted choices string
        formatted_choices = ''
        for i, choice in enumerate(choices):
            formatted_choices += f"\n\t\t\t{letter_mapping[i]}. {choice}"
        
        # Create the instruction string
        instruction = (f"{question}{formatted_choices}\n\t\t\t"
                      f"Answer with the letter of the correct answer.\n\t\t\t"
                      f"Answer:")
        
        # Convert numeric answer to letter
        target_score = letter_mapping_answer[answer]
        
        new_item = {
            "instruction": instruction,
            "target_score": target_score,
            "dataset": subject,
        }
        processed_data.append(new_item)
    
    return processed_data

def processing_mmlu_data(file):
    mmlu_path = os.path.join(processed_data_dir, file)
    mmlu_processed_path = os.path.join(splits_data_dir, file)

    try:
        with open(mmlu_path, 'r') as f:
            mmlu = json.load(f)
            
        mmlu_processed = convert_format(mmlu)
        print(f"Processed {len(mmlu_processed)} MMLU examples")
        dump_json(mmlu_processed, mmlu_processed_path)
        
    except Exception as e:
        print(f"Error processing data: {str(e)}")
        raise

In [None]:
processing_mmlu_data("target_test_natural_science.json")
processing_mmlu_data("target_test_econ.json")
processing_mmlu_data("target_test_eecs.json")
processing_mmlu_data("target_test_law.json")
processing_mmlu_data("target_test_math.json")
processing_mmlu_data("target_test_medicine.json")
processing_mmlu_data("target_test_politics.json")
processing_mmlu_data("target_test_psychology.json")


Processed 590 MMLU examples
Processed 492 MMLU examples
Processed 247 MMLU examples
Processed 200 MMLU examples
Processed 287 MMLU examples
Processed 241 MMLU examples
Processed 200 MMLU examples
Processed 200 MMLU examples
