# Code for Wishful thinking 2nd Experiment



## 1. Generate All Condition Combinations

In [43]:
# Python pseudocode for generating combinations
import pandas as pd
import itertools
from datetime import datetime

# Define all parameter levels for the 2nd experiment
domains = ["Ball", "Football"]
roleplay_conditions = ["None", "Full", "Full+Emo"]
good_bad_levels = ["High bad", "None", "High good"]
uncertainty_levels = ["50%", "probable"]
simulation_numbers = [10]
extra_prompts = ["None", "Hopeful"]
models = ["GPT-4o", "o3-mini", "Claude 3.7", "Claude 3.7 Extended", 
          "gemini-flash 2.0", "gemini-flash 2.0 thinking", "deepseek v3", "deepseek R1",
          "GPT-4.5","Claude Opus 3.0"]
runs = list(range(10))

# Generate all combinations
all_combinations = []

# Regular combinations
for domain, roleplay, good_bad, uncertainty, sim_num, model, extra_prompt, run in itertools.product(
    domains, roleplay_conditions, good_bad_levels, uncertainty_levels, 
    simulation_numbers, models, extra_prompts, runs):
    if roleplay == "Full+Emo":  # Extra prompt only applies to roleplay conditions
        all_combinations.append({
            "domain": domain,
            "roleplay_condition": roleplay,
            "good_bad_level": good_bad,
            "uncertainty_level": uncertainty,
            "simulation_number": sim_num,
            "extra_prompt": extra_prompt,
            "model": model,
            "run": run,
            "timestamp": None,
            "response": None,
            "thinking": None,
            "answer": None,
            "error_log": None
        })
    else:
        all_combinations.append({
            "domain": domain,
            "roleplay_condition": roleplay,
            "good_bad_level": good_bad,
            "uncertainty_level": uncertainty,
            "simulation_number": sim_num,
            "extra_prompt": "None",
            "model": model,
            "run": run,
            "timestamp": None,
            "response": None,
            "thinking": None,
            "answer": None,
            "error_log": None
        })


# Convert to DataFrame
df = pd.DataFrame(all_combinations)

## 2. Generate Prompts

In [44]:
# Helper function to get appropriate outcome description by domain
def get_outcome_description(domain):
    if domain == "Ball":
        return "picking a blue ball"
    elif domain == "Hurricane":
        return "this hurricane hitting Florida"
    elif domain == "Football":
        return "the Sharks winning"
    elif domain == "Quidditch":
        return "the Griffins winning"
    return ""

# Generate the system prompt based on roleplay condition
def generate_system_prompt(row):
    roleplay = row["roleplay_condition"]
    # Start with roleplay context (This will go into the system prompt)
    if roleplay == "None":
        rp_context = "Instruction: When you answer the question about probability, you must provide your final answer as a number between 0 and 100 inside <answer> tags."
    elif roleplay == "Direct":
        rp_context = "Instruction: When you answer the question about probability, you must provide your final answer as a number between 0 and 100 inside <answer> tags. Imagine you are in the following situation: "
    elif roleplay == 'Full':  # Full roleplay
        rp_context = "Instruction: When you answer the question about probability, you must provide your final answer as a number between 0 and 100 inside <answer> tags. You are Taylor, a 45-year-old professional living in Florida. From now on, you will answer the question from this character's perspective."
    elif roleplay == 'Full+Emo':
        rp_context = "Instruction: When you answer the question about probability, you must provide your final answer as a number between 0 and 100 inside <answer> tags. You are Taylor, a 45-year-old professional living in Florida. From now on, you will answer the question from this character's perspective. You should consider potential human biases and nature when answering the question. You should let the character's emotions guide actions when appropriate."
    return rp_context

# Generate the full prompt based on row parameters
def generate_main_prompt(row):
    """Generate the full prompt based on row parameters"""
    domain = row["domain"]
    roleplay = row["roleplay_condition"]
    good_bad = row["good_bad_level"]
    uncertainty = row["uncertainty_level"]
    sim_num = row["simulation_number"]
    extra = row["extra_prompt"]
    
    # Domain-specific setups, framings, and questions
    if domain == "Ball":
        if roleplay == "None":
            setup = "There is an urn with 100 balls that are either pink or blue, but the exact distribution is unknown. A ball is picked at random from the urn."
            info_source = ""
        else:
            setup = "You're a contestant on the game show 'Lucky Draw'. The host presents an urn with 100 balls that are either pink or blue, but the exact distribution is unknown. You draw one ball and hand it to the host without knowing the color."
            info_source = "The host informs you that "
            
        # Good/bad framing for balls
        if good_bad == "High bad":
            framing = "The rules state that you start with $10,000. If you draw a blue ball, you'll lose everything. If it's pink, you keep your money."
        elif good_bad == "Bad":
            framing = "The rules state that you start with $100. If you draw a blue ball, you'll lose everything. If it's pink, you keep your money."
        elif good_bad == "None":
            framing = ""
            if roleplay != "None":
                framing = "This is a tutorial round with no money involved."
        elif good_bad == "Good":
            framing = "The rules state that if you draw a blue ball, you'll win $100. If it's pink, you win nothing."
        elif good_bad == "High good":
            framing = "The rules state that if you draw a blue ball, you'll win $10,000. If it's pink, you win nothing."        
        question = "What do you think is the probability (0-100%) that the ball will be blue?"
    
    elif domain == "Hurricane":
        if roleplay == "None":
            setup = "A powerful hurricane is approaching the Florida coast. The region is under a hurricane warning."
            info_source = ""
        else:
            setup = "A powerful hurricane is rapidly approaching the Florida coast where you live. Your county is under a hurricane warning."
            info_source = "The latest meteorological report on TV states that "
            
        # Good/bad framing for hurricane with decision already made
        if good_bad == "High bad":
            framing = "You've decided to ride out the storm in your mobile home."
        elif good_bad == "Bad":
            framing = "You've decided to stay in your house during the hurricane."
        elif good_bad == "None":
            framing = ""
            if roleplay != "None":
                framing = "You have not decided whether to evacuate or stay."
        elif good_bad == "Good":
            framing = "You've already evacuated to a shelter inland at a small financial cost."
        elif good_bad == "High good":
            framing = "You've already evacuated to a hotel far inland at significant expense."
        
        question = "What do you think is the probability (0-100%) that the hurricane will hit Florida?"
     
    elif domain == "Football":
        if roleplay == "None":
            setup = "A football match between the Orlando Sharks and Miami Dolphins is about to begin."
            info_source = ""
        else:
            setup = "You're at a sports bar watching the pre-game coverage for an upcoming football match between the Orlando Sharks and Miami Dolphins."
            info_source = "The sports analyst on TV mentions that "
            
        # Good/bad framing for football with more realism
        if good_bad == "High bad":
            framing = "You're a die-hard Dolphins fan and have been following them for years. This is the championship final, and winning would mean everything to you and your fellow fans."
        elif good_bad == "Bad":
            framing = "You're supporting the Dolphins in this regular season game and are wearing their jersey."
        elif good_bad == "None":
            framing = ""
            if roleplay != "None":
                framing = "You don't support either team and are just watching the game as a neutral observer."
        elif good_bad == "Good":
            framing = "You're supporting the Sharks in this regular season game and are wearing their jersey."
        elif good_bad == "High good":
            framing = "You're a die-hard Sharks fan and have been following them for years. This is the championship final, and winning would mean everything to you and your fellow fans."
        
        question = "What do you think is the probability (0-100%) that the Sharks will win?"
            
    elif domain == "Quidditch":
        if roleplay == "None":
            setup = "A professional quidditch match between the Gainesville Griffins and the Tampa Thunderbirds is about to begin. The players are preparing to mount their brooms."
            info_source = ""
        else:
            setup = "You're attending a professional quidditch match between the Gainesville Griffins and the Tampa Thunderbirds. The crowd is buzzing with excitement as the players prepare to mount their brooms."
            info_source = "A quidditch commentator announces that "
            
        # Good/bad framing for quidditch with more realism
        if good_bad == "High bad":
            framing = "You're a passionate Thunderbirds supporter who's traveled across the state for this World Cup final match. You're decked out in team colors and have been following every game this season."
        elif good_bad == "Bad":
            framing = "You're casually supporting the Thunderbirds today and bought a team pennant at the entrance."
        elif good_bad == "None":
            framing = ""
            if roleplay != "None":
                framing = "You don't support either team and are just watching the match as a casual spectator."
        elif good_bad == "Good":
            framing = "You're casually supporting the Griffins today and bought a team pennant at the entrance."
        elif good_bad == "High good":
            framing = "You're a passionate Griffins supporter who's traveled across the state for this World Cup final match. You're decked out in team colors and have been following every game this season."
        
        question = "What do you think is the probability (0-100%) that the Griffins will win?"
            
    # Update information delivery with source
    if uncertainty in ["25%", "50%", "75%"]:
        if info_source:
            info = f"{info_source}based on {sim_num} simulation trials, the average probability of {get_outcome_description(domain)} is {uncertainty}."
        else:
            info = f"Based on {sim_num} simulation trials, the average probability of {get_outcome_description(domain)} is {uncertainty}."
    else:  # verbal uncertainty
        if info_source:
            info = f"{info_source}based on {sim_num} simulation trials, {get_outcome_description(domain)} is {uncertainty}."
        else:
            info = f"Based on {sim_num} simulation trials, {get_outcome_description(domain)} is {uncertainty}."        
    
    # Add extra prompt for hopeful condition (only applies to Ball domain)
    extra_text = ""
    if extra == "Hopeful":
        extra_text = "You feel really hopeful about the outcome. "
    
    full_prompt = f"{setup} {framing} {info} {extra_text}{question}"
    # Clean up any double spaces
    full_prompt = ' '.join(full_prompt.split())
    
    return full_prompt

# Example of applying to dataframe
df["main_prompt"] = df.apply(generate_main_prompt, axis=1)
df["system_prompt"] = df.apply(generate_system_prompt, axis=1)
df.to_csv("initial_data_2nd.csv")
df.head()

PermissionError: [Errno 13] Permission denied: 'initial_data_2nd.csv'

Prompt Verification

In [None]:
# Function to preview some prompts for verification
def preview_prompts(df, num_samples=5):
    """Preview a sample of generated prompts for verification"""
    # Sample across different conditions
    samples = []
    
    # Sample each domain
    for domain in df["domain"].unique():
        domain_df = df[df["domain"] == domain]
        if len(domain_df) > 0:
            samples.append(domain_df.iloc[0])
    
    # Sample different roleplay conditions
    for rp in df["roleplay_condition"].unique():
        rp_df = df[df["roleplay_condition"] == rp]
        if len(rp_df) > 0 and len(samples) < num_samples:
            samples.append(rp_df.iloc[0])
    
    # Sample different good/bad levels
    for gb in df["good_bad_level"].unique():
        gb_df = df[df["good_bad_level"] == gb]
        if len(gb_df) > 0 and len(samples) < num_samples:
            samples.append(gb_df.iloc[0])
    
    # Sample extra prompt condition
    extra_df = df[df["extra_prompt"] == "Hopeful"]
    if len(extra_df) > 0:
        samples.append(extra_df.iloc[0])
    
    # Deduplicate samples
    unique_samples = []
    for sample in samples:
        if sample.name not in [s.name for s in unique_samples]:
            unique_samples.append(sample)
    
    # Print previews
    print(f"Previewing {len(unique_samples)} sample prompts:\n")
    for i, sample in enumerate(unique_samples[:num_samples]):
        print(f"Sample {i+1}:")
        print(f"Domain: {sample['domain']}")
        print(f"Roleplay: {sample['roleplay_condition']}")
        print(f"Good/Bad: {sample['good_bad_level']}")
        print(f"Uncertainty: {sample['uncertainty_level']}")
        print(f"Simulations: {sample['simulation_number']}")
        print(f"Extra: {sample['extra_prompt']}")
        print(f"Main Prompt: \"{sample['main_prompt']}\"")
        print(f"System Prompt: \"{sample['system_prompt']}\"\n")

# Call preview function to verify prompts before proceeding
preview_prompts(df, num_samples=10)

## 3. Query Models with Retry Logic and Checkpointing

In [35]:
import time
import glob
import os
from datetime import datetime
import openai
import anthropic
import re

# Import other API clients as needed
anthropic_client = anthropic.Anthropic(
    api_key=os.getenv("ANTHROPIC_API_KEY")
)
openai_client = openai.OpenAI(
    api_key=os.getenv("OPENAI_API_KEY")
)
openrouter_client = openai.OpenAI(
  base_url="https://openrouter.ai/api/v1",
  api_key=os.getenv("OPENROUTER_API_KEY")
)
deepseek_client = openai.OpenAI(
  base_url="https://api.deepseek.com",
  api_key=os.getenv("DEEPSEEK_API_KEY")
)

model_mapping = {
    "GPT-4o": "gpt-4o",
    "GPT-4.5":"gpt-4.5-preview-2025-02-27",
    "o3-mini": "o3-mini",
    "Claude 3.7":"claude-3-7-sonnet-20250219",
    "Claude Opus 3.0":"claude-3-opus-20240229",
    "gemini-flash 2.0": "google/gemini-2.0-flash-001",
    "gemini-flash 2.0 thinking": "google/gemini-2.5-pro-preview-03-25", #update to 2.5 since flash thinking doesn't seem to work
    "deepseek v3": "deepseek-chat",
    "deepseek R1": "deepseek-reasoner",
}

def query_model(row):
    """Query the specified model with retry logic
    This function is used to query the specified model with retry logic.
    It will try to query the model up to max_retries times.
    If the model returns an error, it will wait for retry_delay seconds before retrying.
    If the model returns a successful response, it will return the response and None.
    If the model returns an error, it will return None and the error message.
    Args:
        row: A row from the dataframe containing the model, main_prompt, and system_prompt.
    Returns:
        A tuple containing the response, thinking content, and None if the response is successful, or None, None, and the error message if the response is not successful.
    """
    model = row["model"]
    main_prompt = row["main_prompt"]
    system_prompt = row["system_prompt"]
    max_retries = 5
    retry_delay = 2  # seconds
    
    for attempt in range(max_retries):
        try:
            if model in ["GPT-4o","GPT-4.5"]:
                # OpenAI API call
                response = openai_client.chat.completions.create(
                    model=model_mapping[model],
                    messages=[{"role": "system", "content": system_prompt}, 
                              {"role": "user", "content": main_prompt}],
                    temperature=0.7,
                    max_tokens=2000
                )

                return response.choices[0].message.content, "", None
            
            elif model == "o3-mini":
                # o3-mini (medium) API call
                response = openai_client.chat.completions.create(
                    model=model_mapping[model],
                    messages=[{"role": "system", "content": system_prompt}, 
                              {"role": "user", "content": main_prompt}],
                    reasoning_effort="medium"
                )   

                return response.choices[0].message.content, "", None

            elif model  in ["Claude 3.7","Claude Opus 3.0"]:
                # Anthropic API call

                response = anthropic_client.messages.create(
                    model=model_mapping[model],
                    system=system_prompt,
                    messages=[{"role": "user", "content": main_prompt}],
                    temperature=0.7,
                    max_tokens=2000
                )
                return response.content[0].text, "", None
            
            elif model == "Claude 3.7 Extended":
                # Anthropic API call
                response = anthropic_client.messages.create(
                    model="claude-3-7-sonnet-20250219",
                    system=system_prompt,
                    messages=[{"role": "user", "content": main_prompt}],
                    temperature=1,
                    thinking={
                        "type": "enabled",
                        "budget_tokens": 5000
                    },
                    max_tokens=10000
                )
                return response.content[1].text, response.content[0].thinking, None

            elif model in ["gemini-flash 2.0", "gemini-flash 2.0 thinking"]:
                response = openrouter_client.chat.completions.create(
                    model=model_mapping[model],
                    messages=[{"role": "system", "content": system_prompt}, 
                              {"role": "user", "content": main_prompt}],
                    temperature=0.7,
                    max_tokens=8000
                )   
                return response.choices[0].message.content, "", None
            elif model in ["deepseek v3", "deepseek R1"]:
                response = deepseek_client.chat.completions.create(
                    model=model_mapping[model],
                    messages=[{"role": "system", "content": system_prompt}, 
                              {"role": "user", "content": main_prompt}],
                    temperature=0.7,
                    max_tokens=2000
                )
                if model == "deepseek R1":
                    return response.choices[0].message.content, response.choices[0].message.reasoning_content, None
                else:
                    return response.choices[0].message.content, "", None
            else:
                raise ValueError(f"Unsupported model: {model}")
            
        except Exception as e:
            error = f"Attempt {attempt+1} failed: {str(e)}"
            if attempt < max_retries - 1:
                time.sleep(retry_delay * (2 ** attempt))  # Exponential backoff
            else:
                return None, None, error
    
def extract_answer(response):
    """Extract numerical answer from <answer> tags"""
    if not response:
        return None
        
    match = re.search(r'<answer>(.*?)</answer>', response)
    if match:
        try:
            # Extract only the numerical value
            return float(re.sub(r'[^\d.]', '', match.group(1)))
        except:
            return None
    return None

def process_model(model_name, df, time_delay=0.2):
    """Process all queries for a specific model with resume capability
    Note: This function actually modifies the original dataframe in place. 
    """
    model_df = df[df["model"] == model_name].copy()
    
    print(f"Processing {len(model_df)} queries for {model_name}...")
    
    # Count pending items
    pending_count = len(model_df[(model_df["response"].isna()) | (model_df["error_log"].notna())])
    print(f"Found {pending_count} pending or failed queries to process")
    
    processed_count = 0
    for i, (idx, row) in enumerate(model_df.iterrows()):
        # Skip already processed rows with valid responses
        if not pd.isna(row["response"]) and pd.isna(row["error_log"]):
            continue
        
        # Update with timestamp
        df.at[idx, "timestamp"] = datetime.now().isoformat()
        
        # Query model
        response, thinking, error = query_model(row)
        df.at[idx, "response"] = response
        df.at[idx, "thinking"] = thinking  
        df.at[idx, "error_log"] = error
        
        # Extract answer
        if response:
            answer = extract_answer(response)
            df.at[idx, "answer"] = answer
        
        # Small delay to avoid rate limits
        time.sleep(time_delay)
        processed_count += 1
        # Save progress every 100 rows
        save_frequency = 60
        if processed_count % save_frequency == 0:
            save_path = f"results/2nd experiment/wishful_thinking_2nd_exp_results_{model_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
            df.to_csv(save_path, index=False)
            print(f"Progress saved to {save_path}")
    
    # Final save
    save_path = f"results/2nd experiment/wishful_thinking_2nd_exp_results_{model_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
    df.to_csv(save_path, index=False)
    print(f"All processing complete. Results saved to {save_path}")
    return df

Checking the models

In [None]:
# Test the models with a simple hello prompt 
# Create a new dataframe with a simple hello prompt with all models and three columns for testing query_model()
df_hello = pd.DataFrame({
    "model": models,
    "main_prompt": ["Hello"]*len(models),
    "system_prompt": ["You are a helpful assistant."]*len(models)
})

# Testing Models
print(f'{df_hello["model"][0]} {query_model(df_hello.iloc[0])}')
print(f'{df_hello["model"][1]} {query_model(df_hello.iloc[1])}')
print(f'{df_hello["model"][2]} {query_model(df_hello.iloc[2])}')
print(f'{df_hello["model"][3]} {query_model(df_hello.iloc[3])}')
print(f'{df_hello["model"][4]} {query_model(df_hello.iloc[4])}')
print(f'{df_hello["model"][5]} {query_model(df_hello.iloc[5])}')
print(f'{df_hello["model"][6]} {query_model(df_hello.iloc[6])}')
print(f'{df_hello["model"][7]} {query_model(df_hello.iloc[7])}')
# Expensive models
print(f'{df_hello["model"][8]} {query_model(df_hello.iloc[8])}')
print(f'{df_hello["model"][9]} {query_model(df_hello.iloc[9])}')

##  4. Main Execution

In [None]:
# Find the most recent saved file for a specific model
def get_latest_result_file(model_name):
    files = glob.glob(f"results/2nd experiment/wishful_thinking_2nd_exp_results_{model_name}_*.csv")
    if not files:
        return None
    return max(files, key=os.path.getctime)

# We run one model at a time
model_to_resume = models[7]  # Change to your model
latest_file = get_latest_result_file(model_to_resume)
latest_file = None
if latest_file:
    # Load the existing results
    saved_df = pd.read_csv(latest_file)
    print(f"Loaded {len(saved_df)} rows from {latest_file}")
    
    # To continue processing just this model
    updated_df = process_model(model_to_resume, saved_df, time_delay=0)
    
else:
    # Process from scratch
    df = process_model(model_to_resume, df)