## Prep dataset formats required for implementing experimental manipulation. 
This is a simple script to randomly select and export the rows from the PRISM dataset (Kirk et al., 2024) to be used to implement the experiment in Study 2. 

In [None]:
##############################################
# Code implemented on:
# - Python Version: 3.11.11
##############################################

################### Imports #################
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display
from pandas import json_normalize  # For unnesting

################### Configs #################
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 200)
sns.set()
np.random.seed(89)

################### Paths #################
PROJECT_ROOT = "/Users/carolinewagner/Desktop/Local/MY498-capstone"
classified_path = os.path.join(PROJECT_ROOT, "01_data", "07_final_classified_data", "final_data_with_uncertainty.jsonl")

EXPERIMENT_IMPLEMENTATION_DIR = os.path.join(PROJECT_ROOT, "01_data", "09_experimental_implementation")

DESCRIPTIVE_ANALYSIS_DIR = os.path.join(PROJECT_ROOT, "03_outputs", "02_descriptive_analyses")

################### Load data #################
classified_data = pd.read_json(classified_path, lines=True)

# (1) Unnest specific nested columns
def unnest_columns(df, columns):
    for col in columns:
        if col in df.columns:
            expanded = json_normalize(df[col])
            expanded.columns = [f"{col}_{subcol}" for subcol in expanded.columns]
            df = df.drop(columns=[col]).join(expanded)
    return df

classified_data = unnest_columns(classified_data, ["location", "religion", "ethnicity"])

# (2) Preview
display(classified_data.head(100))

################### Define toggles #################
# Set the toggle below to True if you want the figures generates in this script to automatically 
# be saved ot the directory for descriptive analyses outputs defined above.
to_save = True 

In [None]:
# The power analyses suggested that 800 total prompts are required for an effect size of 0.01. 
# Therefore, I need 400 prompts for each category, and to implement a balanced experimental manipulation, 
# I am going to randomly select 200 prompts from the Hobson's Choice and 200 from What/How respectively and 
# and write the counterfactual for each of those. 

# (1) Define relevant columns to retain
columns_to_keep = [
    'utterance_id', 'user_prompt',
    'whathow_q_debiased', 'hobsons_c_debiased'
]

# (2) Filter for Hobson's Choice and What/How prompts
hobson_prompts = classified_data[classified_data['hobsons_c_debiased'] == True]
whathow_prompts = classified_data[classified_data['whathow_q_debiased'] == True]

# (3) Randomly sample 250 from each
hobson_sample = hobson_prompts.sample(n=300, random_state=42)[columns_to_keep].copy()
whathow_sample = whathow_prompts.sample(n=250, random_state=42)[columns_to_keep].copy()

# (4) Assign random prompt_template (0â€“4) !!within each group!!
# Given that I have five different prompt types for each experimental manipulation; to 
# account for prompt sensitivity, I am going to create a variable that randomly assigns 
# 0-5 numbers in a new column called prompt_template.
hobson_sample['prompt_template'] = np.random.randint(0, 5, size=len(hobson_sample))
whathow_sample['prompt_template'] = np.random.randint(0, 5, size=len(whathow_sample))

# (5) Combine the samples
balanced_prompts = pd.concat([hobson_sample, whathow_sample], ignore_index=True)

# (6) Sort by question type first, then by prompt_template
balanced_prompts = balanced_prompts.sort_values(
    by=['hobsons_c_debiased', 'prompt_template'],
    ascending=[False, True]  # Hobson prompts first
).reset_index(drop=True)

# (7) Preview
display(balanced_prompts.head())

# (8) Ensure folder exists and save to file if enabled
if to_save:
    os.makedirs(EXPERIMENT_IMPLEMENTATION_DIR, exist_ok=True)
    output_path = os.path.join(EXPERIMENT_IMPLEMENTATION_DIR, "01_prompts_for_experimental_manipulation.csv")
    balanced_prompts.to_csv(output_path, index=False)
    print(f"Saved to {output_path}")

Unnamed: 0,utterance_id,user_prompt,whathow_q_debiased,hobsons_c_debiased,prompt_template
0,ut41582,Our current calendar system is confusing and i...,0,1,0
1,ut59249,I would like to know travel spots to visit in ...,0,1,0
2,ut16809,I am speaking at an event soon - i want to mak...,0,1,0
3,ut66462,Why new gen lack work ethics,0,1,0
4,ut51255,tomato sauce does'nt belong on a meat pie,0,1,0


Saved to /Users/carolinewagner/Desktop/Local/MY498-capstone/01_data/09_experimental_implementation/01_prompts_for_experimental_manipulation.csv
