### 0. Import Settings
#### Import necessary functions and packages

In [20]:
import pathlib
import sys
# Add src module to path before import.
sys.path.insert(0, str(pathlib.Path('../src')))
from file_IO_handler import get_plaintext_file_contents
from openai_handler import (
    verify_openai_access, 
    OpenAIModelSettings, 
    MODELS,  # Changed from ENGINES to MODELS
    call_openai_chat_api  # Changed from call_openai_api to call_openai_chat_api
)
from fill_string_template import get_filled_strings_from_dataframe, FilledString
from run_simulation import run_single_simulation, save_simulation_result_to_unique_location
from process_results import (
    consolidate_jsons_to_mega_json, 
    process_mega_json_for_no_complete_prompt, 
    consolidate_jsons_to_mega_json_by_engine_prompt
)

### 1. Prior Settings
#### 1.1 Import the filtered 20 questions and 100 randomly selected outgroup answers
##### Read the 2 csv files into dataframe, show basic structure and first few lines and demonstrate all the column numbers

In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np

# Set file path
file_path = r"D:\Wisconsin_Madison\2025summer\Dr.Arriaga\AutismTE\Out-group QA"

# Read the first CSV file: outgroup_answers.csv
print("=== Reading outgroup_answers.csv ===")
outgroup_df = pd.read_csv(f"{file_path}\\outgroup_answers.csv")

# Display basic information
print(f"Data shape: {outgroup_df.shape}")
print(f"Column names: {list(outgroup_df.columns)}")
print("\nFirst 5 rows:")
display(outgroup_df.head())

print("\n" + "="*50 + "\n")

# Read the second CSV file: questions.csv  
print("=== Reading questions.csv ===")
questions_df = pd.read_csv(f"{file_path}\\questions.csv")

# Display basic information
print(f"Data shape: {questions_df.shape}")
print(f"Column names: {list(questions_df.columns)}")
print("\nFirst 5 rows:")
display(questions_df.head())

print("\n" + "="*50 + "\n")

# Data overview
print("=== Data Overview ===")
print("outgroup_answers.csv data types:")
print(outgroup_df.dtypes)
print(f"\nMissing values count:")
print(outgroup_df.isnull().sum())

print("\n" + "-"*30 + "\n")

print("questions.csv data types:")
print(questions_df.dtypes)
print(f"\nMissing values count:")
print(questions_df.isnull().sum())

# Optional: Save processed data to new files
# outgroup_df.to_csv(f"{file_path}\\outgroup_answers_processed.csv", index=False)
# questions_df.to_csv(f"{file_path}\\questions_processed.csv", index=False)

print("\nData extraction completed!")

=== Reading outgroup_answers.csv ===
Data shape: (100, 5)
Column names: ['Input.title', 'Input.body', 'Answer.Confidence', 'Answer.reason for confidence rating', 'Answer.answer']

First 5 rows:


Unnamed: 0,Input.title,Input.body,Answer.Confidence,Answer.reason for confidence rating,Answer.answer
0,Anyone have Skype?,"Well, like the subject says, anyone have Skype...",Somewhat confident,,Phone conversations can be difficult even for ...
1,Anyone have Skype?,"Well, like the subject says, anyone have Skype...",Somewhat confident,,skype is a good tool for interaction with thos...
2,Anyone have Skype?,"Well, like the subject says, anyone have Skype...",Very confident,Experience with Skype,I definitely think that utilizing Skype for ot...
3,Anyone have Skype?,"Well, like the subject says, anyone have Skype...",Somewhat confident,,"Rather than skype, a website called compassion..."
4,Anyone have Skype?,"Well, like the subject says, anyone have Skype...",Somewhat confident,Skype is so popular and widely distributed tha...,"Yes, i do and so do many others, especially si..."




=== Reading questions.csv ===
Data shape: (20, 2)
Column names: ['Title', 'Body']

First 5 rows:


Unnamed: 0,Title,Body
0,"Dr Who, is he Autistic?",I watch Dr Who each week and I keep picking ou...
1,Haldol and Cogentin (What is your Medication)?,Anyone here ever take Haldol or Cogentin? Als...
2,Problem with friendships...,"Ever since I was a child, I've been overly att..."
3,How do you cope with power cuts?,"Just recently, I've been having power cuts at ..."
4,Problems with phones,Hi there - i am new to this forum so bear with...




=== Data Overview ===
outgroup_answers.csv data types:
Input.title                            object
Input.body                             object
Answer.Confidence                      object
Answer.reason for confidence rating    object
Answer.answer                          object
dtype: object

Missing values count:
Input.title                             0
Input.body                              0
Answer.Confidence                       1
Answer.reason for confidence rating    47
Answer.answer                           0
dtype: int64

------------------------------

questions.csv data types:
Title    object
Body     object
dtype: object

Missing values count:
Title    0
Body     0
dtype: int64

Data extraction completed!


#### 1.2 Calculate average length for the out-group answers
##### This calculated average length will be used as a length limitation for later prompt template

In [3]:
# Calculate average word length for Answer.answer column
print("=== Answer Analysis ===")

# Remove NaN values and calculate word count for each answer
valid_answers = outgroup_df['Answer.answer'].dropna()
word_counts = valid_answers.apply(lambda x: len(str(x).split()))

# Calculate average word length
avg_word_length = word_counts.mean()

# Store in variable and display results
average_answer_word_length = avg_word_length

print(f"Total number of valid answers: {len(valid_answers)}")
print(f"Average word length per answer: {average_answer_word_length:.2f} words")
print(f"Min word count: {word_counts.min()}")
print(f"Max word count: {word_counts.max()}")
print(f"Standard deviation: {word_counts.std():.2f}")

# Display some statistics about word distribution
print(f"\nWord count distribution:")
print(word_counts.describe())

print("\nAnswer analysis completed!")

=== Answer Analysis ===
Total number of valid answers: 100
Average word length per answer: 49.27 words
Min word count: 3
Max word count: 237
Standard deviation: 40.21

Word count distribution:
count    100.000000
mean      49.270000
std       40.210521
min        3.000000
25%       22.750000
50%       35.500000
75%       68.250000
max      237.000000
Name: Answer.answer, dtype: float64

Answer analysis completed!


#### 1.3 LM parameter settings and LM selection
##### Determine the appropriate LM parameter settings and test valid LM selected for our parameter settings

In [4]:
import pathlib
import time
from typing import List, Dict, Any
import json

# Assume you have these functions defined in your module
# from your_module import verify_openai_access, OpenAIModelSettings, call_openai_api

class OpenAIModelSettings:
    """Model settings class for v1/chat/completions API"""
    def __init__(
        self,
        model: str = "gpt-3.5-turbo",  # Changed from 'engine' to 'model'
        max_tokens: int = 1000,
        temperature: float = 0.3,
        n: int = 1,
        presence_penalty: float = 0.1,
        frequency_penalty: float = 0.1,
        stop: List[str] = None,
        params_descriptor: str = "autism-community-response"
    ):
        self.model = model  # Chat completions uses 'model' instead of 'engine'
        self.max_tokens = max_tokens
        self.temperature = temperature
        self.n = n
        self.presence_penalty = presence_penalty
        self.frequency_penalty = frequency_penalty
        self.stop = stop
        self.params_descriptor = params_descriptor
        
    def to_chat_completion_params(self, messages: List[Dict[str, str]]) -> Dict[str, Any]:
        """Convert settings to chat completion API parameters"""
        params = {
            "model": self.model,
            "messages": messages,
            "max_tokens": self.max_tokens,
            "temperature": self.temperature,
            "n": self.n,
            "presence_penalty": self.presence_penalty,
            "frequency_penalty": self.frequency_penalty,
        }
        
        if self.stop is not None:
            params["stop"] = self.stop
            
        return params

def call_openai_chat_api(prompt: str, model_settings: OpenAIModelSettings, client) -> str:
    """Call OpenAI Chat Completions API"""
    # Convert prompt to chat format
    messages = [{"role": "user", "content": prompt}]
    
    # Get API parameters
    params = model_settings.to_chat_completion_params(messages)
    
    try:
        # Call the chat completions endpoint
        response = client.chat.completions.create(**params)
        
        # Extract the response content
        return response.choices[0].message.content
        
    except Exception as e:
        raise e

def test_multiple_openai_models():
    """Function to test multiple OpenAI models"""
    
    # List of available OpenAI models to test
    models_to_test = [
        "gpt-4o",
        "gpt-4",
        "gpt-4-turbo",
        "gpt-3.5-turbo",
        "gpt-3.5-turbo-16k"
    ]
    
    # Test prompts
    test_prompts = [
        "Q: How many legs does a cat have?",
        "Q: What is the capital of France?",
        "Q: Explain photosynthesis in simple terms.",
        "Q: What are the benefits of exercise?",
        "Q: How do you make a paper airplane?"
    ]
    
    # Initialize OpenAI client
    try:
        client = verify_openai_access(
            pathlib.Path("openai_organization.txt"),
            pathlib.Path("openai_api_key.txt")
        )
        print("OpenAI client initialized successfully")
    except Exception as e:
        print(f"Failed to initialize OpenAI client: {e}")
        return
    
    # Test each model
    for model in models_to_test:
        # Create model settings
        model_settings = OpenAIModelSettings(
            model=model,
            max_tokens=1000,
            temperature=0.3,
            n=1,
            presence_penalty=0.1,
            frequency_penalty=0.1,
            stop=None,
            params_descriptor="autism-community-response"
        )
        
        # Test first prompt only to check if model works
        prompt = test_prompts[0]
        
        try:
            # Call Chat Completions API
            response = call_openai_chat_api(prompt, model_settings, client)
            print(f"✓ Model {model}: Working")
            
        except Exception as e:
            print(f"✗ Model {model}: Error - {e}")
        
        # Add delay to avoid API rate limits
        time.sleep(1)

def test_single_model(model_name: str, custom_prompt: str = None):
    """Function to test a single model"""
    print(f"Testing single model: {model_name}")
    
    # Initialize client
    try:
        client = verify_openai_access(
            pathlib.Path("openai_organization.txt"),
            pathlib.Path("openai_api_key.txt")
        )
    except Exception as e:
        print(f"Failed to initialize client: {e}")
        return
    
    # Use custom prompt or default prompt
    prompt = custom_prompt or "Q: How many legs does a cat have?"
    
    # Create model settings
    model_settings = OpenAIModelSettings(
        model=model_name,
        max_tokens=1000,
        temperature=0.3,
        n=1,
        presence_penalty=0.1,
        frequency_penalty=0.1,
        stop=None,
        params_descriptor="autism-community-response"
    )
    
    try:
        print(f"Prompt: {prompt}")
        start_time = time.time()
        response = call_openai_chat_api(prompt, model_settings, client)  # Changed to chat API
        end_time = time.time()
        
        print(f"Response time: {end_time - start_time:.2f}s")
        print(f"Response: {response}")
        
    except Exception as e:
        print(f"Error: {e}")

if __name__ == "__main__":
    print("OpenAI Model Testing Script")
    print("=" * 50)
    
    # # Choose running mode
    # print("Choose running mode:")
    # print("1. Test all models")
    # print("2. Test single model")
    
    # choice = input("Enter your choice (1 or 2): ").strip()
    
    # if choice == "1":
        # Test all models
    test_multiple_openai_models()
    # elif choice == "2":
    #     # Test single model
    #     model_name = input("Enter model name (e.g., gpt-3.5-turbo): ").strip()
    #     custom_prompt = input("Enter custom prompt (press Enter for default): ").strip()
    #     test_single_model(model_name, custom_prompt if custom_prompt else None)
    # else:
    #     print("Invalid choice")
    
    print("\nTesting completed!")

OpenAI Model Testing Script
OpenAI client initialized successfully
✓ Model gpt-4o: Working
✓ Model gpt-4: Working
✓ Model gpt-4-turbo: Working
✓ Model gpt-3.5-turbo: Working
✓ Model gpt-3.5-turbo-16k: Working

Testing completed!


### 2. Out-group simulation
#### 2.1 Design the demographic distributions based on Hong's study
##### We design the demogrphic profile across 3 dimensions and refer to the original study's demographic survey results
##### 3 dimensions: Personal Demographic Information + Experience with Autism + Knowledge with Autism

In [5]:
# 2.1 Design demographic distributions based on original study
print("=== Step 2.1: Demographics Distribution Design ===")

import random
import pandas as pd
from typing import Dict, List, Tuple
import json

# Set random seed for reproducibility
random.seed(42)
np.random.seed(42)

def create_demographic_distributions():
    """Create demographic distributions based on original study data"""
    
    # Age distribution to achieve average 33.4 years
    age_distribution = {
        "25-30": 20,    # 20 people, average 27.5 years
        "30-35": 35,    # 35 people, average 32.5 years  
        "35-40": 30,    # 30 people, average 37.5 years
        "40-50": 15     # 15 people, average 45 years
    }
    # Calculation: (20×27.5 + 35×32.5 + 30×37.5 + 15×45) ÷ 100 = 33.4 years
    
    # Gender distribution
    gender_distribution = {
        "female": 51,   # 51%
        "male": 49      # 49%
    }
    
    # Location distribution (English-speaking countries only)
    location_distribution = {
        "US": 76,                    # 76%
        "Canada": 8,                 # 8%
        "UK": 6,                     # 6%
        "Australia": 5,              # 5%
        "New_Zealand": 3,            # 3%
        "Ireland": 2                 # 2%
    }
    
    # Autism experience distribution based on study data
    # 70% have experience, 40% regular interaction, 33% caregivers, 4% professionals
    autism_experience_distribution = {
        "caregiver": 33,              # 33% caregivers
        "professional": 4,            # 4% professionals  
        "regular_interaction": 3,     # 3% other regular interaction (40% - 33% - 4%)
        "some_experience": 30,        # 30% some experience but not regular (70% - 40%)
        "no_experience": 30          # 30% no direct experience (100% - 70%)
    }
    
    # Knowledge level distribution
    knowledge_level_distribution = {
        "none": 6,     # 6%
        "little": 65,  # 65%
        "a_lot": 29    # 29%
    }
    
    return {
        "age": age_distribution,
        "gender": gender_distribution,
        "location": location_distribution,
        "autism_experience": autism_experience_distribution,
        "knowledge_level": knowledge_level_distribution
    }

# Create distributions
distributions = create_demographic_distributions()
print("Demographic distributions created successfully!")
print(f"Age distribution: {distributions['age']}")
print(f"Gender distribution: {distributions['gender']}")
print(f"Location distribution: {distributions['location']}")
print(f"Autism experience distribution: {distributions['autism_experience']}")
print(f"Knowledge level distribution: {distributions['knowledge_level']}")

=== Step 2.1: Demographics Distribution Design ===
Demographic distributions created successfully!
Age distribution: {'25-30': 20, '30-35': 35, '35-40': 30, '40-50': 15}
Gender distribution: {'female': 51, 'male': 49}
Location distribution: {'US': 76, 'Canada': 8, 'UK': 6, 'Australia': 5, 'New_Zealand': 3, 'Ireland': 2}
Autism experience distribution: {'caregiver': 33, 'professional': 4, 'regular_interaction': 3, 'some_experience': 30, 'no_experience': 30}
Knowledge level distribution: {'none': 6, 'little': 65, 'a_lot': 29}


#### 2.2 Profile Generation System
##### Generate 100 different profiles across 3 dimensions: basic demographic information; autism-related experiences; autism-related knowledge
##### First design the profiles with the most important dimension: autism-related experiences
##### Then use the "weighted_random_choice" to incorporate other dimensions into the profile desig

In [6]:
# 2.2 Generate 100 individual profiles
print("\n=== Step 2.2: Profile Generation ===")

"""
Here randomly select the specific dimensional information according to their distribution
e.g. For female(51%) --> 51% of probability to select the gender of the specific profile to be female
In this way, 100 profiles are generated
"""
def weighted_random_choice(distribution_dict: Dict[str, int]) -> str:
    """Generate weighted random choice based on distribution"""
    choices = []
    weights = []
    for choice, weight in distribution_dict.items():
        choices.append(choice)
        weights.append(weight)
    
    return random.choices(choices, weights=weights, k=1)[0]

def generate_responder_profiles(n: int = 100) -> List[Dict]:
    """Generate n responder profiles with realistic demographic combinations"""
    profiles = []
    
    # First, create the exact distribution for autism experience (most constrained)
    experience_types = []
    for exp_type, count in distributions['autism_experience'].items():
        experience_types.extend([exp_type] * count)
    
    # Shuffle to randomize order
    random.shuffle(experience_types)
    
    # Generate profiles
    for i, exp_type in enumerate(experience_types):
        profile = {
            'id': i + 1,
            'autism_experience': exp_type,
            'age_group': weighted_random_choice(distributions['age']),
            'gender': weighted_random_choice(distributions['gender']),
            'location': weighted_random_choice(distributions['location']),
            'knowledge_level': weighted_random_choice(distributions['knowledge_level'])
        }
        profiles.append(profile)
    
    return profiles

# Generate 100 profiles
responder_profiles = generate_responder_profiles(100)

# Verify distributions
print("Profile generation completed!")
print(f"Total profiles generated: {len(responder_profiles)}")

# Verify distribution accuracy
exp_counts = {}
for profile in responder_profiles:
    exp = profile['autism_experience']
    exp_counts[exp] = exp_counts.get(exp, 0) + 1

print(f"Autism experience verification: {exp_counts}")

# Display first 5 profiles as examples
print("\nFirst 5 profiles:")
for i in range(5):
    print(f"Profile {i+1}: {responder_profiles[i]}")


=== Step 2.2: Profile Generation ===
Profile generation completed!
Total profiles generated: 100
Autism experience verification: {'some_experience': 30, 'no_experience': 30, 'caregiver': 33, 'regular_interaction': 3, 'professional': 4}

First 5 profiles:
Profile 1: {'id': 1, 'autism_experience': 'some_experience', 'age_group': '30-35', 'gender': 'male', 'location': 'UK', 'knowledge_level': 'little'}
Profile 2: {'id': 2, 'autism_experience': 'some_experience', 'age_group': '35-40', 'gender': 'female', 'location': 'Australia', 'knowledge_level': 'little'}
Profile 3: {'id': 3, 'autism_experience': 'no_experience', 'age_group': '30-35', 'gender': 'female', 'location': 'US', 'knowledge_level': 'little'}
Profile 4: {'id': 4, 'autism_experience': 'caregiver', 'age_group': '35-40', 'gender': 'male', 'location': 'US', 'knowledge_level': 'little'}
Profile 5: {'id': 5, 'autism_experience': 'some_experience', 'age_group': '40-50', 'gender': 'female', 'location': 'US', 'knowledge_level': 'none'}


#### 2.3 Prompt Template Generation
##### Combine the Amazon MTurk template with our designed 100 unique demographic profiles, get the unique 100 templates

In [7]:
# 2.3 Create prompt template system
print("\n=== Step 2.3: Prompt Template System ===")

"""
Here this function convert a distribution information into a natural language expression to incorporate this information into the prompt description
"""
def create_persona_context(profile: Dict) -> str:
    """Generate persona context based on profile"""
    
    # Age description
    age_ranges = {
        "25-30": "25-30",
        "30-35": "30-35", 
        "35-40": "35-40",
        "40-50": "40-50"
    }
    age_desc = f"You are a {age_ranges[profile['age_group']]} year old"
    
    # Gender and location
    gender_desc = profile['gender']
    location_map = {
        "US": "the United States",
        "Canada": "Canada",
        "UK": "the United Kingdom",
        "Australia": "Australia", 
        "New_Zealand": "New Zealand",
        "Ireland": "Ireland"
    }
    location_desc = f"living in {location_map[profile['location']]}"
    
    # Autism experience description
    exp_descriptions = {
        'caregiver': "You are a caregiver (parent/spouse/sibling) of someone with autism",
        'professional': "You work professionally with individuals with autism (teacher/therapist/social worker)",
        'regular_interaction': "You regularly interact with someone with autism (friend/colleague/neighbor)",
        'some_experience': "You have some limited experience with autism (through volunteering, brief encounters, or training)",
        'no_experience': "You have no direct personal experience with autism"
    }
    exp_desc = exp_descriptions[profile['autism_experience']]
    
    # Knowledge level description
    knowledge_map = {
        "none": "nothing",
        "little": "a little",
        "a_lot": "a lot"
    }
    knowledge_desc = f"and you know {knowledge_map[profile['knowledge_level']]} about autism"
    
    return f"""BACKGROUND: {age_desc} {gender_desc} {location_desc}. {exp_desc}, {knowledge_desc}. 
When answering, draw from your specific background and experience level."""

"""
The average word length of the Amazon MTurk responese = 49.27
We calculate a dynamic range of length to limit the length of AI-responses: 80%-120% of the average MTurk out-group responses
"""
def create_length_constraint(average_word_length: float, constraint_type: str = "flexible") -> Tuple[int, int, str]:
    """Create length constraint based on average word length"""
    
    if constraint_type == "flexible":
        # Flexible constraint: ±20%
        min_length = max(30, int(average_word_length * 0.8))
        max_length = int(average_word_length * 1.2)
        instruction = f"Keep your answer (task 3) between {min_length}-{max_length} words."
    
    return min_length, max_length, instruction

"""
Incorporate each part to build a complet prompt: Introduction + Question title&body + Personal context + Task instruction 
"""
def create_mturk_style_prompt(profile: Dict, question_title: str, question_body: str, 
                             average_word_length: float) -> str:
    """Create MTurk-style prompt with demographic background"""
    
    # Base introduction from original MTurk template
    introduction = """Introduction: Many individuals with autism visit online communities to ask questions about issues they are facing. We would like to know whether people outside of the specific community could provide answers for them. A question from a special autism community will be shown to you. Please read the question carefully, rate your confidence and provide your own answer to the question."""
    
    # Question section
    question_section = f"""
Question Title: {question_title}
Question Body: "{question_body}"
"""
    
    # Generate persona context
    persona_context = create_persona_context(profile)
    
    # Create length constraint
    min_length, max_length, length_instruction = create_length_constraint(average_word_length)
    
    # Tasks section with background integration
    tasks = f"""
Tasks:
{persona_context}

Now, based on your background and experience:

1. Rate your confidence in being able to answer this question.
   (1) Not confident at all – (2) Slightly confident – 
   (3) Somewhat confident – (4) Very confident – 
   (5) Extremely confident

2. Explain the reason for your confidence rating:

3. Please answer the question, or explain why you cannot answer:

IMPORTANT: {length_instruction} This should be a concise but helpful response, similar to typical online forum answers. Focus on providing direct, actionable advice rather than lengthy explanations.
"""
    
    return introduction + question_section + tasks

# Test the prompt generation system
test_profile = responder_profiles[0] #use the first generated profile
test_question_title = questions_df.iloc[0]['Title'] #the title of the first question
test_question_body = questions_df.iloc[0]['Body'] #the content of the first question

test_prompt = create_mturk_style_prompt(
    test_profile, 
    test_question_title, 
    test_question_body, 
    average_answer_word_length
)

print("Prompt template system created successfully!")
print(f"\nTest prompt preview (first 500 characters):")
print(test_prompt[:] + "...")


=== Step 2.3: Prompt Template System ===
Prompt template system created successfully!

Test prompt preview (first 500 characters):
Introduction: Many individuals with autism visit online communities to ask questions about issues they are facing. We would like to know whether people outside of the specific community could provide answers for them. A question from a special autism community will be shown to you. Please read the question carefully, rate your confidence and provide your own answer to the question.
Question Title: Dr Who, is he Autistic?
PS. He is called The Doctor, not Dr Who. If you refer to him as Dr Who I can guarantee the following post will be a correction, simply because we all enjoy correcting people so much!"lly and surprises people when he bumbles on about some kind of alien technology. This seemed especially clear with this weeks episode with James Corden. 

Tasks:
BACKGROUND: You are a 30-35 year old male living in the United Kingdom. You have some limited expe

### 2.4 run Simulation of out-group responders
#### File Management Module + Task Generation Module + Run Experiment & Result Validation

In [8]:
import os
import json
import pandas as pd
from datetime import datetime
from typing import List, Dict
import random
import time
from tqdm import tqdm
from collections import Counter

### 3. Run Experiment 

#### 3.1 Run the complete simulation for all 400 answers

##### The function to run the whole simulation experiment and call all dependent functions

In [11]:
def run_model_simulation(model_name: str, target_answers: int = 400, 
                        profiles: List[Dict] = None, questions_df: pd.DataFrame = None,
                        average_word_length: float = None):
    """
    Run simulation with configurable model and answer count
    
    Args:
        model_name (str): OpenAI model to use (e.g., "gpt-4", "gpt-3.5-turbo")
        target_answers (int): Total number of answers to generate (default: 400)
        profiles (List[Dict]): List of demographic profiles
        questions_df (pd.DataFrame): DataFrame containing questions
        average_word_length (float): Target word length for responses
    
    Returns:
        tuple: (successful_responses, json_path, csv_path)
    """
    
    print(f"\n{'='*60}")
    print(f"RUNNING SIMULATION: {model_name.upper()}")
    print(f"{'='*60}")
    
    # Calculate answers per question based on target total
    total_questions = len(questions_df)
    answers_per_question = target_answers // total_questions
    actual_total = answers_per_question * total_questions
    
    print(f"📊 Simulation Configuration:")
    print(f"  Model: {model_name}")
    print(f"  Target total answers: {target_answers}")
    print(f"  Questions available: {total_questions}")
    print(f"  Answers per question: {answers_per_question}")
    print(f"  Actual total answers: {actual_total}")
    print(f"  Profiles to use: {answers_per_question} per question (from {len(profiles)} available)")
    
    # Create model-specific output directory
    base_output_dir = r"D:\Wisconsin_Madison\2025summer\Dr.Arriaga\AutismTE\TE_autism\out-group_simulated_answers"
    model_output_dir = os.path.join(base_output_dir, model_name.replace(".", "_"))  # Replace dots for folder name
    os.makedirs(model_output_dir, exist_ok=True)
    print(f"📁 Model-specific output directory: {model_output_dir}")
    
    # Initialize OpenAI client
    try:
        client = verify_openai_access(
            pathlib.Path("openai_organization.txt"),
            pathlib.Path("openai_api_key.txt")
        )
        print(f"✅ OpenAI client initialized")
    except Exception as e:
        print(f"❌ Failed to initialize OpenAI client: {e}")
        return None, None, None
    
    # Configure model settings
    model_settings = OpenAIModelSettings(
        model=model_name,
        max_tokens=1000,
        temperature=0.3,
        n=1,
        presence_penalty=0.1,
        frequency_penalty=0.1,
        stop=None,
        params_descriptor=f"autism-simulation-{model_name}"
    )
    
    # Generate balanced assignments
    assignments = generate_balanced_profile_question_assignments_flexible(
        profiles, questions_df, answers_per_question
    )
    
    # Verify profile coverage
    expected_profiles = min(len(profiles), actual_total)
    coverage_verified = verify_profile_coverage_flexible(assignments, expected_profiles)
    
    if not coverage_verified:
        print(f"❌ Profile coverage verification failed!")
        return None, None, None
    
    # Initialize output files with model-specific paths
    json_path, csv_path = initialize_output_files_for_model(model_output_dir, model_name)
    
    print(f"🚀 Starting simulation execution...")
    # Execute simulation
    successful_responses = 0
    failed_responses = 0
    
    with tqdm(total=len(assignments), desc=f"Running {model_name}", unit="response") as pbar:
        
        for assignment in assignments:
            
            # Create prompt
            prompt = create_mturk_style_prompt(
                assignment['profile'], 
                assignment['question_title'], 
                assignment['question_body'], 
                average_word_length
            )
            
            try:
                # Call OpenAI API
                response = call_openai_chat_api(prompt, model_settings, client)
                
                # Create result object
                result = {
                    'assignment_id': assignment['assignment_id'],
                    'question_idx': assignment['question_idx'],
                    'question_title': assignment['question_title'],
                    'question_body': assignment['question_body'],
                    'profile_id': assignment['profile_id'],
                    'profile': assignment['profile'],
                    'prompt': prompt,
                    'response': response,
                    'timestamp': datetime.now().isoformat(),
                    'model': model_name,
                    'word_count': len(response.split()),
                    'status': 'success'
                }
                
                # Save immediately
                append_result_to_files_robust(result, json_path, csv_path)
                successful_responses += 1
                                # Update progress
                pbar.set_postfix({
                    'Success': successful_responses,
                    'Failed': failed_responses,
                    'Rate': f"{successful_responses/(successful_responses+failed_responses)*100:.1f}%"
                })
                
                # Rate limiting
                time.sleep(0.5)
                
            except Exception as e:
                failed_responses += 1
                print(f"\n⚠️  Error in assignment {assignment['assignment_id']}: {e}")
                
                pbar.set_postfix({
                    'Success': successful_responses,
                    'Failed': failed_responses,
                    'Rate': f"{successful_responses/(successful_responses+failed_responses)*100:.1f}%" if (successful_responses+failed_responses) > 0 else "0%"
                })
            
            pbar.update(1)
    
    # Final summary
    print(f"\n📈 {model_name} Results:")
    print(f"  ✅ Successful: {successful_responses}")
    print(f"  ❌ Failed: {failed_responses}")
    print(f"  📊 Success rate: {successful_responses/(successful_responses+failed_responses)*100:.1f}%")
    print(f"  📁 Saved to: {model_output_dir}")
    
    return successful_responses, json_path, csv_path

#### According to required answer numbers and given question set, assign average number of simulated responders (with profiles) to each question + verify whether required answer number is reached

In [12]:
def generate_balanced_profile_question_assignments_flexible(profiles: List[Dict], 
                                                          questions_df: pd.DataFrame, 
                                                          answers_per_question: int) -> List[Dict]:
    """
    Generate balanced assignments with flexible answer count per question
    
    Args:
        profiles (List[Dict]): Available profiles
        questions_df (pd.DataFrame): Questions to answer
        answers_per_question (int): How many answers per question
    
    Returns:
        List[Dict]: List of assignments
    """
    
    total_assignments = len(questions_df) * answers_per_question
    total_profiles = len(profiles)
    
    print(f"🔄 Generating flexible assignments:")
    print(f"  Total assignments needed: {total_assignments}")
    print(f"  Profiles available: {total_profiles}")
    
    # If we need more assignments than profiles, repeat profiles
    if total_assignments > total_profiles:
        uses_per_profile = total_assignments // total_profiles
        remaining = total_assignments % total_profiles
        
        expanded_profiles = []
                # Each profile used equally
        for _ in range(uses_per_profile):
            expanded_profiles.extend(profiles.copy())
        
        # Add remaining profiles randomly
        if remaining > 0:
            extra_profiles = random.sample(profiles, remaining)
            expanded_profiles.extend(extra_profiles)
            
        print(f"  Each profile used ~{uses_per_profile} times")

    else:
        # More profiles than needed, sample randomly
        expanded_profiles = random.sample(profiles, total_assignments)
        print(f"  Using {total_assignments} profiles randomly selected")
    
    # Shuffle for randomization
    random.shuffle(expanded_profiles)
    
    # Assign to questions
    assignments = []
    profile_index = 0
    
    for question_idx, question_row in questions_df.iterrows():
        for _ in range(answers_per_question):
            profile = expanded_profiles[profile_index]
            
            assignment = {
                'assignment_id': len(assignments) + 1,
                'question_idx': question_idx,
                'question_title': question_row['Title'],
                'question_body': question_row['Body'],
                'profile_id': profile['id'],
                'profile': profile
            }
            assignments.append(assignment)
            profile_index += 1
    
    print(f"✅ Generated {len(assignments)} assignments")
    return assignments

def verify_profile_coverage_flexible(assignments: List[Dict], expected_profiles: int) -> bool:
    """
    Verify profile coverage for flexible assignment counts
    """
    
    profile_usage = {}
    for assignment in assignments:
        profile_id = assignment['profile_id']
        profile_usage[profile_id] = profile_usage.get(profile_id, 0) + 1
    
    unique_profiles = len(profile_usage)
    print(f"👥 Profile coverage: {unique_profiles} unique profiles used")
    
    if unique_profiles >= min(expected_profiles, 100):  # At most 100 profiles available
        print(f"✅ Adequate profile coverage")
        return True
    else:
        print(f"⚠️  Limited profile coverage")
        return True  # Still proceed, but note the limitation

#### Initialize output file + Write into json/csv 

In [13]:
def initialize_output_files_for_model(model_output_dir: str, model_name: str) -> tuple:
    """
    Initialize output files in model-specific directory
    """
    
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    # Model-specific file names
    json_filename = f"{model_name}_simulation_results_{timestamp}.json"
    csv_filename = f"{model_name}_simulation_summary_{timestamp}.csv"
    
    json_path = os.path.join(model_output_dir, json_filename)
    csv_path = os.path.join(model_output_dir, csv_filename)
    
    # Initialize files
    with open(json_path, 'w', encoding='utf-8') as f:
        json.dump([], f)
    
    # MODIFIED: Added question_body to CSV headers
    csv_headers = [
        'assignment_id', 'profile_id', 'age_group', 'gender', 'location', 
        'autism_experience', 'knowledge_level', 'question_idx', 'question_title', 
        'question_body', 'response', 'timestamp', 'model', 'word_count', 'status'
    ]
    csv_df = pd.DataFrame(columns=csv_headers)
    csv_df.to_csv(csv_path, index=False, encoding='utf-8')
    
    return json_path, csv_path


def append_result_to_files_robust(result: Dict, json_path: str, csv_path: str, max_retries: int = 3):
    """
    Robust file writing with retry mechanism
    """
    
    # JSON writing
    try:
        with open(json_path, 'r', encoding='utf-8') as f:
            existing_data = json.load(f)
        existing_data.append(result)
        with open(json_path, 'w', encoding='utf-8') as f:
            json.dump(existing_data, f, indent=2, ensure_ascii=False)
    except Exception as e:
        print(f"JSON write error: {e}")
    
    # CSV writing with retries
    for attempt in range(max_retries):
        try:
            # MODIFIED: Added question_body to CSV row
            csv_row = {
                'assignment_id': result['assignment_id'],
                'profile_id': result['profile_id'],
                'age_group': result['profile']['age_group'],
                'gender': result['profile']['gender'],
                'location': result['profile']['location'],
                'autism_experience': result['profile']['autism_experience'],
                'knowledge_level': result['profile']['knowledge_level'],
                'question_idx': result['question_idx'],
                'question_title': result['question_title'],
                'question_body': result['question_body'],  # ADDED: Question body column
                'response': result['response'],
                'timestamp': result['timestamp'],
                'model': result['model'],
                'word_count': len(result['response'].split()),
                'status': result.get('status', 'success')
            }
            
            csv_row_df = pd.DataFrame([csv_row])
            csv_row_df.to_csv(csv_path, mode='a', header=False, index=False, encoding='utf-8')
            break
            
        except PermissionError:
            if attempt < max_retries - 1:
                time.sleep(1)
            continue
        except Exception as e:
            print(f"CSV write error: {e}")
            break

#### Simulate results for different models through iteration of different models and save them in differnet folders

In [14]:
# ===================================================================
# MULTI-MODEL BATCH TESTING
# ===================================================================

def run_multi_model_comparison(target_answers: int = 100):
    """
    Run simulation across multiple models for comparison
    
    Args:
        target_answers (int): Number of answers to generate per model
    """
    
    # Define models to test
    models_to_test = [
        "gpt-4.1-mini",
        # "gpt-4", 
        # "gpt-4-turbo",  # Note: corrected from "gpt-4.1mini" which doesn't exist
        "gpt-3.5-turbo"
    ]
    
    print(f"\n{'='*80}")
    print(f"MULTI-MODEL COMPARISON STUDY")
    print(f"{'='*80}")
    print(f"🎯 Target answers per model: {target_answers}")
    print(f"🤖 Models to test: {', '.join(models_to_test)}")
    print(f"📊 Total answers to generate: {len(models_to_test) * target_answers}")
    
    # Set random seed for consistency across models
    random.seed(42)
    np.random.seed(42)
    
    # Storage for results comparison
    model_results = {}
    
    # Run simulation for each model
    for i, model_name in enumerate(models_to_test, 1):
        
        print(f"\n🚀 STARTING MODEL {i}/{len(models_to_test)}: {model_name}")
        print(f"{'='*60}")
        
        try:
            # Reset random seed for each model to ensure identical assignments
            random.seed(42)
            
            # Run simulation
            results = run_model_simulation(
                model_name=model_name,
                target_answers=target_answers,
                profiles=responder_profiles,
                questions_df=questions_df,
                average_word_length=average_answer_word_length
            )
            
            successful_responses, json_path, csv_path = results
            
            # Store results
            model_results[model_name] = {
                'successful_responses': successful_responses,
                'json_path': json_path,
                'csv_path': csv_path,
                'success_rate': successful_responses / target_answers * 100 if successful_responses else 0
            }
            
            print(f"✅ {model_name} completed: {successful_responses}/{target_answers} responses")
            
        except Exception as e:
            print(f"❌ {model_name} failed: {e}")
            model_results[model_name] = {
                'successful_responses': 0,
                'json_path': None,
                'csv_path': None,
                'success_rate': 0,
                'error': str(e)
            }
        
        # Add delay between models to avoid rate limiting
        if i < len(models_to_test):
            print(f"⏳ Waiting 30 seconds before next model...")
            time.sleep(30)
    
    # Display comparison summary
    print(f"\n{'='*80}")
    print(f"MULTI-MODEL COMPARISON RESULTS")
    print(f"{'='*80}")
    
    print(f"📊 Results Summary:")
    print(f"{'Model':<15} {'Success':<8} {'Rate':<8} {'Status':<10}")
    print(f"{'-'*50}")
    
    for model_name, result in model_results.items():
        success = result['successful_responses']
        rate = f"{result['success_rate']:.1f}%"
        status = "✅ OK" if success > 0 else "❌ FAIL"
        
        print(f"{model_name:<15} {success:<8} {rate:<8} {status:<10}")
    
    # Save comparison summary
    comparison_df = pd.DataFrame([
        {
            'model': model_name,
            'successful_responses': result['successful_responses'],
            'target_responses': target_answers,
            'success_rate': result['success_rate'],
            'json_path': result.get('json_path', ''),
            'csv_path': result.get('csv_path', ''),
            'error': result.get('error', '')
        }
        for model_name, result in model_results.items()
    ])
    
    # Save comparison to base directory
    base_dir = r"D:\Wisconsin_Madison\2025summer\Dr.Arriaga\AutismTE\TE_autism\out-group_simulated_answers"
    comparison_path = os.path.join(base_dir, f"model_comparison_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv")
    comparison_df.to_csv(comparison_path, index=False)
    
    print(f"\n📁 Comparison summary saved: {comparison_path}")
    
    return model_results

#### Codes to execute the real experiment

In [15]:
# ===================================================================
# EXECUTION CODE
# ===================================================================

print("🔧 MULTI-MODEL AUTISM SIMULATION EXPERIMENT")
print("="*60)

# Verify prerequisites
print("📋 Pre-execution checklist:")
required_vars = ['responder_profiles', 'questions_df', 'average_answer_word_length']
all_ready = True

for var in required_vars:
    if var in globals():
        print(f"  ✅ {var}: Available")
    else:
        print(f"  ❌ {var}: Missing")
        all_ready = False

if not all_ready:
    print("❌ Please run previous steps to generate required variables")
else:
    print("✅ All prerequisites met")
    
    # Choose test scale
    print(f"\n🎯 Choose test scale:")
    print(f"  1. Quick test: 100 answers (5 per question)")
    print(f"  2. Medium test: 200 answers (10 per question)")
    print(f"  3. Full test: 400 answers (20 per question)")
    
    # For demonstration, let's use medium test
    test_scale = 100  # You can change this
    
    print(f"\n🚀 Starting multi-model comparison with {test_scale} answers per model")
    
    # Run the comparison
    try:
        comparison_results = run_multi_model_comparison(target_answers=test_scale)
        
        print(f"\n🎉 Multi-model comparison completed!")
        print(f"Check the individual model folders for detailed results:")
        
        base_dir = r"D:\Wisconsin_Madison\2025summer\Dr.Arriaga\AutismTE\TE_autism\out-group_simulated_answers"
        print(f"📁 {base_dir}")
        
        for model_name in ["gpt-4.1-mini", "gpt-3.5-turbo"]:
            model_dir = os.path.join(base_dir, model_name.replace(".", "_"))
            if os.path.exists(model_dir):
                print(f"  📂 {model_name}/")
                
    except Exception as e:
        print(f"❌ Experiment failed: {e}")
        import traceback
        traceback.print_exc()

🔧 MULTI-MODEL AUTISM SIMULATION EXPERIMENT
📋 Pre-execution checklist:
  ✅ responder_profiles: Available
  ✅ questions_df: Available
  ✅ average_answer_word_length: Available
✅ All prerequisites met

🎯 Choose test scale:
  1. Quick test: 100 answers (5 per question)
  2. Medium test: 200 answers (10 per question)
  3. Full test: 400 answers (20 per question)

🚀 Starting multi-model comparison with 100 answers per model

MULTI-MODEL COMPARISON STUDY
🎯 Target answers per model: 100
🤖 Models to test: gpt-4.1-mini, gpt-3.5-turbo
📊 Total answers to generate: 200

🚀 STARTING MODEL 1/2: gpt-4.1-mini

RUNNING SIMULATION: GPT-4.1-MINI
📊 Simulation Configuration:
  Model: gpt-4.1-mini
  Target total answers: 100
  Questions available: 20
  Answers per question: 5
  Actual total answers: 100
  Profiles to use: 5 per question (from 100 available)
📁 Model-specific output directory: D:\Wisconsin_Madison\2025summer\Dr.Arriaga\AutismTE\TE_autism\out-group_simulated_answers\gpt-4_1-mini
✅ OpenAI client 

Running gpt-4.1-mini: 100%|████████████████| 100/100 [04:54<00:00,  2.95s/response, Success=100, Failed=0, Rate=100.0%]



📈 gpt-4.1-mini Results:
  ✅ Successful: 100
  ❌ Failed: 0
  📊 Success rate: 100.0%
  📁 Saved to: D:\Wisconsin_Madison\2025summer\Dr.Arriaga\AutismTE\TE_autism\out-group_simulated_answers\gpt-4_1-mini
✅ gpt-4.1-mini completed: 100/100 responses
⏳ Waiting 30 seconds before next model...

🚀 STARTING MODEL 2/2: gpt-3.5-turbo

RUNNING SIMULATION: GPT-3.5-TURBO
📊 Simulation Configuration:
  Model: gpt-3.5-turbo
  Target total answers: 100
  Questions available: 20
  Answers per question: 5
  Actual total answers: 100
  Profiles to use: 5 per question (from 100 available)
📁 Model-specific output directory: D:\Wisconsin_Madison\2025summer\Dr.Arriaga\AutismTE\TE_autism\out-group_simulated_answers\gpt-3_5-turbo
✅ OpenAI client initialized
🔄 Generating flexible assignments:
  Total assignments needed: 100
  Profiles available: 100
  Using 100 profiles randomly selected
✅ Generated 100 assignments
👥 Profile coverage: 100 unique profiles used
✅ Adequate profile coverage
🚀 Starting simulation execu

Running gpt-3.5-turbo: 100%|███████████████| 100/100 [09:35<00:00,  5.76s/response, Success=100, Failed=0, Rate=100.0%]


📈 gpt-3.5-turbo Results:
  ✅ Successful: 100
  ❌ Failed: 0
  📊 Success rate: 100.0%
  📁 Saved to: D:\Wisconsin_Madison\2025summer\Dr.Arriaga\AutismTE\TE_autism\out-group_simulated_answers\gpt-3_5-turbo
✅ gpt-3.5-turbo completed: 100/100 responses

MULTI-MODEL COMPARISON RESULTS
📊 Results Summary:
Model           Success  Rate     Status    
--------------------------------------------------
gpt-4.1-mini    100      100.0%   ✅ OK      
gpt-3.5-turbo   100      100.0%   ✅ OK      

📁 Comparison summary saved: D:\Wisconsin_Madison\2025summer\Dr.Arriaga\AutismTE\TE_autism\out-group_simulated_answers\model_comparison_20250722_151146.csv

🎉 Multi-model comparison completed!
Check the individual model folders for detailed results:
📁 D:\Wisconsin_Madison\2025summer\Dr.Arriaga\AutismTE\TE_autism\out-group_simulated_answers
  📂 gpt-4.1-mini/
  📂 gpt-3.5-turbo/





### 4. Rater Simulation
#### 4.1 Question & Answer Preparation

In [4]:
import pandas as pd
import numpy as np
import random
import warnings
from typing import Dict, List, Tuple
import os
import json
import time
from datetime import datetime
from tqdm import tqdm
import pathlib
import hashlib

# Suppress NumPy warnings
warnings.filterwarnings('ignore', category=RuntimeWarning, message='invalid value encountered in divide')
warnings.filterwarnings('ignore', category=RuntimeWarning, message='divide by zero encountered')

### 4.1.1 Rater Profile Creation:
#### Researchers(n=2) + Individuals with Autism(n=6) + Autism Experts(n=11)
#### Create bias and consistency variables for each rater, 

In [5]:
def create_all_rater_profiles():
    """
    Create three groups of rater profiles based on original paper
    """
    
    # Researcher group (2 people) - evaluate all 5 criteria
    researchers = [
        {
            'id': 'R1',
            'group': 'Researchers',
            'expertise': 'autism community research',
            'description': 'a researcher with expertise in autism community research who participated in the in-group question classification process',
            'evaluation_criteria': ['Directness', 'Additional_Information', 'Informational_Support', 'Emotional_Support', 'Helpfulness'],
            'bias': -0.1,  # Slightly conservative
            'consistency': 0.85
        },
        {
            'id': 'R2', 
            'group': 'Researchers',
            'expertise': 'autism community research',
            'description': 'a researcher with expertise in autism community research who participated in the in-group question classification process',
            'evaluation_criteria': ['Directness', 'Additional_Information', 'Informational_Support', 'Emotional_Support', 'Helpfulness'],
            'bias': 0.05,  # Slightly lenient
            'consistency': 0.88
        }
    ]
    # Individuals with autism group (6 people) - only evaluate helpfulness
    autism_individuals = [
        {
            'id': f'A{i}',
            'group': 'Individuals_with_autism',
            'expertise': 'lived experience with autism',
            'description': 'an individual who self-identified as having high-functioning autism and understands the challenges from personal experience',
            'evaluation_criteria': ['Helpfulness'],
            'bias': (i-3.5) * 0.05,  # Range from -0.125 to +0.125
            'consistency': 0.75 + (i % 3) * 0.05  # 0.75 to 0.85
        }
        for i in range(1, 7)
    ]
    
    # Autism experts group (11 people) - only evaluate helpfulness
    expert_roles = [
        'vocational and transitional specialist', 'vocational and transitional specialist',
        'vocational and transitional specialist', 'vocational and transitional specialist',
        'special education teacher', 'special education teacher',
        'job developer', 'job developer', 'job developer',
        'behavior analyst', 'mother of a son with autism'
    ]
    autism_experts = [
        {
            'id': f'E{i}',
            'group': 'Autism_experts',
            'expertise': expert_roles[i-1],
            'description': f'a {expert_roles[i-1]} recruited at an official meeting for designing workplace transition plans for students with autism',
            'evaluation_criteria': ['Helpfulness'],
            'bias': (i-6) * 0.02,  # Range from -0.1 to +0.1
            'consistency': 0.8 + (i % 4) * 0.025  # 0.8 to 0.875
        }
        for i in range(1, 12)
    ]
    
    all_raters = researchers + autism_individuals + autism_experts
    
    return all_raters, researchers, autism_individuals, autism_experts


### 4.1.2 Enhanced LLM Prompt Template
#### Give the criteria of the 5 measures of responses to those in-group questions
#### "Enhanced" means these prompts include more clarifications/exmaples of each criteria apart from simple evaluation measures

In [6]:
def create_researcher_evaluation_prompt(rater_profile: Dict, question: str, answer: str) -> str:
    """
    Enhanced researcher prompt with explicit calibration examples
    """
    
    calibration_examples = """
CALIBRATION EXAMPLES for consistent rating:

Directness Examples:
- DIRECT (1): "Yes, you should..." "The answer is..." "I recommend..." "Here's what to do..."
- INDIRECT (0): "Many people feel..." "It depends..." "You might consider..." without clear guidance

Additional Information Examples:  
- HAS EXTRA INFO (1): Provides context, background, related tips, or broader perspective
- NO EXTRA INFO (0): Only addresses the specific question asked, nothing more

Informational Support Examples:
- PROVIDES SUPPORT (1): Gives specific advice, concrete steps, actionable knowledge, or practical guidance
- NO SUPPORT (0): Only acknowledges feelings without actionable guidance

Emotional Support Examples:
- HAS EMOTIONAL SUPPORT (1): "I understand," "You're not alone," encouragement, validation, empathy
- NO EMOTIONAL SUPPORT (0): Pure factual response without empathy or emotional connection

Helpfulness Scale (be consistent with these anchors):
- 5 = EXCELLENT: Comprehensive, actionable, emotionally supportive, directly addresses question
- 4 = VERY GOOD: Good advice with minor gaps, quite helpful overall
- 3 = GOOD: Adequate response, addresses question moderately well
- 2 = FAIR: Somewhat helpful but lacks depth or has notable issues
- 1 = POOR: Not helpful, off-topic, or potentially problematic
"""    
    prompt = f"""You are researcher {rater_profile['id']} with expertise in {rater_profile['expertise']}. You are {rater_profile['description']}.

{calibration_examples}

CRITICAL: Use these calibration examples as your EXACT reference point. Rate INDEPENDENTLY but consistently with these precise standards.

Question: "{question}"

Response to evaluate: "{answer}"

Rate using the exact criteria above. Be precise and consistent. Provide ONLY numbers, one per line:

Directness (0/1):
Additional Information (0/1):
Informational Support (0/1):
Emotional Support (0/1):
Helpfulness (1-5):
"""
    
    return prompt



#### Create particular perspectives for individuals with autism, specify criteria from their perspective
#### Example: 5 = most helpful to autism people

In [7]:
def create_individual_evaluation_prompt(rater_profile: Dict, question: str, answer: str) -> str:
    """
    Enhanced individual prompt with personal perspective calibration
    """
    
    calibration_examples = """
HELPFULNESS CALIBRATION from personal autism perspective:

5 = EXCELLENT: Would definitely help me or someone I know with autism - practical, understanding, actionable
4 = VERY GOOD: Quite helpful, addresses autism-specific needs well, mostly practical
3 = GOOD: Moderately helpful, generally appropriate, somewhat useful
2 = FAIR: Somewhat helpful but missing important autism-specific aspects
1 = POOR: Not helpful for autism community or potentially inappropriate
"""
    
    prompt = f"""You are {rater_profile['id']}, an individual with high-functioning autism. You are {rater_profile['description']}.

{calibration_examples}

As someone with lived autism experience, rate how helpful this response would be to you or others in the autism community.

Question: "{question}"

Response to evaluate: "{answer}"

Consider: Does this truly help someone with autism in practical, real-world terms?

Use the calibration scale above. Be consistent and INDEPENDENT.

Helpfulness (1-5):
"""
    
    return prompt

#### Create particular perspectives for autism experts, specify criteria from professional perspectives
#### Example: 5 = professional and appropriate, giving evidence and specific about autism

In [8]:
def create_expert_evaluation_prompt(rater_profile: Dict, question: str, answer: str) -> str:
    """
    Enhanced expert prompt with professional standards calibration
    """
    
    calibration_examples = """
PROFESSIONAL HELPFULNESS CALIBRATION for autism support:

5 = EXCELLENT: Evidence-based, professionally appropriate, autism-specific, safe, comprehensive
4 = VERY GOOD: Professionally sound with minor limitations, generally evidence-based
3 = GOOD: Generally appropriate, meets basic professional standards, adequate
2 = FAIR: Somewhat helpful but lacks professional depth or has concerns
1 = POOR: Unprofessional, inappropriate, potentially harmful, or inadequate
"""
    
    prompt = f"""You are expert {rater_profile['id']}, a {rater_profile['expertise']}. You are {rater_profile['description']}.

{calibration_examples}

From your professional perspective working with individuals with autism, evaluate this response.

Question: "{question}"

Response to evaluate: "{answer}"

Rate based on professional appropriateness, evidence-based practice, and suitability for autism community.

Use the calibration scale above. Be consistent and INDEPENDENT as a {rater_profile['expertise']}.

Helpfulness (1-5):
"""
    
    return prompt

### 4.1.3 Enhanced Parameter Generation
#### Slightly adjust different parameters for different rater groups to create variance

In [9]:
def get_enhanced_rater_parameters(rater_id: str, question_text: str = "") -> Dict:
    """
    Enhanced rater-specific parameters for optimal reliability balance
    """
    
    # Create stable hash for this rater
    rater_hash = int(hashlib.md5(rater_id.encode()).hexdigest()[:8], 16) % 1000
    
    # Create question-specific slight adjustment for realistic variance
    question_hash = int(hashlib.md5(question_text.encode()).hexdigest()[:8], 16) % 100 if question_text else 50
    question_adjustment = (question_hash - 50) / 2000  # -0.025 to +0.025
    
    # Base parameters for HIGH reliability (much lower temperature)
    base_params = {
        'temperature': 1.0,  # Much lower for consistency
        # 'top_p': 0.85,        # More focused
        'presence_penalty': 0.0,
        'frequency_penalty': 0.0,
    }

    # TEMPERATURE CHANGE    
    # Rater-group specific minimal adjustments for realistic variance
    if rater_id.startswith('R'):  # Researchers - highest consistency
        base_params['temperature'] = 0.5 + (rater_hash % 3) * 0.005  # 0.05-0.06
        # base_params['top_p'] = 0.88 + (rater_hash % 2) * 0.01  # 0.88-0.89
        
    elif rater_id.startswith('A'):  # Individuals - slight more variance
        base_params['temperature'] = 0.5 + (rater_hash % 5) * 0.005  # 0.05-0.07
        # base_params['top_p'] = 0.85 + (rater_hash % 3) * 0.01  # 0.85-0.87
        
    elif rater_id.startswith('E'):  # Experts - professional consistency
        base_params['temperature'] = 0.5 + (rater_hash % 4) * 0.005  # 0.05-0.065
        # base_params['top_p'] = 0.86 + (rater_hash % 3) * 0.01  # 0.86-0.88
    
    # Add tiny question-specific variance for realism
    base_params['temperature'] += abs(question_adjustment)
    
    return base_params

#### This part is usually not used, used only if LLM call fails (ignore the following 2 functions)

In [10]:
def simulate_answer_quality(question: str, answer: str) -> float:
    """
    Simulate underlying answer quality to create correlated ratings
    """
    
    # Simple heuristics for answer quality
    word_count = len(answer.split())
    
    # Quality indicators
    quality_score = 0.5  # baseline
    
    # Length-based quality (reasonable length is better)
    if 20 <= word_count <= 150:
        quality_score += 0.2
    elif word_count < 10:
        quality_score -= 0.2
    
    # Content-based indicators
    helpful_words = ['recommend', 'suggest', 'try', 'help', 'support', 'understand', 'consider']
    supportive_words = ['feel', 'understand', 'experience', 'know', 'been there']
    
    quality_score += min(0.2, len([w for w in helpful_words if w in answer.lower()]) * 0.05)
    quality_score += min(0.1, len([w for w in supportive_words if w in answer.lower()]) * 0.02)
    
    # Add question-answer relevance (simple word overlap)
    question_words = set(question.lower().split())
    answer_words = set(answer.lower().split())
    overlap = len(question_words.intersection(answer_words))
    quality_score += min(0.1, overlap * 0.01)
    
    return max(0.1, min(0.9, quality_score))

In [11]:
def generate_correlated_ratings(rater_profile: Dict, answer_quality: float, question: str, answer: str) -> Dict:
    """
    Generate correlated ratings based on answer quality and rater characteristics
    """
    
    # Get rater characteristics
    rater_bias = rater_profile.get('bias', 0.0)
    rater_consistency = rater_profile.get('consistency', 0.8)
    
    # Random seed for this specific combination (consistent across runs)
    seed_str = f"{rater_profile['id']}_{hash(question)}_{hash(answer)}"
    np.random.seed(abs(hash(seed_str)) % 2147483647)
    
    if rater_profile['group'] == 'Researchers':
        # Generate correlated binary ratings
        base_threshold = 0.5 + rater_bias
        noise_level = 0.15 * (1 - rater_consistency)
        
        directness = 1 if (answer_quality + np.random.normal(0, noise_level)) > base_threshold else 0
        additional_info = 1 if (answer_quality + np.random.normal(0, noise_level)) > (base_threshold + 0.1) else 0
        info_support = 1 if (answer_quality + np.random.normal(0, noise_level)) > (base_threshold - 0.1) else 0
        emotional_support = 1 if (answer_quality + np.random.normal(0, noise_level)) > (base_threshold + 0.2) else 0
        
        # Correlated helpfulness rating
        helpfulness_raw = answer_quality * 3.5 + 1.5 + rater_bias + np.random.normal(0, 0.3 * (1-rater_consistency))
        helpfulness = max(1, min(5, round(helpfulness_raw)))
        
        return {
            'Directness': directness,
            'Additional_Information': additional_info,
            'Informational_Support': info_support,
            'Emotional_Support': emotional_support,
            'Helpfulness': helpfulness
        }
    else:
        # For individuals and experts, only helpfulness
        helpfulness_raw = answer_quality * 3.5 + 1.5 + rater_bias + np.random.normal(0, 0.25 * (1-rater_consistency))
        helpfulness = max(1, min(5, round(helpfulness_raw)))
        
        return {'Helpfulness': helpfulness}

#### Try to call LLM to evaluate

In [19]:
def call_evaluation_llm(prompt: str, model_name: str = "gpt-4o-mini", rater_id: str = None, question: str = "", answer: str = "") -> str:
   """
   Call LLM for evaluation with enhanced parameters and fallback simulation
   """
   
   try:
       # Initialize OpenAI client
       client = verify_openai_access(
           pathlib.Path("openai_organization.txt"),
           pathlib.Path("openai_api_key.txt")
       )
       
       # Get enhanced rater-specific parameters
       rater_params = get_enhanced_rater_parameters(rater_id, question)
       
       # Configure model settings - remove unsupported top_p parameter
       model_settings = OpenAIModelSettings(
           model=model_name,
           max_tokens=50,  # Short output for focused scoring
           temperature=rater_params['temperature'],
           n=1,
           presence_penalty=rater_params['presence_penalty'],
           frequency_penalty=rater_params['frequency_penalty'],
           stop=None,
           params_descriptor=f"rating-evaluation-{rater_id}"
       )
       
       # Add strong independence guidance
       independence_instruction = f"""
CRITICAL REMINDER: You are {rater_id} evaluating INDEPENDENTLY. Use your calibration standards consistently. Do not assume what other raters think. Rate based solely on the criteria provided.
"""
       
       enhanced_prompt = prompt + independence_instruction
       
       # Call OpenAI API
       response_dict = call_openai_chat_api(enhanced_prompt, model_settings, client)
       
       # Extract text content
       response_text = response_dict['output']['choices'][0]['message']['content']
       return response_text
       
   except Exception as e:
       print(f"LLM call failed for rater {rater_id}, using enhanced simulation: {e}")
       
       # Use enhanced simulation as fallback
       all_raters, researchers, autism_individuals, autism_experts = create_all_rater_profiles()
       
       # Find rater profile
       rater_profile = None
       for rater in all_raters:
           if rater['id'] == rater_id:
               rater_profile = rater
               break
       
       if rater_profile:
           # Generate quality-related ratings
           answer_quality = simulate_answer_quality(question, answer)
           ratings = generate_correlated_ratings(rater_profile, answer_quality, question, answer)
           
           if rater_profile['group'] == 'Researchers':
               return f"{ratings['Directness']}\n{ratings['Additional_Information']}\n{ratings['Informational_Support']}\n{ratings['Emotional_Support']}\n{ratings['Helpfulness']}"
           else:
               return str(ratings['Helpfulness'])
       else:
           # Final fallback
           import random
           random.seed(hash(rater_id + question + answer))
           if "R" in str(rater_id):
               base_quality = random.choice([0, 1])
               return f"{base_quality}\n{random.randint(0,1)}\n{base_quality}\n{random.randint(0,1)}\n{random.randint(2,4)}"
           else:
               return str(random.randint(2, 4))

#### Parse the rating results
#### -Process the LLN response and extract the rating results

In [13]:
def parse_researcher_response(response_text: str, rater_id: str = None, question: str = "", answer: str = "") -> Dict:
    """
    Parse researcher LLM response with enhanced fallback
    """
    
    if not response_text:
        # Use quality-based simulation as fallback
        answer_quality = simulate_answer_quality(question, answer)
        all_raters, researchers, _, _ = create_all_rater_profiles()
        rater_profile = next((r for r in researchers if r['id'] == rater_id), researchers[0])
        return generate_correlated_ratings(rater_profile, answer_quality, question, answer)
    
    try:
        lines = [line.strip() for line in response_text.strip().split('\n') if line.strip()]
        
        # Extract numbers
        import re
        numbers = []
        for line in lines:
            found_numbers = re.findall(r'\d+', line)
            if found_numbers:
                numbers.extend([int(num) for num in found_numbers])
        
        if len(numbers) >= 5:
            return {
                'Directness': min(1, max(0, numbers[0])),
                'Additional_Information': min(1, max(0, numbers[1])),
                'Informational_Support': min(1, max(0, numbers[2])),
                'Emotional_Support': min(1, max(0, numbers[3])),
                'Helpfulness': min(5, max(1, numbers[4]))
            }
        else:
            # Parse failure, use quality-based simulation
            answer_quality = simulate_answer_quality(question, answer)
            all_raters, researchers, _, _ = create_all_rater_profiles()
            rater_profile = next((r for r in researchers if r['id'] == rater_id), researchers[0])
            return generate_correlated_ratings(rater_profile, answer_quality, question, answer)
            
    except Exception as e:
        print(f"Error parsing researcher response: {e}")
        # Use quality-based simulation as fallback
        answer_quality = simulate_answer_quality(question, answer)
        all_raters, researchers, _, _ = create_all_rater_profiles()
        rater_profile = next((r for r in researchers if r['id'] == rater_id), researchers[0])
        return generate_correlated_ratings(rater_profile, answer_quality, question, answer)

def parse_single_helpfulness_response(response_text: str, rater_id: str = None, question: str = "", answer: str = "") -> Dict:
    """
    Parse individual/expert LLM response with enhanced fallback
    """
    
    if not response_text:
        # Use quality-based simulation as fallback
        answer_quality = simulate_answer_quality(question, answer)
        all_raters, _, autism_individuals, autism_experts = create_all_rater_profiles()
        all_non_researchers = autism_individuals + autism_experts
        rater_profile = next((r for r in all_non_researchers if r['id'] == rater_id), all_non_researchers[0])
        return generate_correlated_ratings(rater_profile, answer_quality, question, answer)
    
    try:
        import re
        numbers = re.findall(r'\d+', response_text.strip())
        
        if numbers:
            helpfulness = min(5, max(1, int(numbers[0])))
            return {'Helpfulness': helpfulness}
        else:
            # Parse failure, use quality-based simulation
            answer_quality = simulate_answer_quality(question, answer)
            all_raters, _, autism_individuals, autism_experts = create_all_rater_profiles()
            all_non_researchers = autism_individuals + autism_experts
            rater_profile = next((r for r in all_non_researchers if r['id'] == rater_id), all_non_researchers[0])
            return generate_correlated_ratings(rater_profile, answer_quality, question, answer)
            
    except Exception as e:
        print(f"Error parsing helpfulness response: {e}")
        # Use quality-based simulation as fallback
        answer_quality = simulate_answer_quality(question, answer)
        all_raters, _, autism_individuals, autism_experts = create_all_rater_profiles()
        all_non_researchers = autism_individuals + autism_experts
        rater_profile = next((r for r in all_non_researchers if r['id'] == rater_id), all_non_researchers[0])
        return generate_correlated_ratings(rater_profile, answer_quality, question, answer)

#### Rating execution functions

In [23]:
# ===================================================================
# IN-GROUP ANSWERS RATING SYSTEM - Based on Updated Enhanced LLM Rating Implementation
# ===================================================================

import pandas as pd
import numpy as np
import random
import warnings
from typing import Dict, List, Tuple
import os
import json
import time
from datetime import datetime
from tqdm import tqdm
import pathlib
import hashlib

# Suppress NumPy warnings
warnings.filterwarnings('ignore', category=RuntimeWarning, message='invalid value encountered in divide')
warnings.filterwarnings('ignore', category=RuntimeWarning, message='divide by zero encountered')

def debug_ingroup_csv_data(csv_path: str, file_name: str) -> None:
    """
    Debug in-group CSV data, display detailed information
    """
    
    try:
        df = pd.read_csv(csv_path, encoding='utf-8')
        print(f"\n🔍 Debugging {file_name}:")
        print(f"   File path: {csv_path}")
        print(f"   Data shape: {df.shape}")
        print(f"   Column names: {list(df.columns)}")
        print(f"   First 3 rows of data:")
        
        for i, row in df.head(3).iterrows():
            print(f"     Row {i}:")
            for col in df.columns:
                value = row[col]
                if pd.isna(value):
                    print(f"       {col}: <NaN>")
                elif str(value).strip() == '':
                    print(f"       {col}: <Empty string>")
                else:
                    preview = str(value)[:50] + "..." if len(str(value)) > 50 else str(value)
                    print(f"       {col}: {preview}")
        
        # Check null values
        null_counts = df.isnull().sum()
        print(f"   Null value statistics:")
        for col, count in null_counts.items():
            if count > 0:
                print(f"     {col}: {count} null values")
        
    except Exception as e:
        print(f"❌ Error debugging {file_name}: {e}")

def load_and_prepare_ingroup_rating_pairs(ingroup_csv_path: str, debug: bool = True) -> List[Dict]:
    """
    Load and prepare in-group rating pairs based on actual CSV structure
    """
    
    if debug:
        debug_ingroup_csv_data(ingroup_csv_path, "In-group data")
    
    print(f"📊 Loading and preparing in-group rating pairs...")
    
    # Load in-group data (in-group_answers.csv)
    try:
        ingroup_df = pd.read_csv(ingroup_csv_path, encoding='utf-8')
        print(f"✅ Loaded in-group data: {len(ingroup_df)} rows")
        print(f"   Column names: {list(ingroup_df.columns)}")
    except Exception as e:
        print(f"❌ Error loading in-group data: {e}")
        return None
    
    # Extract in-group question-answer pairs
    ingroup_pairs = []
    valid_ingroup = 0
    invalid_ingroup = 0
    
    # In-group file mapping:
    # Column A: Question Title -> corresponds to Input.title
    # Column B: Question body -> corresponds to Input.body  
    # Column C: Answer -> corresponds to Answer.answer
    
    # Determine column names (handle both potential naming conventions)
    question_title_col = None
    question_body_col = None
    answer_col = None
    
    # Try to identify columns by position and common names
    columns = list(ingroup_df.columns)
    if len(columns) >= 3:
        # Assume first 3 columns are title, body, answer
        question_title_col = columns[0]
        question_body_col = columns[1] 
        answer_col = columns[2]
        print(f"   Using columns: Title='{question_title_col}', Body='{question_body_col}', Answer='{answer_col}'")
    else:
        # Try to find by name patterns
        for col in columns:
            col_lower = col.lower()
            if 'title' in col_lower:
                question_title_col = col
            elif 'body' in col_lower or 'question' in col_lower:
                question_body_col = col
            elif 'answer' in col_lower:
                answer_col = col
    
    if not all([question_title_col, question_body_col, answer_col]):
        print(f"❌ Could not identify required columns in in-group data")
        print(f"   Available columns: {columns}")
        return None
    
    for idx, row in ingroup_df.iterrows():
        question_title = row.get(question_title_col)
        question_body = row.get(question_body_col)
        answer = row.get(answer_col)
        
        # Check data validity
        if (question_body is not None and answer is not None and 
            str(question_body).strip() != '' and str(answer).strip() != '' and
            str(question_body).lower() not in ['nan', 'none', 'null'] and 
            str(answer).lower() not in ['nan', 'none', 'null']):
            
            pair = {
                'pair_id': valid_ingroup + 1,
                'question': str(question_body).strip(),
                'answer': str(answer).strip(),
                'response_id': f"ingroup_{valid_ingroup+1}",
                'source': 'in_group_human',
                'word_count': len(str(answer).split()),
                'title': str(question_title).strip() if question_title and str(question_title).lower() not in ['nan', 'none', 'null'] else ''
            }
            ingroup_pairs.append(pair)
            valid_ingroup += 1
        else:
            invalid_ingroup += 1
    
    print(f"   Valid in-group pairs: {valid_ingroup}, Invalid pairs: {invalid_ingroup}")
    
    # Data volume check
    print(f"\n📊 In-group Data Volume Check:")
    print(f"   In-group data: {valid_ingroup} valid pairs (from {len(ingroup_df)} rows)")
    
    # Randomly shuffle for blind evaluation
    random.shuffle(ingroup_pairs)
    
    print(f"✅ Created {len(ingroup_pairs)} in-group rating pairs:")
    print(f"   - In-group answers: {len(ingroup_pairs)}")
    print(f"   - Pairs have been randomly shuffled for blind evaluation")
    
    # Show data sample for verification
    if len(ingroup_pairs) > 0:
        print(f"\n📋 In-group Data Sample Preview:")
        sample = ingroup_pairs[0]
        print(f"   Question: {sample['question'][:100]}...")
        print(f"   Answer: {sample['answer'][:100]}...")
        print(f"   Source: {sample['source']}")
        print(f"   Title: {sample.get('title', 'N/A')[:50]}...")
    
    return ingroup_pairs

def save_ingroup_rating_results(rating_results: List[Dict], output_dir: str = "rating_results") -> Dict[str, str]:
    """
    Save in-group rating results with cost control information
    """
    
    print(f"\n💾 Saving In-group Cost-Controlled Rating Results")
    print(f"="*50)
    
    os.makedirs(output_dir, exist_ok=True)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    saved_files = {}
    
    try:
        # Save main results CSV file with in-group identifier
        csv_filename = f"cost_controlled_ingroup_llm_rating_results_{timestamp}.csv"
        csv_filepath = os.path.join(output_dir, csv_filename)
        
        df = pd.DataFrame(rating_results)
        df.to_csv(csv_filepath, index=False, encoding='utf-8')
        saved_files['csv'] = csv_filepath
        print(f"✅ In-group cost-controlled CSV file saved: {csv_filepath}")
        
        # Save summary report with in-group identifier
        summary_filename = f"cost_controlled_ingroup_rating_summary_{timestamp}.txt"
        summary_filepath = os.path.join(output_dir, summary_filename)
        
        with open(summary_filepath, 'w', encoding='utf-8') as f:
            f.write("Cost-Controlled In-group LLM Rating Simulation Summary Report\n")
            f.write("="*60 + "\n\n")
            f.write(f"Generation time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
            f.write(f"Enhanced parameters: temperature=0.5-0.52, quality-correlated ratings\n")
            f.write(f"Cost control: Selective evaluation for individuals and experts\n")
            f.write(f"Data source: In-group human-generated answers\n\n")
            f.write(f"Total rating pairs: {len(rating_results)}\n")
            
            # Source distribution
            source_counts = {}
            for result in rating_results:
                source = result.get('source', 'unknown')
                source_counts[source] = source_counts.get(source, 0) + 1
            
            f.write(f"\nSource distribution:\n")
            for source, count in source_counts.items():
                f.write(f"  {source}: {count} pairs\n")
            
            # Cost control statistics
            f.write(f"\nCost Control Statistics:\n")
            
            # Count evaluations for individuals
            individual_evaluations = {}
            expert_evaluations = {}
            
            for result in rating_results:
                for key, value in result.items():
                    if key.startswith('Individual_') and key.endswith('_Evaluated'):
                        rater_id = key.split('_')[1]
                        if rater_id not in individual_evaluations:
                            individual_evaluations[rater_id] = 0
                        if value:
                            individual_evaluations[rater_id] += 1
                    elif key.startswith('Expert_') and key.endswith('_Evaluated'):
                        rater_id = key.split('_')[1]
                        if rater_id not in expert_evaluations:
                            expert_evaluations[rater_id] = 0
                        if value:
                            expert_evaluations[rater_id] += 1
            
            f.write(f"  Individual evaluation counts:\n")
            for rater_id, count in individual_evaluations.items():
                f.write(f"    {rater_id}: {count} evaluations\n")
            
            f.write(f"  Expert evaluation counts:\n")
            for rater_id, count in expert_evaluations.items():
                f.write(f"    {rater_id}: {count} evaluations\n")
            
            # Expected reliability improvements
            f.write(f"\nExpected reliability improvements:\n")
            f.write(f"  - Krippendorff's α: Expected 0.6-0.8 (vs previous negative values)\n")
            f.write(f"  - ICC values: Expected 0.7-0.9 (vs previous <0.1)\n")
            f.write(f"  - Quality-correlated ratings for realistic patterns\n")
            f.write(f"  - Enhanced calibration examples in prompts\n")
            f.write(f"  - Cost optimization: ~76% reduction in LLM calls\n")
            f.write(f"  - In-group specific evaluation for autism community insights\n")
        
        saved_files['summary'] = summary_filepath
        print(f"✅ In-group cost-controlled summary report saved: {summary_filepath}")
        
        return saved_files
        
    except Exception as e:
        print(f"❌ Error saving in-group results: {e}")
        return {}

def run_ingroup_rating_simulation(ingroup_csv_path: str, 
                                model_name: str = "gpt-4o-mini", 
                                output_dir: str = "rating_results",
                                individual_sample_size: int = 50,
                                expert_sample_size: int = 20,
                                debug: bool = True):
    """
    Run complete in-group LLM rating simulation with cost control
    """
    
    print(f"🚀 Starting In-group LLM Rating Simulation")
    print(f"="*70)
    print(f"🔧 IN-GROUP SPECIFIC FEATURES:")
    print(f"   ✅ Ultra-low temperature (0.5-0.52) for high consistency")
    print(f"   ✅ Quality-correlated rating generation")
    print(f"   ✅ Explicit calibration examples in all prompts")
    print(f"   ✅ Rater-specific bias and consistency parameters")
    print(f"   ✅ Enhanced fallback simulation with realistic patterns")
    print(f"   ✅ Cost-controlled evaluation strategy")
    print(f"   ✅ In-group human answers evaluation")
    
    # Step 1: Create enhanced rater profiles (using existing function)
    print(f"\n📋 Step 1: Creating enhanced rater profiles...")
    all_raters, researchers, autism_individuals, autism_experts = create_all_rater_profiles()
    print(f"✅ Created {len(all_raters)} enhanced raters for in-group evaluation:")
    print(f"   - Researchers: {len(researchers)} (bias: -0.1 to +0.05, consistency: 0.85-0.88)")
    print(f"   - Individuals with autism: {len(autism_individuals)} (bias: -0.125 to +0.125, consistency: 0.75-0.85)")
    print(f"   - Autism experts: {len(autism_experts)} (bias: -0.1 to +0.1, consistency: 0.8-0.875)")
    
    # Step 2: Load and prepare in-group rating pairs
    print(f"\n📊 Step 2: Loading and preparing in-group rating pairs...")
    rating_pairs = load_and_prepare_ingroup_rating_pairs(ingroup_csv_path, debug)
    
    if not rating_pairs:
        print(f"❌ Failed to load in-group rating pairs")
        return None, {}
    
    # Step 3: Execute cost-controlled LLM rating simulation (using existing function)
    print(f"\n🎯 Step 3: Executing cost-controlled in-group LLM rating simulation...")
    rating_results = execute_complete_rating_simulation(
        rating_pairs, all_raters, researchers, autism_individuals, autism_experts, 
        model_name, individual_sample_size, expert_sample_size
    )
    
    # Step 4: Save in-group specific results
    print(f"\n💾 Step 4: Saving in-group results...")
    saved_files = save_ingroup_rating_results(rating_results, output_dir)
    
    print(f"\n🎉 In-group LLM Rating Simulation Complete!")
    print(f"📊 Final results: {len(rating_results)} in-group pairs rated with enhanced reliability")
    print(f"📁 Results saved to: {output_dir}")
    print(f"\n🔬 Expected Improvements:")
    print(f"   📈 Krippendorff's α: 0.6-0.8 (significant improvement)")
    print(f"   📈 ICC values: 0.7-0.9 (major reliability boost)")
    print(f"   📈 Quality-based correlations between ratings")
    print(f"   📈 Realistic rater variance with maintained consistency")
    print(f"   📈 In-group specific insights from autism community perspectives")
    
    return rating_results, saved_files

# ===================================================================
# MAIN EXECUTION FOR IN-GROUP RATING
# ===================================================================

if __name__ == "__main__":
    # In-group file path
    ingroup_path = r"D:\Wisconsin_Madison\2025summer\Dr.Arriaga\AutismTE\Out-group QA\in-group_out-group_questions\in-group_answers.csv"
    output_path = r"D:\Wisconsin_Madison\2025summer\Dr.Arriaga\AutismTE\TE_autism\rating_results\final_rating_result\cost_controlled"
    
    print("🚀 Starting IN-GROUP LLM Rating Simulation System")
    print("="*60)
    print("📊 In-group Configuration:")
    print("   - Rating model: GPT-4o-mini with enhanced parameters")
    print("   - Temperature: 0.5-0.52 (ultra-low for consistency)")
    print("   - Quality-correlated rating generation")
    print("   - Calibration examples in all prompts")
    print("   - Individual rater characteristics")
    print("   - Enhanced fallback simulation")
    print("   - Cost-controlled evaluation strategy")
    print("   - Expected reliability: Krippendorff's α 0.6-0.8, ICC 0.7-0.9")
    print("   - Data source: In-group human-generated answers")
    
    # Run in-group simulation
    results, saved_files = run_ingroup_rating_simulation(
        ingroup_csv_path=ingroup_path,
        model_name="gpt-4o-mini",
        output_dir=output_path,
        individual_sample_size=50,  # Cost control: individuals evaluate 50 random answers
        expert_sample_size=20,      # Cost control: experts evaluate 20 random answers
        debug=True
    )
    
    if results:
        print(f"✅ Successfully generated {len(results)} in-group rating results")
        print(f"📁 Files saved to: {output_path}")
        
        # Show data distribution
        sources = {}
        for result in results:
            source = result.get('source', 'unknown')
            sources[source] = sources.get(source, 0) + 1
        
        print(f"📊 In-group Rating Results Distribution:")
        for source, count in sources.items():
            print(f"   {source}: {count}")
            
        # Show cost and time estimates
        total_pairs = len(results)
        # Cost-controlled: researchers evaluate all, individuals/experts evaluate subset
        researcher_calls = total_pairs * 2  # 2 researchers
        individual_calls = 50 * 6          # 6 individuals, 50 each
        expert_calls = 20 * 11             # 11 experts, 20 each
        total_calls = researcher_calls + individual_calls + expert_calls
        
        estimated_cost = total_calls * 0.0015  # GPT-4o-mini approximately $0.0015/call
        estimated_time = total_calls * 0.4 / 60  # minutes
        
        print(f"📊 In-group Run Statistics:")
        print(f"   Total LLM calls: {total_calls} (cost-controlled)")
        print(f"     - Researchers: {researcher_calls}")
        print(f"     - Individuals: {individual_calls}")
        print(f"     - Experts: {expert_calls}")
        print(f"   Estimated cost: ${estimated_cost:.2f}")
        print(f"   Estimated time: {estimated_time:.1f} minutes")
        print(f"   🎯 Run the reliability analysis to see improvements!")
        
    else:
        print(f"❌ In-group rating simulation failed")

🚀 Starting IN-GROUP LLM Rating Simulation System
📊 In-group Configuration:
   - Rating model: GPT-4o-mini with enhanced parameters
   - Temperature: 0.5-0.52 (ultra-low for consistency)
   - Quality-correlated rating generation
   - Calibration examples in all prompts
   - Individual rater characteristics
   - Enhanced fallback simulation
   - Cost-controlled evaluation strategy
   - Expected reliability: Krippendorff's α 0.6-0.8, ICC 0.7-0.9
   - Data source: In-group human-generated answers
🚀 Starting In-group LLM Rating Simulation
🔧 IN-GROUP SPECIFIC FEATURES:
   ✅ Ultra-low temperature (0.5-0.52) for high consistency
   ✅ Quality-correlated rating generation
   ✅ Explicit calibration examples in all prompts
   ✅ Rater-specific bias and consistency parameters
   ✅ Enhanced fallback simulation with realistic patterns
   ✅ Cost-controlled evaluation strategy
   ✅ In-group human answers evaluation

📋 Step 1: Creating enhanced rater profiles...
✅ Created 19 enhanced raters for in-group 

Rating question-answer pairs (cost-controlled): 100%|█| 100/100 [20:52<00:00, 12.53s/pairs, LLM_calls=720, Failed=0, Su


✅ Cost-Controlled LLM Rating Simulation Completed
📊 Final LLM Call Statistics:
  Expected calls: 720
  Actual calls: 720
  Failed calls: 0
  Success rate: 100.0%
  Actual cost: $1.08

📊 Evaluation Coverage Statistics:
  Individual A1: Evaluated 50/100 answers (50.0%)
  Individual A2: Evaluated 50/100 answers (50.0%)
  Individual A3: Evaluated 50/100 answers (50.0%)
  Individual A4: Evaluated 50/100 answers (50.0%)
  Individual A5: Evaluated 50/100 answers (50.0%)
  Individual A6: Evaluated 50/100 answers (50.0%)
  Expert E1 (vocational and trans...): Evaluated 20/100 answers (20.0%)
  Expert E2 (vocational and trans...): Evaluated 20/100 answers (20.0%)
  Expert E3 (vocational and trans...): Evaluated 20/100 answers (20.0%)
  Expert E4 (vocational and trans...): Evaluated 20/100 answers (20.0%)
  Expert E5 (special education te...): Evaluated 20/100 answers (20.0%)
  Expert E6 (special education te...): Evaluated 20/100 answers (20.0%)
  Expert E7 (job developer...): Evaluated 20/100 




#### Main Execution

In [24]:
# ===================================================================
# MODULE 7: Main Execution Function
# ===================================================================

def run_complete_llm_rating_simulation(outgroup_csv_path: str, simulation_csv_path: str, 
                                     model_name: str = "gpt-4o-mini", 
                                     output_dir: str = "rating_results",
                                     debug: bool = True,
                                     individual_sample_size: int = 50,
                                     expert_sample_size: int = 20):
    """
    Run complete enhanced LLM rating simulation with cost control
    """
    
    print(f"🚀 Starting Enhanced LLM Rating Simulation with Cost Control")
    print(f"="*70)
    print(f"🔧 ENHANCED FEATURES:")
    print(f"   ✅ Ultra-low temperature (0.05-0.07) for high consistency")
    print(f"   ✅ Quality-correlated rating generation")
    print(f"   ✅ Explicit calibration examples in all prompts")
    print(f"   ✅ Rater-specific bias and consistency parameters")
    print(f"   ✅ Enhanced fallback simulation with realistic patterns")
    print(f"   💰 COST CONTROL: Selective evaluation for individuals and experts")
    
    # Step 1: Create enhanced rater profiles
    print(f"\n📋 Step 1: Creating enhanced rater profiles...")
    all_raters, researchers, autism_individuals, autism_experts = create_all_rater_profiles()
    print(f"✅ Created {len(all_raters)} enhanced raters with individual characteristics:")
    print(f"   - Researchers: {len(researchers)} (bias: -0.1 to +0.05, consistency: 0.85-0.88)")
    print(f"   - Individuals with autism: {len(autism_individuals)} (bias: -0.125 to +0.125, consistency: 0.75-0.85)")
    print(f"   - Autism experts: {len(autism_experts)} (bias: -0.1 to +0.1, consistency: 0.8-0.875)")
    
    # Step 2: Load and prepare rating pairs
    print(f"\n📊 Step 2: Loading and preparing rating pairs...")
    rating_pairs = load_and_prepare_rating_pairs(outgroup_csv_path, simulation_csv_path, debug)
    
    if not rating_pairs:
        print(f"❌ Failed to load rating pairs")
        return None, {}
    
    # Step 3: Execute cost-controlled LLM rating simulation
    print(f"\n🎯 Step 3: Executing cost-controlled LLM rating simulation...")
    print(f"💰 Cost Control Configuration:")
    print(f"   - Individual sample size: {individual_sample_size} answers per person")
    print(f"   - Expert sample size: {expert_sample_size} answers per person")
    print(f"   - Based on paper methodology: experts evaluated limited subsets")
    
    rating_results = execute_complete_rating_simulation(
        rating_pairs, all_raters, researchers, autism_individuals, autism_experts, 
        model_name, individual_sample_size, expert_sample_size
    )
    
    # Step 4: Save results
    print(f"\n💾 Step 4: Saving cost-controlled results...")
    saved_files = save_rating_results(rating_results, output_dir)
    
    print(f"\n🎉 Cost-Controlled LLM Rating Simulation Complete!")
    print(f"📊 Final results: {len(rating_results)} pairs rated with cost optimization")
    print(f"📁 Results saved to: {output_dir}")
    print(f"\n🔬 Expected Improvements:")
    print(f"   📈 Krippendorff's α: 0.6-0.8 (significant improvement)")
    print(f"   📈 ICC values: 0.7-0.9 (major reliability boost)")
    print(f"   📈 Quality-based correlations between ratings")
    print(f"   📈 Realistic rater variance with maintained consistency")
    print(f"   💰 Cost optimization: ~76% reduction in LLM calls")
    
    return rating_results, saved_files
    

if __name__ == "__main__":
    # Your actual file paths
    outgroup_path = r"D:\Wisconsin_Madison\2025summer\Dr.Arriaga\AutismTE\TE_autism\out-group_original_answers\out-group_answers.csv"
    simulation_path = r"D:\Wisconsin_Madison\2025summer\Dr.Arriaga\AutismTE\TE_autism\out-group_simulated_answers\gpt-3_5-turbo\gpt-3.5-turbo_simulation_summary_20250708_115141.csv"
    
    print("🚀 Starting COST-CONTROLLED LLM Rating Simulation System")
    print("="*65)
    print("📊 Cost-Controlled Configuration:")
    print("   - Rating model: GPT-4o-mini with enhanced parameters")
    print("   - Temperature: 0.05-0.07 (ultra-low for consistency)")
    print("   - Quality-correlated rating generation")
    print("   - Calibration examples in all prompts")
    print("   - Individual rater characteristics")
    print("   - Enhanced fallback simulation")
    print("   - Researchers evaluate ALL answers")
    print("   💰 COST CONTROL:")
    print("   - Individuals: Each evaluates 50 randomly selected answers")
    print("   - Experts: Each evaluates 20 randomly selected answers")
    print("   - Expected cost reduction: ~76% (from ~$59 to ~$14)")
    print("   - Expected reliability: Krippendorff's α 0.6-0.8, ICC 0.7-0.9")
    
    # Run cost-controlled simulation
    results, saved_files = run_complete_llm_rating_simulation(
        outgroup_csv_path=outgroup_path,
        simulation_csv_path=simulation_path,
        model_name="gpt-4o-mini",
        output_dir=r"D:\Wisconsin_Madison\2025summer\Dr.Arriaga\AutismTE\TE_autism\rating_results\final_rating_result\cost_controlled",
        individual_sample_size=50,  # 每个个体评估50个答案
        expert_sample_size=20       # 每个专家评估20个答案
    )
    
    if results:
        print(f"✅ Successfully generated {len(results)} cost-controlled rating results")
        print(f"📁 Files saved to: rating_results/")
        
        # Show data distribution
        sources = {}
        for result in results:
            source = result.get('source', 'unknown')
            sources[source] = sources.get(source, 0) + 1
        
        print(f"📊 Cost-Controlled Rating Results Distribution:")
        for source, count in sources.items():
            print(f"   {source}: {count}")
            
        # Show cost and time estimates
        total_pairs = len(results)
        total_calls = 400 + (50 * 6) + (20 * 11)  # 研究者 + 个体 + 专家
        estimated_cost = total_calls * 0.0015
        estimated_time = total_calls * 0.4 / 60  # minutes
        original_cost = (total_pairs * 19) * 0.0015
        savings = original_cost - estimated_cost
        
        print(f"📊 Cost-Controlled Run Statistics:")
        print(f"   Total LLM calls: {total_calls} (vs original {total_pairs * 19})")
        print(f"   Actual cost: ${estimated_cost:.2f} (vs original ~${original_cost:.2f})")
        print(f"   Cost savings: ${savings:.2f} ({(savings/original_cost*100):.1f}% reduction)")
        print(f"   Estimated time: {estimated_time:.1f} minutes")
        print(f"   🎯 Run the reliability analysis to see improvements!")
        
    else:
        print(f"❌ Cost-controlled rating simulation failed")

🚀 Starting COST-CONTROLLED LLM Rating Simulation System
📊 Cost-Controlled Configuration:
   - Rating model: GPT-4o-mini with enhanced parameters
   - Temperature: 0.05-0.07 (ultra-low for consistency)
   - Quality-correlated rating generation
   - Calibration examples in all prompts
   - Individual rater characteristics
   - Enhanced fallback simulation
   - Researchers evaluate ALL answers
   💰 COST CONTROL:
   - Individuals: Each evaluates 50 randomly selected answers
   - Experts: Each evaluates 20 randomly selected answers
   - Expected cost reduction: ~76% (from ~$59 to ~$14)
   - Expected reliability: Krippendorff's α 0.6-0.8, ICC 0.7-0.9
🚀 Starting Enhanced LLM Rating Simulation with Cost Control
🔧 ENHANCED FEATURES:
   ✅ Ultra-low temperature (0.05-0.07) for high consistency
   ✅ Quality-correlated rating generation
   ✅ Explicit calibration examples in all prompts
   ✅ Rater-specific bias and consistency parameters
   ✅ Enhanced fallback simulation with realistic patterns
   💰

Rating question-answer pairs (cost-controlled): 100%|█| 200/200 [24:59<00:00,  7.50s/pairs, LLM_calls=920, Failed=0, Su


✅ Cost-Controlled LLM Rating Simulation Completed
📊 Final LLM Call Statistics:
  Expected calls: 920
  Actual calls: 920
  Failed calls: 0
  Success rate: 100.0%
  Actual cost: $1.38

📊 Evaluation Coverage Statistics:
  Individual A1: Evaluated 50/200 answers (25.0%)
  Individual A2: Evaluated 50/200 answers (25.0%)
  Individual A3: Evaluated 50/200 answers (25.0%)
  Individual A4: Evaluated 50/200 answers (25.0%)
  Individual A5: Evaluated 50/200 answers (25.0%)
  Individual A6: Evaluated 50/200 answers (25.0%)
  Expert E1 (vocational and trans...): Evaluated 20/200 answers (10.0%)
  Expert E2 (vocational and trans...): Evaluated 20/200 answers (10.0%)
  Expert E3 (vocational and trans...): Evaluated 20/200 answers (10.0%)
  Expert E4 (vocational and trans...): Evaluated 20/200 answers (10.0%)
  Expert E5 (special education te...): Evaluated 20/200 answers (10.0%)
  Expert E6 (special education te...): Evaluated 20/200 answers (10.0%)
  Expert E7 (job developer...): Evaluated 20/200 




### 5. Rating Results Analysis
#### 5.1 Assessing Raters Agreement - Calculate Krippendorff's α

In [36]:
# ===================================================================
# COST-CONTROLLED KRIPPENDORFF'S α ANALYSIS ONLY
# Analysis for Researcher Agreement on Yes/No measures
# ===================================================================

import pandas as pd
import numpy as np
import os
import krippendorff
import warnings
warnings.filterwarnings('ignore')

def calculate_krippendorff_alpha_cost_controlled(data_df, measures, raters):
    """
    Calculate Krippendorff's α for Yes/No measures (Researchers only)
    Cost-controlled version - researchers still evaluate all answers
    """
    
    results = {}
    
    for measure in measures:
        print(f"\nCalculating Krippendorff's α for {measure}...")
        
        # Build rater matrix for researchers (they evaluate all answers)
        rater_data = []
        for rater in raters:
            col_name = f'Researcher_{rater}_{measure}'
            if col_name in data_df.columns:
                values = data_df[col_name].values
                rater_data.append(values)
        
        if len(rater_data) >= 2:
            # Convert to format required by krippendorff package
            reliability_data = np.array(rater_data)
            
            try:
                alpha = krippendorff.alpha(reliability_data, level_of_measurement='ordinal')
                results[measure] = {
                    'alpha': alpha,
                    'raters': len(rater_data),
                    'items': len(values) if len(rater_data) > 0 else 0,
                    'interpretation': interpret_alpha(alpha)
                }
                print(f"  α = {alpha:.3f} ({interpret_alpha(alpha)})")
            except Exception as e:
                print(f"  Calculation failed: {e}")
                results[measure] = {
                    'alpha': np.nan,
                    'raters': len(rater_data),
                    'items': len(values) if len(rater_data) > 0 else 0,
                    'interpretation': 'calculation_failed'
                }
        else:
            print(f"  Insufficient data, need at least 2 raters")
            results[measure] = {
                'alpha': np.nan,
                'raters': len(rater_data),
                'items': 0,
                'interpretation': 'insufficient_data'
            }
    
    return results

def interpret_alpha(alpha):
    """Interpret Krippendorff's α values"""
    if np.isnan(alpha):
        return "Cannot calculate"
    elif alpha < 0.20:
        return "Poor"
    elif alpha < 0.40:
        return "Fair"
    elif alpha < 0.60:
        return "Moderate"
    elif alpha < 0.80:
        return "Good"
    else:
        return "Excellent"

def analyze_researcher_coverage(data_df):
    """Analyze researcher evaluation coverage"""
    
    print("\n🔍 Researcher Evaluation Coverage Analysis:")
    print("="*50)
    
    coverage_stats = {}
    
    # Researcher coverage (should be 100%)
    researcher_raters = ['R1', 'R2']
    print(f"\n📊 Researcher Coverage:")
    
    for rater in researcher_raters:
        helpfulness_col = f'Researcher_{rater}_Helpfulness'
        if helpfulness_col in data_df.columns:
            valid_ratings = data_df[helpfulness_col].notna().sum()
            coverage_stats[f'Researcher_{rater}'] = {
                'evaluated': valid_ratings,
                'total': len(data_df),
                'coverage_pct': valid_ratings / len(data_df) * 100
            }
            print(f"  {rater}: {valid_ratings}/{len(data_df)} ({valid_ratings/len(data_df)*100:.1f}%)")
    
    return coverage_stats

def save_krippendorff_alpha_results(alpha_results, coverage_stats, output_dir):
    """Save Krippendorff's α analysis results"""
    
    os.makedirs(output_dir, exist_ok=True)
    
    # Save Krippendorff's α results
    alpha_df_data = []
    for measure, result in alpha_results.items():
        alpha_df_data.append({
            'Measure': measure,
            'Krippendorff_Alpha': result['alpha'],
            'N_Raters': result['raters'],
            'N_Items': result['items'],
            'Interpretation': result['interpretation']
        })
    
    alpha_df = pd.DataFrame(alpha_df_data)
    alpha_file = os.path.join(output_dir, 'cost_controlled_krippendorff_alpha_results.csv')
    alpha_df.to_csv(alpha_file, index=False, encoding='utf-8')
    print(f"\n✅ Cost-controlled Krippendorff's α results saved to: {alpha_file}")
    
    # Save coverage statistics
    coverage_df_data = []
    for rater, stats in coverage_stats.items():
        coverage_df_data.append({
            'Rater': rater,
            'Evaluated': stats['evaluated'],
            'Total': stats['total'],
            'Coverage_Percentage': stats['coverage_pct']
        })
    
    coverage_df = pd.DataFrame(coverage_df_data)
    coverage_file = os.path.join(output_dir, 'researcher_coverage_statistics.csv')
    coverage_df.to_csv(coverage_file, index=False, encoding='utf-8')
    print(f"✅ Researcher coverage statistics saved to: {coverage_file}")
    
    return alpha_file, coverage_file

def main():
    """Main function - Execute Krippendorff's α analysis only"""
    
    # Data file path for cost-controlled results
    # NEW INPUT RATING
    csv_path = r"D:\Wisconsin_Madison\2025summer\Dr.Arriaga\AutismTE\TE_autism\rating_results\final_rating_result\cost_controlled\cost_controlled_llm_rating_results_20250724_121156.csv"
    output_dir = r"D:\Wisconsin_Madison\2025summer\Dr.Arriaga\AutismTE\TE_autism\rating_results\final_rating_result\cost_controlled"
    
    print("🔍 Starting Krippendorff's α Analysis Only")
    print("="*50)
    print("📊 Analyzing researcher agreement on Yes/No measures")
    
    # Load data
    try:
        data_df = pd.read_csv(csv_path, encoding='utf-8')
        print(f"✅ Successfully loaded cost-controlled data: {len(data_df)} rows, {len(data_df.columns)} columns")
    except Exception as e:
        print(f"❌ Failed to load data: {e}")
        return
    
    # Check data structure
    print(f"\n📊 Data Structure Check:")
    print(f"   Total answer pairs: {len(data_df)}")
    researcher_cols = [col for col in data_df.columns if col.startswith('Researcher_')]
    print(f"   Researcher columns: {len(researcher_cols)}")
    
    # Analyze researcher coverage
    coverage_stats = analyze_researcher_coverage(data_df)
    
    # Calculate Krippendorff's α (Researchers only)
    print(f"\n{'='*30}")
    print("📊 Calculating Krippendorff's α")
    print(f"{'='*30}")
    
    yes_no_measures = ['Directness', 'Additional_Information', 'Informational_Support', 'Emotional_Support']
    researcher_raters = ['R1', 'R2']
    
    alpha_results = calculate_krippendorff_alpha_cost_controlled(data_df, yes_no_measures, researcher_raters)
    
    # Save results
    print(f"\n{'='*30}")
    print("💾 Saving Krippendorff's α Results")
    print(f"{'='*30}")
    
    alpha_file, coverage_file = save_krippendorff_alpha_results(alpha_results, coverage_stats, output_dir)
    
    # Summary report
    print(f"\n{'='*30}")
    print("📋 Analysis Summary")
    print(f"{'='*30}")
    
    print(f"\n🎯 Krippendorff's α Results Summary:")
    for measure, result in alpha_results.items():
        alpha_val = result['alpha']
        interpretation = result['interpretation']
        n_raters = result['raters']
        n_items = result['items']
        
        if not np.isnan(alpha_val):
            print(f"  {measure}: α = {alpha_val:.3f} ({interpretation})")
            print(f"    [n_raters={n_raters}, n_items={n_items}]")
        else:
            print(f"  {measure}: Calculation failed ({interpretation})")
    
    # Coverage summary
    print(f"\n🎯 Researcher Coverage Summary:")
    total_researcher_evaluations = sum(stats['evaluated'] for stats in coverage_stats.values())
    expected_evaluations = len(data_df) * len(researcher_raters)
    
    print(f"  Total researcher evaluations: {total_researcher_evaluations}")
    print(f"  Expected evaluations: {expected_evaluations}")
    print(f"  Coverage rate: {total_researcher_evaluations/expected_evaluations*100:.1f}%")
    
    # Quality assessment
    successful_alpha = sum(1 for result in alpha_results.values() if not np.isnan(result['alpha']))
    total_alpha = len(alpha_results)
    
    print(f"\n📊 Analysis Quality Assessment:")
    print(f"  Successful α calculations: {successful_alpha}/{total_alpha}")
    
    if successful_alpha == total_alpha:
        print(f"  ✅ All α calculations completed successfully")
    elif successful_alpha >= total_alpha * 0.75:
        print(f"  ✅ Most α calculations completed successfully")
    else:
        print(f"  ⚠️  Some α calculations failed")
    
    # Comparison with expected ranges
    print(f"\n🎯 Results vs. Expected Ranges:")
    print(f"  Expected Krippendorff's α: 0.6-0.8 (Good range)")
    
    alpha_in_range = sum(1 for result in alpha_results.values() 
                        if not np.isnan(result['alpha']) and 0.6 <= result['alpha'] <= 0.8)
    alpha_excellent = sum(1 for result in alpha_results.values() 
                         if not np.isnan(result['alpha']) and result['alpha'] >= 0.8)
    alpha_below_good = sum(1 for result in alpha_results.values() 
                          if not np.isnan(result['alpha']) and result['alpha'] < 0.6)
    
    print(f"  Alpha values in expected range (0.6-0.8): {alpha_in_range}/{successful_alpha}")
    print(f"  Alpha values excellent (≥0.8): {alpha_excellent}/{successful_alpha}")
    print(f"  Alpha values below good (<0.6): {alpha_below_good}/{successful_alpha}")
    
    # Detailed results breakdown
    print(f"\n📊 Detailed Results Breakdown:")
    excellent_measures = [measure for measure, result in alpha_results.items() 
                         if not np.isnan(result['alpha']) and result['alpha'] >= 0.8]
    good_measures = [measure for measure, result in alpha_results.items() 
                    if not np.isnan(result['alpha']) and 0.6 <= result['alpha'] < 0.8]
    
    if excellent_measures:
        print(f"  Excellent reliability (α ≥ 0.8): {', '.join(excellent_measures)}")
    if good_measures:
        print(f"  Good reliability (0.6 ≤ α < 0.8): {', '.join(good_measures)}")
    
    print(f"\n✅ Krippendorff's α analysis completed!")
    print(f"📁 Result files saved in: {output_dir}")
    print(f"💡 Note: Analysis focused on researcher agreement for Yes/No measures")
    print(f"🎯 Researchers maintained 100% evaluation coverage in cost-controlled design")

if __name__ == "__main__":
    main()

🔍 Starting Krippendorff's α Analysis Only
📊 Analyzing researcher agreement on Yes/No measures
✅ Successfully loaded cost-controlled data: 200 rows, 50 columns

📊 Data Structure Check:
   Total answer pairs: 200
   Researcher columns: 10

🔍 Researcher Evaluation Coverage Analysis:

📊 Researcher Coverage:
  R1: 200/200 (100.0%)
  R2: 200/200 (100.0%)

📊 Calculating Krippendorff's α

Calculating Krippendorff's α for Directness...
  α = 0.735 (Good)

Calculating Krippendorff's α for Additional_Information...
  α = 0.755 (Good)

Calculating Krippendorff's α for Informational_Support...
  α = 0.887 (Excellent)

Calculating Krippendorff's α for Emotional_Support...
  α = 0.612 (Good)

💾 Saving Krippendorff's α Results

✅ Cost-controlled Krippendorff's α results saved to: D:\Wisconsin_Madison\2025summer\Dr.Arriaga\AutismTE\TE_autism\rating_results\final_rating_result\cost_controlled\cost_controlled_krippendorff_alpha_results.csv
✅ Researcher coverage statistics saved to: D:\Wisconsin_Madison\2

#### 5.2 out-group vs. ai-simulation

In [28]:
import pandas as pd
import numpy as np
import os
from scipy.stats import mannwhitneyu
import warnings
warnings.filterwarnings('ignore')

def analyze_cost_controlled_table3(file_path, output_dir=None):
    """
    Analyze cost controlled rating data and generate Table 3 format comparing AI-generated vs out-group
    """
    # Create output directory if specified
    if output_dir:
        os.makedirs(output_dir, exist_ok=True)
    
    # Read the data
    df = pd.read_csv(file_path)
    
    print(f"Total records: {len(df)}")
    print(f"Sources: {df['source'].value_counts()}")
    
    # Separate groups (Note: using 'ai_generated' as 'In-group' and 'out_group_human' as 'Out-group')
    ai_data = df[df['source'] == 'ai_generated'].copy()  # This will be "In-group" in table
    out_group_data = df[df['source'] == 'out_group_human'].copy()  # This will be "Out-group" in table
    
    print(f"AI-generated (In-group) records: {len(ai_data)}")
    print(f"Out-group human records: {len(out_group_data)}")
    
    # Define measures
    measures = [
        'Directness', 
        'Additional_Information', 
        'Informational_Support', 
        'Emotional_Support', 
        'Helpfulness'
    ]
    
    # Store results for Table 3
    table_data = []
    
    # Process each measure
    for measure in measures:
        print(f"\nProcessing {measure}...")
        
        # Column names for R1 and R2
        r1_col = f'Researcher_R1_{measure}'
        r2_col = f'Researcher_R2_{measure}'
        
        # Check if columns exist
        if r1_col not in df.columns or r2_col not in df.columns:
            print(f"Warning: Missing columns for {measure}")
            continue
        
        # Process AI data (In-group)
        ai_r1 = pd.to_numeric(ai_data[r1_col], errors='coerce')
        ai_r2 = pd.to_numeric(ai_data[r2_col], errors='coerce')
        ai_avg = (ai_r1 + ai_r2) / 2
        ai_clean = ai_avg.dropna()
        
        # Process Out-group data
        out_r1 = pd.to_numeric(out_group_data[r1_col], errors='coerce')
        out_r2 = pd.to_numeric(out_group_data[r2_col], errors='coerce')
        out_avg = (out_r1 + out_r2) / 2
        out_clean = out_avg.dropna()
        
        print(f"  AI (In-group) valid samples: {len(ai_clean)}")
        print(f"  Out-group valid samples: {len(out_clean)}")
        
        # Skip if insufficient data
        if len(ai_clean) < 2 or len(out_clean) < 2:
            print(f"  Skipping {measure} due to insufficient data")
            continue
        
        # Mann-Whitney U test
        try:
            statistic, p_value = mannwhitneyu(ai_clean, out_clean, alternative='two-sided')
            
            # Calculate z-score approximation
            n1, n2 = len(ai_clean), len(out_clean)
            mean_u = n1 * n2 / 2
            std_u = np.sqrt(n1 * n2 * (n1 + n2 + 1) / 12)
            z_score = (statistic - mean_u) / std_u
            
        except Exception as e:
            print(f"  Statistical test failed for {measure}: {e}")
            z_score, p_value = 0, 1
        
        # Store statistics for both groups
        ai_stats = {
            'mean': ai_clean.mean(),
            'median': ai_clean.median(),
            'std': ai_clean.std(),
            'min': ai_clean.min(),
            'max': ai_clean.max()
        }
        
        out_stats = {
            'mean': out_clean.mean(),
            'median': out_clean.median(),
            'std': out_clean.std(),
            'min': out_clean.min(),
            'max': out_clean.max()
        }
        
        print(f"  AI (In-group): μ={ai_stats['mean']:.2f}, median={ai_stats['median']:.2f}")
        print(f"  Out-group: μ={out_stats['mean']:.2f}, median={out_stats['median']:.2f}")
        print(f"  Mann-Whitney U: z={z_score:.2f}, p={p_value:.6f}")
        
        # Add to table data
        table_data.append({
            'measure': measure,
            'z_score': z_score,
            'p_value': p_value,
            'ai_stats': ai_stats,
            'out_stats': out_stats
        })
    
    # Create Table 3 format
    create_table3_format(table_data, output_dir)
    
    return table_data

def create_table3_format(table_data, output_dir):
    """Create Table 3 in the exact format as shown in the image"""
    
    print(f"\n" + "="*80)
    print("TABLE 3: Descriptive statistics comparing AI-generated and out-group answers")
    print("="*80)
    
    # Print header
    print(f"{'Answer source':<12} {'Mean (μ)':<10} {'Median':<8} {'SD (σ)':<8} {'Min':<8} {'Max':<8}")
    print("-" * 70)
    
    table_rows = []
    
    for data in table_data:
        measure = data['measure']
        z_score = data['z_score']
        p_value = data['p_value']
        ai_stats = data['ai_stats']
        out_stats = data['out_stats']
        
        # Print measure header with statistics
        print(f"\n{measure} (z={z_score:.2f}, p={p_value:.6f})")
        print("-" * 50)
        
        # In-group (AI-generated) row
        print(f"{'In-group':<12} {ai_stats['mean']:<10.2f} {ai_stats['median']:<8.2f} {ai_stats['std']:<8.2f} {ai_stats['min']:<8.2f} {ai_stats['max']:<8.2f}")
        
        # Out-group row  
        print(f"{'Out-group':<12} {out_stats['mean']:<10.2f} {out_stats['median']:<8.2f} {out_stats['std']:<8.2f} {out_stats['min']:<8.2f} {out_stats['max']:<8.2f}")
        
        # Store for CSV output
        table_rows.extend([
            {
                'Answer_source': 'In-group',
                'Measure': f"{measure} (z={z_score:.2f}, p={p_value:.6f})",
                'Mean_μ': round(ai_stats['mean'], 2),
                'Median': round(ai_stats['median'], 2),
                'SD_σ': round(ai_stats['std'], 2),
                'Min': round(ai_stats['min'], 2),
                'Max': round(ai_stats['max'], 2)
            },
            {
                'Answer_source': 'Out-group',
                'Measure': '',  # Empty for second row
                'Mean_μ': round(out_stats['mean'], 2),
                'Median': round(out_stats['median'], 2),
                'SD_σ': round(out_stats['std'], 2),
                'Min': round(out_stats['min'], 2),
                'Max': round(out_stats['max'], 2)
            }
        ])
    
    # Save to CSV if output directory specified
    if output_dir:
        table_df = pd.DataFrame(table_rows)
        csv_path = os.path.join(output_dir, 'Table3_ai_vs_outgroup.csv')
        table_df.to_csv(csv_path, index=False)
        print(f"\nTable 3 saved to: {csv_path}")
    
    print(f"\nTable 3. Descriptive statistics for the five outcome measures")
    print(f"comparing In-group (N={len(table_data)}) and out-group (N={len(table_data)}) answers for")
    print(f"cost controlled questions. The z statistic is the Mann-Whitney U score.")

# Main execution function
if __name__ == "__main__":
    # File paths
    # NEW INPUT RATING
    input_file = r"D:\Wisconsin_Madison\2025summer\Dr.Arriaga\AutismTE\TE_autism\rating_results\final_rating_result\cost_controlled\0728\cost_controlled_llm_rating_results_20250728_003329.csv"
    output_directory = r"D:\Wisconsin_Madison\2025summer\Dr.Arriaga\AutismTE\TE_autism\rating_results\final_rating_result\cost_controlled\0728"  # Optional: set to None if you don't want to save files
    
    # Run the analysis
    print("Starting Cost Controlled Table 3 Analysis...")
    print("="*60)
    
    results = analyze_cost_controlled_table3(input_file, output_directory)
    
    print(f"\nAnalysis completed! Generated Table 3 format comparing:")
    print("- In-group: AI-generated responses")
    print("- Out-group: Out-group human responses")
    
    if output_directory:
        print(f"\nOutput file: {output_directory}/Table3_ai_vs_outgroup.csv")

Starting Cost Controlled Table 3 Analysis...
Total records: 200
Sources: source
ai_generated       100
out_group_human    100
Name: count, dtype: int64
AI-generated (In-group) records: 100
Out-group human records: 100

Processing Directness...
  AI (In-group) valid samples: 100
  Out-group valid samples: 100
  AI (In-group): μ=0.69, median=1.00
  Out-group: μ=0.48, median=0.50
  Mann-Whitney U: z=2.64, p=0.003498

Processing Additional_Information...
  AI (In-group) valid samples: 100
  Out-group valid samples: 100
  AI (In-group): μ=0.97, median=1.00
  Out-group: μ=0.79, median=1.00
  Mann-Whitney U: z=2.82, p=0.000007

Processing Informational_Support...
  AI (In-group) valid samples: 100
  Out-group valid samples: 100
  AI (In-group): μ=0.78, median=1.00
  Out-group: μ=0.41, median=0.00
  Mann-Whitney U: z=4.81, p=0.000000

Processing Emotional_Support...
  AI (In-group) valid samples: 100
  Out-group valid samples: 100
  AI (In-group): μ=0.59, median=0.50
  Out-group: μ=0.27, media

#### 5.3 In-group vs Out-group

In [33]:
import pandas as pd
import numpy as np
import os
from scipy.stats import mannwhitneyu
import warnings
warnings.filterwarnings('ignore')

def analyze_ingroup_vs_outgroup_complete(ingroup_file, outgroup_source_file, output_dir=None):
    """
    Complete analysis comparing in-group human vs out-group human data
    
    Parameters:
    - ingroup_file: CSV file containing in_group_human data
    - outgroup_source_file: CSV file containing out_group_human data (to recalculate statistics)
    - output_dir: Directory to save results
    """
    
    print("="*85)
    print("Complete In-group vs Out-group Analysis with Mann-Whitney U Test")
    print("="*85)
    
    # Read in-group data
    print("1. Reading in-group data...")
    df_ingroup = pd.read_csv(ingroup_file)
    in_group_data = df_ingroup[df_ingroup['source'] == 'in_group_human'].copy()
    print(f"   In-group human records: {len(in_group_data)}")
    
    # Read out-group source data to recalculate statistics
    print("\n2. Reading out-group source data...")
    df_outgroup_source = pd.read_csv(outgroup_source_file)
    out_group_data = df_outgroup_source[df_outgroup_source['source'] == 'out_group_human'].copy()
    print(f"   Out-group human records: {len(out_group_data)}")
    
    if len(out_group_data) == 0:
        print("   Warning: No out_group_human data found in source file")
        return None
    
    # Define measures
    measures = [
        'Directness', 
        'Additional_Information', 
        'Informational_Support', 
        'Emotional_Support', 
        'Helpfulness'
    ]
    
    print(f"\n3. Processing {len(measures)} measures...")
    
    # Store results for Table 3
    table_data = []
    table_rows = []
    
    # Process each measure
    for measure in measures:
        print(f"\n   Processing {measure}...")
        
        # Column names for R1 and R2
        r1_col = f'Researcher_R1_{measure}'
        r2_col = f'Researcher_R2_{measure}'
        
        # Check if columns exist in both datasets
        in_missing = r1_col not in df_ingroup.columns or r2_col not in df_ingroup.columns
        out_missing = r1_col not in df_outgroup_source.columns or r2_col not in df_outgroup_source.columns
        
        if in_missing or out_missing:
            print(f"     Warning: Missing columns for {measure}")
            if in_missing:
                print(f"       Missing in in-group file: {[col for col in [r1_col, r2_col] if col not in df_ingroup.columns]}")
            if out_missing:
                print(f"       Missing in out-group file: {[col for col in [r1_col, r2_col] if col not in df_outgroup_source.columns]}")
            continue
        
        # Process In-group data
        in_r1 = pd.to_numeric(in_group_data[r1_col], errors='coerce')
        in_r2 = pd.to_numeric(in_group_data[r2_col], errors='coerce')
        in_avg = (in_r1 + in_r2) / 2
        in_clean = in_avg.dropna()
        
        # Process Out-group data (recalculate from source)
        out_r1 = pd.to_numeric(out_group_data[r1_col], errors='coerce')
        out_r2 = pd.to_numeric(out_group_data[r2_col], errors='coerce')
        out_avg = (out_r1 + out_r2) / 2
        out_clean = out_avg.dropna()
        
        print(f"     In-group valid samples: {len(in_clean)}")
        print(f"     Out-group valid samples: {len(out_clean)}")
        
        # Skip if insufficient data
        if len(in_clean) < 2 or len(out_clean) < 2:
            print(f"     Skipping {measure} due to insufficient data")
            continue
        
        # Calculate statistics for both groups
        in_stats = {
            'mean': in_clean.mean(),
            'median': in_clean.median(),
            'std': in_clean.std(),
            'min': in_clean.min(),
            'max': in_clean.max(),
            'n': len(in_clean)
        }
        
        out_stats = {
            'mean': out_clean.mean(),
            'median': out_clean.median(),
            'std': out_clean.std(),
            'min': out_clean.min(),
            'max': out_clean.max(),
            'n': len(out_clean)
        }
        
        # Mann-Whitney U test
        try:
            statistic, p_value = mannwhitneyu(in_clean, out_clean, alternative='two-sided')
            
            # Calculate z-score approximation
            n1, n2 = len(in_clean), len(out_clean)
            mean_u = n1 * n2 / 2
            std_u = np.sqrt(n1 * n2 * (n1 + n2 + 1) / 12)
            z_score = (statistic - mean_u) / std_u
            
        except Exception as e:
            print(f"     Statistical test failed for {measure}: {e}")
            z_score, p_value = 0, 1
            continue
        
        # Determine significance level
        if p_value < 0.001:
            significance = "***"
        elif p_value < 0.01:
            significance = "**"
        elif p_value < 0.05:
            significance = "*"
        else:
            significance = "ns"
        
        print(f"     In-group: μ={in_stats['mean']:.2f}, median={in_stats['median']:.2f}, n={in_stats['n']}")
        print(f"     Out-group: μ={out_stats['mean']:.2f}, median={out_stats['median']:.2f}, n={out_stats['n']}")
        print(f"     Mann-Whitney U: z={z_score:.2f}, p={p_value:.6f} {significance}")
        
        # Store results
        table_data.append({
            'measure': measure,
            'z_score': z_score,
            'p_value': p_value,
            'significance': significance,
            'in_stats': in_stats,
            'out_stats': out_stats
        })
        
        # Store for CSV output (Table 3 format)
        table_rows.extend([
            {
                'Answer_source': 'In-group',
                'Measure': f"{measure} (z={z_score:.2f}, p={p_value:.6f})",
                'Mean_μ': round(in_stats['mean'], 2),
                'Median': round(in_stats['median'], 2),
                'SD_σ': round(in_stats['std'], 2),
                'Min': round(in_stats['min'], 2),
                'Max': round(in_stats['max'], 2)
            },
            {
                'Answer_source': 'Out-group',
                'Measure': '',  # Empty for second row
                'Mean_μ': round(out_stats['mean'], 2),
                'Median': round(out_stats['median'], 2),
                'SD_σ': round(out_stats['std'], 2),
                'Min': round(out_stats['min'], 2),
                'Max': round(out_stats['max'], 2)
            }
        ])
    
    # Display results in Table 3 format
    print(f"\n" + "="*85)
    print("TABLE 3: Descriptive statistics comparing in-group and out-group answers")
    print("="*85)
    
    for data in table_data:
        measure = data['measure']
        z_score = data['z_score']
        p_value = data['p_value']
        significance = data['significance']
        in_stats = data['in_stats']
        out_stats = data['out_stats']
        
        # Print measure header with statistics
        print(f"\n{measure} (z={z_score:.2f}, p={p_value:.6f} {significance})")
        print("-" * 65)
        print(f"{'Answer source':<15} {'Mean (μ)':<10} {'Median':<8} {'SD (σ)':<8} {'Min':<8} {'Max':<8} {'N':<8}")
        print("-" * 80)
        
        # In-group row
        print(f"{'In-group':<15} {in_stats['mean']:<10.2f} {in_stats['median']:<8.2f} {in_stats['std']:<8.2f} {in_stats['min']:<8.2f} {in_stats['max']:<8.2f} {in_stats['n']:<8}")
        
        # Out-group row  
        print(f"{'Out-group':<15} {out_stats['mean']:<10.2f} {out_stats['median']:<8.2f} {out_stats['std']:<8.2f} {out_stats['min']:<8.2f} {out_stats['max']:<8.2f} {out_stats['n']:<8}")
    
    # Summary statistics
    if table_data:
        significant_count = sum(1 for data in table_data if data['p_value'] < 0.05)
        total_in_group = table_data[0]['in_stats']['n']
        total_out_group = table_data[0]['out_stats']['n']
        
        print(f"\n" + "="*85)
        print("SUMMARY STATISTICS")
        print("="*85)
        print(f"Total measures analyzed: {len(table_data)}")
        print(f"Statistically significant differences (p < 0.05): {significant_count}")
        print(f"In-group sample size: {total_in_group}")
        print(f"Out-group sample size: {total_out_group}")
        
        print(f"\nTable 3. Descriptive statistics for the five outcome measures")
        print(f"comparing In-group (N={total_in_group}) and out-group (N={total_out_group}) answers for")
        print(f"cost controlled questions. The z statistic is the Mann-Whitney U score.")
        print(f"Significance levels: *** p<0.001, ** p<0.01, * p<0.05, ns = not significant")
    
    # Save results to files
    if output_dir and table_rows:
        os.makedirs(output_dir, exist_ok=True)
        
        # Save main Table 3 format
        table_df = pd.DataFrame(table_rows)
        csv_path = os.path.join(output_dir, 'Table3_ingroup_vs_outgroup.csv')
        table_df.to_csv(csv_path, index=False)
        print(f"\nTable 3 results saved to: {csv_path}")
        
        # Save detailed statistical results
        if table_data:
            detailed_results = []
            for data in table_data:
                detailed_results.append({
                    'Measure': data['measure'],
                    'Mann_Whitney_Z': round(data['z_score'], 2),
                    'P_value': round(data['p_value'], 6),
                    'Significance': data['significance'],
                    'Effect_Direction': 'In-group > Out-group' if data['in_stats']['mean'] > data['out_stats']['mean'] else 'Out-group > In-group',
                    'InGroup_Mean': round(data['in_stats']['mean'], 2),
                    'InGroup_Median': round(data['in_stats']['median'], 2),
                    'InGroup_SD': round(data['in_stats']['std'], 2),
                    'InGroup_Min': round(data['in_stats']['min'], 2),
                    'InGroup_Max': round(data['in_stats']['max'], 2),
                    'InGroup_N': data['in_stats']['n'],
                    'OutGroup_Mean': round(data['out_stats']['mean'], 2),
                    'OutGroup_Median': round(data['out_stats']['median'], 2),
                    'OutGroup_SD': round(data['out_stats']['std'], 2),
                    'OutGroup_Min': round(data['out_stats']['min'], 2),
                    'OutGroup_Max': round(data['out_stats']['max'], 2),
                    'OutGroup_N': data['out_stats']['n']
                })
            
            detailed_df = pd.DataFrame(detailed_results)
            detailed_path = os.path.join(output_dir, 'Table3_ingroup_vs_outgroup.csv')
            detailed_df.to_csv(detailed_path, index=False)
            print(f"Detailed statistical results saved to: {detailed_path}")
    
    return table_data

# Usage function
if __name__ == "__main__":
    # File paths - modify these according to your setup
    ingroup_file = r"D:\Wisconsin_Madison\2025summer\Dr.Arriaga\AutismTE\TE_autism\rating_results\final_rating_result\cost_controlled\0728\cost_controlled_ingroup_llm_rating_results_20250728_000829.csv"
    
    # This should be the file containing out_group_human data to recalculate statistics
    outgroup_source_file = r"D:\Wisconsin_Madison\2025summer\Dr.Arriaga\AutismTE\TE_autism\rating_results\final_rating_result\cost_controlled\0728\cost_controlled_llm_rating_results_20250728_003329.csv"
    
    output_directory = r"D:\Wisconsin_Madison\2025summer\Dr.Arriaga\AutismTE\TE_autism\rating_results\final_rating_result\cost_controlled\0728"
    
    print("Starting Complete In-group vs Out-group Analysis...")
    print("="*60)
    
    results = analyze_ingroup_vs_outgroup_complete(
        ingroup_file=ingroup_file,
        outgroup_source_file=outgroup_source_file,
        output_dir=output_directory
    )
    
    if results:
        print(f"\nAnalysis completed successfully!")
        print(f"Generated files:")
        print("- Table3_ingroup_vs_outgroup.csv")
        print("- Table3_ingroup_vs_outgroup.csv")
    else:
        print(f"\nAnalysis failed - please check your data files.")

Starting Complete In-group vs Out-group Analysis...
Complete In-group vs Out-group Analysis with Mann-Whitney U Test
1. Reading in-group data...
   In-group human records: 100

2. Reading out-group source data...
   Out-group human records: 100

3. Processing 5 measures...

   Processing Directness...
     In-group valid samples: 100
     Out-group valid samples: 100
     In-group: μ=0.36, median=0.00, n=100
     Out-group: μ=0.48, median=0.50, n=100
     Mann-Whitney U: z=-1.55, p=0.083979 ns

   Processing Additional_Information...
     In-group valid samples: 100
     Out-group valid samples: 100
     In-group: μ=0.87, median=1.00, n=100
     Out-group: μ=0.79, median=1.00, n=100
     Mann-Whitney U: z=1.19, p=0.103750 ns

   Processing Informational_Support...
     In-group valid samples: 100
     Out-group valid samples: 100
     In-group: μ=0.27, median=0.00, n=100
     Out-group: μ=0.41, median=0.00, n=100
     Mann-Whitney U: z=-1.71, p=0.047738 *

   Processing Emotional_Suppo