### 0. Import Settings
#### Import necessary functions and packages

In [2]:
import pathlib
import sys
# Add src module to path before import.
sys.path.insert(0, str(pathlib.Path('../src')))
from file_IO_handler import get_plaintext_file_contents
from openai_handler import (
    verify_openai_access, 
    OpenAIModelSettings, 
    MODELS,  # Changed from ENGINES to MODELS
    call_openai_chat_api  # Changed from call_openai_api to call_openai_chat_api
)
from fill_string_template import get_filled_strings_from_dataframe, FilledString
from run_simulation import run_single_simulation, save_simulation_result_to_unique_location
from process_results import (
    consolidate_jsons_to_mega_json, 
    process_mega_json_for_no_complete_prompt, 
    consolidate_jsons_to_mega_json_by_engine_prompt
)

### 1. Prior Settings
#### 1.1 Import the filtered 20 questions and 100 randomly selected outgroup answers
##### Read the 2 csv files into dataframe, show basic structure and first few lines and demonstrate all the column numbers

In [3]:
# Import necessary libraries
import pandas as pd
import numpy as np

# Set file path
file_path = r"D:\Wisconsin_Madison\2025summer\Dr.Arriaga\AutismTE\Out-group QA"

# Read the first CSV file: outgroup_answers.csv
print("=== Reading outgroup_answers.csv ===")
outgroup_df = pd.read_csv(f"{file_path}\\outgroup_answers.csv")

# Display basic information
print(f"Data shape: {outgroup_df.shape}")
print(f"Column names: {list(outgroup_df.columns)}")
print("\nFirst 5 rows:")
display(outgroup_df.head())

print("\n" + "="*50 + "\n")

# Read the second CSV file: questions.csv  
print("=== Reading questions.csv ===")
questions_df = pd.read_csv(f"{file_path}\\questions.csv")

# Display basic information
print(f"Data shape: {questions_df.shape}")
print(f"Column names: {list(questions_df.columns)}")
print("\nFirst 5 rows:")
display(questions_df.head())

print("\n" + "="*50 + "\n")

# Data overview
print("=== Data Overview ===")
print("outgroup_answers.csv data types:")
print(outgroup_df.dtypes)
print(f"\nMissing values count:")
print(outgroup_df.isnull().sum())

print("\n" + "-"*30 + "\n")

print("questions.csv data types:")
print(questions_df.dtypes)
print(f"\nMissing values count:")
print(questions_df.isnull().sum())

# Optional: Save processed data to new files
# outgroup_df.to_csv(f"{file_path}\\outgroup_answers_processed.csv", index=False)
# questions_df.to_csv(f"{file_path}\\questions_processed.csv", index=False)

print("\nData extraction completed!")

=== Reading outgroup_answers.csv ===
Data shape: (100, 5)
Column names: ['Input.title', 'Input.body', 'Answer.Confidence', 'Answer.reason for confidence rating', 'Answer.answer']

First 5 rows:


Unnamed: 0,Input.title,Input.body,Answer.Confidence,Answer.reason for confidence rating,Answer.answer
0,Anyone have Skype?,"Well, like the subject says, anyone have Skype...",Somewhat confident,,Phone conversations can be difficult even for ...
1,Anyone have Skype?,"Well, like the subject says, anyone have Skype...",Somewhat confident,,skype is a good tool for interaction with thos...
2,Anyone have Skype?,"Well, like the subject says, anyone have Skype...",Very confident,Experience with Skype,I definitely think that utilizing Skype for ot...
3,Anyone have Skype?,"Well, like the subject says, anyone have Skype...",Somewhat confident,,"Rather than skype, a website called compassion..."
4,Anyone have Skype?,"Well, like the subject says, anyone have Skype...",Somewhat confident,Skype is so popular and widely distributed tha...,"Yes, i do and so do many others, especially si..."




=== Reading questions.csv ===
Data shape: (20, 2)
Column names: ['Title', 'Body']

First 5 rows:


Unnamed: 0,Title,Body
0,"Dr Who, is he Autistic?",I watch Dr Who each week and I keep picking ou...
1,Haldol and Cogentin (What is your Medication)?,Anyone here ever take Haldol or Cogentin? Als...
2,Problem with friendships...,"Ever since I was a child, I've been overly att..."
3,How do you cope with power cuts?,"Just recently, I've been having power cuts at ..."
4,Problems with phones,Hi there - i am new to this forum so bear with...




=== Data Overview ===
outgroup_answers.csv data types:
Input.title                            object
Input.body                             object
Answer.Confidence                      object
Answer.reason for confidence rating    object
Answer.answer                          object
dtype: object

Missing values count:
Input.title                             0
Input.body                              0
Answer.Confidence                       1
Answer.reason for confidence rating    47
Answer.answer                           0
dtype: int64

------------------------------

questions.csv data types:
Title    object
Body     object
dtype: object

Missing values count:
Title    0
Body     0
dtype: int64

Data extraction completed!


#### 1.2 calculate average length for the out-group answers
##### This calculated average length will be used as a length limitation for later prompt template

In [4]:
# Calculate average word length for Answer.answer column
print("=== Answer Analysis ===")

# Remove NaN values and calculate word count for each answer
valid_answers = outgroup_df['Answer.answer'].dropna()
word_counts = valid_answers.apply(lambda x: len(str(x).split()))

# Calculate average word length
avg_word_length = word_counts.mean()

# Store in variable and display results
average_answer_word_length = avg_word_length

print(f"Total number of valid answers: {len(valid_answers)}")
print(f"Average word length per answer: {average_answer_word_length:.2f} words")
print(f"Min word count: {word_counts.min()}")
print(f"Max word count: {word_counts.max()}")
print(f"Standard deviation: {word_counts.std():.2f}")

# Display some statistics about word distribution
print(f"\nWord count distribution:")
print(word_counts.describe())

print("\nAnswer analysis completed!")

=== Answer Analysis ===
Total number of valid answers: 100
Average word length per answer: 49.27 words
Min word count: 3
Max word count: 237
Standard deviation: 40.21

Word count distribution:
count    100.000000
mean      49.270000
std       40.210521
min        3.000000
25%       22.750000
50%       35.500000
75%       68.250000
max      237.000000
Name: Answer.answer, dtype: float64

Answer analysis completed!


#### 1.3 LM parameter settings and LM selection
##### Determine the appropriate LM parameter settings and test valid LM selected for our parameter settings

In [5]:
import pathlib
import time
from typing import List, Dict, Any
import json

# Assume you have these functions defined in your module
# from your_module import verify_openai_access, OpenAIModelSettings, call_openai_api

class OpenAIModelSettings:
    """Model settings class for v1/chat/completions API"""
    def __init__(
        self,
        model: str = "gpt-3.5-turbo",  # Changed from 'engine' to 'model'
        max_tokens: int = 1000,
        temperature: float = 0.3,
        n: int = 1,
        presence_penalty: float = 0.1,
        frequency_penalty: float = 0.1,
        stop: List[str] = None,
        params_descriptor: str = "autism-community-response"
    ):
        self.model = model  # Chat completions uses 'model' instead of 'engine'
        self.max_tokens = max_tokens
        self.temperature = temperature
        self.n = n
        self.presence_penalty = presence_penalty
        self.frequency_penalty = frequency_penalty
        self.stop = stop
        self.params_descriptor = params_descriptor
        
    def to_chat_completion_params(self, messages: List[Dict[str, str]]) -> Dict[str, Any]:
        """Convert settings to chat completion API parameters"""
        params = {
            "model": self.model,
            "messages": messages,
            "max_tokens": self.max_tokens,
            "temperature": self.temperature,
            "n": self.n,
            "presence_penalty": self.presence_penalty,
            "frequency_penalty": self.frequency_penalty,
        }
        
        if self.stop is not None:
            params["stop"] = self.stop
            
        return params

def call_openai_chat_api(prompt: str, model_settings: OpenAIModelSettings, client) -> str:
    """Call OpenAI Chat Completions API"""
    # Convert prompt to chat format
    messages = [{"role": "user", "content": prompt}]
    
    # Get API parameters
    params = model_settings.to_chat_completion_params(messages)
    
    try:
        # Call the chat completions endpoint
        response = client.chat.completions.create(**params)
        
        # Extract the response content
        return response.choices[0].message.content
        
    except Exception as e:
        raise e

def test_multiple_openai_models():
    """Function to test multiple OpenAI models"""
    
    # List of available OpenAI models to test
    models_to_test = [
        "gpt-4o",
        "gpt-4",
        "gpt-4-turbo",
        "gpt-3.5-turbo",
        "gpt-3.5-turbo-16k"
    ]
    
    # Test prompts
    test_prompts = [
        "Q: How many legs does a cat have?",
        "Q: What is the capital of France?",
        "Q: Explain photosynthesis in simple terms.",
        "Q: What are the benefits of exercise?",
        "Q: How do you make a paper airplane?"
    ]
    
    # Initialize OpenAI client
    try:
        client = verify_openai_access(
            pathlib.Path("openai_organization.txt"),
            pathlib.Path("openai_api_key.txt")
        )
        print("OpenAI client initialized successfully")
    except Exception as e:
        print(f"Failed to initialize OpenAI client: {e}")
        return
    
    # Test each model
    for model in models_to_test:
        # Create model settings
        model_settings = OpenAIModelSettings(
            model=model,
            max_tokens=1000,
            temperature=0.3,
            n=1,
            presence_penalty=0.1,
            frequency_penalty=0.1,
            stop=None,
            params_descriptor="autism-community-response"
        )
        
        # Test first prompt only to check if model works
        prompt = test_prompts[0]
        
        try:
            # Call Chat Completions API
            response = call_openai_chat_api(prompt, model_settings, client)
            print(f"✓ Model {model}: Working")
            
        except Exception as e:
            print(f"✗ Model {model}: Error - {e}")
        
        # Add delay to avoid API rate limits
        time.sleep(1)

def test_single_model(model_name: str, custom_prompt: str = None):
    """Function to test a single model"""
    print(f"Testing single model: {model_name}")
    
    # Initialize client
    try:
        client = verify_openai_access(
            pathlib.Path("openai_organization.txt"),
            pathlib.Path("openai_api_key.txt")
        )
    except Exception as e:
        print(f"Failed to initialize client: {e}")
        return
    
    # Use custom prompt or default prompt
    prompt = custom_prompt or "Q: How many legs does a cat have?"
    
    # Create model settings
    model_settings = OpenAIModelSettings(
        model=model_name,
        max_tokens=1000,
        temperature=0.3,
        n=1,
        presence_penalty=0.1,
        frequency_penalty=0.1,
        stop=None,
        params_descriptor="autism-community-response"
    )
    
    try:
        print(f"Prompt: {prompt}")
        start_time = time.time()
        response = call_openai_chat_api(prompt, model_settings, client)  # Changed to chat API
        end_time = time.time()
        
        print(f"Response time: {end_time - start_time:.2f}s")
        print(f"Response: {response}")
        
    except Exception as e:
        print(f"Error: {e}")

if __name__ == "__main__":
    print("OpenAI Model Testing Script")
    print("=" * 50)
    
    # # Choose running mode
    # print("Choose running mode:")
    # print("1. Test all models")
    # print("2. Test single model")
    
    # choice = input("Enter your choice (1 or 2): ").strip()
    
    # if choice == "1":
        # Test all models
    test_multiple_openai_models()
    # elif choice == "2":
    #     # Test single model
    #     model_name = input("Enter model name (e.g., gpt-3.5-turbo): ").strip()
    #     custom_prompt = input("Enter custom prompt (press Enter for default): ").strip()
    #     test_single_model(model_name, custom_prompt if custom_prompt else None)
    # else:
    #     print("Invalid choice")
    
    print("\nTesting completed!")

OpenAI Model Testing Script
OpenAI client initialized successfully
✓ Model gpt-4o: Working
✓ Model gpt-4: Working
✓ Model gpt-4-turbo: Working
✓ Model gpt-3.5-turbo: Working
✓ Model gpt-3.5-turbo-16k: Working

Testing completed!


### 2. Out-group simulation
#### 2.1 Design the demographic distributions based on Hong's study
##### We design the demogrphic profile across 3 dimensions and refer to the original study's demographic survey results
##### 3 dimensions: Personal Demographic Information + Experience with Autism + Knowledge with Autism

In [6]:
# 2.1 Design demographic distributions based on original study
print("=== Step 2.1: Demographics Distribution Design ===")

import random
import pandas as pd
from typing import Dict, List, Tuple
import json

# Set random seed for reproducibility
random.seed(42)
np.random.seed(42)

def create_demographic_distributions():
    """Create demographic distributions based on original study data"""
    
    # Age distribution to achieve average 33.4 years
    age_distribution = {
        "25-30": 20,    # 20 people, average 27.5 years
        "30-35": 35,    # 35 people, average 32.5 years  
        "35-40": 30,    # 30 people, average 37.5 years
        "40-50": 15     # 15 people, average 45 years
    }
    # Calculation: (20×27.5 + 35×32.5 + 30×37.5 + 15×45) ÷ 100 = 33.4 years
    
    # Gender distribution
    gender_distribution = {
        "female": 51,   # 51%
        "male": 49      # 49%
    }
    
    # Location distribution (English-speaking countries only)
    location_distribution = {
        "US": 76,                    # 76%
        "Canada": 8,                 # 8%
        "UK": 6,                     # 6%
        "Australia": 5,              # 5%
        "New_Zealand": 3,            # 3%
        "Ireland": 2                 # 2%
    }
    
    # Autism experience distribution based on study data
    # 70% have experience, 40% regular interaction, 33% caregivers, 4% professionals
    autism_experience_distribution = {
        "caregiver": 33,              # 33% caregivers
        "professional": 4,            # 4% professionals  
        "regular_interaction": 3,     # 3% other regular interaction (40% - 33% - 4%)
        "some_experience": 30,        # 30% some experience but not regular (70% - 40%)
        "no_experience": 30          # 30% no direct experience (100% - 70%)
    }
    
    # Knowledge level distribution
    knowledge_level_distribution = {
        "none": 6,     # 6%
        "little": 65,  # 65%
        "a_lot": 29    # 29%
    }
    
    return {
        "age": age_distribution,
        "gender": gender_distribution,
        "location": location_distribution,
        "autism_experience": autism_experience_distribution,
        "knowledge_level": knowledge_level_distribution
    }

# Create distributions
distributions = create_demographic_distributions()
print("Demographic distributions created successfully!")
print(f"Age distribution: {distributions['age']}")
print(f"Gender distribution: {distributions['gender']}")
print(f"Location distribution: {distributions['location']}")
print(f"Autism experience distribution: {distributions['autism_experience']}")
print(f"Knowledge level distribution: {distributions['knowledge_level']}")

=== Step 2.1: Demographics Distribution Design ===
Demographic distributions created successfully!
Age distribution: {'25-30': 20, '30-35': 35, '35-40': 30, '40-50': 15}
Gender distribution: {'female': 51, 'male': 49}
Location distribution: {'US': 76, 'Canada': 8, 'UK': 6, 'Australia': 5, 'New_Zealand': 3, 'Ireland': 2}
Autism experience distribution: {'caregiver': 33, 'professional': 4, 'regular_interaction': 3, 'some_experience': 30, 'no_experience': 30}
Knowledge level distribution: {'none': 6, 'little': 65, 'a_lot': 29}


#### 2.2 Profile Generation System
##### Generate 100 different profiles across 3 dimensions: basic demographic information; autism-related experiences; autism-related knowledge
##### First design the profiles with the most important dimension: autism-related experiences
##### Then use the "weighted_random_choice" to incorporate other dimensions into the profile desig

In [7]:
# 2.2 Generate 100 individual profiles
print("\n=== Step 2.2: Profile Generation ===")

"""
Here randomly select the specific dimensional information according to their distribution
e.g. For female(51%) --> 51% of probability to select the gender of the specific profile to be female
In this way, 100 profiles are generated
"""
def weighted_random_choice(distribution_dict: Dict[str, int]) -> str:
    """Generate weighted random choice based on distribution"""
    choices = []
    weights = []
    for choice, weight in distribution_dict.items():
        choices.append(choice)
        weights.append(weight)
    
    return random.choices(choices, weights=weights, k=1)[0]

def generate_responder_profiles(n: int = 100) -> List[Dict]:
    """Generate n responder profiles with realistic demographic combinations"""
    profiles = []
    
    # First, create the exact distribution for autism experience (most constrained)
    experience_types = []
    for exp_type, count in distributions['autism_experience'].items():
        experience_types.extend([exp_type] * count)
    
    # Shuffle to randomize order
    random.shuffle(experience_types)
    
    # Generate profiles
    for i, exp_type in enumerate(experience_types):
        profile = {
            'id': i + 1,
            'autism_experience': exp_type,
            'age_group': weighted_random_choice(distributions['age']),
            'gender': weighted_random_choice(distributions['gender']),
            'location': weighted_random_choice(distributions['location']),
            'knowledge_level': weighted_random_choice(distributions['knowledge_level'])
        }
        profiles.append(profile)
    
    return profiles

# Generate 100 profiles
responder_profiles = generate_responder_profiles(100)

# Verify distributions
print("Profile generation completed!")
print(f"Total profiles generated: {len(responder_profiles)}")

# Verify distribution accuracy
exp_counts = {}
for profile in responder_profiles:
    exp = profile['autism_experience']
    exp_counts[exp] = exp_counts.get(exp, 0) + 1

print(f"Autism experience verification: {exp_counts}")

# Display first 5 profiles as examples
print("\nFirst 5 profiles:")
for i in range(5):
    print(f"Profile {i+1}: {responder_profiles[i]}")


=== Step 2.2: Profile Generation ===
Profile generation completed!
Total profiles generated: 100
Autism experience verification: {'some_experience': 30, 'no_experience': 30, 'caregiver': 33, 'regular_interaction': 3, 'professional': 4}

First 5 profiles:
Profile 1: {'id': 1, 'autism_experience': 'some_experience', 'age_group': '30-35', 'gender': 'male', 'location': 'UK', 'knowledge_level': 'little'}
Profile 2: {'id': 2, 'autism_experience': 'some_experience', 'age_group': '35-40', 'gender': 'female', 'location': 'Australia', 'knowledge_level': 'little'}
Profile 3: {'id': 3, 'autism_experience': 'no_experience', 'age_group': '30-35', 'gender': 'female', 'location': 'US', 'knowledge_level': 'little'}
Profile 4: {'id': 4, 'autism_experience': 'caregiver', 'age_group': '35-40', 'gender': 'male', 'location': 'US', 'knowledge_level': 'little'}
Profile 5: {'id': 5, 'autism_experience': 'some_experience', 'age_group': '40-50', 'gender': 'female', 'location': 'US', 'knowledge_level': 'none'}


#### 2.3 Prompt Template Generation
##### Combine the Amazon MTurk template with our designed 100 unique demographic profiles, get the unique 100 templates

In [8]:
# 2.3 Create prompt template system
print("\n=== Step 2.3: Prompt Template System ===")

"""
Here this function convert a distribution information into a natural language expression to incorporate this information into the prompt description
"""
def create_persona_context(profile: Dict) -> str:
    """Generate persona context based on profile"""
    
    # Age description
    age_ranges = {
        "25-30": "25-30",
        "30-35": "30-35", 
        "35-40": "35-40",
        "40-50": "40-50"
    }
    age_desc = f"You are a {age_ranges[profile['age_group']]} year old"
    
    # Gender and location
    gender_desc = profile['gender']
    location_map = {
        "US": "the United States",
        "Canada": "Canada",
        "UK": "the United Kingdom",
        "Australia": "Australia", 
        "New_Zealand": "New Zealand",
        "Ireland": "Ireland"
    }
    location_desc = f"living in {location_map[profile['location']]}"
    
    # Autism experience description
    exp_descriptions = {
        'caregiver': "You are a caregiver (parent/spouse/sibling) of someone with autism",
        'professional': "You work professionally with individuals with autism (teacher/therapist/social worker)",
        'regular_interaction': "You regularly interact with someone with autism (friend/colleague/neighbor)",
        'some_experience': "You have some limited experience with autism (through volunteering, brief encounters, or training)",
        'no_experience': "You have no direct personal experience with autism"
    }
    exp_desc = exp_descriptions[profile['autism_experience']]
    
    # Knowledge level description
    knowledge_map = {
        "none": "nothing",
        "little": "a little",
        "a_lot": "a lot"
    }
    knowledge_desc = f"and you know {knowledge_map[profile['knowledge_level']]} about autism"
    
    return f"""BACKGROUND: {age_desc} {gender_desc} {location_desc}. {exp_desc}, {knowledge_desc}. 
When answering, draw from your specific background and experience level."""

"""
The average word length of the Amazon MTurk responese = 49.27
We calculate a dynamic range of length to limit the length of AI-responses: 80%-120% of the average MTurk out-group responses
"""
def create_length_constraint(average_word_length: float, constraint_type: str = "flexible") -> Tuple[int, int, str]:
    """Create length constraint based on average word length"""
    
    if constraint_type == "flexible":
        # Flexible constraint: ±20%
        min_length = max(30, int(average_word_length * 0.8))
        max_length = int(average_word_length * 1.2)
        instruction = f"Keep your answer (task 3) between {min_length}-{max_length} words."
    
    return min_length, max_length, instruction

"""
Incorporate each part to build a complet prompt: Introduction + Question title&body + Personal context + Task instruction 
"""
def create_mturk_style_prompt(profile: Dict, question_title: str, question_body: str, 
                             average_word_length: float) -> str:
    """Create MTurk-style prompt with demographic background"""
    
    # Base introduction from original MTurk template
    introduction = """Introduction: Many individuals with autism visit online communities to ask questions about issues they are facing. We would like to know whether people outside of the specific community could provide answers for them. A question from a special autism community will be shown to you. Please read the question carefully, rate your confidence and provide your own answer to the question."""
    
    # Question section
    question_section = f"""
Question Title: {question_title}
Question Body: "{question_body}"
"""
    
    # Generate persona context
    persona_context = create_persona_context(profile)
    
    # Create length constraint
    min_length, max_length, length_instruction = create_length_constraint(average_word_length)
    
    # Tasks section with background integration
    tasks = f"""
Tasks:
{persona_context}

Now, based on your background and experience:

1. Rate your confidence in being able to answer this question.
   (1) Not confident at all – (2) Slightly confident – 
   (3) Somewhat confident – (4) Very confident – 
   (5) Extremely confident

2. Explain the reason for your confidence rating:

3. Please answer the question, or explain why you cannot answer:

IMPORTANT: {length_instruction} This should be a concise but helpful response, similar to typical online forum answers. Focus on providing direct, actionable advice rather than lengthy explanations.
"""
    
    return introduction + question_section + tasks

# Test the prompt generation system
test_profile = responder_profiles[0] #use the first generated profile
test_question_title = questions_df.iloc[0]['Title'] #the title of the first question
test_question_body = questions_df.iloc[0]['Body'] #the content of the first question

test_prompt = create_mturk_style_prompt(
    test_profile, 
    test_question_title, 
    test_question_body, 
    average_answer_word_length
)

print("Prompt template system created successfully!")
print(f"\nTest prompt preview (first 500 characters):")
print(test_prompt[:] + "...")


=== Step 2.3: Prompt Template System ===
Prompt template system created successfully!

Test prompt preview (first 500 characters):
Introduction: Many individuals with autism visit online communities to ask questions about issues they are facing. We would like to know whether people outside of the specific community could provide answers for them. A question from a special autism community will be shown to you. Please read the question carefully, rate your confidence and provide your own answer to the question.
Question Title: Dr Who, is he Autistic?
PS. He is called The Doctor, not Dr Who. If you refer to him as Dr Who I can guarantee the following post will be a correction, simply because we all enjoy correcting people so much!"lly and surprises people when he bumbles on about some kind of alien technology. This seemed especially clear with this weeks episode with James Corden. 

Tasks:
BACKGROUND: You are a 30-35 year old male living in the United Kingdom. You have some limited expe

### 2.4 run Simulation of out-group responders
#### Feed the 100 unique prompts into the LLM and save the results to local directory

1. 改为400个answer生成 2. tqdm展示生成进程 3. 一遍生成一遍存结果，不要最后全生成完了再存

In [9]:
# 2.4 Simulation execution system with guaranteed 100% profile coverage
print("\n=== Step 2.4: Simulation Execution System ===")

import os
import json
import pandas as pd
from datetime import datetime
from typing import List, Dict
import random
import time
from tqdm import tqdm
from collections import Counter

"""
Directory management for output files
"""
def create_output_directory():
    """
    Create output directory if it doesn't exist
    
    Returns:
        str: Path to the output directory
    """
    output_dir = r"D:\Wisconsin_Madison\2025summer\Dr.Arriaga\AutismTE\TE_autism\out-group_simulated_answers"
    os.makedirs(output_dir, exist_ok=True)
    return output_dir

"""
Balanced assignment generation to ensure 100% profile coverage
This approach guarantees that all 100 profiles are used exactly 4 times each,
which is more methodologically sound than random sampling for research purposes.
"""
def generate_balanced_profile_question_assignments(profiles: List[Dict], questions_df: pd.DataFrame, 
                                                 answers_per_question: int = 20) -> List[Dict]:
    """
    Generate exactly 400 profile-question assignments ensuring ALL 100 profiles are used equally.
    
    Strategy:
    1. Each profile appears exactly 4 times across all questions (100 profiles × 4 = 400 assignments)
    2. Random shuffling maintains randomness while ensuring complete coverage
    3. Sequential assignment to questions ensures balanced distribution
    
    Args:
        profiles (List[Dict]): List of 100 demographic profiles
        questions_df (pd.DataFrame): DataFrame containing 20 questions
        answers_per_question (int): Number of answers per question (default: 20)
    
    Returns:
        List[Dict]: List of 400 assignments with guaranteed profile coverage
    """
    print(f"Generating balanced assignments to ensure 100% profile coverage...")
    
    # Calculate assignment parameters
    total_profiles = len(profiles)
    total_questions = len(questions_df)
    total_assignments = total_questions * answers_per_question
    uses_per_profile = total_assignments // total_profiles  # Should be 4
    
    print(f"  Total profiles: {total_profiles}")
    print(f"  Total questions: {total_questions}")
    print(f"  Answers per question: {answers_per_question}")
    print(f"  Total assignments needed: {total_assignments}")
    print(f"  Each profile will be used exactly: {uses_per_profile} times")
    
    # Step 1: Create expanded profile list where each profile appears exactly 4 times
    # This mathematical approach guarantees 100% coverage
    expanded_profiles = []
    for _ in range(uses_per_profile):  # Repeat 4 times
        expanded_profiles.extend(profiles.copy())
    
    print(f"  Created expanded profile list: {len(expanded_profiles)} entries")
    
    # Step 2: Shuffle to randomize assignment while maintaining equal usage
    # This preserves randomness while ensuring mathematical balance
    random.shuffle(expanded_profiles)
    print(f"  Shuffled expanded profile list for randomization")
    
    # Step 3: Assign profiles to questions sequentially
    # Sequential assignment from shuffled list ensures no profile is missed
    assignments = []
    profile_index = 0
    
    for question_idx, question_row in questions_df.iterrows():
        question_profiles = []
        
        # Take next 20 profiles from shuffled list
        for _ in range(answers_per_question):
            profile = expanded_profiles[profile_index]
            question_profiles.append(profile)
            profile_index += 1
        
        # Create assignment objects for this question
        for profile in question_profiles:
            assignment = {
                'assignment_id': len(assignments) + 1,          # Global unique ID (1-400)
                'question_idx': question_idx,                   # Question index (0-19)
                'question_title': question_row['Title'],        # Question title
                'question_body': question_row['Body'],          # Question content
                'profile_id': profile['id'],                    # Profile ID (1-100)
                'profile': profile                              # Complete profile information
            }
            assignments.append(assignment)
        
        # Display assignment summary for this question
        profile_ids = [p['id'] for p in question_profiles]
        print(f"  Question {question_idx + 1}: Assigned profiles {sorted(profile_ids)[:5]}...{sorted(profile_ids)[-5:]} (20 total)")
    
    print(f"Generated {len(assignments)} total assignments")
    return assignments

"""
Profile coverage verification functions
These functions ensure that the assignment strategy meets research quality standards
"""
def verify_profile_coverage(assignments: List[Dict], total_profiles: int = 100) -> bool:
    """
    Verify that all profiles are used and usage is perfectly balanced.
    This is critical for ensuring the validity of the simulation.
    
    Args:
        assignments (List[Dict]): List of profile-question assignments
        total_profiles (int): Expected number of unique profiles
    
    Returns:
        bool: True if coverage is complete and balanced, False otherwise
    """
    print(f"\n=== Profile Coverage Verification ===")
    
    # Count usage of each profile
    profile_usage = {}
    for assignment in assignments:
        profile_id = assignment['profile_id']
        profile_usage[profile_id] = profile_usage.get(profile_id, 0) + 1
    
    # Check complete coverage
    used_profiles = set(profile_usage.keys())
    all_profiles = set(range(1, total_profiles + 1))
    missing_profiles = all_profiles - used_profiles
    
    print(f"Total unique profiles used: {len(used_profiles)}/{total_profiles}")
    
    if missing_profiles:
        print(f"❌ Missing profiles: {sorted(list(missing_profiles))}")
        return False
    else:
        print(f"✅ All {total_profiles} profiles are used")
    
    # Check usage balance
    usage_counts = list(profile_usage.values())
    usage_distribution = Counter(usage_counts)
    
    print(f"\nUsage distribution:")
    for usage_count, num_profiles in sorted(usage_distribution.items()):
        print(f"  {num_profiles} profiles used {usage_count} times each")
    
    # Verify perfect balance (all profiles used exactly 4 times)
    expected_usage = len(assignments) // total_profiles
    if len(usage_distribution) == 1 and expected_usage in usage_distribution:
        print(f"✅ Perfect balance: All profiles used exactly {expected_usage} times")
        return True
    else:
        print(f"⚠️  Imbalanced usage detected")
        print(f"   Expected: {expected_usage} uses per profile")
        return False

def analyze_demographic_distribution(assignments: List[Dict]):
    """
    Analyze demographic distribution across questions to ensure representativeness.
    This helps verify that the randomization maintains demographic balance.
    
    Args:
        assignments (List[Dict]): List of profile-question assignments
    """
    print(f"\n=== Demographic Distribution Analysis ===")
    
    # Group assignments by question
    question_assignments = {}
    for assignment in assignments:
        q_idx = assignment['question_idx']
        if q_idx not in question_assignments:
            question_assignments[q_idx] = []
        question_assignments[q_idx].append(assignment)
    
    # Analyze autism experience distribution for each question
    print(f"Autism experience distribution per question:")
    for q_idx in sorted(question_assignments.keys()):
        assignments_for_q = question_assignments[q_idx]
        
        # Count autism experience types
        autism_exp_count = {}
        for assignment in assignments_for_q:
            exp = assignment['profile']['autism_experience']
            autism_exp_count[exp] = autism_exp_count.get(exp, 0) + 1
        
        print(f"  Question {q_idx + 1}: {dict(autism_exp_count)}")
        
        # Check for reasonable demographic balance
        total_for_q = len(assignments_for_q)
        expected_caregivers = int(33 * (total_for_q / 100))  # 33% are caregivers
        actual_caregivers = autism_exp_count.get('caregiver', 0)
        
        if abs(actual_caregivers - expected_caregivers) <= 2:  # Allow ±2 tolerance
            print(f"    ✅ Good demographic balance (caregivers: {actual_caregivers}/{expected_caregivers} expected)")
        else:
            print(f"    ⚠️  Demographic imbalance (caregivers: {actual_caregivers}/{expected_caregivers} expected)")

"""
Real-time file management for incremental saving
This prevents data loss in case of interruption during long simulation runs
"""
# def initialize_output_files(output_dir: str) -> tuple:
#     """
#     Initialize output files with proper headers and structure.
#     Creates both JSON and CSV files for different analysis needs.
    
#     Args:
#         output_dir (str): Directory to save output files
    
#     Returns:
#         tuple: (json_path, csv_path) - paths to initialized files
#     """
#     # Generate unique timestamp for file naming
#     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
#     # Initialize JSON file for complete data storage
#     json_filename = f"simulation_results_{timestamp}.json"
#     json_path = os.path.join(output_dir, json_filename)
    
#     # Initialize CSV file for analysis-friendly format
#     csv_filename = f"simulation_summary_{timestamp}.csv"
#     csv_path = os.path.join(output_dir, csv_filename)
    
#     # Create empty JSON array
#     with open(json_path, 'w', encoding='utf-8') as f:
#         json.dump([], f)
    
#     # Create CSV file with headers
#     csv_headers = [
#         'assignment_id', 'profile_id', 'age_group', 'gender', 'location', 
#         'autism_experience', 'knowledge_level', 'question_idx', 'question_title', 
#         'response', 'timestamp', 'model', 'word_count', 'status'
#     ]
#     csv_df = pd.DataFrame(columns=csv_headers)
#     csv_df.to_csv(csv_path, index=False, encoding='utf-8')
    
#     return json_path, csv_path
def initialize_output_files(output_dir: str) -> tuple:
    """
    Initialize output files with proper headers and structure.
    Creates both JSON and CSV files for different analysis needs.
    
    Args:
        output_dir (str): Directory to save output files
    
    Returns:
        tuple: (json_path, csv_path) - paths to initialized files
    """
    # Generate unique timestamp for file naming
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    # Initialize JSON file for complete data storage
    json_filename = f"simulation_results_{timestamp}.json"
    json_path = os.path.join(output_dir, json_filename)
    
    # Initialize CSV file for analysis-friendly format
    csv_filename = f"simulation_summary_{timestamp}.csv"
    csv_path = os.path.join(output_dir, csv_filename)
    
    # Create empty JSON array
    with open(json_path, 'w', encoding='utf-8') as f:
        json.dump([], f)
    
    # Create CSV file with headers - MODIFIED: Added question_body column
    csv_headers = [
        'assignment_id', 'profile_id', 'age_group', 'gender', 'location', 
        'autism_experience', 'knowledge_level', 'question_idx', 'question_title', 
        'question_body', 'response', 'timestamp', 'model', 'word_count', 'status'
    ]
    csv_df = pd.DataFrame(columns=csv_headers)
    csv_df.to_csv(csv_path, index=False, encoding='utf-8')
    
    return json_path, csv_path

# def append_result_to_files(result: Dict, json_path: str, csv_path: str):
#     """
#     Append a single simulation result to both JSON and CSV files immediately.
#     This incremental saving approach prevents data loss during long runs.
    
#     Args:
#         result (Dict): Simulation result to save
#         json_path (str): Path to JSON file
#         csv_path (str): Path to CSV file
#     """
    
#     # Append to JSON file (complete data preservation)
#     try:
#         # Read existing data
#         with open(json_path, 'r', encoding='utf-8') as f:
#             existing_data = json.load(f)
        
#         # Add new result
#         existing_data.append(result)
        
#         # Write back to file with proper formatting
#         with open(json_path, 'w', encoding='utf-8') as f:
#             json.dump(existing_data, f, indent=2, ensure_ascii=False)
    
#     except Exception as e:
#         print(f"Error writing to JSON file: {e}")
    
#     # Append to CSV file (analysis-friendly format)
#     try:
#         # Extract and flatten data for CSV format
#         csv_row = {
#             'assignment_id': result['assignment_id'],
#             'profile_id': result['profile_id'],
#             'age_group': result['profile']['age_group'],
#             'gender': result['profile']['gender'],
#             'location': result['profile']['location'],
#             'autism_experience': result['profile']['autism_experience'],
#             'knowledge_level': result['profile']['knowledge_level'],
#             'question_idx': result['question_idx'],
#             'question_title': result['question_title'],
#             'response': result['response'],
#             'timestamp': result['timestamp'],
#             'model': result['model'],
#             'word_count': len(result['response'].split()),
#             'status': result.get('status', 'success')
#         }
        
#         # Append single row to CSV
#         csv_row_df = pd.DataFrame([csv_row])
#         csv_row_df.to_csv(csv_path, mode='a', header=False, index=False, encoding='utf-8')
        
#     except Exception as e:
#         print(f"Error writing to CSV file: {e}")
def append_result_to_files(result: Dict, json_path: str, csv_path: str):
    """
    Append a single simulation result to both JSON and CSV files immediately.
    This incremental saving approach prevents data loss during long runs.
    
    Args:
        result (Dict): Simulation result to save
        json_path (str): Path to JSON file
        csv_path (str): Path to CSV file
    """
    
    # Append to JSON file (complete data preservation)
    try:
        # Read existing data
        with open(json_path, 'r', encoding='utf-8') as f:
            existing_data = json.load(f)
        
        # Add new result
        existing_data.append(result)
        
        # Write back to file with proper formatting
        with open(json_path, 'w', encoding='utf-8') as f:
            json.dump(existing_data, f, indent=2, ensure_ascii=False)
    
    except Exception as e:
        print(f"Error writing to JSON file: {e}")
    
    # Append to CSV file (analysis-friendly format)
    try:
        # Extract and flatten data for CSV format - MODIFIED: Added question_body
        csv_row = {
            'assignment_id': result['assignment_id'],
            'profile_id': result['profile_id'],
            'age_group': result['profile']['age_group'],
            'gender': result['profile']['gender'],
            'location': result['profile']['location'],
            'autism_experience': result['profile']['autism_experience'],
            'knowledge_level': result['profile']['knowledge_level'],
            'question_idx': result['question_idx'],
            'question_title': result['question_title'],
            'question_body': result['question_body'],  # ADDED: Question body column
            'response': result['response'],
            'timestamp': result['timestamp'],
            'model': result['model'],
            'word_count': len(result['response'].split()),
            'status': result.get('status', 'success')
        }
        
        # Append single row to CSV
        csv_row_df = pd.DataFrame([csv_row])
        csv_row_df.to_csv(csv_path, mode='a', header=False, index=False, encoding='utf-8')
        
    except Exception as e:
        print(f"Error writing to CSV file: {e}")

"""
Main simulation execution function with real-time saving and progress tracking
This is the core function that orchestrates the entire simulation process
"""
def run_simulation_batch_400_realtime_balanced(profiles: List[Dict], questions_df: pd.DataFrame, 
                                             average_word_length: float, model_name: str = "gpt-3.5-turbo",
                                             answers_per_question: int = 20):
    """
    Execute simulation for exactly 400 answers with guaranteed 100% profile coverage.
    
    Features:
    - Balanced assignment ensuring all profiles are used equally
    - Real-time saving to prevent data loss
    - Progress tracking with detailed statistics
    - Comprehensive error handling and reporting
    
    Args:
        profiles (List[Dict]): List of 100 demographic profiles
        questions_df (pd.DataFrame): DataFrame with 20 questions
        average_word_length (float): Target word length for responses
        model_name (str): OpenAI model to use
        answers_per_question (int): Number of answers per question
    
    Returns:
        tuple: (successful_responses, json_path, csv_path)
    """
    
    # Initialize OpenAI client with error handling
    try:
        client = verify_openai_access(
            pathlib.Path("openai_organization.txt"),
            pathlib.Path("openai_api_key.txt")
        )
        print("✅ OpenAI client initialized successfully")
    except Exception as e:
        print(f"❌ Failed to initialize OpenAI client: {e}")
        return None, None, None
    
    # Configure model settings for consistent responses
    model_settings = OpenAIModelSettings(
        model=model_name,
        max_tokens=1000,                    # Sufficient length for responses
        temperature=0.3,                    # Lower temperature for consistency
        n=1,                               # Single response per call
        presence_penalty=0.1,              # Slight penalty to avoid repetition
        frequency_penalty=0.1,             # Slight penalty for varied vocabulary
        stop=None,                         # No stop sequences
        params_descriptor="autism-community-response-simulation"
    )
    
    # Setup output files
    output_dir = create_output_directory()
    json_path, csv_path = initialize_output_files(output_dir)
    
    # Generate balanced assignments with verification
    print(f"\n=== Assignment Generation ===")
    assignments = generate_balanced_profile_question_assignments(profiles, questions_df, answers_per_question)
    
    # Verify assignment quality before starting simulation
    coverage_verified = verify_profile_coverage(assignments, len(profiles))
    if not coverage_verified:
        print("❌ Profile coverage verification failed! Aborting simulation.")
        return None, None, None
    
    # Analyze demographic distribution
    analyze_demographic_distribution(assignments)
    
    total_assignments = len(assignments)
    
    print(f"\n=== Simulation Execution ===")
    print(f"Starting simulation with {total_assignments} assignments")
    print(f"✅ Guaranteed 100% profile coverage verified")
    print(f"Model: {model_name} (temp={model_settings.temperature})")
    print(f"Target word length: {average_word_length:.1f} ± 20%")
    print(f"Output files:")
    print(f"  JSON: {json_path}")
    print(f"  CSV: {csv_path}")
    
    # Initialize execution statistics
    successful_responses = 0
    failed_responses = 0
    start_time = datetime.now()
    
    # Execute simulation with progress tracking
    with tqdm(total=total_assignments, desc="Generating responses", unit="response") as pbar:
        
        for assignment in assignments:
            
            # Generate personalized prompt for this profile-question combination
            prompt = create_mturk_style_prompt(
                assignment['profile'], 
                assignment['question_title'], 
                assignment['question_body'], 
                average_word_length
            )
            
            try:
                # Call OpenAI API
                response = call_openai_chat_api(prompt, model_settings, client)
                
                # Create comprehensive result object
                result = {
                    'assignment_id': assignment['assignment_id'],
                    'question_idx': assignment['question_idx'],
                    'question_title': assignment['question_title'],
                    'question_body': assignment['question_body'],
                    'profile_id': assignment['profile_id'],
                    'profile': assignment['profile'],
                    'prompt': prompt,                                    # Complete prompt for analysis
                    'response': response,                                # LLM-generated response
                    'timestamp': datetime.now().isoformat(),
                    'model': model_name,
                    'word_count': len(response.split()),
                    'status': 'success'
                }
                
                # Save result immediately to prevent data loss
                append_result_to_files(result, json_path, csv_path)
                
                successful_responses += 1
                
                # Update progress bar with detailed information
                pbar.set_postfix({
                    'Success': successful_responses,
                    'Failed': failed_responses,
                    'Success Rate': f"{successful_responses/(successful_responses+failed_responses)*100:.1f}%",
                    'Current Q': f"Q{assignment['question_idx']+1}",
                    'Profile': assignment['profile_id']
                })
                
                # Rate limiting to avoid API throttling
                time.sleep(0.5)
                
            except Exception as e:
                failed_responses += 1
                
                # Log detailed error information
                error_result = {
                    'assignment_id': assignment['assignment_id'],
                    'question_idx': assignment['question_idx'],
                    'profile_id': assignment['profile_id'],
                    'error': str(e),
                    'timestamp': datetime.now().isoformat(),
                    'status': 'failed'
                }
                
                print(f"\n⚠️  Error processing assignment {assignment['assignment_id']}: {e}")
                
                # Update progress bar with error statistics
                pbar.set_postfix({
                    'Success': successful_responses,
                    'Failed': failed_responses,
                    'Success Rate': f"{successful_responses/(successful_responses+failed_responses)*100:.1f}%" if (successful_responses+failed_responses) > 0 else "0%",
                    'Current Q': f"Q{assignment['question_idx']+1}",
                    'Profile': assignment['profile_id']
                })
            
            # Update progress bar
            pbar.update(1)
    
    # Calculate execution statistics
    end_time = datetime.now()
    execution_time = end_time - start_time
    
    # Display comprehensive execution summary
    print(f"\n=== Simulation Completed ===")
    print(f"📊 Execution Statistics:")
    print(f"  Total assignments: {total_assignments}")
    print(f"  Successful responses: {successful_responses}")
    print(f"  Failed responses: {failed_responses}")
    print(f"  Success rate: {successful_responses/total_assignments*100:.1f}%")
    print(f"  Execution time: {execution_time}")
    print(f"  Average time per response: {execution_time.total_seconds()/total_assignments:.2f} seconds")
    
    # Final verification of profile coverage in actual results
    if successful_responses > 0:
        print(f"\n=== Final Profile Coverage Verification ===")
        try:
            # Read saved results to verify actual coverage
            final_csv = pd.read_csv(csv_path, encoding='utf-8')
            unique_profiles_used = final_csv['profile_id'].nunique()
            
            print(f"Profiles actually used in successful responses: {unique_profiles_used}/100")
            
            if unique_profiles_used == 100:
                print(f"✅ SUCCESS: All 100 profiles were successfully used!")
            elif unique_profiles_used >= 95:
                print(f"⚠️  PARTIAL SUCCESS: {unique_profiles_used} profiles used (>95% coverage)")
            else:
                print(f"❌ INSUFFICIENT COVERAGE: Only {unique_profiles_used} profiles used")
                
            # Analyze word length distribution
            avg_word_count = final_csv['word_count'].mean()
            target_min = average_word_length * 0.8
            target_max = average_word_length * 1.2
            within_range = final_csv[
                (final_csv['word_count'] >= target_min) & 
                (final_csv['word_count'] <= target_max)
            ]
            
            print(f"\n📝 Response Quality:")
            print(f"  Average word count: {avg_word_count:.1f}")
            print(f"  Target range: {target_min:.1f}-{target_max:.1f}")
            print(f"  Within target range: {len(within_range)}/{len(final_csv)} ({len(within_range)/len(final_csv)*100:.1f}%)")
                
        except Exception as e:
            print(f"Error reading final results: {e}")
    
    print(f"\n📁 Results saved to:")
    print(f"  Complete data (JSON): {json_path}")
    print(f"  Analysis data (CSV): {csv_path}")
    
    return successful_responses, json_path, csv_path

"""
Results validation function for quality assurance
"""
def validate_simulation_results_comprehensive(json_path: str, csv_path: str, 
                                            target_word_length: float, expected_total: int = 400):
    """
    Comprehensive validation of simulation results for research quality assurance.
    
    Args:
        json_path (str): Path to JSON results file
        csv_path (str): Path to CSV results file
        target_word_length (float): Expected average word length
        expected_total (int): Expected total number of responses
    """
    print(f"\n=== Comprehensive Results Validation ===")
    
    # File consistency validation
    try:
        with open(json_path, 'r', encoding='utf-8') as f:
            json_results = json.load(f)
        csv_results = pd.read_csv(csv_path, encoding='utf-8')
        
        print(f"📄 File Validation:")
        print(f"  JSON file: {len(json_results)} results")
        print(f"  CSV file: {len(csv_results)} results")
        
        if len(json_results) == len(csv_results):
            print(f"  ✅ File consistency verified")
        else:
            print(f"  ⚠️  File inconsistency detected")
            
    except Exception as e:
        print(f"❌ Error reading result files: {e}")
        return
    
    if len(csv_results) == 0:
        print("❌ No results to validate")
        return
    
    # Response quality validation
    print(f"\n📊 Response Quality Analysis:")
    
    # Word length analysis
    avg_word_count = csv_results['word_count'].mean()
    std_word_count = csv_results['word_count'].std()
    target_min = target_word_length * 0.8
    target_max = target_word_length * 1.2
    
    within_range = csv_results[
        (csv_results['word_count'] >= target_min) & 
        (csv_results['word_count'] <= target_max)
    ]
    
    print(f"  Word Length Statistics:")
    print(f"    Target: {target_word_length:.1f} words")
    print(f"    Actual average: {avg_word_count:.1f} ± {std_word_count:.1f}")
    print(f"    Range: {csv_results['word_count'].min()}-{csv_results['word_count'].max()} words")
    print(f"    Within target range: {len(within_range)}/{len(csv_results)} ({len(within_range)/len(csv_results)*100:.1f}%)")
    
    # Profile coverage analysis
    print(f"\n👥 Profile Coverage Analysis:")
    unique_profiles = csv_results['profile_id'].nunique()
    profile_usage = csv_results['profile_id'].value_counts()
    
    print(f"  Unique profiles used: {unique_profiles}/100")
    print(f"  Usage distribution: {dict(Counter(profile_usage.values))}")
    
    if unique_profiles == 100:
        print(f"  ✅ Complete profile coverage achieved")
    else:
        print(f"  ⚠️  Incomplete profile coverage")
    
    # Question distribution analysis
    print(f"\n❓ Question Distribution Analysis:")
    question_counts = csv_results['question_idx'].value_counts().sort_index()
    expected_per_question = expected_total // 20
    
    print(f"  Answers per question:")
    for q_idx, count in question_counts.items():
        status = "✅" if count == expected_per_question else "⚠️"
        print(f"    Question {q_idx + 1}: {count} answers {status}")
    
    uneven_questions = question_counts[question_counts != expected_per_question]
    if len(uneven_questions) == 0:
        print(f"  ✅ Even question distribution")
    else:
        print(f"  ⚠️  Uneven distribution in {len(uneven_questions)} questions")
    
    # Demographic balance analysis
    print(f"\n🎯 Demographic Balance Analysis:")
    autism_exp_dist = csv_results['autism_experience'].value_counts()
    print(f"  Autism Experience Distribution:")
    for exp_type, count in autism_exp_dist.items():
        percentage = count / len(csv_results) * 100
        print(f"    {exp_type}: {count} ({percentage:.1f}%)")

# Create output directory
output_directory = create_output_directory()
print(f"Output directory created: {output_directory}")


=== Step 2.4: Simulation Execution System ===
Output directory created: D:\Wisconsin_Madison\2025summer\Dr.Arriaga\AutismTE\TE_autism\out-group_simulated_answers


### 3. Run Experiment 
#### 3.1 Run Simulation
##### Run the trial simulation for 20 answers

In [10]:
# Optional: Test run with smaller scale
print("\n🧪 OPTIONAL: TEST RUN")
print("="*30)

def run_test_simulation():
    """Run a small-scale test to verify everything works"""
    
    # Use only first 2 questions and 4 profiles for testing
    test_questions = questions_df.head(2)
    test_profiles = responder_profiles[:20]  # Use 20 profiles for 2 questions × 10 answers each
    
    print(f"Running test with:")
    print(f"  {len(test_questions)} questions")
    print(f"  {len(test_profiles)} profiles") 
    print(f"  Target: {len(test_questions) * 10} total responses")
    
    try:
        test_results = run_simulation_batch_400_realtime_balanced(
            profiles=test_profiles,
            questions_df=test_questions,
            average_word_length=average_answer_word_length,
            model_name="gpt-3.5-turbo",
            answers_per_question=10  # 10 answers per question instead of 20
        )
        
        if test_results[0] and test_results[0] > 0:
            print(f"✅ Test successful! Generated {test_results[0]} responses")
            print(f"You can now run the full simulation with confidence.")
            return True
        else:
            print(f"❌ Test failed")
            return False
            
    except Exception as e:
        print(f"❌ Test error: {e}")
        return False

# Uncomment the following line to run the test
test_success = run_test_simulation()


🧪 OPTIONAL: TEST RUN
Running test with:
  2 questions
  20 profiles
  Target: 20 total responses
✅ OpenAI client initialized successfully

=== Assignment Generation ===
Generating balanced assignments to ensure 100% profile coverage...
  Total profiles: 20
  Total questions: 2
  Answers per question: 10
  Total assignments needed: 20
  Each profile will be used exactly: 1 times
  Created expanded profile list: 20 entries
  Shuffled expanded profile list for randomization
  Question 1: Assigned profiles [2, 3, 5, 6, 8]...[9, 12, 16, 17, 19] (20 total)
  Question 2: Assigned profiles [1, 4, 7, 10, 11]...[13, 14, 15, 18, 20] (20 total)
Generated 20 total assignments

=== Profile Coverage Verification ===
Total unique profiles used: 20/20
✅ All 20 profiles are used

Usage distribution:
  20 profiles used 1 times each
✅ Perfect balance: All profiles used exactly 1 times

=== Demographic Distribution Analysis ===
Autism experience distribution per question:
  Question 1: {'some_experience':

Generating responses: 100%|█| 20/20 [00:36<00:00,  1.85s/response, Success=20, Failed=0, Success Rate=100.0%, Current Q


=== Simulation Completed ===
📊 Execution Statistics:
  Total assignments: 20
  Successful responses: 20
  Failed responses: 0
  Success rate: 100.0%
  Execution time: 0:00:37.020833
  Average time per response: 1.85 seconds

=== Final Profile Coverage Verification ===
Profiles actually used in successful responses: 20/100
❌ INSUFFICIENT COVERAGE: Only 20 profiles used

📝 Response Quality:
  Average word count: 73.9
  Target range: 39.4-59.1
  Within target range: 0/20 (0.0%)

📁 Results saved to:
  Complete data (JSON): D:\Wisconsin_Madison\2025summer\Dr.Arriaga\AutismTE\TE_autism\out-group_simulated_answers\simulation_results_20250707_212844.json
  Analysis data (CSV): D:\Wisconsin_Madison\2025summer\Dr.Arriaga\AutismTE\TE_autism\out-group_simulated_answers\simulation_summary_20250707_212844.csv
✅ Test successful! Generated 20 responses
You can now run the full simulation with confidence.





##### Run the complete simulation for all 400 answers

In [11]:
# ===================================================================
# FLEXIBLE SIMULATION RUNNER WITH CONFIGURABLE PARAMETERS
# ===================================================================

def run_model_simulation(model_name: str, target_answers: int = 400, 
                        profiles: List[Dict] = None, questions_df: pd.DataFrame = None,
                        average_word_length: float = None):
    """
    Run simulation with configurable model and answer count
    
    Args:
        model_name (str): OpenAI model to use (e.g., "gpt-4", "gpt-3.5-turbo")
        target_answers (int): Total number of answers to generate (default: 400)
        profiles (List[Dict]): List of demographic profiles
        questions_df (pd.DataFrame): DataFrame containing questions
        average_word_length (float): Target word length for responses
    
    Returns:
        tuple: (successful_responses, json_path, csv_path)
    """
    
    print(f"\n{'='*60}")
    print(f"RUNNING SIMULATION: {model_name.upper()}")
    print(f"{'='*60}")
    
    # Calculate answers per question based on target total
    total_questions = len(questions_df)
    answers_per_question = target_answers // total_questions
    actual_total = answers_per_question * total_questions
    
    print(f"📊 Simulation Configuration:")
    print(f"  Model: {model_name}")
    print(f"  Target total answers: {target_answers}")
    print(f"  Questions available: {total_questions}")
    print(f"  Answers per question: {answers_per_question}")
    print(f"  Actual total answers: {actual_total}")
    print(f"  Profiles to use: {answers_per_question} per question (from {len(profiles)} available)")
    
    # Create model-specific output directory
    base_output_dir = r"D:\Wisconsin_Madison\2025summer\Dr.Arriaga\AutismTE\TE_autism\out-group_simulated_answers"
    model_output_dir = os.path.join(base_output_dir, model_name.replace(".", "_"))  # Replace dots for folder name
    os.makedirs(model_output_dir, exist_ok=True)
    
    print(f"📁 Model-specific output directory: {model_output_dir}")
    
    # Initialize OpenAI client
    try:
        client = verify_openai_access(
            pathlib.Path("openai_organization.txt"),
            pathlib.Path("openai_api_key.txt")
        )
        print(f"✅ OpenAI client initialized")
    except Exception as e:
        print(f"❌ Failed to initialize OpenAI client: {e}")
        return None, None, None
    
    # Configure model settings
    model_settings = OpenAIModelSettings(
        model=model_name,
        max_tokens=1000,
        temperature=0.3,
        n=1,
        presence_penalty=0.1,
        frequency_penalty=0.1,
        stop=None,
        params_descriptor=f"autism-simulation-{model_name}"
    )
    
    # Generate balanced assignments
    assignments = generate_balanced_profile_question_assignments_flexible(
        profiles, questions_df, answers_per_question
    )
    
    # Verify profile coverage
    expected_profiles = min(len(profiles), actual_total)
    coverage_verified = verify_profile_coverage_flexible(assignments, expected_profiles)
    
    if not coverage_verified:
        print(f"❌ Profile coverage verification failed!")
        return None, None, None
    
    # Initialize output files with model-specific paths
    json_path, csv_path = initialize_output_files_for_model(model_output_dir, model_name)
    
    print(f"🚀 Starting simulation execution...")
    
    # Execute simulation
    successful_responses = 0
    failed_responses = 0
    
    with tqdm(total=len(assignments), desc=f"Running {model_name}", unit="response") as pbar:
        
        for assignment in assignments:
            
            # Create prompt
            prompt = create_mturk_style_prompt(
                assignment['profile'], 
                assignment['question_title'], 
                assignment['question_body'], 
                average_word_length
            )
            
            try:
                # Call OpenAI API
                response = call_openai_chat_api(prompt, model_settings, client)
                
                # Create result object
                result = {
                    'assignment_id': assignment['assignment_id'],
                    'question_idx': assignment['question_idx'],
                    'question_title': assignment['question_title'],
                    'question_body': assignment['question_body'],
                    'profile_id': assignment['profile_id'],
                    'profile': assignment['profile'],
                    'prompt': prompt,
                    'response': response,
                    'timestamp': datetime.now().isoformat(),
                    'model': model_name,
                    'word_count': len(response.split()),
                    'status': 'success'
                }
                
                # Save immediately
                append_result_to_files_robust(result, json_path, csv_path)
                successful_responses += 1
                
                # Update progress
                pbar.set_postfix({
                    'Success': successful_responses,
                    'Failed': failed_responses,
                    'Rate': f"{successful_responses/(successful_responses+failed_responses)*100:.1f}%"
                })
                
                # Rate limiting
                time.sleep(0.5)
                
            except Exception as e:
                failed_responses += 1
                print(f"\n⚠️  Error in assignment {assignment['assignment_id']}: {e}")
                
                pbar.set_postfix({
                    'Success': successful_responses,
                    'Failed': failed_responses,
                    'Rate': f"{successful_responses/(successful_responses+failed_responses)*100:.1f}%" if (successful_responses+failed_responses) > 0 else "0%"
                })
            
            pbar.update(1)
    
    # Final summary
    print(f"\n📈 {model_name} Results:")
    print(f"  ✅ Successful: {successful_responses}")
    print(f"  ❌ Failed: {failed_responses}")
    print(f"  📊 Success rate: {successful_responses/(successful_responses+failed_responses)*100:.1f}%")
    print(f"  📁 Saved to: {model_output_dir}")
    
    return successful_responses, json_path, csv_path

# ===================================================================
# SUPPORTING FUNCTIONS FOR FLEXIBLE SIMULATION
# ===================================================================

def generate_balanced_profile_question_assignments_flexible(profiles: List[Dict], 
                                                          questions_df: pd.DataFrame, 
                                                          answers_per_question: int) -> List[Dict]:
    """
    Generate balanced assignments with flexible answer count per question
    
    Args:
        profiles (List[Dict]): Available profiles
        questions_df (pd.DataFrame): Questions to answer
        answers_per_question (int): How many answers per question
    
    Returns:
        List[Dict]: List of assignments
    """
    
    total_assignments = len(questions_df) * answers_per_question
    total_profiles = len(profiles)
    
    print(f"🔄 Generating flexible assignments:")
    print(f"  Total assignments needed: {total_assignments}")
    print(f"  Profiles available: {total_profiles}")
    
    # If we need more assignments than profiles, repeat profiles
    if total_assignments > total_profiles:
        uses_per_profile = total_assignments // total_profiles
        remaining = total_assignments % total_profiles
        
        expanded_profiles = []
        
        # Each profile used equally
        for _ in range(uses_per_profile):
            expanded_profiles.extend(profiles.copy())
        
        # Add remaining profiles randomly
        if remaining > 0:
            extra_profiles = random.sample(profiles, remaining)
            expanded_profiles.extend(extra_profiles)
            
        print(f"  Each profile used ~{uses_per_profile} times")
        
    else:
        # More profiles than needed, sample randomly
        expanded_profiles = random.sample(profiles, total_assignments)
        print(f"  Using {total_assignments} profiles randomly selected")
    
    # Shuffle for randomization
    random.shuffle(expanded_profiles)
    
    # Assign to questions
    assignments = []
    profile_index = 0
    
    for question_idx, question_row in questions_df.iterrows():
        for _ in range(answers_per_question):
            profile = expanded_profiles[profile_index]
            
            assignment = {
                'assignment_id': len(assignments) + 1,
                'question_idx': question_idx,
                'question_title': question_row['Title'],
                'question_body': question_row['Body'],
                'profile_id': profile['id'],
                'profile': profile
            }
            assignments.append(assignment)
            profile_index += 1
    
    print(f"✅ Generated {len(assignments)} assignments")
    return assignments

def verify_profile_coverage_flexible(assignments: List[Dict], expected_profiles: int) -> bool:
    """
    Verify profile coverage for flexible assignment counts
    """
    
    profile_usage = {}
    for assignment in assignments:
        profile_id = assignment['profile_id']
        profile_usage[profile_id] = profile_usage.get(profile_id, 0) + 1
    
    unique_profiles = len(profile_usage)
    print(f"👥 Profile coverage: {unique_profiles} unique profiles used")
    
    if unique_profiles >= min(expected_profiles, 100):  # At most 100 profiles available
        print(f"✅ Adequate profile coverage")
        return True
    else:
        print(f"⚠️  Limited profile coverage")
        return True  # Still proceed, but note the limitation

# def initialize_output_files_for_model(model_output_dir: str, model_name: str) -> tuple:
#     """
#     Initialize output files in model-specific directory
#     """
    
#     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
#     # Model-specific file names
#     json_filename = f"{model_name}_simulation_results_{timestamp}.json"
#     csv_filename = f"{model_name}_simulation_summary_{timestamp}.csv"
    
#     json_path = os.path.join(model_output_dir, json_filename)
#     csv_path = os.path.join(model_output_dir, csv_filename)
    
#     # Initialize files
#     with open(json_path, 'w', encoding='utf-8') as f:
#         json.dump([], f)
    
#     csv_headers = [
#         'assignment_id', 'profile_id', 'age_group', 'gender', 'location', 
#         'autism_experience', 'knowledge_level', 'question_idx', 'question_title', 
#         'response', 'timestamp', 'model', 'word_count', 'status'
#     ]
#     csv_df = pd.DataFrame(columns=csv_headers)
#     csv_df.to_csv(csv_path, index=False, encoding='utf-8')
    
#     return json_path, csv_path
def initialize_output_files_for_model(model_output_dir: str, model_name: str) -> tuple:
    """
    Initialize output files in model-specific directory
    """
    
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    # Model-specific file names
    json_filename = f"{model_name}_simulation_results_{timestamp}.json"
    csv_filename = f"{model_name}_simulation_summary_{timestamp}.csv"
    
    json_path = os.path.join(model_output_dir, json_filename)
    csv_path = os.path.join(model_output_dir, csv_filename)
    
    # Initialize files
    with open(json_path, 'w', encoding='utf-8') as f:
        json.dump([], f)
    
    # MODIFIED: Added question_body to CSV headers
    csv_headers = [
        'assignment_id', 'profile_id', 'age_group', 'gender', 'location', 
        'autism_experience', 'knowledge_level', 'question_idx', 'question_title', 
        'question_body', 'response', 'timestamp', 'model', 'word_count', 'status'
    ]
    csv_df = pd.DataFrame(columns=csv_headers)
    csv_df.to_csv(csv_path, index=False, encoding='utf-8')
    
    return json_path, csv_path

# def append_result_to_files_robust(result: Dict, json_path: str, csv_path: str, max_retries: int = 3):
#     """
#     Robust file writing with retry mechanism
#     """
    
#     # JSON writing
#     try:
#         with open(json_path, 'r', encoding='utf-8') as f:
#             existing_data = json.load(f)
#         existing_data.append(result)
#         with open(json_path, 'w', encoding='utf-8') as f:
#             json.dump(existing_data, f, indent=2, ensure_ascii=False)
#     except Exception as e:
#         print(f"JSON write error: {e}")
    
#     # CSV writing with retries
#     for attempt in range(max_retries):
#         try:
#             csv_row = {
#                 'assignment_id': result['assignment_id'],
#                 'profile_id': result['profile_id'],
#                 'age_group': result['profile']['age_group'],
#                 'gender': result['profile']['gender'],
#                 'location': result['profile']['location'],
#                 'autism_experience': result['profile']['autism_experience'],
#                 'knowledge_level': result['profile']['knowledge_level'],
#                 'question_idx': result['question_idx'],
#                 'question_title': result['question_title'],
#                 'response': result['response'],
#                 'timestamp': result['timestamp'],
#                 'model': result['model'],
#                 'word_count': len(result['response'].split()),
#                 'status': result.get('status', 'success')
#             }
            
#             csv_row_df = pd.DataFrame([csv_row])
#             csv_row_df.to_csv(csv_path, mode='a', header=False, index=False, encoding='utf-8')
#             break
            
#         except PermissionError:
#             if attempt < max_retries - 1:
#                 time.sleep(1)
#             continue
#         except Exception as e:
#             print(f"CSV write error: {e}")
#             break
def append_result_to_files_robust(result: Dict, json_path: str, csv_path: str, max_retries: int = 3):
    """
    Robust file writing with retry mechanism
    """
    
    # JSON writing
    try:
        with open(json_path, 'r', encoding='utf-8') as f:
            existing_data = json.load(f)
        existing_data.append(result)
        with open(json_path, 'w', encoding='utf-8') as f:
            json.dump(existing_data, f, indent=2, ensure_ascii=False)
    except Exception as e:
        print(f"JSON write error: {e}")
    
    # CSV writing with retries
    for attempt in range(max_retries):
        try:
            # MODIFIED: Added question_body to CSV row
            csv_row = {
                'assignment_id': result['assignment_id'],
                'profile_id': result['profile_id'],
                'age_group': result['profile']['age_group'],
                'gender': result['profile']['gender'],
                'location': result['profile']['location'],
                'autism_experience': result['profile']['autism_experience'],
                'knowledge_level': result['profile']['knowledge_level'],
                'question_idx': result['question_idx'],
                'question_title': result['question_title'],
                'question_body': result['question_body'],  # ADDED: Question body column
                'response': result['response'],
                'timestamp': result['timestamp'],
                'model': result['model'],
                'word_count': len(result['response'].split()),
                'status': result.get('status', 'success')
            }
            
            csv_row_df = pd.DataFrame([csv_row])
            csv_row_df.to_csv(csv_path, mode='a', header=False, index=False, encoding='utf-8')
            break
            
        except PermissionError:
            if attempt < max_retries - 1:
                time.sleep(1)
            continue
        except Exception as e:
            print(f"CSV write error: {e}")
            break

##### Simulate results for different models through iteration of different models and save them in differnet folders

In [12]:
# ===================================================================
# MULTI-MODEL BATCH TESTING
# ===================================================================

def run_multi_model_comparison(target_answers: int = 100):
    """
    Run simulation across multiple models for comparison
    
    Args:
        target_answers (int): Number of answers to generate per model
    """
    
    # Define models to test
    models_to_test = [
        "gpt-4.1-mini",
        # "gpt-4", 
        # "gpt-4-turbo",  # Note: corrected from "gpt-4.1mini" which doesn't exist
        "gpt-3.5-turbo"
    ]
    
    print(f"\n{'='*80}")
    print(f"MULTI-MODEL COMPARISON STUDY")
    print(f"{'='*80}")
    print(f"🎯 Target answers per model: {target_answers}")
    print(f"🤖 Models to test: {', '.join(models_to_test)}")
    print(f"📊 Total answers to generate: {len(models_to_test) * target_answers}")
    
    # Set random seed for consistency across models
    random.seed(42)
    np.random.seed(42)
    
    # Storage for results comparison
    model_results = {}
    
    # Run simulation for each model
    for i, model_name in enumerate(models_to_test, 1):
        
        print(f"\n🚀 STARTING MODEL {i}/{len(models_to_test)}: {model_name}")
        print(f"{'='*60}")
        
        try:
            # Reset random seed for each model to ensure identical assignments
            random.seed(42)
            
            # Run simulation
            results = run_model_simulation(
                model_name=model_name,
                target_answers=target_answers,
                profiles=responder_profiles,
                questions_df=questions_df,
                average_word_length=average_answer_word_length
            )
            
            successful_responses, json_path, csv_path = results
            
            # Store results
            model_results[model_name] = {
                'successful_responses': successful_responses,
                'json_path': json_path,
                'csv_path': csv_path,
                'success_rate': successful_responses / target_answers * 100 if successful_responses else 0
            }
            
            print(f"✅ {model_name} completed: {successful_responses}/{target_answers} responses")
            
        except Exception as e:
            print(f"❌ {model_name} failed: {e}")
            model_results[model_name] = {
                'successful_responses': 0,
                'json_path': None,
                'csv_path': None,
                'success_rate': 0,
                'error': str(e)
            }
        
        # Add delay between models to avoid rate limiting
        if i < len(models_to_test):
            print(f"⏳ Waiting 30 seconds before next model...")
            time.sleep(30)
    
    # Display comparison summary
    print(f"\n{'='*80}")
    print(f"MULTI-MODEL COMPARISON RESULTS")
    print(f"{'='*80}")
    
    print(f"📊 Results Summary:")
    print(f"{'Model':<15} {'Success':<8} {'Rate':<8} {'Status':<10}")
    print(f"{'-'*50}")
    
    for model_name, result in model_results.items():
        success = result['successful_responses']
        rate = f"{result['success_rate']:.1f}%"
        status = "✅ OK" if success > 0 else "❌ FAIL"
        
        print(f"{model_name:<15} {success:<8} {rate:<8} {status:<10}")
    
    # Save comparison summary
    comparison_df = pd.DataFrame([
        {
            'model': model_name,
            'successful_responses': result['successful_responses'],
            'target_responses': target_answers,
            'success_rate': result['success_rate'],
            'json_path': result.get('json_path', ''),
            'csv_path': result.get('csv_path', ''),
            'error': result.get('error', '')
        }
        for model_name, result in model_results.items()
    ])
    
    # Save comparison to base directory
    base_dir = r"D:\Wisconsin_Madison\2025summer\Dr.Arriaga\AutismTE\TE_autism\out-group_simulated_answers"
    comparison_path = os.path.join(base_dir, f"model_comparison_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv")
    comparison_df.to_csv(comparison_path, index=False)
    
    print(f"\n📁 Comparison summary saved: {comparison_path}")
    
    return model_results

##### Codes to execute the real experiment

In [23]:
# ===================================================================
# EXECUTION CODE
# ===================================================================

print("🔧 MULTI-MODEL AUTISM SIMULATION EXPERIMENT")
print("="*60)

# Verify prerequisites
print("📋 Pre-execution checklist:")
required_vars = ['responder_profiles', 'questions_df', 'average_answer_word_length']
all_ready = True

for var in required_vars:
    if var in globals():
        print(f"  ✅ {var}: Available")
    else:
        print(f"  ❌ {var}: Missing")
        all_ready = False

if not all_ready:
    print("❌ Please run previous steps to generate required variables")
else:
    print("✅ All prerequisites met")
    
    # Choose test scale
    print(f"\n🎯 Choose test scale:")
    print(f"  1. Quick test: 100 answers (5 per question)")
    print(f"  2. Medium test: 200 answers (10 per question)")
    print(f"  3. Full test: 400 answers (20 per question)")
    
    # For demonstration, let's use medium test
    test_scale = 100  # You can change this
    
    print(f"\n🚀 Starting multi-model comparison with {test_scale} answers per model")
    
    # Run the comparison
    try:
        comparison_results = run_multi_model_comparison(target_answers=test_scale)
        
        print(f"\n🎉 Multi-model comparison completed!")
        print(f"Check the individual model folders for detailed results:")
        
        base_dir = r"D:\Wisconsin_Madison\2025summer\Dr.Arriaga\AutismTE\TE_autism\out-group_simulated_answers"
        print(f"📁 {base_dir}")
        
        for model_name in ["gpt-4.1-mini", "gpt-3.5-turbo"]:
            model_dir = os.path.join(base_dir, model_name.replace(".", "_"))
            if os.path.exists(model_dir):
                print(f"  📂 {model_name}/")
                
    except Exception as e:
        print(f"❌ Experiment failed: {e}")
        import traceback
        traceback.print_exc()

🔧 MULTI-MODEL AUTISM SIMULATION EXPERIMENT
📋 Pre-execution checklist:
  ✅ responder_profiles: Available
  ✅ questions_df: Available
  ✅ average_answer_word_length: Available
✅ All prerequisites met

🎯 Choose test scale:
  1. Quick test: 100 answers (5 per question)
  2. Medium test: 200 answers (10 per question)
  3. Full test: 400 answers (20 per question)

🚀 Starting multi-model comparison with 100 answers per model

MULTI-MODEL COMPARISON STUDY
🎯 Target answers per model: 100
🤖 Models to test: gpt-4.1-mini, gpt-3.5-turbo
📊 Total answers to generate: 200

🚀 STARTING MODEL 1/2: gpt-4.1-mini

RUNNING SIMULATION: GPT-4.1-MINI
📊 Simulation Configuration:
  Model: gpt-4.1-mini
  Target total answers: 100
  Questions available: 20
  Answers per question: 5
  Actual total answers: 100
  Profiles to use: 5 per question (from 100 available)
📁 Model-specific output directory: D:\Wisconsin_Madison\2025summer\Dr.Arriaga\AutismTE\TE_autism\out-group_simulated_answers\gpt-4_1-mini
✅ OpenAI client 

Running gpt-4.1-mini: 100%|████████████████| 100/100 [04:55<00:00,  2.95s/response, Success=100, Failed=0, Rate=100.0%]



📈 gpt-4.1-mini Results:
  ✅ Successful: 100
  ❌ Failed: 0
  📊 Success rate: 100.0%
  📁 Saved to: D:\Wisconsin_Madison\2025summer\Dr.Arriaga\AutismTE\TE_autism\out-group_simulated_answers\gpt-4_1-mini
✅ gpt-4.1-mini completed: 100/100 responses
⏳ Waiting 30 seconds before next model...

🚀 STARTING MODEL 2/2: gpt-3.5-turbo

RUNNING SIMULATION: GPT-3.5-TURBO
📊 Simulation Configuration:
  Model: gpt-3.5-turbo
  Target total answers: 100
  Questions available: 20
  Answers per question: 5
  Actual total answers: 100
  Profiles to use: 5 per question (from 100 available)
📁 Model-specific output directory: D:\Wisconsin_Madison\2025summer\Dr.Arriaga\AutismTE\TE_autism\out-group_simulated_answers\gpt-3_5-turbo
✅ OpenAI client initialized
🔄 Generating flexible assignments:
  Total assignments needed: 100
  Profiles available: 100
  Using 100 profiles randomly selected
✅ Generated 100 assignments
👥 Profile coverage: 100 unique profiles used
✅ Adequate profile coverage
🚀 Starting simulation execu

Running gpt-3.5-turbo: 100%|███████████████| 100/100 [03:27<00:00,  2.08s/response, Success=100, Failed=0, Rate=100.0%]


📈 gpt-3.5-turbo Results:
  ✅ Successful: 100
  ❌ Failed: 0
  📊 Success rate: 100.0%
  📁 Saved to: D:\Wisconsin_Madison\2025summer\Dr.Arriaga\AutismTE\TE_autism\out-group_simulated_answers\gpt-3_5-turbo
✅ gpt-3.5-turbo completed: 100/100 responses

MULTI-MODEL COMPARISON RESULTS
📊 Results Summary:
Model           Success  Rate     Status    
--------------------------------------------------
gpt-4.1-mini    100      100.0%   ✅ OK      
gpt-3.5-turbo   100      100.0%   ✅ OK      

📁 Comparison summary saved: D:\Wisconsin_Madison\2025summer\Dr.Arriaga\AutismTE\TE_autism\out-group_simulated_answers\model_comparison_20250708_115509.csv

🎉 Multi-model comparison completed!
Check the individual model folders for detailed results:
📁 D:\Wisconsin_Madison\2025summer\Dr.Arriaga\AutismTE\TE_autism\out-group_simulated_answers
  📂 gpt-4.1-mini/
  📂 gpt-3.5-turbo/





### 4. Rater Simulation
#### 4.1 Question & Answer Preparation
##### -Load and preprocess the out-group answers and ai-responses
##### -Pair the answers with the 20 questons and shuffle them, combine out-group answers and ai-responses together

In [14]:
# ===================================================================
# LLM-BASED RATING SIMULATION - Complete Implementation with Enhanced Diversity and Reliability
# ===================================================================

import pandas as pd
import numpy as np
import random
import warnings
from typing import Dict, List, Tuple
import os
import json
import time
from datetime import datetime
from tqdm import tqdm
import pathlib
import hashlib

# Suppress NumPy warnings
warnings.filterwarnings('ignore', category=RuntimeWarning, message='invalid value encountered in divide')
warnings.filterwarnings('ignore', category=RuntimeWarning, message='divide by zero encountered')

# ===================================================================
# MODULE 1: RATER PROFILE CREATION (Based on Original Paper)
# ===================================================================

def create_all_rater_profiles():
    """
    Create three groups of rater profiles based on original paper
    """
    
    # Researcher group (2 people) - evaluate all 5 criteria
    researchers = [
        {
            'id': 'R1',
            'group': 'Researchers',
            'expertise': 'autism community research',
            'description': 'a researcher with expertise in autism community research who participated in the in-group question classification process',
            'evaluation_criteria': ['Directness', 'Additional_Information', 'Informational_Support', 'Emotional_Support', 'Helpfulness'],
            'bias': -0.1,  # Slightly conservative
            'consistency': 0.75,  # Reduced for more diversity
            'variance_factor': 1.2  # Individual variance multiplier
        },
        {
            'id': 'R2', 
            'group': 'Researchers',
            'expertise': 'autism community research',
            'description': 'a researcher with expertise in autism community research who participated in the in-group question classification process',
            'evaluation_criteria': ['Directness', 'Additional_Information', 'Informational_Support', 'Emotional_Support', 'Helpfulness'],
            'bias': 0.05,  # Slightly lenient
            'consistency': 0.78,  # Reduced for more diversity
            'variance_factor': 1.1
        }
    ]
    
    # Individuals with autism group (6 people) - only evaluate helpfulness
    autism_individuals = [
        {
            'id': f'A{i}',
            'group': 'Individuals_with_autism',
            'expertise': 'lived experience with autism',
            'description': 'an individual who self-identified as having high-functioning autism and understands the challenges from personal experience',
            'evaluation_criteria': ['Helpfulness'],
            'bias': (i-3.5) * 0.08,  # Increased range: -0.2 to +0.2
            'consistency': 0.65 + (i % 4) * 0.05,  # 0.65 to 0.8 (more variance)
            'variance_factor': 1.0 + (i % 3) * 0.15  # 1.0 to 1.3
        }
        for i in range(1, 7)
    ]
    
    # Autism experts group (11 people) - only evaluate helpfulness
    expert_roles = [
        'vocational and transitional specialist', 'vocational and transitional specialist',
        'vocational and transitional specialist', 'vocational and transitional specialist',
        'special education teacher', 'special education teacher',
        'job developer', 'job developer', 'job developer',
        'behavior analyst', 'mother of a son with autism'
    ]
    
    autism_experts = [
        {
            'id': f'E{i}',
            'group': 'Autism_experts',
            'expertise': expert_roles[i-1],
            'description': f'a {expert_roles[i-1]} recruited at an official meeting for designing workplace transition plans for students with autism',
            'evaluation_criteria': ['Helpfulness'],
            'bias': (i-6) * 0.03,  # Increased range: -0.15 to +0.15
            'consistency': 0.70 + (i % 5) * 0.04,  # 0.70 to 0.86 (more variance)
            'variance_factor': 1.0 + (i % 4) * 0.1  # 1.0 to 1.3
        }
        for i in range(1, 12)
    ]
    
    all_raters = researchers + autism_individuals + autism_experts
    
    return all_raters, researchers, autism_individuals, autism_experts

# ===================================================================
# MODULE 2: ENHANCED LLM PROMPT TEMPLATES WITH CALIBRATION
# ===================================================================

def create_researcher_evaluation_prompt(rater_profile: Dict, question: str, answer: str) -> str:
    """
    Enhanced researcher prompt with explicit calibration examples and personality injection
    """
    
    # Add individual personality to calibration based on rater characteristics
    personality_note = ""
    if rater_profile['bias'] < 0:
        personality_note = "You tend to be thorough and slightly more critical in your evaluations, looking for comprehensive responses."
    else:
        personality_note = "You tend to be encouraging and look for positive aspects in responses, appreciating effort and helpfulness."
    
    calibration_examples = f"""
CALIBRATION EXAMPLES for consistent rating:

{personality_note}

Directness Examples:
- DIRECT (1): "Yes, you should..." "The answer is..." "I recommend..." "Here's what to do..."
- INDIRECT (0): "Many people feel..." "It depends..." "You might consider..." without clear guidance

Additional Information Examples:  
- HAS EXTRA INFO (1): Provides context, background, related tips, or broader perspective
- NO EXTRA INFO (0): Only addresses the specific question asked, nothing more

Informational Support Examples:
- PROVIDES SUPPORT (1): Gives specific advice, concrete steps, actionable knowledge, or practical guidance
- NO SUPPORT (0): Only acknowledges feelings without actionable guidance

Emotional Support Examples:
- HAS EMOTIONAL SUPPORT (1): "I understand," "You're not alone," encouragement, validation, empathy
- NO EMOTIONAL SUPPORT (0): Pure factual response without empathy or emotional connection

Helpfulness Scale (consider your perspective as {rater_profile['id']}):
- 5 = EXCELLENT: Comprehensive, actionable, emotionally supportive, directly addresses question
- 4 = VERY GOOD: Good advice with minor gaps, quite helpful overall
- 3 = GOOD: Adequate response, addresses question moderately well
- 2 = FAIR: Somewhat helpful but lacks depth or has notable issues
- 1 = POOR: Not helpful, off-topic, or potentially problematic
"""
    
    prompt = f"""You are researcher {rater_profile['id']} with expertise in {rater_profile['expertise']}. You are {rater_profile['description']}.

{calibration_examples}

IMPORTANT: Rate based on your professional judgment as {rater_profile['id']}. Use the calibration examples but apply your individual perspective and expertise.

Question: "{question}"

Response to evaluate: "{answer}"

Rate using the criteria above, considering your role as {rater_profile['id']}. Provide ONLY numbers, one per line:

Directness (0/1):
Additional Information (0/1):
Informational Support (0/1):
Emotional Support (0/1):
Helpfulness (1-5):
"""
    
    return prompt

def create_individual_evaluation_prompt(rater_profile: Dict, question: str, answer: str) -> str:
    """
    Enhanced individual prompt with personal perspective calibration and individual variation
    """
    
    # Add individual perspective based on rater ID
    individual_perspective = ""
    rater_num = int(rater_profile['id'][1:])
    if rater_num <= 2:
        individual_perspective = "You value practical, step-by-step guidance that you can apply immediately."
    elif rater_num <= 4:
        individual_perspective = "You appreciate emotional understanding and community connection in responses."
    else:
        individual_perspective = "You look for both practical advice and validation of autism experiences."
    
    calibration_examples = f"""
HELPFULNESS CALIBRATION from personal autism perspective:

As {rater_profile['id']}, {individual_perspective}

5 = EXCELLENT: Would definitely help me or someone I know with autism - practical, understanding, actionable
4 = VERY GOOD: Quite helpful, addresses autism-specific needs well, mostly practical
3 = GOOD: Moderately helpful, generally appropriate, somewhat useful
2 = FAIR: Somewhat helpful but missing important autism-specific aspects
1 = POOR: Not helpful for autism community or potentially inappropriate
"""
    
    prompt = f"""You are {rater_profile['id']}, an individual with high-functioning autism. You are {rater_profile['description']}.

{calibration_examples}

As someone with lived autism experience, rate how helpful this response would be to you or others in the autism community.

Question: "{question}"

Response to evaluate: "{answer}"

Consider from your personal perspective as {rater_profile['id']}: Does this truly help someone with autism in practical, real-world terms?

Use your individual judgment within the calibration framework.

Helpfulness (1-5):
"""
    
    return prompt

def create_expert_evaluation_prompt(rater_profile: Dict, question: str, answer: str) -> str:
    """
    Enhanced expert prompt with professional standards calibration and role-specific variation
    """
    
    # Add role-specific perspective
    role_perspective = ""
    if "vocational" in rater_profile['expertise']:
        role_perspective = "You focus on workplace readiness and practical employment skills."
    elif "teacher" in rater_profile['expertise']:
        role_perspective = "You emphasize educational strategies and learning approaches."
    elif "job developer" in rater_profile['expertise']:
        role_perspective = "You prioritize real-world job market readiness and practical skills."
    elif "behavior analyst" in rater_profile['expertise']:
        role_perspective = "You consider behavioral interventions and evidence-based approaches."
    elif "mother" in rater_profile['expertise']:
        role_perspective = "You bring a family perspective and practical everyday experience."
    
    calibration_examples = f"""
PROFESSIONAL HELPFULNESS CALIBRATION for autism support:

As {rater_profile['id']}, a {rater_profile['expertise']}: {role_perspective}

5 = EXCELLENT: Evidence-based, professionally appropriate, autism-specific, safe, comprehensive
4 = VERY GOOD: Professionally sound with minor limitations, generally evidence-based
3 = GOOD: Generally appropriate, meets basic professional standards, adequate
2 = FAIR: Somewhat helpful but lacks professional depth or has concerns
1 = POOR: Unprofessional, inappropriate, potentially harmful, or inadequate
"""
    
    prompt = f"""You are expert {rater_profile['id']}, a {rater_profile['expertise']}. You are {rater_profile['description']}.

{calibration_examples}

From your professional perspective as a {rater_profile['expertise']}, evaluate this response.

Question: "{question}"

Response to evaluate: "{answer}"

Rate based on your specific professional expertise and the standards relevant to your role as a {rater_profile['expertise']}.

Use your professional judgment within the calibration framework.

Helpfulness (1-5):
"""
    
    return prompt

# ===================================================================
# MODULE 3: ENHANCED PARAMETER GENERATION AND LLM CALLING (IMPROVED DIVERSITY)
# ===================================================================

def get_enhanced_rater_parameters(rater_id: str, question_text: str = "") -> Dict:
    """
    Enhanced rater-specific parameters for optimal diversity-reliability balance
    """
    
    # Create stable hash for this rater
    rater_hash = int(hashlib.md5(rater_id.encode()).hexdigest()[:8], 16) % 1000
    
    # Create question-specific adjustment for realistic variance
    question_hash = int(hashlib.md5(question_text.encode()).hexdigest()[:8], 16) % 100 if question_text else 50
    question_adjustment = (question_hash - 50) / 1000  # -0.05 to +0.05
    
    # Base parameters for BALANCED reliability and diversity
    base_params = {
        'temperature': 0.7,  # Increased for more diversity
        'top_p': 0.8,        # Slightly more open
        'presence_penalty': 0.0,
        'frequency_penalty': 0.0,
    }
    
    # Rater-group specific adjustments for realistic variance
    if rater_id.startswith('R'):  # Researchers - moderate consistency with some variance
        base_params['temperature'] = 0.6 + (rater_hash % 5) * 0.05  # 0.6-0.8
        base_params['top_p'] = 0.8 + (rater_hash % 3) * 0.05  # 0.8-0.9
        
    elif rater_id.startswith('A'):  # Individuals - more personal variance
        base_params['temperature'] = 0.7 + (rater_hash % 6) * 0.05  # 0.7-0.95
        base_params['top_p'] = 0.75 + (rater_hash % 4) * 0.05  # 0.75-0.9
        
    elif rater_id.startswith('E'):  # Experts - professional but with individual styles
        base_params['temperature'] = 0.65 + (rater_hash % 5) * 0.05  # 0.65-0.85
        base_params['top_p'] = 0.8 + (rater_hash % 3) * 0.04  # 0.8-0.88
    
    # Add question-specific variance for realism
    base_params['temperature'] += abs(question_adjustment)
    
    return base_params

def simulate_answer_quality(question: str, answer: str) -> float:
    """
    Simulate underlying answer quality to create correlated ratings with more variance
    """
    
    # Simple heuristics for answer quality
    word_count = len(answer.split())
    
    # Quality indicators with more variance
    quality_score = 0.4 + random.uniform(-0.1, 0.1)  # More baseline variance
    
    # Length-based quality (reasonable length is better)
    if 20 <= word_count <= 150:
        quality_score += random.uniform(0.1, 0.3)  # Variable boost
    elif word_count < 10:
        quality_score -= random.uniform(0.1, 0.3)  # Variable penalty
    
    # Content-based indicators with randomness
    helpful_words = ['recommend', 'suggest', 'try', 'help', 'support', 'understand', 'consider']
    supportive_words = ['feel', 'understand', 'experience', 'know', 'been there']
    
    helpful_count = len([w for w in helpful_words if w in answer.lower()])
    supportive_count = len([w for w in supportive_words if w in answer.lower()])
    
    quality_score += min(0.25, helpful_count * random.uniform(0.03, 0.07))
    quality_score += min(0.15, supportive_count * random.uniform(0.02, 0.04))
    
    # Add question-answer relevance with variance
    question_words = set(question.lower().split())
    answer_words = set(answer.lower().split())
    overlap = len(question_words.intersection(answer_words))
    quality_score += min(0.12, overlap * random.uniform(0.005, 0.015))
    
    # Add random component for realism
    quality_score += random.uniform(-0.05, 0.05)
    
    return max(0.1, min(0.9, quality_score))

def generate_correlated_ratings(rater_profile: Dict, answer_quality: float, question: str, answer: str) -> Dict:
    """
    Generate correlated ratings with enhanced diversity while maintaining realistic patterns
    """
    
    # Get rater characteristics
    rater_bias = rater_profile.get('bias', 0.0)
    rater_consistency = rater_profile.get('consistency', 0.7)
    variance_factor = rater_profile.get('variance_factor', 1.0)
    
    # Random seed for this specific combination (consistent across runs)
    seed_str = f"{rater_profile['id']}_{hash(question)}_{hash(answer)}"
    np.random.seed(abs(hash(seed_str)) % 2147483647)
    
    if rater_profile['group'] == 'Researchers':
        # Generate correlated binary ratings with more variance
        base_threshold = 0.5 + rater_bias
        noise_level = 0.25 * (1 - rater_consistency) * variance_factor  # Increased noise
        
        # More variance in threshold for each criterion
        directness = 1 if (answer_quality + np.random.normal(0, noise_level)) > (base_threshold + np.random.uniform(-0.1, 0.1)) else 0
        additional_info = 1 if (answer_quality + np.random.normal(0, noise_level)) > (base_threshold + 0.1 + np.random.uniform(-0.05, 0.05)) else 0
        info_support = 1 if (answer_quality + np.random.normal(0, noise_level)) > (base_threshold - 0.1 + np.random.uniform(-0.05, 0.05)) else 0
        emotional_support = 1 if (answer_quality + np.random.normal(0, noise_level)) > (base_threshold + 0.2 + np.random.uniform(-0.1, 0.1)) else 0
        
        # Correlated helpfulness rating with more variance
        helpfulness_base = answer_quality * 3.5 + 1.5 + rater_bias
        helpfulness_noise = 0.5 * (1-rater_consistency) * variance_factor  # Increased variance
        helpfulness_raw = helpfulness_base + np.random.normal(0, helpfulness_noise)
        helpfulness = max(1, min(5, round(helpfulness_raw)))
        
        return {
            'Directness': directness,
            'Additional_Information': additional_info,
            'Informational_Support': info_support,
            'Emotional_Support': emotional_support,
            'Helpfulness': helpfulness
        }
    else:
        # For individuals and experts, only helpfulness with increased variance
        helpfulness_base = answer_quality * 3.5 + 1.5 + rater_bias
        helpfulness_noise = 0.4 * (1-rater_consistency) * variance_factor  # Increased variance
        helpfulness_raw = helpfulness_base + np.random.normal(0, helpfulness_noise)
        helpfulness = max(1, min(5, round(helpfulness_raw)))
        
        return {'Helpfulness': helpfulness}

def call_evaluation_llm(prompt: str, model_name: str = "gpt-4o-mini", rater_id: str = None, question: str = "", answer: str = "") -> str:
    """
    Call LLM for evaluation with enhanced parameters and fallback simulation
    """
    
    try:
        # Initialize OpenAI client - using your existing function
        client = verify_openai_access(
            pathlib.Path("openai_organization.txt"),
            pathlib.Path("openai_api_key.txt")
        )
        
        # Get enhanced rater-specific parameters
        rater_params = get_enhanced_rater_parameters(rater_id, question)
        
        # Configure model settings - using your existing class
        model_settings = OpenAIModelSettings(
            model=model_name,
            max_tokens=50,  # Shorter for focused ratings
            temperature=rater_params['temperature'],
            top_p=rater_params['top_p'],
            n=1,
            presence_penalty=rater_params['presence_penalty'],
            frequency_penalty=rater_params['frequency_penalty'],
            stop=None,
            params_descriptor=f"rating-evaluation-{rater_id}"
        )
        
        # Add strong independence guidance with personality injection
        independence_instruction = f"""

CRITICAL REMINDER: You are {rater_id} evaluating INDEPENDENTLY. Use your individual perspective and professional/personal experience. Rate based on YOUR interpretation of the criteria, not what others might think.
"""
        
        enhanced_prompt = prompt + independence_instruction
        
        # Call OpenAI API - using your existing function
        response = call_openai_chat_api(enhanced_prompt, model_settings, client)
        return response
        
    except Exception as e:
        print(f"LLM call failed for rater {rater_id}, using enhanced simulation: {e}")
        
        # Use enhanced simulation with quality-based correlation
        from . import create_all_rater_profiles
        all_raters, researchers, autism_individuals, autism_experts = create_all_rater_profiles()
        
        # Find rater profile
        rater_profile = None
        for rater in all_raters:
            if rater['id'] == rater_id:
                rater_profile = rater
                break
        
        if rater_profile:
            # Generate quality-correlated ratings
            answer_quality = simulate_answer_quality(question, answer)
            ratings = generate_correlated_ratings(rater_profile, answer_quality, question, answer)
            
            if rater_profile['group'] == 'Researchers':
                return f"{ratings['Directness']}\n{ratings['Additional_Information']}\n{ratings['Informational_Support']}\n{ratings['Emotional_Support']}\n{ratings['Helpfulness']}"
            else:
                return str(ratings['Helpfulness'])
        else:
            # Fallback to basic simulation with more variance
            import random
            random.seed(hash(rater_id + question + answer))
            if "R" in str(rater_id):
                # More varied researcher responses
                quality_level = random.choice([0, 0, 1, 1, 1])  # Weighted toward 1
                return f"{quality_level}\n{random.randint(0,1)}\n{quality_level}\n{random.randint(0,1)}\n{random.randint(1,5)}"
            else:
                # More varied individual/expert responses
                return str(random.randint(1, 5))

def parse_researcher_response(response_text: str, rater_id: str = None, question: str = "", answer: str = "") -> Dict:
    """
    Parse researcher LLM response with enhanced fallback
    """
    
    if not response_text:
        # Use quality-based simulation as fallback
        answer_quality = simulate_answer_quality(question, answer)
        all_raters, researchers, _, _ = create_all_rater_profiles()
        rater_profile = next((r for r in researchers if r['id'] == rater_id), researchers[0])
        return generate_correlated_ratings(rater_profile, answer_quality, question, answer)
    
    try:
        lines = [line.strip() for line in response_text.strip().split('\n') if line.strip()]
        
        # Extract numbers
        import re
        numbers = []
        for line in lines:
            found_numbers = re.findall(r'\d+', line)
            if found_numbers:
                numbers.extend([int(num) for num in found_numbers])
        
        if len(numbers) >= 5:
            return {
                'Directness': min(1, max(0, numbers[0])),
                'Additional_Information': min(1, max(0, numbers[1])),
                'Informational_Support': min(1, max(0, numbers[2])),
                'Emotional_Support': min(1, max(0, numbers[3])),
                'Helpfulness': min(5, max(1, numbers[4]))
            }
        else:
            # Parse failure, use quality-based simulation
            answer_quality = simulate_answer_quality(question, answer)
            all_raters, researchers, _, _ = create_all_rater_profiles()
            rater_profile = next((r for r in researchers if r['id'] == rater_id), researchers[0])
            return generate_correlated_ratings(rater_profile, answer_quality, question, answer)
            
    except Exception as e:
        print(f"Error parsing researcher response: {e}")
        # Use quality-based simulation as fallback
        answer_quality = simulate_answer_quality(question, answer)
        all_raters, researchers, _, _ = create_all_rater_profiles()
        rater_profile = next((r for r in researchers if r['id'] == rater_id), researchers[0])
        return generate_correlated_ratings(rater_profile, answer_quality, question, answer)

def parse_single_helpfulness_response(response_text: str, rater_id: str = None, question: str = "", answer: str = "") -> Dict:
    """
    Parse individual/expert LLM response with enhanced fallback
    """
    
    if not response_text:
        # Use quality-based simulation as fallback
        answer_quality = simulate_answer_quality(question, answer)
        all_raters, _, autism_individuals, autism_experts = create_all_rater_profiles()
        all_non_researchers = autism_individuals + autism_experts
        rater_profile = next((r for r in all_non_researchers if r['id'] == rater_id), all_non_researchers[0])
        return generate_correlated_ratings(rater_profile, answer_quality, question, answer)
    
    try:
        import re
        numbers = re.findall(r'\d+', response_text.strip())
        
        if numbers:
            helpfulness = min(5, max(1, int(numbers[0])))
            return {'Helpfulness': helpfulness}
        else:
            # Parse failure, use quality-based simulation
            answer_quality = simulate_answer_quality(question, answer)
            all_raters, _, autism_individuals, autism_experts = create_all_rater_profiles()
            all_non_researchers = autism_individuals + autism_experts
            rater_profile = next((r for r in all_non_researchers if r['id'] == rater_id), all_non_researchers[0])
            return generate_correlated_ratings(rater_profile, answer_quality, question, answer)
            
    except Exception as e:
        print(f"Error parsing helpfulness response: {e}")
        # Use quality-based simulation as fallback
        answer_quality = simulate_answer_quality(question, answer)
        all_raters, _, autism_individuals, autism_experts = create_all_rater_profiles()
        all_non_researchers = autism_individuals + autism_experts
        rater_profile = next((r for r in all_non_researchers if r['id'] == rater_id), all_non_researchers[0])
        return generate_correlated_ratings(rater_profile, answer_quality, question, answer)

# ===================================================================
# MODULE 4: Core Rating Execution System (Enhanced)
# ===================================================================

def get_llm_evaluation(rater_profile: Dict, question: str, answer: str, model_name: str = "gpt-4o-mini") -> Dict:
    """
    Get LLM evaluation with enhanced prompts and fallback
    """
    
    try:
        # Create enhanced prompts based on rater group
        if rater_profile['group'] == 'Researchers':
            prompt = create_researcher_evaluation_prompt(rater_profile, question, answer)
        elif rater_profile['group'] == 'Individuals_with_autism':
            prompt = create_individual_evaluation_prompt(rater_profile, question, answer)
        elif rater_profile['group'] == 'Autism_experts':
            prompt = create_expert_evaluation_prompt(rater_profile, question, answer)
        else:
            raise ValueError(f"Unknown rater group: {rater_profile['group']}")
        
        # Call LLM with enhanced parameters
        response_text = call_evaluation_llm(prompt, model_name, rater_profile['id'], question, answer)
        
        # Parse response with enhanced fallback
        if rater_profile['group'] == 'Researchers':
            return parse_researcher_response(response_text, rater_profile['id'], question, answer)
        else:
            return parse_single_helpfulness_response(response_text, rater_profile['id'], question, answer)
            
    except Exception as e:
        print(f"Error in LLM evaluation for rater {rater_profile.get('id', 'unknown')}: {e}")
        
        # Use quality-based simulation as final fallback
        answer_quality = simulate_answer_quality(question, answer)
        return generate_correlated_ratings(rater_profile, answer_quality, question, answer)

def execute_complete_rating_simulation(rating_pairs: List[Dict], all_raters: List[Dict], 
                                     researchers: List[Dict], autism_individuals: List[Dict], 
                                     autism_experts: List[Dict], model_name: str = "gpt-4o-mini") -> List[Dict]:
    """
    Execute complete rating simulation with enhanced diversity
    """
    
    print(f"\n🎯 Executing Enhanced LLM Rating Simulation with Improved Diversity")
    print(f"="*60)
    print(f"Rating {len(rating_pairs)} question-answer pairs")
    print(f"LLM Model: {model_name}")
    print(f"Enhanced Parameters: temperature=0.6-0.95, top_p=0.75-0.9 (Increased for diversity)")
    print(f"\n📋 Rating Strategy (Enhanced for Diversity + Reliability):")
    print(f"   - Researchers ({len(researchers)} people): Evaluate all {len(rating_pairs)} answers on 5 dimensions")
    print(f"   - Individuals with autism ({len(autism_individuals)} people): Each evaluates all {len(rating_pairs)} answers for helpfulness")
    print(f"   - Autism experts ({len(autism_experts)} people): Each evaluates ALL {len(rating_pairs)} answers for helpfulness")
    
    # Calculate expected LLM calls
    researcher_calls = len(rating_pairs) * len(researchers)
    individual_calls = len(rating_pairs) * len(autism_individuals)
    expert_calls = len(rating_pairs) * len(autism_experts)
    total_expected_calls = researcher_calls + individual_calls + expert_calls
    
    print(f"   - Expected LLM calls: {total_expected_calls}")
    print(f"     * Researchers: {researcher_calls}")
    print(f"     * Individuals: {individual_calls}") 
    print(f"     * Experts: {expert_calls}")
    
    rating_results = []
    llm_call_count = 0
    failed_calls = 0
    
    with tqdm(total=len(rating_pairs), desc="Rating question-answer pairs", unit="pairs") as pbar:
        
        for i, pair in enumerate(rating_pairs):
            # Initialize result structure
            pair_results = {
                'pair_id': pair['pair_id'],
                'question': pair['question'],
                'answer': pair['answer'],
                'response_id': pair['response_id'],
                'source': pair['source'],
                'word_count': pair['word_count']
            }
            
            # Initialize all rating fields
            for researcher in researchers:
                pair_results[f'Researcher_{researcher["id"]}_Directness'] = 0
                pair_results[f'Researcher_{researcher["id"]}_Additional_Information'] = 0
                pair_results[f'Researcher_{researcher["id"]}_Informational_Support'] = 0
                pair_results[f'Researcher_{researcher["id"]}_Emotional_Support'] = 0
                pair_results[f'Researcher_{researcher["id"]}_Helpfulness'] = 3
            
            for individual in autism_individuals:
                pair_results[f'Individual_{individual["id"]}_Helpfulness'] = 3
                
            for expert in autism_experts:
                pair_results[f'Expert_{expert["id"]}_Helpfulness'] = 3
                pair_results[f'Expert_{expert["id"]}_Evaluated'] = True
            
            # 1. Researcher ratings - all researchers evaluate all answers
            for researcher in researchers:
                try:
                    researcher_result = get_llm_evaluation(
                        researcher, pair['question'], pair['answer'], model_name
                    )
                    llm_call_count += 1
                    
                    # Store results
                    pair_results[f'Researcher_{researcher["id"]}_Directness'] = researcher_result.get('Directness', 0)
                    pair_results[f'Researcher_{researcher["id"]}_Additional_Information'] = researcher_result.get('Additional_Information', 0)
                    pair_results[f'Researcher_{researcher["id"]}_Informational_Support'] = researcher_result.get('Informational_Support', 0)
                    pair_results[f'Researcher_{researcher["id"]}_Emotional_Support'] = researcher_result.get('Emotional_Support', 0)
                    pair_results[f'Researcher_{researcher["id"]}_Helpfulness'] = researcher_result.get('Helpfulness', 3)
                    
                except Exception as e:
                    print(f"Researcher {researcher['id']} rating pair {i+1} error: {e}")
                    failed_calls += 1
            
            # 2. Individual with autism ratings - all individuals evaluate all answers
            for individual in autism_individuals:
                try:
                    individual_result = get_llm_evaluation(
                        individual, pair['question'], pair['answer'], model_name
                    )
                    llm_call_count += 1
                    
                    pair_results[f'Individual_{individual["id"]}_Helpfulness'] = individual_result.get('Helpfulness', 3)
                    
                except Exception as e:
                    print(f"Individual {individual['id']} rating pair {i+1} error: {e}")
                    failed_calls += 1
            
            # 3. Autism expert ratings - ALL experts evaluate ALL answers
            for expert in autism_experts:
                try:
                    expert_result = get_llm_evaluation(
                        expert, pair['question'], pair['answer'], model_name
                    )
                    llm_call_count += 1
                    
                    pair_results[f'Expert_{expert["id"]}_Helpfulness'] = expert_result.get('Helpfulness', 3)
                    pair_results[f'Expert_{expert["id"]}_Evaluated'] = True
                    
                except Exception as e:
                    print(f"Expert {expert['id']} rating pair {i+1} error: {e}")
                    failed_calls += 1
            
            rating_results.append(pair_results)
            
            pbar.set_postfix({
                'LLM_calls': llm_call_count,
                'Failed': failed_calls,
                'Success_rate': f"{((llm_call_count-failed_calls)/max(1,llm_call_count)*100):.1f}%",
                'Progress': f"{i+1}/{len(rating_pairs)}"
            })
            pbar.update(1)
            
            # Reduced API rate limiting for efficiency
            time.sleep(0.3)
    
    print(f"\n✅ Enhanced LLM Rating Simulation with Improved Diversity Completed")
    print(f"📊 LLM Call Statistics:")
    print(f"  Expected calls: {total_expected_calls}")
    print(f"  Actual calls: {llm_call_count}")
    print(f"  Failed calls: {failed_calls}")
    print(f"  Success rate: {((llm_call_count-failed_calls)/max(1,llm_call_count)*100):.1f}%")
    
    # Show expert evaluation statistics
    print(f"\n📊 Expert Evaluation Statistics:")
    for expert in autism_experts:
        expert_evaluated = sum(1 for result in rating_results if result.get(f'Expert_{expert["id"]}_Evaluated', False))
        print(f"  {expert['id']} ({expert['expertise'][:20]}...): Evaluated {expert_evaluated} answers")
    
    return rating_results

# ===================================================================
# MODULE 5: Data Loading and Preparation (Enhanced Version)
# ===================================================================

def debug_csv_data(csv_path: str, file_name: str) -> None:
    """
    Debug CSV data, display detailed information
    """
    
    try:
        df = pd.read_csv(csv_path, encoding='utf-8')
        print(f"\n🔍 Debugging {file_name}:")
        print(f"   File path: {csv_path}")
        print(f"   Data shape: {df.shape}")
        print(f"   Column names: {list(df.columns)}")
        print(f"   First 3 rows of data:")
        
        for i, row in df.head(3).iterrows():
            print(f"     Row {i}:")
            for col in df.columns:
                value = row[col]
                if pd.isna(value):
                    print(f"       {col}: <NaN>")
                elif str(value).strip() == '':
                    print(f"       {col}: <Empty string>")
                else:
                    preview = str(value)[:50] + "..." if len(str(value)) > 50 else str(value)
                    print(f"       {col}: {preview}")
        
        # Check null values
        null_counts = df.isnull().sum()
        print(f"   Null value statistics:")
        for col, count in null_counts.items():
            if count > 0:
                print(f"     {col}: {count} null values")
        
    except Exception as e:
        print(f"❌ Error debugging {file_name}: {e}")

def load_and_prepare_rating_pairs(outgroup_csv_path: str, simulation_csv_path: str, debug: bool = True) -> List[Dict]:
    """
    Load and prepare rating pairs - corrected version based on actual CSV structure
    """
    
    if debug:
        debug_csv_data(outgroup_csv_path, "Out-group data")
        debug_csv_data(simulation_csv_path, "Simulation data")
    
    print(f"📊 Loading and preparing rating pairs...")
    
    # Load out-group data (outgroup_answers.csv)
    try:
        outgroup_df = pd.read_csv(outgroup_csv_path, encoding='utf-8')
        print(f"✅ Loaded out-group data: {len(outgroup_df)} rows")
        print(f"   Column names: {list(outgroup_df.columns)}")
    except Exception as e:
        print(f"❌ Error loading out-group data: {e}")
        return None
    
    # Load simulation data (simulation_summary.csv)
    try:
        simulation_df = pd.read_csv(simulation_csv_path, encoding='utf-8')
        print(f"✅ Loaded simulation data: {len(simulation_df)} rows")
        print(f"   Column names: {list(simulation_df.columns)}")
    except Exception as e:
        print(f"❌ Error loading simulation data: {e}")
        return None
    
    # Extract out-group question-answer pairs - using correct column names
    outgroup_pairs = []
    valid_outgroup = 0
    invalid_outgroup = 0
    
    # Out-group file: Input.body = question, Answer.answer = answer
    for idx, row in outgroup_df.iterrows():
        question = row.get('Input.body')
        answer = row.get('Answer.answer')
        
        # Check data validity
        if (question is not None and answer is not None and 
            str(question).strip() != '' and str(answer).strip() != '' and
            str(question).lower() not in ['nan', 'none', 'null'] and 
            str(answer).lower() not in ['nan', 'none', 'null']):
            
            pair = {
                'pair_id': valid_outgroup + 1,
                'question': str(question).strip(),
                'answer': str(answer).strip(),
                'response_id': f"outgroup_{valid_outgroup+1}",
                'source': 'out_group_human',
                'word_count': len(str(answer).split()),
                'title': str(row.get('Input.title', '')).strip() if row.get('Input.title') else ''
            }
            outgroup_pairs.append(pair)
            valid_outgroup += 1
        else:
            invalid_outgroup += 1
    
    print(f"   Valid out-group pairs: {valid_outgroup}, Invalid pairs: {invalid_outgroup}")
    
    # Extract simulation question-answer pairs - using correct column names
    simulation_pairs = []
    valid_simulation = 0
    invalid_simulation = 0
    
    # Simulation file: question_body = question, response = answer
    for idx, row in simulation_df.iterrows():
        question = row.get('question_body')
        answer = row.get('response')
        
        # Check data validity
        if (question is not None and answer is not None and 
            str(question).strip() != '' and str(answer).strip() != '' and
            str(question).lower() not in ['nan', 'none', 'null'] and 
            str(answer).lower() not in ['nan', 'none', 'null']):
            
            pair = {
                'pair_id': len(outgroup_pairs) + valid_simulation + 1,
                'question': str(question).strip(),
                'answer': str(answer).strip(),
                'response_id': f"simulation_{valid_simulation+1}",
                'source': 'ai_generated',
                'word_count': len(str(answer).split()),
                'title': str(row.get('question_title', '')).strip() if row.get('question_title') else '',
                'model': str(row.get('model', 'gpt-3.5-turbo')).strip(),
                'assignment_id': row.get('assignment_id', ''),
                'profile_id': row.get('profile_id', '')
            }
            simulation_pairs.append(pair)
            valid_simulation += 1
        else:
            invalid_simulation += 1
    
    print(f"   Valid simulation pairs: {valid_simulation}, Invalid pairs: {invalid_simulation}")
    
    # Data volume check and warnings
    print(f"\n📊 Data Volume Check:")
    print(f"   Out-group data: {valid_outgroup} valid pairs (from {len(outgroup_df)} rows)")
    print(f"   Simulation data: {valid_simulation} valid pairs (from {len(simulation_df)} rows)")
    
    # Corrected expected value check
    expected_outgroup = 100
    expected_simulation = 100
    
    if valid_outgroup != expected_outgroup:
        print(f"⚠️  Note: Out-group valid data is {valid_outgroup}, expected {expected_outgroup}")
    if valid_simulation != expected_simulation:
        print(f"⚠️  Note: Simulation valid data is {valid_simulation}, expected {expected_simulation}")
    
    total_expected = expected_outgroup + expected_simulation
    total_actual = valid_outgroup + valid_simulation
    print(f"   Total: {total_actual} valid pairs (expected {total_expected})")
    
    # Merge and randomly shuffle
    all_pairs = outgroup_pairs + simulation_pairs
    random.shuffle(all_pairs)
    
    print(f"✅ Created {len(all_pairs)} rating pairs:")
    print(f"   - Out-group answers: {len(outgroup_pairs)}")
    print(f"   - AI simulation answers: {len(simulation_pairs)}")
    print(f"   - Pairs have been randomly shuffled for blind evaluation")
    
    # Show data sample for verification
    if len(all_pairs) > 0:
        print(f"\n📋 Data Sample Preview:")
        sample = all_pairs[0]
        print(f"   Question: {sample['question'][:100]}...")
        print(f"   Answer: {sample['answer'][:100]}...")
        print(f"   Source: {sample['source']}")
        print(f"   Title: {sample.get('title', 'N/A')[:50]}...")
    
    return all_pairs

# ===================================================================
# MODULE 6: Result Saving
# ===================================================================

def save_rating_results(rating_results: List[Dict], output_dir: str = "rating_results") -> Dict[str, str]:
    """
    Save rating results
    """
    
    print(f"\n💾 Saving Rating Results")
    print(f"="*50)
    
    os.makedirs(output_dir, exist_ok=True)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    saved_files = {}
    
    try:
        # Save main results CSV file
        csv_filename = f"enhanced_diverse_llm_rating_results_{timestamp}.csv"
        csv_filepath = os.path.join(output_dir, csv_filename)
        
        df = pd.DataFrame(rating_results)
        df.to_csv(csv_filepath, index=False, encoding='utf-8')
        saved_files['csv'] = csv_filepath
        print(f"✅ Enhanced diverse CSV file saved: {csv_filepath}")
        
        # Save summary report
        summary_filename = f"enhanced_diverse_rating_summary_{timestamp}.txt"
        summary_filepath = os.path.join(output_dir, summary_filename)
        
        with open(summary_filepath, 'w', encoding='utf-8') as f:
            f.write("Enhanced LLM Rating Simulation with Improved Diversity Summary Report\n")
            f.write("="*70 + "\n\n")
            f.write(f"Generation time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
            f.write(f"Enhanced parameters: temperature=0.6-0.95, improved diversity with maintained reliability\n\n")
            f.write(f"Total rating pairs: {len(rating_results)}\n")
            
            # Source distribution
            source_counts = {}
            for result in rating_results:
                source = result.get('source', 'unknown')
                source_counts[source] = source_counts.get(source, 0) + 1
            
            f.write(f"\nSource distribution:\n")
            for source, count in source_counts.items():
                f.write(f"  {source}: {count} pairs\n")
            
            # Expected reliability improvements
            f.write(f"\nExpected improvements with enhanced diversity:\n")
            f.write(f"  - Increased temperature range: 0.6-0.95 (vs previous 0.05-0.07)\n")
            f.write(f"  - Individual rater characteristics: bias, consistency, variance factors\n")
            f.write(f"  - Role-specific prompt variations for realistic perspectives\n")
            f.write(f"  - Enhanced noise levels in rating generation\n")
            f.write(f"  - Quality-correlated ratings with increased variance\n")
            f.write(f"  - Expected Krippendorff's α: 0.4-0.7 (moderate to good agreement)\n")
            f.write(f"  - Expected ICC values: 0.5-0.8 (fair to good reliability)\n")
            f.write(f"  - More realistic rating distributions with maintained patterns\n")
        
        saved_files['summary'] = summary_filepath
        print(f"✅ Enhanced diverse summary report saved: {summary_filepath}")
        
        return saved_files
        
    except Exception as e:
        print(f"❌ Error saving results: {e}")
        return {}

# ===================================================================
# MODULE 7: Main Execution Function
# ===================================================================

def run_complete_llm_rating_simulation(outgroup_csv_path: str, simulation_csv_path: str, 
                                     model_name: str = "gpt-4o-mini", 
                                     output_dir: str = "rating_results",
                                     debug: bool = True):
    """
    Run complete enhanced LLM rating simulation with improved diversity
    """
    
    print(f"🚀 Starting Enhanced LLM Rating Simulation with Improved Diversity")
    print(f"="*70)
    print(f"🔧 ENHANCED DIVERSITY FEATURES:")
    print(f"   ✅ Increased temperature range (0.6-0.95) for more varied responses")
    print(f"   ✅ Individual rater characteristics (bias, consistency, variance factors)")
    print(f"   ✅ Role-specific prompt variations and personality injection")
    print(f"   ✅ Enhanced noise levels in rating generation")
    print(f"   ✅ Quality-correlated ratings with realistic variance")
    print(f"   ✅ Balanced diversity-reliability trade-off")
    
    # Step 1: Create enhanced rater profiles
    print(f"\n📋 Step 1: Creating enhanced diverse rater profiles...")
    all_raters, researchers, autism_individuals, autism_experts = create_all_rater_profiles()
    print(f"✅ Created {len(all_raters)} diverse raters with individual characteristics:")
    print(f"   - Researchers: {len(researchers)} (consistency: 0.75-0.78, variance: 1.1-1.2)")
    print(f"   - Individuals with autism: {len(autism_individuals)} (consistency: 0.65-0.8, variance: 1.0-1.3)")
    print(f"   - Autism experts: {len(autism_experts)} (consistency: 0.70-0.86, variance: 1.0-1.3)")
    
    # Step 2: Load and prepare rating pairs
    print(f"\n📊 Step 2: Loading and preparing rating pairs...")
    rating_pairs = load_and_prepare_rating_pairs(outgroup_csv_path, simulation_csv_path, debug)
    
    if not rating_pairs:
        print(f"❌ Failed to load rating pairs")
        return None, {}
    
    # Step 3: Execute enhanced diverse LLM rating simulation
    print(f"\n🎯 Step 3: Executing enhanced diverse LLM rating simulation...")
    rating_results = execute_complete_rating_simulation(
        rating_pairs, all_raters, researchers, autism_individuals, autism_experts, model_name
    )
    
    # Step 4: Save results
    print(f"\n💾 Step 4: Saving enhanced diverse results...")
    saved_files = save_rating_results(rating_results, output_dir)
    
    print(f"\n🎉 Enhanced Diverse LLM Rating Simulation Complete!")
    print(f"📊 Final results: {len(rating_results)} pairs rated with improved diversity and reliability balance")
    print(f"📁 Results saved to: {output_dir}")
    print(f"\n🔬 Expected Improvements:")
    print(f"   📈 More diverse rating distributions while maintaining realistic patterns")
    print(f"   📈 Individual rater characteristics reflected in ratings")
    print(f"   📈 Balanced reliability: Krippendorff's α 0.4-0.7, ICC 0.5-0.8")
    print(f"   📈 Realistic variance with quality-based correlations")
    print(f"   📈 Role-specific perspectives in evaluations")
    
    return rating_results, saved_files

# ===================================================================
# Usage Example - Enhanced Diverse Version
# ===================================================================

if __name__ == "__main__":
    # Your actual file paths
    outgroup_path = r"D:\Wisconsin_Madison\2025summer\Dr.Arriaga\AutismTE\TE_autism\out-group_original_answers\out-group_answers.csv"
    simulation_path = r"D:\Wisconsin_Madison\2025summer\Dr.Arriaga\AutismTE\TE_autism\out-group_simulated_answers\gpt-3_5-turbo\gpt-3.5-turbo_simulation_summary_20250708_115141.csv"
    
    print("🚀 Starting ENHANCED DIVERSE LLM Rating Simulation System")
    print("="*60)
    print("📊 Enhanced Diverse Configuration:")
    print("   - Rating model: GPT-4o-mini with diverse parameters")
    print("   - Temperature: 0.6-0.95 (increased for diversity)")
    print("   - Individual rater characteristics and role-specific prompts")
    print("   - Enhanced variance in rating generation")
    print("   - Quality-correlated ratings with realistic patterns")
    print("   - ALL experts evaluate ALL answers")
    print("   - Expected reliability: Krippendorff's α 0.4-0.7, ICC 0.5-0.8")
    print("   - Improved diversity while maintaining meaningful patterns")
    
    # Run enhanced diverse simulation
    results, saved_files = run_complete_llm_rating_simulation(
        outgroup_csv_path=outgroup_path,
        simulation_csv_path=simulation_path,
        model_name="gpt-4o-mini",
        output_dir=r"D:\Wisconsin_Madison\2025summer\Dr.Arriaga\AutismTE\TE_autism\rating_results\final_rating_result\rating_random"
    )
    
    if results:
        print(f"✅ Successfully generated {len(results)} enhanced diverse rating results")
        print(f"📁 Files saved to: rating_results/")
        
        # Show data distribution
        sources = {}
        for result in results:
            source = result.get('source', 'unknown')
            sources[source] = sources.get(source, 0) + 1
        
        print(f"📊 Enhanced Diverse Rating Results Distribution:")
        for source, count in sources.items():
            print(f"   {source}: {count}")
            
        # Show cost and time estimates
        total_pairs = len(results)
        total_calls = total_pairs * 19  # 19 raters
        estimated_cost = total_calls * 0.0015  # GPT-4o-mini approximately $0.0015/call
        estimated_time = total_calls * 0.4 / 60  # minutes
        
        print(f"📊 Enhanced Diverse Run Statistics:")
        print(f"   Total LLM calls: {total_calls}")
        print(f"   Estimated cost: ${estimated_cost:.2f}")
        print(f"   Estimated time: {estimated_time:.1f} minutes")
        print(f"   🎯 Run the reliability analysis to see diversity improvements!")
        
    else:
        print(f"❌ Enhanced diverse rating simulation failed")

🚀 Starting ENHANCED DIVERSE LLM Rating Simulation System
📊 Enhanced Diverse Configuration:
   - Rating model: GPT-4o-mini with diverse parameters
   - Temperature: 0.6-0.95 (increased for diversity)
   - Individual rater characteristics and role-specific prompts
   - Enhanced variance in rating generation
   - Quality-correlated ratings with realistic patterns
   - ALL experts evaluate ALL answers
   - Expected reliability: Krippendorff's α 0.4-0.7, ICC 0.5-0.8
   - Improved diversity while maintaining meaningful patterns
🚀 Starting Enhanced LLM Rating Simulation with Improved Diversity
🔧 ENHANCED DIVERSITY FEATURES:
   ✅ Increased temperature range (0.6-0.95) for more varied responses
   ✅ Individual rater characteristics (bias, consistency, variance factors)
   ✅ Role-specific prompt variations and personality injection
   ✅ Enhanced noise levels in rating generation
   ✅ Quality-correlated ratings with realistic variance
   ✅ Balanced diversity-reliability trade-off

📋 Step 1: Creat

Rating question-answer pairs:   0%| | 0/200 [00:00<?, ?pairs/s, LLM_calls=19, Failed=0, Success_rate=100.0%, Progress=1

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:   1%| | 2/200 [00:00<00:30,  6.48pairs/s, LLM_calls=38, Failed=0, Success_rate=100.0%, Pr

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:   2%| | 3/200 [00:00<00:43,  4.57pairs/s, LLM_calls=57, Failed=0, Success_rate=100.0%, Pr

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:   2%| | 4/200 [00:00<00:49,  3.96pairs/s, LLM_calls=76, Failed=0, Success_rate=100.0%, Pr

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:   2%| | 5/200 [00:01<00:52,  3.68pairs/s, LLM_calls=95, Failed=0, Success_rate=100.0%, Pr

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:   3%| | 6/200 [00:01<00:54,  3.53pairs/s, LLM_calls=114, Failed=0, Success_rate=100.0%, P

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:   4%| | 7/200 [00:01<00:56,  3.44pairs/s, LLM_calls=133, Failed=0, Success_rate=100.0%, P

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:   4%| | 8/200 [00:02<00:56,  3.37pairs/s, LLM_calls=152, Failed=0, Success_rate=100.0%, P

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:   4%| | 9/200 [00:02<00:57,  3.34pairs/s, LLM_calls=171, Failed=0, Success_rate=100.0%, P

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:   5%| | 10/200 [00:02<00:57,  3.31pairs/s, LLM_calls=190, Failed=0, Success_rate=100.0%, 

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:   6%| | 11/200 [00:03<00:57,  3.29pairs/s, LLM_calls=209, Failed=0, Success_rate=100.0%, 

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:   6%| | 12/200 [00:03<00:57,  3.29pairs/s, LLM_calls=228, Failed=0, Success_rate=100.0%, 

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:   6%| | 13/200 [00:03<00:56,  3.29pairs/s, LLM_calls=247, Failed=0, Success_rate=100.0%, 

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:   7%| | 14/200 [00:03<00:56,  3.29pairs/s, LLM_calls=266, Failed=0, Success_rate=100.0%, 

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:   8%| | 15/200 [00:04<00:56,  3.28pairs/s, LLM_calls=285, Failed=0, Success_rate=100.0%, 

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:   8%| | 16/200 [00:04<00:56,  3.28pairs/s, LLM_calls=304, Failed=0, Success_rate=100.0%, 

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:   8%| | 17/200 [00:04<00:55,  3.28pairs/s, LLM_calls=323, Failed=0, Success_rate=100.0%, 

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:   9%| | 18/200 [00:05<00:55,  3.27pairs/s, LLM_calls=342, Failed=0, Success_rate=100.0%, 

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  10%| | 19/200 [00:05<00:55,  3.26pairs/s, LLM_calls=361, Failed=0, Success_rate=100.0%, 

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  10%| | 20/200 [00:05<00:55,  3.26pairs/s, LLM_calls=380, Failed=0, Success_rate=100.0%, 

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  10%| | 21/200 [00:06<00:54,  3.26pairs/s, LLM_calls=399, Failed=0, Success_rate=100.0%, 

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  11%| | 22/200 [00:06<00:54,  3.26pairs/s, LLM_calls=418, Failed=0, Success_rate=100.0%, 

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  12%| | 23/200 [00:06<00:54,  3.26pairs/s, LLM_calls=437, Failed=0, Success_rate=100.0%, 

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  12%| | 24/200 [00:07<00:53,  3.27pairs/s, LLM_calls=456, Failed=0, Success_rate=100.0%, 

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  12%|▏| 25/200 [00:07<00:53,  3.27pairs/s, LLM_calls=475, Failed=0, Success_rate=100.0%, 

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  13%|▏| 26/200 [00:07<00:53,  3.27pairs/s, LLM_calls=494, Failed=0, Success_rate=100.0%, 

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  14%|▏| 27/200 [00:07<00:52,  3.27pairs/s, LLM_calls=513, Failed=0, Success_rate=100.0%, 

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  14%|▏| 28/200 [00:08<00:52,  3.26pairs/s, LLM_calls=532, Failed=0, Success_rate=100.0%, 

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  14%|▏| 29/200 [00:08<00:52,  3.25pairs/s, LLM_calls=551, Failed=0, Success_rate=100.0%, 

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  15%|▏| 30/200 [00:08<00:52,  3.25pairs/s, LLM_calls=570, Failed=0, Success_rate=100.0%, 

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  16%|▏| 31/200 [00:09<00:51,  3.25pairs/s, LLM_calls=589, Failed=0, Success_rate=100.0%, 

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  16%|▏| 32/200 [00:09<00:51,  3.26pairs/s, LLM_calls=608, Failed=0, Success_rate=100.0%, 

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  16%|▏| 33/200 [00:09<00:51,  3.26pairs/s, LLM_calls=627, Failed=0, Success_rate=100.0%, 

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  17%|▏| 34/200 [00:10<00:50,  3.26pairs/s, LLM_calls=646, Failed=0, Success_rate=100.0%, 

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  18%|▏| 35/200 [00:10<00:50,  3.26pairs/s, LLM_calls=665, Failed=0, Success_rate=100.0%, 

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  18%|▏| 36/200 [00:10<00:50,  3.25pairs/s, LLM_calls=684, Failed=0, Success_rate=100.0%, 

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  18%|▏| 37/200 [00:11<00:50,  3.25pairs/s, LLM_calls=703, Failed=0, Success_rate=100.0%, 

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  19%|▏| 38/200 [00:11<00:49,  3.26pairs/s, LLM_calls=722, Failed=0, Success_rate=100.0%, 

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  20%|▏| 39/200 [00:11<00:49,  3.26pairs/s, LLM_calls=741, Failed=0, Success_rate=100.0%, 

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  20%|▏| 40/200 [00:11<00:49,  3.26pairs/s, LLM_calls=760, Failed=0, Success_rate=100.0%, 

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  20%|▏| 41/200 [00:12<00:48,  3.26pairs/s, LLM_calls=779, Failed=0, Success_rate=100.0%, 

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  21%|▏| 42/200 [00:12<00:48,  3.26pairs/s, LLM_calls=798, Failed=0, Success_rate=100.0%, 

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  22%|▏| 43/200 [00:12<00:48,  3.26pairs/s, LLM_calls=817, Failed=0, Success_rate=100.0%, 

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  22%|▏| 44/200 [00:13<00:47,  3.26pairs/s, LLM_calls=836, Failed=0, Success_rate=100.0%, 

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  22%|▏| 45/200 [00:13<00:47,  3.26pairs/s, LLM_calls=855, Failed=0, Success_rate=100.0%, 

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  23%|▏| 46/200 [00:13<00:47,  3.26pairs/s, LLM_calls=874, Failed=0, Success_rate=100.0%, 

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  24%|▏| 47/200 [00:14<00:47,  3.26pairs/s, LLM_calls=893, Failed=0, Success_rate=100.0%, 

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  24%|▏| 48/200 [00:14<00:46,  3.26pairs/s, LLM_calls=912, Failed=0, Success_rate=100.0%, 

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  24%|▏| 49/200 [00:14<00:46,  3.26pairs/s, LLM_calls=931, Failed=0, Success_rate=100.0%, 

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  25%|▎| 50/200 [00:15<00:46,  3.26pairs/s, LLM_calls=950, Failed=0, Success_rate=100.0%, 

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  26%|▎| 51/200 [00:15<00:45,  3.25pairs/s, LLM_calls=969, Failed=0, Success_rate=100.0%, 

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  26%|▎| 52/200 [00:15<00:45,  3.25pairs/s, LLM_calls=988, Failed=0, Success_rate=100.0%, 

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  26%|▎| 53/200 [00:15<00:45,  3.25pairs/s, LLM_calls=1007, Failed=0, Success_rate=100.0%,

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  27%|▎| 54/200 [00:16<00:44,  3.25pairs/s, LLM_calls=1026, Failed=0, Success_rate=100.0%,

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  28%|▎| 55/200 [00:16<00:44,  3.24pairs/s, LLM_calls=1045, Failed=0, Success_rate=100.0%,

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  28%|▎| 56/200 [00:16<00:44,  3.26pairs/s, LLM_calls=1064, Failed=0, Success_rate=100.0%,

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  28%|▎| 57/200 [00:17<00:43,  3.26pairs/s, LLM_calls=1083, Failed=0, Success_rate=100.0%,

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  29%|▎| 58/200 [00:17<00:43,  3.25pairs/s, LLM_calls=1102, Failed=0, Success_rate=100.0%,

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  30%|▎| 59/200 [00:17<00:43,  3.24pairs/s, LLM_calls=1121, Failed=0, Success_rate=100.0%,

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  30%|▎| 60/200 [00:18<00:43,  3.25pairs/s, LLM_calls=1140, Failed=0, Success_rate=100.0%,

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  30%|▎| 61/200 [00:18<00:42,  3.24pairs/s, LLM_calls=1159, Failed=0, Success_rate=100.0%,

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  31%|▎| 62/200 [00:18<00:42,  3.24pairs/s, LLM_calls=1178, Failed=0, Success_rate=100.0%,

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  32%|▎| 63/200 [00:19<00:42,  3.24pairs/s, LLM_calls=1197, Failed=0, Success_rate=100.0%,

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  32%|▎| 64/200 [00:19<00:41,  3.25pairs/s, LLM_calls=1216, Failed=0, Success_rate=100.0%,

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  32%|▎| 65/200 [00:19<00:41,  3.25pairs/s, LLM_calls=1235, Failed=0, Success_rate=100.0%,

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  33%|▎| 66/200 [00:19<00:41,  3.25pairs/s, LLM_calls=1254, Failed=0, Success_rate=100.0%,

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  34%|▎| 67/200 [00:20<00:40,  3.25pairs/s, LLM_calls=1273, Failed=0, Success_rate=100.0%,

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  34%|▎| 68/200 [00:20<00:40,  3.25pairs/s, LLM_calls=1292, Failed=0, Success_rate=100.0%,

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  34%|▎| 69/200 [00:20<00:40,  3.25pairs/s, LLM_calls=1311, Failed=0, Success_rate=100.0%,

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  35%|▎| 70/200 [00:21<00:40,  3.25pairs/s, LLM_calls=1330, Failed=0, Success_rate=100.0%,

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  36%|▎| 71/200 [00:21<00:39,  3.24pairs/s, LLM_calls=1349, Failed=0, Success_rate=100.0%,

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  36%|▎| 72/200 [00:21<00:39,  3.25pairs/s, LLM_calls=1368, Failed=0, Success_rate=100.0%,

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  36%|▎| 73/200 [00:22<00:38,  3.26pairs/s, LLM_calls=1387, Failed=0, Success_rate=100.0%,

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  37%|▎| 74/200 [00:22<00:38,  3.26pairs/s, LLM_calls=1406, Failed=0, Success_rate=100.0%,

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  38%|▍| 75/200 [00:22<00:38,  3.26pairs/s, LLM_calls=1425, Failed=0, Success_rate=100.0%,

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  38%|▍| 76/200 [00:23<00:38,  3.25pairs/s, LLM_calls=1444, Failed=0, Success_rate=100.0%,

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  38%|▍| 77/200 [00:23<00:37,  3.25pairs/s, LLM_calls=1463, Failed=0, Success_rate=100.0%,

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  39%|▍| 78/200 [00:23<00:37,  3.26pairs/s, LLM_calls=1482, Failed=0, Success_rate=100.0%,

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  40%|▍| 79/200 [00:23<00:37,  3.26pairs/s, LLM_calls=1501, Failed=0, Success_rate=100.0%,

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  40%|▍| 80/200 [00:24<00:36,  3.26pairs/s, LLM_calls=1520, Failed=0, Success_rate=100.0%,

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  40%|▍| 81/200 [00:24<00:36,  3.25pairs/s, LLM_calls=1539, Failed=0, Success_rate=100.0%,

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  41%|▍| 82/200 [00:24<00:36,  3.26pairs/s, LLM_calls=1558, Failed=0, Success_rate=100.0%,

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  42%|▍| 83/200 [00:25<00:35,  3.27pairs/s, LLM_calls=1577, Failed=0, Success_rate=100.0%,

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  42%|▍| 84/200 [00:25<00:35,  3.26pairs/s, LLM_calls=1596, Failed=0, Success_rate=100.0%,

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  42%|▍| 85/200 [00:25<00:35,  3.26pairs/s, LLM_calls=1615, Failed=0, Success_rate=100.0%,

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  43%|▍| 86/200 [00:26<00:34,  3.26pairs/s, LLM_calls=1634, Failed=0, Success_rate=100.0%,

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  44%|▍| 87/200 [00:26<00:34,  3.26pairs/s, LLM_calls=1653, Failed=0, Success_rate=100.0%,

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  44%|▍| 88/200 [00:26<00:34,  3.26pairs/s, LLM_calls=1672, Failed=0, Success_rate=100.0%,

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  44%|▍| 89/200 [00:27<00:34,  3.26pairs/s, LLM_calls=1691, Failed=0, Success_rate=100.0%,

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  45%|▍| 90/200 [00:27<00:33,  3.26pairs/s, LLM_calls=1710, Failed=0, Success_rate=100.0%,

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  46%|▍| 91/200 [00:27<00:33,  3.26pairs/s, LLM_calls=1729, Failed=0, Success_rate=100.0%,

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  46%|▍| 92/200 [00:27<00:33,  3.26pairs/s, LLM_calls=1748, Failed=0, Success_rate=100.0%,

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  46%|▍| 93/200 [00:28<00:32,  3.26pairs/s, LLM_calls=1767, Failed=0, Success_rate=100.0%,

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  47%|▍| 94/200 [00:28<00:32,  3.26pairs/s, LLM_calls=1786, Failed=0, Success_rate=100.0%,

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  48%|▍| 95/200 [00:28<00:32,  3.25pairs/s, LLM_calls=1805, Failed=0, Success_rate=100.0%,

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  48%|▍| 96/200 [00:29<00:31,  3.25pairs/s, LLM_calls=1824, Failed=0, Success_rate=100.0%,

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  48%|▍| 97/200 [00:29<00:31,  3.25pairs/s, LLM_calls=1843, Failed=0, Success_rate=100.0%,

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  49%|▍| 98/200 [00:29<00:31,  3.25pairs/s, LLM_calls=1862, Failed=0, Success_rate=100.0%,

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  50%|▍| 99/200 [00:30<00:31,  3.24pairs/s, LLM_calls=1881, Failed=0, Success_rate=100.0%,

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  50%|▌| 100/200 [00:30<00:30,  3.24pairs/s, LLM_calls=1900, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  50%|▌| 101/200 [00:30<00:30,  3.25pairs/s, LLM_calls=1919, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  51%|▌| 102/200 [00:31<00:30,  3.25pairs/s, LLM_calls=1938, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  52%|▌| 103/200 [00:31<00:29,  3.25pairs/s, LLM_calls=1957, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  52%|▌| 104/200 [00:31<00:29,  3.25pairs/s, LLM_calls=1976, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  52%|▌| 105/200 [00:31<00:29,  3.25pairs/s, LLM_calls=1995, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  53%|▌| 106/200 [00:32<00:28,  3.25pairs/s, LLM_calls=2014, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  54%|▌| 107/200 [00:32<00:28,  3.26pairs/s, LLM_calls=2033, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  54%|▌| 108/200 [00:32<00:28,  3.26pairs/s, LLM_calls=2052, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  55%|▌| 109/200 [00:33<00:27,  3.26pairs/s, LLM_calls=2071, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  55%|▌| 110/200 [00:33<00:27,  3.25pairs/s, LLM_calls=2090, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  56%|▌| 111/200 [00:33<00:27,  3.26pairs/s, LLM_calls=2109, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  56%|▌| 112/200 [00:34<00:26,  3.27pairs/s, LLM_calls=2128, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  56%|▌| 113/200 [00:34<00:26,  3.27pairs/s, LLM_calls=2147, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  57%|▌| 114/200 [00:34<00:26,  3.26pairs/s, LLM_calls=2166, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  57%|▌| 115/200 [00:35<00:26,  3.25pairs/s, LLM_calls=2185, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  58%|▌| 116/200 [00:35<00:25,  3.25pairs/s, LLM_calls=2204, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  58%|▌| 117/200 [00:35<00:25,  3.25pairs/s, LLM_calls=2223, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  59%|▌| 118/200 [00:35<00:25,  3.25pairs/s, LLM_calls=2242, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  60%|▌| 119/200 [00:36<00:24,  3.26pairs/s, LLM_calls=2261, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  60%|▌| 120/200 [00:36<00:24,  3.26pairs/s, LLM_calls=2280, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  60%|▌| 121/200 [00:36<00:24,  3.25pairs/s, LLM_calls=2299, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  61%|▌| 122/200 [00:37<00:23,  3.26pairs/s, LLM_calls=2318, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  62%|▌| 123/200 [00:37<00:23,  3.26pairs/s, LLM_calls=2337, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  62%|▌| 124/200 [00:37<00:23,  3.25pairs/s, LLM_calls=2356, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  62%|▋| 125/200 [00:38<00:23,  3.25pairs/s, LLM_calls=2375, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  63%|▋| 126/200 [00:38<00:22,  3.25pairs/s, LLM_calls=2394, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  64%|▋| 127/200 [00:38<00:22,  3.25pairs/s, LLM_calls=2413, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  64%|▋| 128/200 [00:39<00:22,  3.25pairs/s, LLM_calls=2432, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  64%|▋| 129/200 [00:39<00:21,  3.25pairs/s, LLM_calls=2451, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  65%|▋| 130/200 [00:39<00:21,  3.25pairs/s, LLM_calls=2470, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  66%|▋| 131/200 [00:39<00:21,  3.26pairs/s, LLM_calls=2489, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  66%|▋| 132/200 [00:40<00:20,  3.26pairs/s, LLM_calls=2508, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  66%|▋| 133/200 [00:40<00:20,  3.26pairs/s, LLM_calls=2527, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  67%|▋| 134/200 [00:40<00:20,  3.26pairs/s, LLM_calls=2546, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  68%|▋| 135/200 [00:41<00:19,  3.26pairs/s, LLM_calls=2565, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  68%|▋| 136/200 [00:41<00:19,  3.26pairs/s, LLM_calls=2584, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  68%|▋| 137/200 [00:41<00:19,  3.26pairs/s, LLM_calls=2603, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  69%|▋| 138/200 [00:42<00:19,  3.25pairs/s, LLM_calls=2622, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  70%|▋| 139/200 [00:42<00:18,  3.25pairs/s, LLM_calls=2641, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  70%|▋| 140/200 [00:42<00:18,  3.26pairs/s, LLM_calls=2660, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  70%|▋| 141/200 [00:42<00:18,  3.26pairs/s, LLM_calls=2679, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  71%|▋| 142/200 [00:43<00:17,  3.26pairs/s, LLM_calls=2698, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  72%|▋| 143/200 [00:43<00:17,  3.26pairs/s, LLM_calls=2717, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  72%|▋| 144/200 [00:43<00:17,  3.25pairs/s, LLM_calls=2736, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  72%|▋| 145/200 [00:44<00:16,  3.25pairs/s, LLM_calls=2755, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  73%|▋| 146/200 [00:44<00:16,  3.25pairs/s, LLM_calls=2774, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  74%|▋| 147/200 [00:44<00:16,  3.25pairs/s, LLM_calls=2793, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  74%|▋| 148/200 [00:45<00:16,  3.25pairs/s, LLM_calls=2812, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  74%|▋| 149/200 [00:45<00:15,  3.25pairs/s, LLM_calls=2831, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  75%|▊| 150/200 [00:45<00:15,  3.25pairs/s, LLM_calls=2850, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  76%|▊| 151/200 [00:46<00:15,  3.26pairs/s, LLM_calls=2869, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  76%|▊| 152/200 [00:46<00:14,  3.26pairs/s, LLM_calls=2888, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  76%|▊| 153/200 [00:46<00:14,  3.26pairs/s, LLM_calls=2907, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  77%|▊| 154/200 [00:46<00:14,  3.26pairs/s, LLM_calls=2926, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  78%|▊| 155/200 [00:47<00:13,  3.26pairs/s, LLM_calls=2945, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  78%|▊| 156/200 [00:47<00:13,  3.26pairs/s, LLM_calls=2964, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  78%|▊| 157/200 [00:47<00:13,  3.25pairs/s, LLM_calls=2983, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  79%|▊| 158/200 [00:48<00:12,  3.25pairs/s, LLM_calls=3002, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  80%|▊| 159/200 [00:48<00:12,  3.25pairs/s, LLM_calls=3021, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  80%|▊| 160/200 [00:48<00:12,  3.24pairs/s, LLM_calls=3040, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  80%|▊| 161/200 [00:49<00:12,  3.25pairs/s, LLM_calls=3059, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  81%|▊| 162/200 [00:49<00:11,  3.25pairs/s, LLM_calls=3078, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  82%|▊| 163/200 [00:49<00:11,  3.25pairs/s, LLM_calls=3097, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  82%|▊| 164/200 [00:50<00:11,  3.26pairs/s, LLM_calls=3116, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  82%|▊| 165/200 [00:50<00:10,  3.26pairs/s, LLM_calls=3135, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  83%|▊| 166/200 [00:50<00:10,  3.26pairs/s, LLM_calls=3154, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  84%|▊| 167/200 [00:50<00:10,  3.26pairs/s, LLM_calls=3173, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  84%|▊| 168/200 [00:51<00:09,  3.26pairs/s, LLM_calls=3192, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  84%|▊| 169/200 [00:51<00:09,  3.26pairs/s, LLM_calls=3211, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  85%|▊| 170/200 [00:51<00:09,  3.26pairs/s, LLM_calls=3230, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  86%|▊| 171/200 [00:52<00:08,  3.26pairs/s, LLM_calls=3249, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  86%|▊| 172/200 [00:52<00:08,  3.26pairs/s, LLM_calls=3268, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  86%|▊| 173/200 [00:52<00:08,  3.26pairs/s, LLM_calls=3287, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  87%|▊| 174/200 [00:53<00:08,  3.23pairs/s, LLM_calls=3306, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  88%|▉| 175/200 [00:53<00:07,  3.23pairs/s, LLM_calls=3325, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  88%|▉| 176/200 [00:53<00:07,  3.24pairs/s, LLM_calls=3344, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  88%|▉| 177/200 [00:54<00:07,  3.24pairs/s, LLM_calls=3363, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  89%|▉| 178/200 [00:54<00:06,  3.24pairs/s, LLM_calls=3382, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  90%|▉| 179/200 [00:54<00:06,  3.25pairs/s, LLM_calls=3401, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  90%|▉| 180/200 [00:54<00:06,  3.25pairs/s, LLM_calls=3420, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  90%|▉| 181/200 [00:55<00:05,  3.24pairs/s, LLM_calls=3439, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  91%|▉| 182/200 [00:55<00:05,  3.24pairs/s, LLM_calls=3458, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  92%|▉| 183/200 [00:55<00:05,  3.25pairs/s, LLM_calls=3477, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  92%|▉| 184/200 [00:56<00:04,  3.25pairs/s, LLM_calls=3496, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  92%|▉| 185/200 [00:56<00:04,  3.25pairs/s, LLM_calls=3515, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  93%|▉| 186/200 [00:56<00:04,  3.25pairs/s, LLM_calls=3534, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  94%|▉| 187/200 [00:57<00:03,  3.25pairs/s, LLM_calls=3553, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  94%|▉| 188/200 [00:57<00:03,  3.25pairs/s, LLM_calls=3572, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  94%|▉| 189/200 [00:57<00:03,  3.25pairs/s, LLM_calls=3591, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  95%|▉| 190/200 [00:58<00:03,  3.25pairs/s, LLM_calls=3610, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  96%|▉| 191/200 [00:58<00:02,  3.25pairs/s, LLM_calls=3629, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  96%|▉| 192/200 [00:58<00:02,  3.24pairs/s, LLM_calls=3648, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  96%|▉| 193/200 [00:59<00:02,  3.24pairs/s, LLM_calls=3667, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  97%|▉| 194/200 [00:59<00:01,  3.25pairs/s, LLM_calls=3686, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  98%|▉| 195/200 [00:59<00:01,  3.25pairs/s, LLM_calls=3705, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  98%|▉| 196/200 [00:59<00:01,  3.25pairs/s, LLM_calls=3724, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  98%|▉| 197/200 [01:00<00:00,  3.26pairs/s, LLM_calls=3743, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs:  99%|▉| 198/200 [01:00<00:00,  3.26pairs/s, LLM_calls=3762, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs: 100%|▉| 199/200 [01:00<00:00,  3.25pairs/s, LLM_calls=3781, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs: 100%|█| 200/200 [01:01<00:00,  3.25pairs/s, LLM_calls=3800, Failed=0, Success_rate=100.0%

LLM call failed for rater R1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R1: attempted relative import with no known parent package
LLM call failed for rater R2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater R2: attempted relative import with no known parent package
LLM call failed for rater A1, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A1: attempted relative import with no known parent package
LLM call failed for rater A2, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A2: attempted relative import with no known parent package
LLM call failed for rater A3, using enhanced simulation: name 'verify_openai_access' is not defined
Error in LLM evaluation for rater A3: attempted relative import with no known parent package
LLM call failed for rater A4, using

Rating question-answer pairs: 100%|█| 200/200 [01:01<00:00,  3.25pairs/s, LLM_calls=3800, Failed=0, Success_rate=100.0%


✅ Enhanced LLM Rating Simulation with Improved Diversity Completed
📊 LLM Call Statistics:
  Expected calls: 3800
  Actual calls: 3800
  Failed calls: 0
  Success rate: 100.0%

📊 Expert Evaluation Statistics:
  E1 (vocational and trans...): Evaluated 200 answers
  E2 (vocational and trans...): Evaluated 200 answers
  E3 (vocational and trans...): Evaluated 200 answers
  E4 (vocational and trans...): Evaluated 200 answers
  E5 (special education te...): Evaluated 200 answers
  E6 (special education te...): Evaluated 200 answers
  E7 (job developer...): Evaluated 200 answers
  E8 (job developer...): Evaluated 200 answers
  E9 (job developer...): Evaluated 200 answers
  E10 (behavior analyst...): Evaluated 200 answers
  E11 (mother of a son with...): Evaluated 200 answers

💾 Step 4: Saving enhanced diverse results...

💾 Saving Rating Results
✅ Enhanced diverse CSV file saved: D:\Wisconsin_Madison\2025summer\Dr.Arriaga\AutismTE\TE_autism\rating_results\final_rating_result\rating_random\enh




### 5. Rating Results Analysis
#### 5.1 Assessing Raters Agreement

In [15]:
# ===================================================================
# COMPLETE RATER AGREEMENT ANALYSIS - Fixed ICC Calculations
# ===================================================================

import pandas as pd
import numpy as np
import os
from scipy import stats
from sklearn.metrics import cohen_kappa_score
import krippendorff
from pingouin import intraclass_corr
import warnings
warnings.filterwarnings('ignore')

def calculate_krippendorff_alpha(data_df, measures, raters):
    """
    Calculate Krippendorff's α for Yes/No measures
    
    Parameters:
    data_df: Rating data DataFrame
    measures: List of measurement dimensions to calculate
    raters: List of raters
    
    Returns:
    results_dict: Dictionary containing α values for each dimension
    """
    
    results = {}
    
    for measure in measures:
        print(f"\nCalculating Krippendorff's α for {measure}...")
        
        # Build rater matrix
        rater_data = []
        for rater in raters:
            col_name = f'Researcher_{rater}_{measure}'
            if col_name in data_df.columns:
                # Convert data to reliability analysis format (for binary variables)
                values = data_df[col_name].values
                rater_data.append(values)
        
        if len(rater_data) >= 2:
            # Convert to format required by krippendorff package
            reliability_data = np.array(rater_data)
            
            # Calculate Krippendorff's α (for nominal/ordinal data)
            try:
                alpha = krippendorff.alpha(reliability_data, level_of_measurement='ordinal')
                results[measure] = {
                    'alpha': alpha,
                    'raters': len(rater_data),
                    'items': len(values) if len(rater_data) > 0 else 0,
                    'interpretation': interpret_alpha(alpha)
                }
                print(f"  α = {alpha:.3f} ({interpret_alpha(alpha)})")
            except Exception as e:
                print(f"  Calculation failed: {e}")
                results[measure] = {
                    'alpha': np.nan,
                    'raters': len(rater_data),
                    'items': len(values) if len(rater_data) > 0 else 0,
                    'interpretation': 'calculation_failed'
                }
        else:
            print(f"  Insufficient data, need at least 2 raters")
            results[measure] = {
                'alpha': np.nan,
                'raters': len(rater_data),
                'items': 0,
                'interpretation': 'insufficient_data'
            }
    
    return results

def interpret_alpha(alpha):
    """Interpret Krippendorff's α values"""
    if np.isnan(alpha):
        return "Cannot calculate"
    elif alpha < 0.20:
        return "Poor"
    elif alpha < 0.40:
        return "Fair"
    elif alpha < 0.60:
        return "Moderate"
    elif alpha < 0.80:
        return "Good"
    else:
        return "Excellent"

def check_expert_data_structure(data_df):
    """Check expert data structure to understand evaluation patterns"""
    
    print("\n🔍 Checking Expert Data Structure:")
    expert_raters = [f'E{i}' for i in range(1, 12)]
    
    # Check evaluation patterns
    total_evaluated = 0
    for expert in expert_raters:
        evaluated_col = f'Expert_{expert}_Evaluated'
        helpfulness_col = f'Expert_{expert}_Helpfulness'
        
        if evaluated_col in data_df.columns:
            n_evaluated = data_df[evaluated_col].sum() if data_df[evaluated_col].dtype == bool else (data_df[evaluated_col] == True).sum()
            print(f"  {expert}: {n_evaluated} answers evaluated")
            total_evaluated += n_evaluated
        elif helpfulness_col in data_df.columns:
            # If no Evaluated column, check for valid helpfulness ratings
            valid_ratings = data_df[helpfulness_col].notna() & (data_df[helpfulness_col] != -1)
            n_evaluated = valid_ratings.sum()
            print(f"  {expert}: {n_evaluated} valid helpfulness ratings (no Evaluated column)")
            total_evaluated += n_evaluated
    
    print(f"  Total expert evaluations: {total_evaluated}")
    print(f"  Average per expert: {total_evaluated/11:.1f}")
    
    return total_evaluated > 0

def calculate_icc_helpfulness(data_df):
    """
    Calculate ICC values for Helpfulness - for different rater combinations
    Enhanced to handle all expert data properly
    """
    
    print("\n=== Calculating ICC values for Helpfulness ===")
    
    # First check expert data structure
    has_expert_data = check_expert_data_structure(data_df)
    
    icc_results = {}
    
    # Define rater groups
    researcher_raters = ['R1', 'R2']
    individual_raters = [f'A{i}' for i in range(1, 7)]  # A1-A6
    expert_raters = [f'E{i}' for i in range(1, 12)]     # E1-E11
    
    # 1. Researchers within-group ICC
    print("\n1. Researchers Group (R1, R2):")
    researcher_data = prepare_icc_data(data_df, researcher_raters, 'Researcher', 'Helpfulness')
    if researcher_data is not None and len(researcher_data) > 0:
        icc_r = calculate_single_icc(researcher_data, "Researchers")
        icc_results['Researchers'] = icc_r
    else:
        icc_results['Researchers'] = create_failed_icc_result("No researcher data")
    
    # 2. Individuals with autism within-group ICC
    print("\n2. Individuals with Autism Group (A1-A6):")
    individual_data = prepare_icc_data(data_df, individual_raters, 'Individual', 'Helpfulness')
    if individual_data is not None and len(individual_data) > 0:
        icc_a = calculate_single_icc(individual_data, "Individuals with Autism")
        icc_results['Individuals_with_Autism'] = icc_a
    else:
        icc_results['Individuals_with_Autism'] = create_failed_icc_result("No individual data")
    
    # 3. Autism experts within-group ICC (enhanced handling)
    print("\n3. Autism Experts Group (E1-E11):")
    expert_data = prepare_enhanced_expert_icc_data(data_df, expert_raters)
    if expert_data is not None and len(expert_data) > 0:
        icc_e = calculate_single_icc(expert_data, "Autism Experts")
        icc_results['Autism_experts'] = icc_e
    else:
        icc_results['Autism_experts'] = create_failed_icc_result("No expert data")
    
    # 4. Cross-group ICC calculations (enhanced)
    print("\n4. Cross-group ICC Analysis:")
    
    # R + A (Researchers + Individuals) - should always work
    ra_data = prepare_cross_group_icc(data_df, researcher_raters, individual_raters, 
                                      'Researcher', 'Individual', 'Helpfulness')
    if ra_data is not None and len(ra_data) > 0:
        icc_ra = calculate_single_icc(ra_data, "Researchers + Individuals")
        icc_results['R_A'] = icc_ra
    else:
        icc_results['R_A'] = create_failed_icc_result("No R+A data")
    
    # R + E (Researchers + Experts) - enhanced handling
    re_data = prepare_enhanced_researcher_expert_icc(data_df, researcher_raters, expert_raters)
    if re_data is not None and len(re_data) > 0:
        icc_re = calculate_single_icc(re_data, "Researchers + Experts")
        icc_results['R_E'] = icc_re
    else:
        icc_results['R_E'] = create_failed_icc_result("No R+E data")
    
    # A + E (Individuals + Experts) - enhanced handling
    ae_data = prepare_enhanced_individual_expert_icc(data_df, individual_raters, expert_raters)
    if ae_data is not None and len(ae_data) > 0:
        icc_ae = calculate_single_icc(ae_data, "Individuals + Experts")
        icc_results['A_E'] = icc_ae
    else:
        icc_results['A_E'] = create_failed_icc_result("No A+E data")
    
    # R + A + E (All groups) - enhanced handling
    rae_data = prepare_enhanced_all_groups_icc(data_df, researcher_raters, individual_raters, expert_raters)
    if rae_data is not None and len(rae_data) > 0:
        icc_rae = calculate_single_icc(rae_data, "All Groups")
        icc_results['R_A_E'] = icc_rae
    else:
        icc_results['R_A_E'] = create_failed_icc_result("No R+A+E data")
    
    return icc_results

def prepare_icc_data(data_df, raters, prefix, measure):
    """Prepare data format required for ICC calculation"""
    
    icc_data = []
    
    for idx, row in data_df.iterrows():
        for rater in raters:
            col_name = f'{prefix}_{rater}_{measure}'
            if col_name in data_df.columns:
                score = row[col_name]
                if pd.notna(score) and score != -1:  # Exclude non-evaluated data
                    icc_data.append({
                        'Subject': idx,
                        'Rater': rater,
                        'Rating': score
                    })
    
    if len(icc_data) > 0:
        return pd.DataFrame(icc_data)
    else:
        print(f"  Not enough data for ICC calculation")
        return None

def prepare_enhanced_expert_icc_data(data_df, expert_raters):
    """Enhanced expert group ICC data preparation"""
    
    icc_data = []
    
    for idx, row in data_df.iterrows():
        for expert in expert_raters:
            helpfulness_col = f'Expert_{expert}_Helpfulness'
            evaluated_col = f'Expert_{expert}_Evaluated'
            
            if helpfulness_col in data_df.columns:
                score = row[helpfulness_col]
                
                # Check if expert evaluated this answer
                is_evaluated = True  # Default assume evaluated
                if evaluated_col in data_df.columns:
                    is_evaluated = row[evaluated_col] == True
                
                # Include if score is valid and expert evaluated
                if pd.notna(score) and score != -1 and is_evaluated:
                    icc_data.append({
                        'Subject': idx,
                        'Rater': expert,
                        'Rating': score
                    })
    
    print(f"  Prepared {len(icc_data)} expert ratings for ICC calculation")
    
    if len(icc_data) > 0:
        return pd.DataFrame(icc_data)
    else:
        print(f"  Expert group has insufficient evaluation data")
        return None

def prepare_cross_group_icc(data_df, group1_raters, group2_raters, prefix1, prefix2, measure):
    """Prepare cross-group ICC data"""
    
    icc_data = []
    
    for idx, row in data_df.iterrows():
        # Group 1 data
        for rater in group1_raters:
            col_name = f'{prefix1}_{rater}_{measure}'
            if col_name in data_df.columns:
                score = row[col_name]
                if pd.notna(score) and score != -1:
                    icc_data.append({
                        'Subject': idx,
                        'Rater': f'{prefix1}_{rater}',
                        'Rating': score
                    })
        
        # Group 2 data
        for rater in group2_raters:
            col_name = f'{prefix2}_{rater}_{measure}'
            if col_name in data_df.columns:
                score = row[col_name]
                if pd.notna(score) and score != -1:
                    icc_data.append({
                        'Subject': idx,
                        'Rater': f'{prefix2}_{rater}',
                        'Rating': score
                    })
    
    if len(icc_data) > 0:
        return pd.DataFrame(icc_data)
    else:
        return None

def prepare_enhanced_researcher_expert_icc(data_df, researcher_raters, expert_raters):
    """Enhanced researcher + expert ICC data preparation"""
    
    icc_data = []
    
    for idx, row in data_df.iterrows():
        # Always add researcher data (they evaluate all answers)
        for rater in researcher_raters:
            col_name = f'Researcher_{rater}_Helpfulness'
            if col_name in data_df.columns:
                score = row[col_name]
                if pd.notna(score) and score != -1:
                    icc_data.append({
                        'Subject': idx,
                        'Rater': f'Researcher_{rater}',
                        'Rating': score
                    })
        
        # Add expert data with enhanced handling
        for expert in expert_raters:
            helpfulness_col = f'Expert_{expert}_Helpfulness'
            evaluated_col = f'Expert_{expert}_Evaluated'
            
            if helpfulness_col in data_df.columns:
                score = row[helpfulness_col]
                
                # Check if expert evaluated this answer
                is_evaluated = True  # Default assume evaluated
                if evaluated_col in data_df.columns:
                    is_evaluated = row[evaluated_col] == True
                
                # Include if score is valid and expert evaluated
                if pd.notna(score) and score != -1 and is_evaluated:
                    icc_data.append({
                        'Subject': idx,
                        'Rater': f'Expert_{expert}',
                        'Rating': score
                    })
    
    print(f"  Prepared {len(icc_data)} R+E ratings for ICC calculation")
    
    if len(icc_data) > 0:
        return pd.DataFrame(icc_data)
    else:
        return None

def prepare_enhanced_individual_expert_icc(data_df, individual_raters, expert_raters):
    """Enhanced individual + expert ICC data preparation"""
    
    icc_data = []
    
    for idx, row in data_df.iterrows():
        # Add individual data (they evaluate all answers)
        for rater in individual_raters:
            col_name = f'Individual_{rater}_Helpfulness'
            if col_name in data_df.columns:
                score = row[col_name]
                if pd.notna(score) and score != -1:
                    icc_data.append({
                        'Subject': idx,
                        'Rater': f'Individual_{rater}',
                        'Rating': score
                    })
        
        # Add expert data with enhanced handling
        for expert in expert_raters:
            helpfulness_col = f'Expert_{expert}_Helpfulness'
            evaluated_col = f'Expert_{expert}_Evaluated'
            
            if helpfulness_col in data_df.columns:
                score = row[helpfulness_col]
                
                # Check if expert evaluated this answer
                is_evaluated = True  # Default assume evaluated
                if evaluated_col in data_df.columns:
                    is_evaluated = row[evaluated_col] == True
                
                # Include if score is valid and expert evaluated
                if pd.notna(score) and score != -1 and is_evaluated:
                    icc_data.append({
                        'Subject': idx,
                        'Rater': f'Expert_{expert}',
                        'Rating': score
                    })
    
    print(f"  Prepared {len(icc_data)} A+E ratings for ICC calculation")
    
    if len(icc_data) > 0:
        return pd.DataFrame(icc_data)
    else:
        return None

def prepare_enhanced_all_groups_icc(data_df, researcher_raters, individual_raters, expert_raters):
    """Enhanced all groups combined ICC data preparation"""
    
    icc_data = []
    
    for idx, row in data_df.iterrows():
        # Add researcher data
        for rater in researcher_raters:
            col_name = f'Researcher_{rater}_Helpfulness'
            if col_name in data_df.columns:
                score = row[col_name]
                if pd.notna(score) and score != -1:
                    icc_data.append({
                        'Subject': idx,
                        'Rater': f'Researcher_{rater}',
                        'Rating': score
                    })
        
        # Add individual data
        for rater in individual_raters:
            col_name = f'Individual_{rater}_Helpfulness'
            if col_name in data_df.columns:
                score = row[col_name]
                if pd.notna(score) and score != -1:
                    icc_data.append({
                        'Subject': idx,
                        'Rater': f'Individual_{rater}',
                        'Rating': score
                    })
        
        # Add expert data with enhanced handling
        for expert in expert_raters:
            helpfulness_col = f'Expert_{expert}_Helpfulness'
            evaluated_col = f'Expert_{expert}_Evaluated'
            
            if helpfulness_col in data_df.columns:
                score = row[helpfulness_col]
                
                # Check if expert evaluated this answer
                is_evaluated = True  # Default assume evaluated
                if evaluated_col in data_df.columns:
                    is_evaluated = row[evaluated_col] == True
                
                # Include if score is valid and expert evaluated
                if pd.notna(score) and score != -1 and is_evaluated:
                    icc_data.append({
                        'Subject': idx,
                        'Rater': f'Expert_{expert}',
                        'Rating': score
                    })
    
    print(f"  Prepared {len(icc_data)} R+A+E ratings for ICC calculation")
    
    if len(icc_data) > 0:
        return pd.DataFrame(icc_data)
    else:
        return None

def calculate_single_icc(icc_data, group_name):
    """Calculate ICC for a single group with enhanced error handling"""
    
    try:
        # Basic data validation
        if len(icc_data) == 0:
            return create_failed_icc_result("Empty dataset")
        
        n_subjects = icc_data['Subject'].nunique()
        n_raters = icc_data['Rater'].nunique()
        
        if n_subjects < 2:
            return create_failed_icc_result(f"Too few subjects: {n_subjects}")
        
        if n_raters < 2:
            return create_failed_icc_result(f"Too few raters: {n_raters}")
        
        # Remove any missing values
        icc_data_clean = icc_data.dropna()
        
        if len(icc_data_clean) == 0:
            return create_failed_icc_result("No valid data after cleaning")
        
        # Use pingouin to calculate ICC
        icc_result = intraclass_corr(data=icc_data_clean, targets='Subject', 
                                   raters='Rater', ratings='Rating')
        
        # Extract ICC(2,1) - consistent with original paper model
        icc_21 = icc_result[icc_result['Type'] == 'ICC2']['ICC'].iloc[0]
        ci_lower = icc_result[icc_result['Type'] == 'ICC2']['CI95%'].iloc[0][0]
        ci_upper = icc_result[icc_result['Type'] == 'ICC2']['CI95%'].iloc[0][1]
        
        print(f"  {group_name}: ICC = {icc_21:.3f} (95% CI: {ci_lower:.3f}-{ci_upper:.3f})")
        print(f"    Subjects: {n_subjects}, Raters: {n_raters}, Total ratings: {len(icc_data_clean)}")
        
        return {
            'ICC': icc_21,
            'CI_lower': ci_lower,
            'CI_upper': ci_upper,
            'n_subjects': n_subjects,
            'n_raters': n_raters,
            'interpretation': interpret_icc(icc_21)
        }
        
    except Exception as e:
        print(f"  {group_name}: ICC calculation failed - {e}")
        return create_failed_icc_result(str(e))

def create_failed_icc_result(reason):
    """Create a standardized failed ICC result"""
    return {
        'ICC': np.nan,
        'CI_lower': np.nan,
        'CI_upper': np.nan,
        'n_subjects': 0,
        'n_raters': 0,
        'interpretation': f'Failed: {reason}'
    }

def interpret_icc(icc):
    """Interpret ICC values"""
    if np.isnan(icc):
        return "Cannot calculate"
    elif icc < 0.50:
        return "Poor"
    elif icc < 0.75:
        return "Moderate"
    elif icc < 0.90:
        return "Good"
    else:
        return "Excellent"

def save_agreement_results(alpha_results, icc_results, output_dir):
    """Save rater agreement analysis results to tables"""
    
    os.makedirs(output_dir, exist_ok=True)
    
    # 1. Save Krippendorff's α results
    alpha_df_data = []
    for measure, result in alpha_results.items():
        alpha_df_data.append({
            'Measure': measure,
            'Krippendorff_Alpha': result['alpha'],
            'N_Raters': result['raters'],
            'N_Items': result['items'],
            'Interpretation': result['interpretation']
        })
    
    alpha_df = pd.DataFrame(alpha_df_data)
    alpha_file = os.path.join(output_dir, 'enhanced_krippendorff_alpha_results.csv')
    alpha_df.to_csv(alpha_file, index=False, encoding='utf-8')
    print(f"\n✅ Krippendorff's α results saved to: {alpha_file}")
    
    # 2. Save ICC results
    icc_df_data = []
    for group, result in icc_results.items():
        icc_df_data.append({
            'Rater_Group': group,
            'ICC': result['ICC'],
            'CI_Lower': result['CI_lower'],
            'CI_Upper': result['CI_upper'],
            'N_Subjects': result['n_subjects'],
            'N_Raters': result['n_raters'],
            'Interpretation': result['interpretation']
        })
    
    icc_df = pd.DataFrame(icc_df_data)
    icc_file = os.path.join(output_dir, 'enhanced_icc_helpfulness_results.csv')
    icc_df.to_csv(icc_file, index=False, encoding='utf-8')
    print(f"✅ ICC results saved to: {icc_file}")
    
    return alpha_file, icc_file

def main():
    """Main function - Execute enhanced rater agreement analysis"""
    
    # Data file paths
    # enhanced LLM rating update
    csv_path = r"D:\Wisconsin_Madison\2025summer\Dr.Arriaga\AutismTE\TE_autism\rating_results\final_rating_result\rating_random\enhanced_diverse_llm_rating_results_20250715_121059.csv"
    output_dir = r"D:\Wisconsin_Madison\2025summer\Dr.Arriaga\AutismTE\TE_autism\rating_results\final_rating_result\rating_random"
    
    print("🔍 Starting Enhanced Rater Agreement Analysis")
    print("="*60)
    
    # Load data
    try:
        data_df = pd.read_csv(csv_path, encoding='utf-8')
        print(f"✅ Successfully loaded data: {len(data_df)} rows, {len(data_df.columns)} columns")
    except Exception as e:
        print(f"❌ Failed to load data: {e}")
        return
    
    # Check data structure
    print(f"\n📊 Data Structure Check:")
    print(f"   Columns: {len(data_df.columns)}")
    expert_cols = [col for col in data_df.columns if col.startswith('Expert_')]
    print(f"   Expert columns: {len(expert_cols)}")
    evaluated_cols = [col for col in expert_cols if 'Evaluated' in col]
    print(f"   Expert Evaluated columns: {len(evaluated_cols)}")
    
    # 1. Calculate Krippendorff's α (only for researcher group Yes/No variables)
    print(f"\n{'='*30}")
    print("📊 1. Calculating Krippendorff's α")
    print(f"{'='*30}")
    
    yes_no_measures = ['Directness', 'Additional_Information', 'Informational_Support', 'Emotional_Support']
    researcher_raters = ['R1', 'R2']
    
    alpha_results = calculate_krippendorff_alpha(data_df, yes_no_measures, researcher_raters)
    
    # 2. Calculate ICC (Helpfulness ratings) - enhanced version
    print(f"\n{'='*30}")
    print("📊 2. Calculating Enhanced ICC (Helpfulness)")
    print(f"{'='*30}")
    
    icc_results = calculate_icc_helpfulness(data_df)
    
    # 3. Save results
    print(f"\n{'='*30}")
    print("💾 3. Saving Enhanced Analysis Results")
    print(f"{'='*30}")
    
    alpha_file, icc_file = save_agreement_results(alpha_results, icc_results, output_dir)
    
    # 4. Summary report
    print(f"\n{'='*30}")
    print("📋 4. Enhanced Analysis Summary")
    print(f"{'='*30}")
    
    print(f"\n🎯 Krippendorff's α Results Summary:")
    for measure, result in alpha_results.items():
        alpha_val = result['alpha']
        interpretation = result['interpretation']
        if not np.isnan(alpha_val):
            print(f"  {measure}: α = {alpha_val:.3f} ({interpretation})")
        else:
            print(f"  {measure}: Calculation failed ({interpretation})")
    
    print(f"\n🎯 ICC Results Summary:")
    for group, result in icc_results.items():
        icc_val = result['ICC']
        interpretation = result['interpretation']
        if not np.isnan(icc_val):
            print(f"  {group}: ICC = {icc_val:.3f} ({interpretation})")
        else:
            print(f"  {group}: {interpretation}")
    
    # Check completeness
    expected_groups = ['Researchers', 'Individuals_with_Autism', 'Autism_experts', 'R_A', 'R_E', 'A_E', 'R_A_E']
    missing_groups = [g for g in expected_groups if g not in icc_results or np.isnan(icc_results[g]['ICC'])]
    
    if missing_groups:
        print(f"\n⚠️  Missing or failed ICC calculations: {missing_groups}")
    else:
        print(f"\n✅ All ICC calculations completed successfully!")
    
    print(f"\n✅ Enhanced rater agreement analysis completed!")
    print(f"📁 Result files saved in: {output_dir}")

if __name__ == "__main__":
    main()

🔍 Starting Enhanced Rater Agreement Analysis
✅ Successfully loaded data: 200 rows, 44 columns

📊 Data Structure Check:
   Columns: 44
   Expert columns: 22
   Expert Evaluated columns: 11

📊 1. Calculating Krippendorff's α

Calculating Krippendorff's α for Directness...
  α = 0.391 (Fair)

Calculating Krippendorff's α for Additional_Information...
  α = 0.346 (Fair)

Calculating Krippendorff's α for Informational_Support...
  α = 0.655 (Good)

Calculating Krippendorff's α for Emotional_Support...
  α = 0.231 (Fair)

📊 2. Calculating Enhanced ICC (Helpfulness)

=== Calculating ICC values for Helpfulness ===

🔍 Checking Expert Data Structure:
  E1: 200 answers evaluated
  E2: 200 answers evaluated
  E3: 200 answers evaluated
  E4: 200 answers evaluated
  E5: 200 answers evaluated
  E6: 200 answers evaluated
  E7: 200 answers evaluated
  E8: 200 answers evaluated
  E9: 200 answers evaluated
  E10: 200 answers evaluated
  E11: 200 answers evaluated
  Total expert evaluations: 2200
  Averag

#### 5.2 out-group vs. ai-simulation

In [16]:
import pandas as pd
import numpy as np
import os
from scipy import stats
from scipy.stats import mannwhitneyu
import warnings
warnings.filterwarnings('ignore')

def analyze_rating_data_table3_format(file_path, output_dir):
    """
    Analyze rating data comparing out_group_human vs ai_generated answers
    Following the exact format and methodology of Table 3 from the original paper
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Read the data
    df = pd.read_csv(file_path)
    
    print(f"Total records: {len(df)}")
    print(f"Columns: {df.columns.tolist()}")
    print(f"Sources: {df['source'].value_counts()}")
    
    # Separate out_group_human and ai_generated data
    out_group_data = df[df['source'] == 'out_group_human'].copy()
    ai_generated_data = df[df['source'] == 'ai_generated'].copy()
    
    print(f"\nOut-group human records: {len(out_group_data)}")
    print(f"AI-generated records: {len(ai_generated_data)}")
    
    # Check if we have data for both groups
    if len(out_group_data) == 0 or len(ai_generated_data) == 0:
        print("Error: Missing data for one or both groups")
        return
    
    # Define the five measures exactly as in the original paper
    measures = [
        'Directness', 
        'Additional_Information', 
        'Informational_Support', 
        'Emotional_Support', 
        'Helpfulness'
    ]
    
    # Binary measures (0-1 scale)
    binary_measures = ['Directness', 'Additional_Information', 'Informational_Support', 'Emotional_Support']
    
    results = {}
    table3_rows = []
    
    # Process each measure
    for measure in measures:
        print(f"\nProcessing {measure}...")
        
        # Column names for R1 and R2 (both researchers)
        r1_col = f'Researcher_R1_{measure}'
        r2_col = f'Researcher_R2_{measure}'
        
        # Check if columns exist
        if r1_col not in df.columns or r2_col not in df.columns:
            print(f"Warning: Missing columns for {measure}")
            continue
        
        # Process Out-group Human data
        out_r1 = pd.to_numeric(out_group_data[r1_col], errors='coerce')
        out_r2 = pd.to_numeric(out_group_data[r2_col], errors='coerce')
        # Average of two researchers for each response
        out_avg = (out_r1 + out_r2) / 2
        out_clean = out_avg.dropna()
        
        # Process AI-generated data
        ai_r1 = pd.to_numeric(ai_generated_data[r1_col], errors='coerce')
        ai_r2 = pd.to_numeric(ai_generated_data[r2_col], errors='coerce')
        # Average of two researchers for each response
        ai_avg = (ai_r1 + ai_r2) / 2
        ai_clean = ai_avg.dropna()
        
        print(f"  Out-group valid samples: {len(out_clean)}")
        print(f"  AI-generated valid samples: {len(ai_clean)}")
        
        # Skip if insufficient data
        if len(out_clean) < 2 or len(ai_clean) < 2:
            print(f"  Skipping {measure} due to insufficient data")
            continue
        
        # Mann-Whitney U test (two-tailed)
        try:
            statistic, p_value = mannwhitneyu(ai_clean, out_clean, alternative='two-sided')
            
            # Calculate z-score approximation for Mann-Whitney U
            n1, n2 = len(ai_clean), len(out_clean)
            mean_u = n1 * n2 / 2
            std_u = np.sqrt(n1 * n2 * (n1 + n2 + 1) / 12)
            z_score = (statistic - mean_u) / std_u
            
        except Exception as e:
            print(f"  Statistical test failed for {measure}: {e}")
            z_score, p_value = 0, 1
        
        # Store comprehensive results
        results[measure] = {
            'out_group': {
                'mean': out_clean.mean(),
                'median': out_clean.median(),
                'std': out_clean.std(),
                'min': out_clean.min(),
                'max': out_clean.max(),
                'n': len(out_clean)
            },
            'ai_generated': {
                'mean': ai_clean.mean(),
                'median': ai_clean.median(),
                'std': ai_clean.std(),
                'min': ai_clean.min(),
                'max': ai_clean.max(),
                'n': len(ai_clean)
            },
            'z_score': z_score,
            'p_value': p_value,
            'test_statistic': statistic
        }
        
        print(f"  Out-group: μ={out_clean.mean():.3f}, σ={out_clean.std():.3f}, median={out_clean.median():.3f}")
        print(f"  AI-generated: μ={ai_clean.mean():.3f}, σ={ai_clean.std():.3f}, median={ai_clean.median():.3f}")
        print(f"  Mann-Whitney U test: z={z_score:.3f}, p={p_value:.6f}")
        
        # Create Table 3 rows (following original paper format)
        # Row 1: Out-group Human
        table3_rows.append({
            'Measure': f"{measure} (z={z_score:.2f}, p={p_value:.6f})",
            'Answer_Source': 'Out-group Human',
            'Mean_μ': round(out_clean.mean(), 2),
            'Median': round(out_clean.median(), 2),
            'SD_σ': round(out_clean.std(), 2),
            'Min': round(out_clean.min(), 2),
            'Max': round(out_clean.max(), 2),
            'N': len(out_clean)
        })
        
        # Row 2: AI-generated
        table3_rows.append({
            'Measure': '',  # Empty for second row as in original
            'Answer_Source': 'AI-generated',
            'Mean_μ': round(ai_clean.mean(), 2),
            'Median': round(ai_clean.median(), 2),
            'SD_σ': round(ai_clean.std(), 2),
            'Min': round(ai_clean.min(), 2),
            'Max': round(ai_clean.max(), 2),
            'N': len(ai_clean)
        })
    
    # Create Table 3 DataFrame (exact format as original paper)
    if table3_rows:
        table3_df = pd.DataFrame(table3_rows)
        table3_df.to_csv(os.path.join(output_dir, 'Table3_Descriptive_Statistics.csv'), index=False)
        
        # Also create a formatted version for display
        table3_formatted = table3_df.copy()
        # Format the statistical test column for better readability
        for i in range(0, len(table3_formatted), 2):
            if i < len(table3_formatted):
                measure_name = table3_formatted.iloc[i]['Measure'].split(' (')[0]
                if measure_name in results:
                    z_val = results[measure_name]['z_score']
                    p_val = results[measure_name]['p_value']
                    significance = get_significance_stars(p_val)
                    table3_formatted.iloc[i, table3_formatted.columns.get_loc('Measure')] = f"{measure_name}{significance}"
        
        table3_formatted.to_csv(os.path.join(output_dir, 'Table3_Formatted.csv'), index=False)
        print(f"\nTable 3 created with {len(results)} valid measures")
    else:
        print("\nNo valid measures found - unable to create Table 3")
    
    # Calculate word count statistics (as mentioned in original paper)
    out_group_words = pd.to_numeric(out_group_data['word_count'], errors='coerce').dropna()
    ai_words = pd.to_numeric(ai_generated_data['word_count'], errors='coerce').dropna()
    
    if len(out_group_words) > 0 and len(ai_words) > 0:
        word_statistic, word_p_value = mannwhitneyu(ai_words, out_group_words, alternative='two-sided')
        n1, n2 = len(ai_words), len(out_group_words)
        mean_u = n1 * n2 / 2
        std_u = np.sqrt(n1 * n2 * (n1 + n2 + 1) / 12)
        word_z_score = (word_statistic - mean_u) / std_u
    else:
        word_z_score, word_p_value = 0, 1
    
    # Create comprehensive analysis report
    create_comprehensive_report(results, out_group_words, ai_words, word_z_score, word_p_value, output_dir)
    
    # Print summary to console (Table 3 style)
    print(f"\n" + "="*80)
    print("TABLE 3 SUMMARY: Out-group Human vs AI-generated Responses")
    print("="*80)
    print(f"{'Measure':<20} {'Source':<15} {'Mean':<8} {'Median':<8} {'SD':<8} {'Min':<8} {'Max':<8}")
    print("-"*80)
    
    for measure in measures:
        if measure in results:
            out_data = results[measure]['out_group']
            ai_data = results[measure]['ai_generated']
            p_val = results[measure]['p_value']
            z_val = results[measure]['z_score']
            significance = get_significance_stars(p_val)
            
            print(f"{measure + significance:<20} {'Out-group':<15} {out_data['mean']:<8.2f} {out_data['median']:<8.2f} {out_data['std']:<8.2f} {out_data['min']:<8.2f} {out_data['max']:<8.2f}")
            print(f"{'(z=' + str(round(z_val,2)) + ')':<20} {'AI-generated':<15} {ai_data['mean']:<8.2f} {ai_data['median']:<8.2f} {ai_data['std']:<8.2f} {ai_data['min']:<8.2f} {ai_data['max']:<8.2f}")
            print("")
    
    # Word count comparison
    print(f"{'Word Count':<20} {'Out-group':<15} {out_group_words.mean():<8.2f} {out_group_words.median():<8.2f} {out_group_words.std():<8.2f} {out_group_words.min():<8.0f} {out_group_words.max():<8.0f}")
    print(f"{'(z=' + str(round(word_z_score,2)) + ')':<20} {'AI-generated':<15} {ai_words.mean():<8.2f} {ai_words.median():<8.2f} {ai_words.std():<8.2f} {ai_words.min():<8.0f} {ai_words.max():<8.0f}")
    
    print(f"\nFiles created in {output_dir}:")
    print("1. Table3_Descriptive_Statistics.csv")
    print("2. Table3_Formatted.csv") 
    print("3. Comprehensive_Analysis_Report.txt")
    
    return results, table3_df

def get_significance_stars(p_value):
    """Convert p-value to significance stars"""
    if p_value < 0.001:
        return "***"
    elif p_value < 0.01:
        return "**"
    elif p_value < 0.05:
        return "*"
    else:
        return ""

def create_comprehensive_report(results, out_group_words, ai_words, word_z_score, word_p_value, output_dir):
    """Create a comprehensive analysis report following the original paper's methodology"""
    
    with open(os.path.join(output_dir, 'Comprehensive_Analysis_Report.txt'), 'w', encoding='utf-8') as f:
        f.write("COMPREHENSIVE RATING ANALYSIS REPORT\n")
        f.write("Out-group Human vs AI-generated Responses\n")
        f.write("Following Original Paper Methodology (Table 3 Format)\n")
        f.write("=" * 80 + "\n\n")
        
        f.write("METHODOLOGY:\n")
        f.write("-" * 40 + "\n")
        f.write("• Two researchers (R1, R2) rated all responses\n")
        f.write("• For each response: average score = (R1_score + R2_score) / 2\n")
        f.write("• Binary measures (0-1): Directness, Additional_Information, Informational_Support, Emotional_Support\n")
        f.write("• Likert scale (1-5): Helpfulness\n")
        f.write("• Statistical test: Mann-Whitney U (two-tailed)\n")
        f.write("• Significance levels: * p<0.05, ** p<0.01, *** p<0.001\n\n")
        
        f.write("SAMPLE SIZES:\n")
        f.write("-" * 40 + "\n")
        if results:
            first_measure = list(results.keys())[0]
            f.write(f"Out-group human responses: {results[first_measure]['out_group']['n']}\n")
            f.write(f"AI-generated responses: {results[first_measure]['ai_generated']['n']}\n\n")
        
        f.write("WORD COUNT ANALYSIS:\n")
        f.write("-" * 40 + "\n")
        f.write(f"Out-group human: μ={out_group_words.mean():.2f} words, σ={out_group_words.std():.2f}\n")
        f.write(f"AI-generated: μ={ai_words.mean():.2f} words, σ={ai_words.std():.2f}\n")
        f.write(f"Mann-Whitney U test: z={word_z_score:.3f}, p={word_p_value:.6f}{get_significance_stars(word_p_value)}\n\n")
        
        f.write("DETAILED RESULTS BY MEASURE:\n")
        f.write("=" * 80 + "\n")
        
        measures = ['Directness', 'Additional_Information', 'Informational_Support', 'Emotional_Support', 'Helpfulness']
        
        for measure in measures:
            if measure not in results:
                f.write(f"\n{measure}: INSUFFICIENT DATA\n")
                continue
                
            out_data = results[measure]['out_group']
            ai_data = results[measure]['ai_generated']
            z_score = results[measure]['z_score']
            p_value = results[measure]['p_value']
            significance = get_significance_stars(p_value)
            
            f.write(f"\n{measure.upper()}{significance}:\n")
            f.write("-" * 50 + "\n")
            f.write(f"Out-group Human:  μ={out_data['mean']:.3f}, median={out_data['median']:.3f}, σ={out_data['std']:.3f}\n")
            f.write(f"                  min={out_data['min']:.3f}, max={out_data['max']:.3f}, n={out_data['n']}\n")
            f.write(f"AI-generated:     μ={ai_data['mean']:.3f}, median={ai_data['median']:.3f}, σ={ai_data['std']:.3f}\n")
            f.write(f"                  min={ai_data['min']:.3f}, max={ai_data['max']:.3f}, n={ai_data['n']}\n")
            f.write(f"Statistical Test: Mann-Whitney U = {results[measure]['test_statistic']:.1f}\n")
            f.write(f"                  z = {z_score:.3f}, p = {p_value:.6f}\n")
            
            # Effect size interpretation
            if abs(z_score) >= 2.58:
                effect = "Large effect (|z| ≥ 2.58)"
            elif abs(z_score) >= 1.96:
                effect = "Medium effect (|z| ≥ 1.96)"
            elif abs(z_score) >= 1.28:
                effect = "Small effect (|z| ≥ 1.28)"
            else:
                effect = "Negligible effect (|z| < 1.28)"
            
            f.write(f"Effect Size:      {effect}\n")
            
            # Interpretation
            if p_value < 0.001:
                f.write(f"Interpretation:   Highly significant difference (p < 0.001)\n")
            elif p_value < 0.01:
                f.write(f"Interpretation:   Very significant difference (p < 0.01)\n")
            elif p_value < 0.05:
                f.write(f"Interpretation:   Significant difference (p < 0.05)\n")
            else:
                f.write(f"Interpretation:   No significant difference (p ≥ 0.05)\n")
            
            # Direction of difference
            diff = ai_data['mean'] - out_data['mean']
            if abs(diff) < 0.01:
                f.write(f"Direction:        No meaningful difference\n")
            elif diff > 0:
                f.write(f"Direction:        AI-generated scores higher (+{diff:.3f})\n")
            else:
                f.write(f"Direction:        Out-group human scores higher ({diff:.3f})\n")
        
        f.write(f"\nCONCLUSIONS:\n")
        f.write("=" * 40 + "\n")
        f.write("This analysis compares the quality ratings between:\n")
        f.write("• Out-group Human responses (original Amazon MTurk crowd workers)\n")
        f.write("• AI-generated responses (enhanced LLM simulation system)\n\n")
        f.write("Key findings indicate the relative performance of AI simulation\n")
        f.write("compared to authentic human out-group responses across all\n")
        f.write("five quality dimensions used in the original research.\n")

# Main execution function
if __name__ == "__main__":
    # File paths - update these to match your actual file locations
    #  enhanced LLM rating update
    input_file = r"D:\Wisconsin_Madison\2025summer\Dr.Arriaga\AutismTE\TE_autism\rating_results\final_rating_result\rating_random\enhanced_diverse_llm_rating_results_20250715_121059.csv"
    output_directory = r"D:\Wisconsin_Madison\2025summer\Dr.Arriaga\AutismTE\TE_autism\rating_results\final_rating_result\rating_random"
    
    # Run the analysis
    print("Starting Table 3 analysis following original paper methodology...")
    results, table3_df = analyze_rating_data_table3_format(input_file, output_directory)
    print("\nAnalysis completed successfully!")

Starting Table 3 analysis following original paper methodology...
Total records: 200
Columns: ['pair_id', 'question', 'answer', 'response_id', 'source', 'word_count', 'Researcher_R1_Directness', 'Researcher_R1_Additional_Information', 'Researcher_R1_Informational_Support', 'Researcher_R1_Emotional_Support', 'Researcher_R1_Helpfulness', 'Researcher_R2_Directness', 'Researcher_R2_Additional_Information', 'Researcher_R2_Informational_Support', 'Researcher_R2_Emotional_Support', 'Researcher_R2_Helpfulness', 'Individual_A1_Helpfulness', 'Individual_A2_Helpfulness', 'Individual_A3_Helpfulness', 'Individual_A4_Helpfulness', 'Individual_A5_Helpfulness', 'Individual_A6_Helpfulness', 'Expert_E1_Helpfulness', 'Expert_E1_Evaluated', 'Expert_E2_Helpfulness', 'Expert_E2_Evaluated', 'Expert_E3_Helpfulness', 'Expert_E3_Evaluated', 'Expert_E4_Helpfulness', 'Expert_E4_Evaluated', 'Expert_E5_Helpfulness', 'Expert_E5_Evaluated', 'Expert_E6_Helpfulness', 'Expert_E6_Evaluated', 'Expert_E7_Helpfulness', 'Exp