In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Load the dataset
# Note: Replace 'your_data_file.csv' with the actual path to your Qualtrics data
# df = pd.read_csv('your_data_file.csv')

# For demonstration purposes, we'll create a sample dataset structure
# In practice, you would load your actual data here
print("请确保已加载您的数据集到变量 'df' 中")
print("数据集应包含以下关键变量：")
print("- order: 排列顺序变量")
print("- pref_*: 政策偏好问题")
print("- base_*: 基线问题") 
print("- err_*: 错误阈值问题")
print("- 各种时间变量: *_First Click, *_Last Click, *_Page Submit, *_Click Count")

In [None]:
# Data Preparation and Variable Organization
def prepare_data(df):
    """
    Prepare and organize the survey data for analysis
    """
    print("开始数据预处理...")
    
    # Create a copy to avoid modifying original data
    data = df.copy()
    
    # Define question groups
    policy_questions = [col for col in data.columns if col.startswith('pref_') and not any(x in col for x in ['First Click', 'Last Click', 'Page Submit', 'Click Count'])]
    baseline_questions = [col for col in data.columns if col.startswith('base_') and not any(x in col for x in ['First Click', 'Last Click', 'Page Submit', 'Click Count'])]
    error_questions = [col for col in data.columns if col.startswith('err_') and not any(x in col for x in ['First Click', 'Last Click', 'Page Submit', 'Click Count'])]
    
    # Define time variables for each group
    policy_time_vars = ['pref_First Click', 'pref_Last Click', 'pref_Page Submit', 'pref_Click Count']
    
    # Baseline time variables (7 themes)
    baseline_themes = ['disease', 'armed', 'conv', 'welfare', 'immi', 'vote', 'air', 'firearm', 'auto']
    baseline_time_vars = []
    for theme in baseline_themes:
        baseline_time_vars.extend([f'base_{theme}_First Click', f'base_{theme}_Last Click', 
                                 f'base_{theme}_Page Submit', f'base_{theme}_Click Count'])
    
    # Error time variables (7 themes)
    error_themes = ['disease', 'armed', 'conv', 'welfare', 'immi', 'vote', 'air', 'auto']
    error_time_vars = []
    for theme in error_themes:
        error_time_vars.extend([f'err_{theme}_First Click', f'err_{theme}_Last Click', 
                               f'err_{theme}_Page Submit', f'err_{theme}_Click Count'])
    
    # Store organized data
    organized_data = {
        'policy_questions': policy_questions,
        'baseline_questions': baseline_questions, 
        'error_questions': error_questions,
        'policy_time_vars': policy_time_vars,
        'baseline_time_vars': baseline_time_vars,
        'error_time_vars': error_time_vars,
        'all_time_vars': policy_time_vars + baseline_time_vars + error_time_vars
    }
    
    print(f"政策问题数量: {len(policy_questions)}")
    print(f"基线问题数量: {len(baseline_questions)}")
    print(f"错误阈值问题数量: {len(error_questions)}")
    print(f"总时间变量数量: {len(organized_data['all_time_vars'])}")
    
    return data, organized_data

# Apply data preparation
# data, organized_data = prepare_data(df)
print("数据预处理函数已定义，请加载数据后运行")

In [None]:
# Order Validation Function
def validate_participant_orders(data):
    """
    Validate that each participant's question answering order is correct
    """
    print("验证参与者答题顺序...")
    
    # Define expected orders
    expected_orders = ['PBT', 'PTB', 'TBP', 'BTP']
    
    # Check if order variable exists and has valid values
    if 'order' not in data.columns:
        print("错误: 未找到 'order' 变量")
        return False
    
    # Check order distribution
    order_counts = data['order'].value_counts()
    print("排列顺序分布:")
    for order in expected_orders:
        count = order_counts.get(order, 0)
        print(f"  {order}: {count} 参与者")
    
    # Check for invalid orders
    invalid_orders = data[~data['order'].isin(expected_orders)]['order'].unique()
    if len(invalid_orders) > 0:
        print(f"警告: 发现无效的排列顺序: {invalid_orders}")
        return False
    
    print("所有参与者的排列顺序都有效")
    return True

# Function to extract question sequence for each participant
def get_question_sequence(participant_data, order):
    """
    Get the question sequence for a participant based on their order
    """
    sequences = {
        'PBT': ['Policy', 'Baseline', 'Error'],
        'PTB': ['Policy', 'Error', 'Baseline'], 
        'TBP': ['Error', 'Baseline', 'Policy'],
        'BTP': ['Baseline', 'Error', 'Policy']
    }
    return sequences.get(order, [])

In [None]:
# Response Time Analysis Functions
def calculate_response_times(data, organized_data):
    """
    Calculate response times and analyze patterns
    """
    print("计算响应时间...")
    
    # Create response time dataframe
    response_data = []
    
    for idx, row in data.iterrows():
        participant_id = idx
        order = row['order']
        
        # Policy questions (single timer for all 13 questions)
        if pd.notna(row['pref_Page Submit']):
            response_data.append({
                'participant_id': participant_id,
                'question_type': 'Policy',
                'question_sequence': 1 if order in ['PBT', 'PTB'] else 3,
                'response_time': row['pref_Page Submit'],
                'first_click': row['pref_First Click'],
                'last_click': row['pref_Last Click'],
                'click_count': row['pref_Click Count'],
                'order': order
            })
        
        # Baseline questions (7 themes, each with individual timers)
        baseline_themes = ['disease', 'armed', 'conv', 'welfare', 'immi', 'vote', 'air', 'firearm', 'auto']
        for i, theme in enumerate(baseline_themes):
            page_submit_col = f'base_{theme}_Page Submit'
            if pd.notna(row[page_submit_col]):
                response_data.append({
                    'participant_id': participant_id,
                    'question_type': 'Baseline',
                    'question_sequence': 1 if order == 'BTP' else (2 if order in ['PBT', 'TBP'] else 3),
                    'theme': theme,
                    'theme_order': i + 1,
                    'response_time': row[page_submit_col],
                    'first_click': row[f'base_{theme}_First Click'],
                    'last_click': row[f'base_{theme}_Last Click'],
                    'click_count': row[f'base_{theme}_Click Count'],
                    'order': order
                })
        
        # Error threshold questions (7 themes, each with individual timers)
        error_themes = ['disease', 'armed', 'conv', 'welfare', 'immi', 'vote', 'air', 'auto']
        for i, theme in enumerate(error_themes):
            page_submit_col = f'err_{theme}_Page Submit'
            if pd.notna(row[page_submit_col]):
                response_data.append({
                    'participant_id': participant_id,
                    'question_type': 'Error',
                    'question_sequence': 1 if order == 'TBP' else (2 if order in ['PTB', 'BTP'] else 3),
                    'theme': theme,
                    'theme_order': i + 1,
                    'response_time': row[page_submit_col],
                    'first_click': row[f'err_{theme}_First Click'],
                    'last_click': row[f'err_{theme}_Last Click'],
                    'click_count': row[f'err_{theme}_Click Count'],
                    'order': order
                })
    
    response_df = pd.DataFrame(response_data)
    print(f"生成了 {len(response_df)} 条响应时间记录")
    
    return response_df

def analyze_response_patterns(response_df):
    """
    Analyze response time patterns and identify potential issues
    """
    print("分析响应模式...")
    
    # Calculate summary statistics by question type
    summary_stats = response_df.groupby('question_type')['response_time'].agg([
        'count', 'mean', 'median', 'std', 'min', 'max'
    ]).round(2)
    
    print("响应时间摘要统计:")
    print(summary_stats)
    
    # Calculate response time by sequence position
    sequence_stats = response_df.groupby(['question_type', 'question_sequence'])['response_time'].agg([
        'count', 'mean', 'std'
    ]).round(2)
    
    print("\n按序列位置的响应时间:")
    print(sequence_stats)
    
    return summary_stats, sequence_stats

In [None]:
# Attention Check Functions
def perform_attention_checks(response_df):
    """
    Perform comprehensive attention checks based on response times and patterns
    """
    print("执行注意力检查...")
    
    attention_issues = []
    
    # 1. Check for extremely fast responses (potential random clicking)
    for question_type in ['Policy', 'Baseline', 'Error']:
        subset = response_df[response_df['question_type'] == question_type]
        if len(subset) > 0:
            # Define thresholds for "too fast" responses (in seconds)
            if question_type == 'Policy':
                min_reasonable_time = 30  # Policy questions should take at least 30 seconds
            else:
                min_reasonable_time = 5   # Individual baseline/error questions should take at least 5 seconds
            
            too_fast = subset[subset['response_time'] < min_reasonable_time]
            if len(too_fast) > 0:
                for _, row in too_fast.iterrows():
                    attention_issues.append({
                        'participant_id': row['participant_id'],
                        'issue_type': 'Too Fast Response',
                        'question_type': question_type,
                        'response_time': row['response_time'],
                        'threshold': min_reasonable_time,
                        'severity': 'High' if row['response_time'] < min_reasonable_time/2 else 'Medium'
                    })
    
    # 2. Check for extremely slow responses (potential inattention)
    for question_type in ['Policy', 'Baseline', 'Error']:
        subset = response_df[response_df['question_type'] == question_type]
        if len(subset) > 0:
            # Define thresholds for "too slow" responses
            if question_type == 'Policy':
                max_reasonable_time = 1800  # Policy questions shouldn't take more than 30 minutes
            else:
                max_reasonable_time = 300   # Individual questions shouldn't take more than 5 minutes
            
            too_slow = subset[subset['response_time'] > max_reasonable_time]
            if len(too_slow) > 0:
                for _, row in too_slow.iterrows():
                    attention_issues.append({
                        'participant_id': row['participant_id'],
                        'issue_type': 'Too Slow Response',
                        'question_type': question_type,
                        'response_time': row['response_time'],
                        'threshold': max_reasonable_time,
                        'severity': 'High' if row['response_time'] > max_reasonable_time*2 else 'Medium'
                    })
    
    # 3. Check for inconsistent response patterns within participants
    participant_stats = response_df.groupby('participant_id').agg({
        'response_time': ['mean', 'std', 'count'],
        'click_count': 'mean'
    }).round(2)
    
    # Flatten column names
    participant_stats.columns = ['_'.join(col).strip() for col in participant_stats.columns]
    
    # Identify participants with high variability in response times
    high_variability = participant_stats[
        (participant_stats['response_time_std'] > participant_stats['response_time_mean'] * 0.5) &
        (participant_stats['response_time_count'] >= 5)  # Only check participants with enough data
    ]
    
    for participant_id in high_variability.index:
        attention_issues.append({
            'participant_id': participant_id,
            'issue_type': 'High Response Time Variability',
            'question_type': 'All',
            'response_time': participant_stats.loc[participant_id, 'response_time_mean'],
            'variability': participant_stats.loc[participant_id, 'response_time_std'],
            'severity': 'Medium'
        })
    
    # 4. Check for very low click counts (potential random clicking)
    low_clicks = response_df[response_df['click_count'] < 2]
    for _, row in low_clicks.iterrows():
        attention_issues.append({
            'participant_id': row['participant_id'],
            'issue_type': 'Very Low Click Count',
            'question_type': row['question_type'],
            'response_time': row['response_time'],
            'click_count': row['click_count'],
            'severity': 'High' if row['click_count'] == 0 else 'Medium'
        })
    
    # 5. Check for missing responses
    missing_responses = response_df[response_df['response_time'].isna()]
    for _, row in missing_responses.iterrows():
        attention_issues.append({
            'participant_id': row['participant_id'],
            'issue_type': 'Missing Response',
            'question_type': row['question_type'],
            'response_time': None,
            'severity': 'High'
        })
    
    return pd.DataFrame(attention_issues)

def identify_problematic_participants(attention_issues_df):
    """
    Identify and categorize problematic participants
    """
    print("识别有问题的参与者...")
    
    if len(attention_issues_df) == 0:
        print("未发现注意力问题")
        return pd.DataFrame()
    
    # Count issues per participant
    issue_counts = attention_issues_df.groupby('participant_id').agg({
        'issue_type': 'count',
        'severity': lambda x: (x == 'High').sum()
    }).rename(columns={'issue_type': 'total_issues', 'severity': 'high_severity_issues'})
    
    # Categorize participants
    issue_counts['category'] = 'Normal'
    issue_counts.loc[issue_counts['total_issues'] >= 3, 'category'] = 'Moderate Issues'
    issue_counts.loc[issue_counts['high_severity_issues'] >= 2, 'category'] = 'High Issues'
    issue_counts.loc[issue_counts['total_issues'] >= 5, 'category'] = 'Severe Issues'
    
    # Get detailed issue information for problematic participants
    problematic = issue_counts[issue_counts['category'] != 'Normal']
    
    print(f"发现 {len(problematic)} 个有问题的参与者:")
    print(problematic['category'].value_counts())
    
    return issue_counts

In [None]:
# Visualization Functions
def create_comprehensive_visualizations(response_df, attention_issues_df, issue_counts):
    """
    Create comprehensive visualizations of participant activities
    """
    print("创建可视化图表...")
    
    # Set up the plotting area
    fig = plt.figure(figsize=(20, 24))
    
    # 1. Response Time Distribution by Question Type
    plt.subplot(4, 3, 1)
    for question_type in response_df['question_type'].unique():
        subset = response_df[response_df['question_type'] == question_type]
        plt.hist(subset['response_time'], alpha=0.7, label=question_type, bins=30)
    plt.xlabel('Response Time (seconds)')
    plt.ylabel('Frequency')
    plt.title('Response Time Distribution by Question Type')
    plt.legend()
    plt.yscale('log')
    
    # 2. Response Time by Sequence Position
    plt.subplot(4, 3, 2)
    sequence_data = response_df.groupby(['question_type', 'question_sequence'])['response_time'].mean().unstack()
    sequence_data.plot(kind='bar', ax=plt.gca())
    plt.xlabel('Question Type')
    plt.ylabel('Average Response Time (seconds)')
    plt.title('Average Response Time by Sequence Position')
    plt.xticks(rotation=45)
    plt.legend(title='Sequence Position')
    
    # 3. Click Count Distribution
    plt.subplot(4, 3, 3)
    plt.hist(response_df['click_count'], bins=20, alpha=0.7, color='skyblue')
    plt.xlabel('Click Count')
    plt.ylabel('Frequency')
    plt.title('Click Count Distribution')
    
    # 4. Response Time vs Click Count Scatter
    plt.subplot(4, 3, 4)
    for question_type in response_df['question_type'].unique():
        subset = response_df[response_df['question_type'] == question_type]
        plt.scatter(subset['click_count'], subset['response_time'], 
                   alpha=0.6, label=question_type, s=20)
    plt.xlabel('Click Count')
    plt.ylabel('Response Time (seconds)')
    plt.title('Response Time vs Click Count')
    plt.legend()
    plt.yscale('log')
    
    # 5. Individual Participant Response Patterns
    plt.subplot(4, 3, 5)
    participant_sample = response_df['participant_id'].unique()[:20]  # Show first 20 participants
    for pid in participant_sample:
        participant_data = response_df[response_df['participant_id'] == pid]
        plt.plot(participant_data['response_time'], alpha=0.7, linewidth=1)
    plt.xlabel('Question Index')
    plt.ylabel('Response Time (seconds)')
    plt.title('Individual Participant Response Patterns (Sample)')
    plt.yscale('log')
    
    # 6. Attention Issues by Type
    plt.subplot(4, 3, 6)
    if len(attention_issues_df) > 0:
        issue_type_counts = attention_issues_df['issue_type'].value_counts()
        plt.pie(issue_type_counts.values, labels=issue_type_counts.index, autopct='%1.1f%%')
        plt.title('Distribution of Attention Issues')
    else:
        plt.text(0.5, 0.5, 'No attention issues found', ha='center', va='center', transform=plt.gca().transAxes)
        plt.title('Distribution of Attention Issues')
    
    # 7. Problematic Participants Overview
    plt.subplot(4, 3, 7)
    if len(issue_counts) > 0:
        category_counts = issue_counts['category'].value_counts()
        colors = ['green' if cat == 'Normal' else 'orange' if 'Moderate' in cat else 'red' for cat in category_counts.index]
        plt.bar(category_counts.index, category_counts.values, color=colors)
        plt.xlabel('Participant Category')
        plt.ylabel('Number of Participants')
        plt.title('Participant Categories by Issue Severity')
        plt.xticks(rotation=45)
    else:
        plt.text(0.5, 0.5, 'No problematic participants', ha='center', va='center', transform=plt.gca().transAxes)
        plt.title('Participant Categories by Issue Severity')
    
    # 8. Response Time Trends by Order
    plt.subplot(4, 3, 8)
    order_trends = response_df.groupby(['order', 'question_type'])['response_time'].mean().unstack()
    order_trends.plot(kind='bar', ax=plt.gca())
    plt.xlabel('Order')
    plt.ylabel('Average Response Time (seconds)')
    plt.title('Response Time by Order and Question Type')
    plt.xticks(rotation=45)
    plt.legend(title='Question Type')
    
    # 9. Baseline vs Error Question Comparison
    plt.subplot(4, 3, 9)
    baseline_data = response_df[response_df['question_type'] == 'Baseline']
    error_data = response_df[response_df['question_type'] == 'Error']
    
    if len(baseline_data) > 0 and len(error_data) > 0:
        plt.boxplot([baseline_data['response_time'], error_data['response_time']], 
                   labels=['Baseline', 'Error'])
        plt.ylabel('Response Time (seconds)')
        plt.title('Baseline vs Error Questions Response Time')
        plt.yscale('log')
    
    # 10. Click Count by Question Type
    plt.subplot(4, 3, 10)
    click_data = response_df.groupby('question_type')['click_count'].apply(list)
    plt.boxplot(click_data.values, labels=click_data.index)
    plt.xlabel('Question Type')
    plt.ylabel('Click Count')
    plt.title('Click Count Distribution by Question Type')
    
    # 11. Response Time Heatmap by Participant and Question Type
    plt.subplot(4, 3, 11)
    if len(response_df) > 0:
        # Create a pivot table for heatmap
        heatmap_data = response_df.pivot_table(
            values='response_time', 
            index='participant_id', 
            columns='question_type', 
            aggfunc='mean'
        )
        # Show only first 30 participants for readability
        heatmap_subset = heatmap_data.head(30)
        sns.heatmap(heatmap_subset, cmap='YlOrRd', cbar_kws={'label': 'Response Time (seconds)'})
        plt.title('Response Time Heatmap (First 30 Participants)')
        plt.xlabel('Question Type')
        plt.ylabel('Participant ID')
    
    # 12. Issue Severity Distribution
    plt.subplot(4, 3, 12)
    if len(attention_issues_df) > 0:
        severity_counts = attention_issues_df['severity'].value_counts()
        colors = {'High': 'red', 'Medium': 'orange', 'Low': 'yellow'}
        severity_colors = [colors.get(sev, 'gray') for sev in severity_counts.index]
        plt.bar(severity_counts.index, severity_counts.values, color=severity_colors)
        plt.xlabel('Issue Severity')
        plt.ylabel('Number of Issues')
        plt.title('Distribution of Issue Severity')
    else:
        plt.text(0.5, 0.5, 'No issues found', ha='center', va='center', transform=plt.gca().transAxes)
        plt.title('Distribution of Issue Severity')
    
    plt.tight_layout()
    plt.show()
    
    return fig

def create_individual_participant_analysis(response_df, problematic_participants):
    """
    Create detailed analysis for individual problematic participants
    """
    if len(problematic_participants) == 0:
        print("没有发现有问题参与者，跳过个体分析")
        return
    
    print("创建个体参与者详细分析...")
    
    # Get top 10 most problematic participants
    top_problematic = problematic_participants.nlargest(10, 'total_issues')
    
    fig, axes = plt.subplots(2, 5, figsize=(25, 10))
    axes = axes.flatten()
    
    for i, (participant_id, row) in enumerate(top_problematic.iterrows()):
        if i >= 10:
            break
            
        participant_data = response_df[response_df['participant_id'] == participant_id]
        
        # Plot response times by question type
        for j, question_type in enumerate(['Policy', 'Baseline', 'Error']):
            subset = participant_data[participant_data['question_type'] == question_type]
            if len(subset) > 0:
                axes[i].scatter(subset['response_time'], [j] * len(subset), 
                              label=question_type, alpha=0.7, s=50)
        
        axes[i].set_xlabel('Response Time (seconds)')
        axes[i].set_ylabel('Question Type')
        axes[i].set_title(f'Participant {participant_id}\nIssues: {row["total_issues"]}, High: {row["high_severity_issues"]}')
        axes[i].set_yscale('log')
        axes[i].set_yticks([0, 1, 2])
        axes[i].set_yticklabels(['Policy', 'Baseline', 'Error'])
        axes[i].legend()
    
    plt.tight_layout()
    plt.show()

In [None]:
# Main Analysis Pipeline
def run_complete_analysis(df):
    """
    Run the complete attention check analysis pipeline
    """
    print("=" * 60)
    print("开始完整的注意力检查分析")
    print("=" * 60)
    
    # Step 1: Data Preparation
    print("\n1. 数据预处理...")
    data, organized_data = prepare_data(df)
    
    # Step 2: Order Validation
    print("\n2. 验证答题顺序...")
    order_valid = validate_participant_orders(data)
    if not order_valid:
        print("警告: 发现答题顺序问题，请检查数据")
    
    # Step 3: Calculate Response Times
    print("\n3. 计算响应时间...")
    response_df = calculate_response_times(data, organized_data)
    
    # Step 4: Analyze Response Patterns
    print("\n4. 分析响应模式...")
    summary_stats, sequence_stats = analyze_response_patterns(response_df)
    
    # Step 5: Perform Attention Checks
    print("\n5. 执行注意力检查...")
    attention_issues_df = perform_attention_checks(response_df)
    
    # Step 6: Identify Problematic Participants
    print("\n6. 识别有问题的参与者...")
    issue_counts = identify_problematic_participants(attention_issues_df)
    
    # Step 7: Create Visualizations
    print("\n7. 创建可视化图表...")
    fig = create_comprehensive_visualizations(response_df, attention_issues_df, issue_counts)
    
    # Step 8: Individual Participant Analysis
    print("\n8. 个体参与者分析...")
    create_individual_participant_analysis(response_df, issue_counts)
    
    # Step 9: Generate Summary Report
    print("\n9. 生成摘要报告...")
    generate_summary_report(response_df, attention_issues_df, issue_counts)
    
    return {
        'response_df': response_df,
        'attention_issues_df': attention_issues_df,
        'issue_counts': issue_counts,
        'summary_stats': summary_stats,
        'sequence_stats': sequence_stats
    }

def generate_summary_report(response_df, attention_issues_df, issue_counts):
    """
    Generate a comprehensive summary report
    """
    print("\n" + "=" * 60)
    print("分析摘要报告")
    print("=" * 60)
    
    # Basic statistics
    total_participants = response_df['participant_id'].nunique()
    total_responses = len(response_df)
    
    print(f"\n基本统计:")
    print(f"  总参与者数量: {total_participants}")
    print(f"  总响应数量: {total_responses}")
    print(f"  平均每个参与者响应数: {total_responses/total_participants:.1f}")
    
    # Response time statistics
    print(f"\n响应时间统计:")
    for question_type in response_df['question_type'].unique():
        subset = response_df[response_df['question_type'] == question_type]
        avg_time = subset['response_time'].mean()
        median_time = subset['response_time'].median()
        print(f"  {question_type}: 平均 {avg_time:.1f}秒, 中位数 {median_time:.1f}秒")
    
    # Attention issues summary
    if len(attention_issues_df) > 0:
        print(f"\n注意力问题统计:")
        print(f"  总问题数量: {len(attention_issues_df)}")
        print(f"  涉及参与者: {attention_issues_df['participant_id'].nunique()}")
        
        issue_type_summary = attention_issues_df['issue_type'].value_counts()
        print(f"  问题类型分布:")
        for issue_type, count in issue_type_summary.items():
            print(f"    {issue_type}: {count}")
        
        severity_summary = attention_issues_df['severity'].value_counts()
        print(f"  严重程度分布:")
        for severity, count in severity_summary.items():
            print(f"    {severity}: {count}")
    else:
        print(f"\n注意力问题统计: 未发现注意力问题")
    
    # Problematic participants summary
    if len(issue_counts) > 0:
        print(f"\n有问题参与者统计:")
        category_summary = issue_counts['category'].value_counts()
        for category, count in category_summary.items():
            print(f"  {category}: {count} 参与者")
        
        # List most problematic participants
        most_problematic = issue_counts.nlargest(5, 'total_issues')
        print(f"\n问题最多的5个参与者:")
        for participant_id, row in most_problematic.iterrows():
            print(f"  参与者 {participant_id}: {row['total_issues']} 个问题, {row['high_severity_issues']} 个高严重性问题")
    else:
        print(f"\n有问题参与者统计: 未发现有问题参与者")
    
    print("\n" + "=" * 60)
    print("分析完成")
    print("=" * 60)

# Example usage (uncomment when you have your data loaded)
# results = run_complete_analysis(df)

In [None]:
# Additional Analysis Functions for Specific Requirements

def compare_question_difficulty(response_df):
    """
    Compare the same question with its mean completion speed from all samples
    """
    print("比较问题难度...")
    
    # Group by question type and calculate mean response times
    question_means = response_df.groupby('question_type')['response_time'].mean()
    
    # For baseline and error questions, also compare by theme
    baseline_data = response_df[response_df['question_type'] == 'Baseline']
    error_data = response_df[response_df['question_type'] == 'Error']
    
    if len(baseline_data) > 0:
        baseline_theme_means = baseline_data.groupby('theme')['response_time'].mean().sort_values()
        print("\n基线问题主题平均响应时间:")
        for theme, mean_time in baseline_theme_means.items():
            print(f"  {theme}: {mean_time:.1f}秒")
    
    if len(error_data) > 0:
        error_theme_means = error_data.groupby('theme')['response_time'].mean().sort_values()
        print("\n错误阈值问题主题平均响应时间:")
        for theme, mean_time in error_theme_means.items():
            print(f"  {theme}: {mean_time:.1f}秒")
    
    return question_means

def analyze_sequential_patterns(response_df):
    """
    Compare questions in sequential order with their counterparts
    """
    print("分析序列模式...")
    
    # Analyze baseline questions as a group
    baseline_data = response_df[response_df['question_type'] == 'Baseline']
    if len(baseline_data) > 0:
        # Calculate average response time by sequence position for baseline
        baseline_sequence = baseline_data.groupby('theme_order')['response_time'].agg(['mean', 'std', 'count'])
        print("\n基线问题序列位置分析:")
        for pos, row in baseline_sequence.iterrows():
            print(f"  位置 {pos}: 平均 {row['mean']:.1f}秒 (标准差: {row['std']:.1f}, 样本数: {row['count']})")
    
    # Analyze error threshold questions as a group
    error_data = response_df[response_df['question_type'] == 'Error']
    if len(error_data) > 0:
        # Calculate average response time by sequence position for error
        error_sequence = error_data.groupby('theme_order')['response_time'].agg(['mean', 'std', 'count'])
        print("\n错误阈值问题序列位置分析:")
        for pos, row in error_sequence.iterrows():
            print(f"  位置 {pos}: 平均 {row['mean']:.1f}秒 (标准差: {row['std']:.1f}, 样本数: {row['count']})")
    
    return baseline_sequence if len(baseline_data) > 0 else None, error_sequence if len(error_data) > 0 else None

def create_sequence_visualization(response_df):
    """
    Create visualization for sequential patterns
    """
    fig, axes = plt.subplots(1, 3, figsize=(18, 6))
    
    # Policy questions (single timer)
    policy_data = response_df[response_df['question_type'] == 'Policy']
    if len(policy_data) > 0:
        axes[0].hist(policy_data['response_time'], bins=20, alpha=0.7, color='blue')
        axes[0].set_xlabel('Response Time (seconds)')
        axes[0].set_ylabel('Frequency')
        axes[0].set_title('Policy Questions Response Time')
        axes[0].set_yscale('log')
    
    # Baseline questions by sequence position
    baseline_data = response_df[response_df['question_type'] == 'Baseline']
    if len(baseline_data) > 0:
        baseline_sequence = baseline_data.groupby('theme_order')['response_time'].mean()
        axes[1].plot(baseline_sequence.index, baseline_sequence.values, 'o-', color='green', linewidth=2, markersize=8)
        axes[1].set_xlabel('Question Sequence Position')
        axes[1].set_ylabel('Average Response Time (seconds)')
        axes[1].set_title('Baseline Questions: Response Time by Position')
        axes[1].grid(True, alpha=0.3)
    
    # Error threshold questions by sequence position
    error_data = response_df[response_df['question_type'] == 'Error']
    if len(error_data) > 0:
        error_sequence = error_data.groupby('theme_order')['response_time'].mean()
        axes[2].plot(error_sequence.index, error_sequence.values, 'o-', color='red', linewidth=2, markersize=8)
        axes[2].set_xlabel('Question Sequence Position')
        axes[2].set_ylabel('Average Response Time (seconds)')
        axes[2].set_title('Error Threshold Questions: Response Time by Position')
        axes[2].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

# Function to export results
def export_results(results, filename_prefix="attention_analysis"):
    """
    Export analysis results to CSV files
    """
    print("导出分析结果...")
    
    # Export response data
    results['response_df'].to_csv(f"{filename_prefix}_response_data.csv", index=False)
    print(f"响应数据已导出到: {filename_prefix}_response_data.csv")
    
    # Export attention issues
    if len(results['attention_issues_df']) > 0:
        results['attention_issues_df'].to_csv(f"{filename_prefix}_attention_issues.csv", index=False)
        print(f"注意力问题已导出到: {filename_prefix}_attention_issues.csv")
    
    # Export participant issue counts
    if len(results['issue_counts']) > 0:
        results['issue_counts'].to_csv(f"{filename_prefix}_participant_issues.csv", index=True)
        print(f"参与者问题统计已导出到: {filename_prefix}_participant_issues.csv")
    
    print("所有结果已成功导出")

print("所有分析函数已定义完成！")
print("\n使用方法:")
print("1. 加载您的数据集到变量 'df'")
print("2. 运行: results = run_complete_analysis(df)")
print("3. 可选: export_results(results) 导出结果")