# Air Quality Prediction Study Analysis

This notebook analyzes experimental data from the trust and uncertainty visualization study.
The study examines how different visualization conditions affect user trust, confidence, and decision-making across two phases.

In [None]:
import pandas as pd
import numpy as np
import glob
import os
from pathlib import Path
import json
import warnings
warnings.filterwarnings('ignore')

# Import visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.patches import Circle
import matplotlib.patches as mpatches

# Set plotting style
plt.style.use('default')
sns.set_palette("Set2")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 11
plt.rcParams['axes.grid'] = True
plt.rcParams['grid.alpha'] = 0.3

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', None)

## 1. Data Loading and Preprocessing

In [2]:
# Load all CSV files from the data directory
data_dir = Path('./data')
csv_files = list(data_dir.glob('user_*.csv'))
print(f"Found {len(csv_files)} participant data files:")
for file in csv_files:
    print(f"  - {file.name}")

Found 5 participant data files:
  - user_6657381117794993965781_2025-12-08T17-16-29-651.csv
  - user_67425624930886458_2025-12-08T17-23-33-555.csv
  - user_9830_2025-12-08T04-16-47-915.csv
  - user_1765213036676_2025-12-08T16-57-16-674.csv
  - user_62199458482518274623771_2025-12-08T17-37-25-103.csv


In [3]:
# Function to load and clean individual participant data
def load_participant_data(file_path):
    """Load a single participant CSV file and clean the data"""
    try:
        df = pd.read_csv(file_path)
        
        # Extract participant ID from filename if not in data
        if 'participant_id' not in df.columns or df['participant_id'].isna().all():
            participant_id = file_path.stem.split('_')[1]  # Extract from filename
            df['participant_id'] = participant_id
        
        # Clean condition IDs and names
        if 'condition_id' in df.columns:
            df['condition_id'] = df['condition_id'].fillna('unknown')
        
        return df
    except Exception as e:
        print(f"Error loading {file_path}: {e}")
        return None

# Load all participant data
all_data = []
for file_path in csv_files:
    participant_data = load_participant_data(file_path)
    if participant_data is not None:
        all_data.append(participant_data)
        print(f"Loaded data for participant: {participant_data['participant_id'].iloc[0] if not participant_data['participant_id'].isna().all() else 'unknown'}")

# Combine all participant data
if all_data:
    combined_data = pd.concat(all_data, ignore_index=True)
    print(f"\nCombined dataset shape: {combined_data.shape}")
else:
    print("No data loaded successfully")

Loaded data for participant: 66573c811d17794993965781
Loaded data for participant: 67aced4b25f62493088645b8
Loaded data for participant: nan
Loaded data for participant: test
Loaded data for participant: 62e199458482518274623771

Combined dataset shape: (92, 46)


In [4]:
# Examine the data structure
print("Column names:")
print(combined_data.columns.tolist())
print(f"\nDataset shape: {combined_data.shape}")
print(f"\nUnique trial types:")
print(combined_data['trial_type'].value_counts())

Column names:
['city_a_estimate', 'city_b_estimate', 'click_events', 'comprehension_ease', 'condition', 'condition_id', 'condition_name', 'confidence_label', 'confidence_rating', 'data_trust', 'display_format', 'end_time', 'hover_events', 'interaction_log', 'internal_node_id', 'participant_id', 'percent_score', 'phase', 'phase1_complete', 'phase2_complete', 'predictions_shown', 'probability_estimate', 'question_order', 'response', 'responses', 'round', 'rt', 'rt_total', 'skeptical_rating', 'start_time', 'stimulus', 'success', 'time_elapsed', 'time_on_viz', 'total_interactions', 'total_questions', 'total_score', 'travel_choice', 'trial_index', 'trial_type', 'trust_composite', 'usability_composite', 'usability_difficulty', 'view_history', 'visualization_literacy_score', 'visualization_shown']

Dataset shape: (92, 46)

Unique trial types:
trial_type
html-button-response    30
prediction-task         10
trust-survey            10
survey-text              9
fullscreen               8
vis-li

In [5]:
# Filter for relevant trial types (prediction tasks and surveys)
relevant_trials = combined_data[
    combined_data['trial_type'].isin([
        'prediction-task', 'vis-literacy', 'trust-survey', 
        'personality-survey', 'survey-text', 'survey-multi-choice'
    ])
].copy()

print(f"Filtered dataset shape: {relevant_trials.shape}")
print(f"\nTrial types in filtered data:")
print(relevant_trials['trial_type'].value_counts())

Filtered dataset shape: (44, 46)

Trial types in filtered data:
trial_type
prediction-task        10
trust-survey           10
survey-text             9
vis-literacy            5
personality-survey      5
survey-multi-choice     5
Name: count, dtype: int64


In [6]:
# Examine condition distribution
print("Unique conditions:")
condition_counts = relevant_trials['condition_id'].value_counts(dropna=False)
print(condition_counts)

print("\nCondition names:")
condition_names = relevant_trials[['condition_id', 'condition_name']].drop_duplicates().dropna()
for _, row in condition_names.iterrows():
    print(f"  {row['condition_id']}: {row['condition_name']}")

Unique conditions:
condition_id
condition_7_buggy         14
condition_9_combined      14
condition_5_pi_hover       7
condition_0_historical     5
unknown                    4
Name: count, dtype: int64

Condition names:
  condition_7_buggy: Buggy Control
  condition_0_historical: Historical Only
  condition_5_pi_hover: PI Plot + Hover
  condition_9_combined: Combined PI + Ensemble


In [7]:
# Separate Phase 1 and Phase 2 data
prediction_data = relevant_trials[relevant_trials['trial_type'] == 'prediction-task'].copy()

# Phase separation logic
phase1_data = prediction_data[prediction_data['phase'] == 1].copy()
phase2_data = prediction_data[prediction_data['phase'] == 2].copy()

print(f"Phase 1 data: {len(phase1_data)} rows")
print(f"Phase 2 data: {len(phase2_data)} rows")

# Get visualization literacy data
vis_literacy_data = relevant_trials[relevant_trials['trial_type'] == 'vis-literacy'].copy()
print(f"Visualization literacy data: {len(vis_literacy_data)} rows")

# Get trust survey data
trust_data = relevant_trials[relevant_trials['trial_type'] == 'trust-survey'].copy()
print(f"Trust survey data: {len(trust_data)} rows")

Phase 1 data: 5 rows
Phase 2 data: 5 rows
Visualization literacy data: 5 rows
Trust survey data: 10 rows


## 2. Basic Statistics Tables by Condition

Each table shows conditions as rows and response variables as columns, with participant response lists in each cell.

In [8]:
def create_condition_response_table(data, response_columns, title="Response Table"):
    """
    Create a table where each row is a condition and each column is a response variable.
    Each cell contains a list of participant responses.
    """
    # Get unique conditions
    conditions = sorted(data['condition_id'].dropna().unique())
    
    # Initialize results dictionary
    results = {}
    
    for condition in conditions:
        condition_data = data[data['condition_id'] == condition]
        condition_responses = {}
        
        for col in response_columns:
            if col in condition_data.columns:
                responses = condition_data[col].dropna().tolist()
                condition_responses[col] = responses
            else:
                condition_responses[col] = []
        
        results[condition] = condition_responses
    
    # Convert to DataFrame
    df = pd.DataFrame(results).T
    
    print(f"\n{title}")
    print("=" * len(title))
    return df

# Function to display response summary statistics
def display_response_summary(df, title="Summary"):
    """
    Display summary statistics for response lists in each cell
    """
    print(f"\n{title} - Response Counts and Basic Stats")
    print("-" * (len(title) + 30))
    
    for condition in df.index:
        print(f"\nCondition: {condition}")
        for col in df.columns:
            responses = df.loc[condition, col]
            if isinstance(responses, list) and responses:
                numeric_responses = [r for r in responses if isinstance(r, (int, float)) and not pd.isna(r)]
                if numeric_responses:
                    print(f"  {col}: n={len(numeric_responses)}, mean={np.mean(numeric_responses):.2f}, responses={numeric_responses}")
                else:
                    print(f"  {col}: n={len(responses)}, responses={responses[:5]}{'...' if len(responses) > 5 else ''}")
            else:
                print(f"  {col}: No responses")
    
    return df

### Phase 1 Responses (Baseline - No Visualization)

In [9]:
# Phase 1 response columns
phase1_columns = ['probability_estimate', 'confidence_rating', 'travel_choice']

# Create Phase 1 table
phase1_table = create_condition_response_table(
    phase1_data, 
    phase1_columns, 
    "Phase 1 Responses (No Visualization)"
)

# Display the table
display_response_summary(phase1_table, "Phase 1 Summary")


Phase 1 Responses (No Visualization)

Phase 1 Summary - Response Counts and Basic Stats
---------------------------------------------

Condition: condition_0_historical
  probability_estimate: n=5, mean=70.00, responses=[80.0, 88.0, 50.0, 50.0, 82.0]
  confidence_rating: n=5, mean=3.80, responses=[5.0, 4.0, 3.0, 2.0, 5.0]
  travel_choice: n=5, responses=['City A', 'City A', 'No Preference', 'No Preference', 'City A']


Unnamed: 0,probability_estimate,confidence_rating,travel_choice
condition_0_historical,"[80.0, 88.0, 50.0, 50.0, 82.0]","[5.0, 4.0, 3.0, 2.0, 5.0]","[City A, City A, No Preference, No Preference,..."


### Phase 2 Responses (With Visualization)

In [10]:
# Phase 2 response columns
phase2_columns = ['probability_estimate', 'confidence_rating', 'travel_choice', 'data_trust', 'skeptical_rating']

# Create Phase 2 table
phase2_table = create_condition_response_table(
    phase2_data, 
    phase2_columns, 
    "Phase 2 Responses (With Visualization)"
)

# Display the table
display_response_summary(phase2_table, "Phase 2 Summary")


Phase 2 Responses (With Visualization)

Phase 2 Summary - Response Counts and Basic Stats
---------------------------------------------

Condition: condition_5_pi_hover
  probability_estimate: n=1, mean=86.00, responses=[86.0]
  confidence_rating: n=1, mean=5.00, responses=[5.0]
  travel_choice: n=1, responses=['City A']
  data_trust: No responses
  skeptical_rating: No responses

Condition: condition_7_buggy
  probability_estimate: n=2, mean=75.00, responses=[100.0, 50.0]
  confidence_rating: n=2, mean=4.50, responses=[6.0, 3.0]
  travel_choice: n=2, responses=['City A', 'No Preference']
  data_trust: No responses
  skeptical_rating: No responses

Condition: condition_9_combined
  probability_estimate: n=2, mean=67.50, responses=[50.0, 85.0]
  confidence_rating: n=2, mean=4.50, responses=[3.0, 6.0]
  travel_choice: n=2, responses=['No Preference', 'City A']
  data_trust: No responses
  skeptical_rating: No responses


Unnamed: 0,probability_estimate,confidence_rating,travel_choice,data_trust,skeptical_rating
condition_5_pi_hover,[86.0],[5.0],[City A],[],[]
condition_7_buggy,"[100.0, 50.0]","[6.0, 3.0]","[City A, No Preference]",[],[]
condition_9_combined,"[50.0, 85.0]","[3.0, 6.0]","[No Preference, City A]",[],[]


### Visualization Literacy Question Responses by Condition

In [None]:
# Function to parse visualization literacy responses
def parse_vis_literacy_responses(responses_str):
    """Parse the JSON responses from visualization literacy test"""
    if pd.isna(responses_str):
        return []
    try:
        # Clean and parse JSON
        cleaned_str = responses_str.replace("'", '"')
        responses = json.loads(cleaned_str)
        return responses
    except:
        return []

# Define the correct answers from the plugin (Mini-VLAT questions)
CORRECT_ANSWERS = {
    'minivlat_1': 0,   # $16.55 - $57.52
    'minivlat_2': 3,   # 5.50Mbps - 28.60Mbps  
    'minivlat_3': 1,   # $6.1
    'minivlat_4': 1,   # 27%
    'minivlat_5': 1,   # 25%
    'minivlat_6': 1,   # 190
    'minivlat_7': 1,   # False
    'minivlat_8': 0,   # $5.15
    'minivlat_9': 2,   # 1700
    'minivlat_10': 0,  # 525 km
    'minivlat_11': 0,  # True
    'minivlat_12': 3   # Citibank
}

def calculate_vis_literacy_score(responses):
    """Calculate visualization literacy score based on correct answers"""
    if not responses:
        return 0, {}
    
    total_correct = 0
    question_scores = {}
    
    for response in responses:
        question_id = response.get('question_id', '')
        participant_response = response.get('response', -1)
        
        if question_id in CORRECT_ANSWERS:
            is_correct = participant_response == CORRECT_ANSWERS[question_id]
            question_scores[question_id] = {
                'response': participant_response,
                'correct': is_correct,
                'correct_answer': CORRECT_ANSWERS[question_id]
            }
            if is_correct:
                total_correct += 1
    
    return total_correct, question_scores

# Parse visualization literacy responses and calculate scores
print("=== VISUALIZATION LITERACY SCORING ===" )
print("\nIndividual Question Responses and Scores:")
print("-" * 50)

vis_lit_responses = {}
vis_lit_scores = {}

for idx, row in vis_literacy_data.iterrows():
    participant_id = row['participant_id']
    condition_id = row['condition_id']
    responses = parse_vis_literacy_responses(row['responses'])
    
    if responses:
        # Calculate scores
        total_score, question_scores = calculate_vis_literacy_score(responses)
        max_possible = len(CORRECT_ANSWERS)
        
        print(f"\nParticipant: {participant_id} (Condition: {condition_id})")
        print(f"Total Score: {total_score}/{max_possible} ({(total_score/max_possible)*100:.1f}%)")
        
        # Store comprehensive data
        vis_lit_responses[participant_id] = {
            'condition_id': condition_id,
            'responses': responses,
            'total_score': total_score,
            'max_possible': max_possible,
            'question_scores': question_scores
        }
        
        # Create summary score record
        vis_lit_scores[participant_id] = {
            'participant_id': participant_id,
            'condition_id': condition_id,
            'total_score': total_score,
            'max_possible': max_possible,
            'percent_score': (total_score/max_possible)*100
        }
        
        # Display first few question details
        print("First 3 question details:")
        for i, (q_id, q_score) in enumerate(list(question_scores.items())[:3]):
            response = q_score['response']
            is_correct = q_score['correct']
            correct_ans = q_score['correct_answer']
            print(f"  {q_id}: Response={response}, Correct Answer={correct_ans}, Correct={is_correct}")
        
        if len(question_scores) > 3:
            print(f"  ... and {len(question_scores) - 3} more questions")
    else:
        print(f"\nParticipant: {participant_id} - No responses found")

# Convert scores to DataFrame for easier analysis
vis_lit_scores_df = pd.DataFrame(list(vis_lit_scores.values()))

print(f"\n\n=== VISUALIZATION LITERACY SUMMARY ===" )
print(f"Total participants with scores: {len(vis_lit_scores_df)}")
print(f"Mean score: {vis_lit_scores_df['total_score'].mean():.2f}/{max_possible}")
print(f"Mean percentage: {vis_lit_scores_df['percent_score'].mean():.1f}%")

# Summary by condition
if len(vis_lit_scores_df) > 0:
    print("\nScores by condition:")
    condition_summary = vis_lit_scores_df.groupby('condition_id').agg({
        'total_score': ['count', 'mean', 'std'],
        'percent_score': ['mean', 'std']
    }).round(2)
    print(condition_summary)

# Create a summary table of correct/incorrect responses by question type  
print("\n\n=== QUESTION TYPE PERFORMANCE SUMMARY ===" )
question_performance = {}

for participant_id, data in vis_lit_responses.items():
    condition = data['condition_id']
    
    for q_id, q_data in data['question_scores'].items():
        response = q_data['response']
        is_correct = q_data['correct']
        
        if condition not in question_performance:
            question_performance[condition] = {}
        
        if q_id not in question_performance[condition]:
            question_performance[condition][q_id] = []
        
        question_performance[condition][q_id].append({
            'participant': participant_id,
            'response': response,
            'correct': is_correct
        })

# Display performance by condition and question
for condition in sorted(question_performance.keys()):
    print(f"\nCondition: {condition}")
    print("-" * (len(condition) + 12))
    
    for question_id in sorted(question_performance[condition].keys()):
        responses = question_performance[condition][question_id]
        correct_count = sum(1 for r in responses if r['correct'])
        total_count = len(responses)
        accuracy = (correct_count / total_count) * 100 if total_count > 0 else 0
        
        print(f"  {question_id}: {correct_count}/{total_count} correct ({accuracy:.1f}%)")
        
        # Show individual responses
        response_list = [r['response'] for r in responses]
        print(f"    Responses: {response_list}")

### Trust and Usability Measures by Condition

In [12]:
# Examine trust survey data structure
print("Trust survey columns:")
trust_cols = [col for col in trust_data.columns if any(keyword in col.lower() for keyword in ['trust', 'usability', 'skeptical', 'comprehension'])]
print(trust_cols)

# Trust and usability response columns
trust_columns = ['trust_composite', 'usability_composite', 'data_trust', 'skeptical_rating', 'usability_difficulty', 'comprehension_ease']

# Create trust measures table
trust_table = create_condition_response_table(
    trust_data, 
    trust_columns, 
    "Trust and Usability Measures by Condition"
)

# Display the table
display_response_summary(trust_table, "Trust and Usability Summary")

Trust survey columns:
['comprehension_ease', 'data_trust', 'skeptical_rating', 'trust_composite', 'usability_composite', 'usability_difficulty']

Trust and Usability Measures by Condition

Trust and Usability Summary - Response Counts and Basic Stats
---------------------------------------------------------

Condition: condition_5_pi_hover
  trust_composite: n=1, mean=5.00, responses=[5.0]
  usability_composite: n=1, mean=5.00, responses=[5.0]
  data_trust: n=1, mean=5.00, responses=[5.0]
  skeptical_rating: n=1, mean=3.00, responses=[3.0]
  usability_difficulty: n=1, mean=3.00, responses=[3.0]
  comprehension_ease: n=1, mean=5.00, responses=[5.0]

Condition: condition_7_buggy
  trust_composite: n=2, mean=2.00, responses=[2.0, 2.0]
  usability_composite: n=2, mean=5.00, responses=[6.0, 4.0]
  data_trust: n=2, mean=2.00, responses=[2.0, 2.0]
  skeptical_rating: n=2, mean=5.00, responses=[6.0, 4.0]
  usability_difficulty: n=2, mean=3.50, responses=[2.0, 5.0]
  comprehension_ease: n=2, me

Unnamed: 0,trust_composite,usability_composite,data_trust,skeptical_rating,usability_difficulty,comprehension_ease
condition_5_pi_hover,[5.0],[5.0],[5.0],[3.0],[3.0],[5.0]
condition_7_buggy,"[2.0, 2.0]","[6.0, 4.0]","[2.0, 2.0]","[6.0, 4.0]","[2.0, 5.0]","[6.0, 4.0]"
condition_9_combined,"[4.0, 2.0]","[4.0, 3.0]","[4.0, 2.0]","[4.0, 6.0]","[4.0, 5.0]","[4.0, 3.0]"


## 3. Data Quality Assessment

In [13]:
# Participation completion rates
print("=== DATA QUALITY ASSESSMENT ===")
print("\n1. Participation Completion Rates")
print("-" * 35)

# Check completion by participant
participant_completion = combined_data.groupby('participant_id').agg({
    'trial_type': list,
    'phase1_complete': 'any',
    'phase2_complete': 'any',
    'condition_id': 'first'
}).reset_index()

print(f"Total participants: {len(participant_completion)}")
print(f"Phase 1 completed: {participant_completion['phase1_complete'].sum()}")
print(f"Phase 2 completed: {participant_completion['phase2_complete'].sum()}")
print(f"Both phases completed: {(participant_completion['phase1_complete'] & participant_completion['phase2_complete']).sum()}")

print("\nCompletion by condition:")
completion_by_condition = participant_completion.groupby('condition_id').agg({
    'participant_id': 'count',
    'phase1_complete': 'sum',
    'phase2_complete': 'sum'
}).rename(columns={'participant_id': 'total_participants'})
print(completion_by_condition)

=== DATA QUALITY ASSESSMENT ===

1. Participation Completion Rates
-----------------------------------
Total participants: 5
Phase 1 completed: 5
Phase 2 completed: 5
Both phases completed: 5

Completion by condition:
              total_participants  phase1_complete  phase2_complete
condition_id                                                      
unknown                        5                5                5


In [14]:
# Response time analysis
print("\n2. Response Time Analysis")
print("-" * 25)

# Get response times for prediction tasks
prediction_rt = prediction_data[prediction_data['rt'].notna()]
if len(prediction_rt) > 0:
    print(f"Prediction task response times (ms):")
    print(f"  Mean: {prediction_rt['rt'].mean():.0f} ms")
    print(f"  Median: {prediction_rt['rt'].median():.0f} ms")
    print(f"  Min: {prediction_rt['rt'].min():.0f} ms")
    print(f"  Max: {prediction_rt['rt'].max():.0f} ms")
    
    # Response times by phase
    print("\nResponse times by phase:")
    phase_rt = prediction_rt.groupby('phase')['rt'].agg(['count', 'mean', 'median']).round(0)
    print(phase_rt)
else:
    print("No response time data available")


2. Response Time Analysis
-------------------------
Prediction task response times (ms):
  Mean: 40141 ms
  Median: 43091 ms
  Min: 5149 ms
  Max: 84929 ms

Response times by phase:
       count     mean   median
phase                         
1.0        5  55413.0  58067.0
2.0        5  24868.0  22476.0


In [15]:
# Missing data patterns
print("\n3. Missing Data Patterns")
print("-" * 25)

# Key variables missing data
key_vars = ['probability_estimate', 'confidence_rating', 'data_trust', 'visualization_literacy_score']
missing_data = {}

for var in key_vars:
    if var in relevant_trials.columns:
        total_rows = len(relevant_trials)
        missing_count = relevant_trials[var].isna().sum()
        missing_pct = (missing_count / total_rows) * 100
        missing_data[var] = {'missing_count': missing_count, 'missing_pct': missing_pct}

missing_df = pd.DataFrame(missing_data).T
print("Missing data for key variables:")
print(missing_df.round(1))


3. Missing Data Patterns
-------------------------
Missing data for key variables:
                              missing_count  missing_pct
probability_estimate                   34.0         77.3
confidence_rating                      34.0         77.3
data_trust                             39.0         88.6
visualization_literacy_score           44.0        100.0


## 4. Sample Demographics

In [16]:
# Participant counts per condition
print("=== SAMPLE DEMOGRAPHICS ===")
print("\n1. Participant Counts by Condition")
print("-" * 35)

# Get unique participants per condition
participant_counts = relevant_trials.groupby(['condition_id', 'condition_name']).agg({
    'participant_id': 'nunique'
}).rename(columns={'participant_id': 'participant_count'}).reset_index()

print(participant_counts.to_string(index=False))
print(f"\nTotal unique participants: {relevant_trials['participant_id'].nunique()}")

=== SAMPLE DEMOGRAPHICS ===

1. Participant Counts by Condition
-----------------------------------
          condition_id         condition_name  participant_count
condition_0_historical        Historical Only                  4
  condition_5_pi_hover        PI Plot + Hover                  1
     condition_7_buggy          Buggy Control                  1
  condition_9_combined Combined PI + Ensemble                  2

Total unique participants: 4


In [17]:
# Basic demographic information from survey data
print("\n2. Basic Demographics")
print("-" * 20)

# Get demographic data
demo_text = relevant_trials[relevant_trials['trial_type'] == 'survey-text'].copy()
demo_multi = relevant_trials[relevant_trials['trial_type'] == 'survey-multi-choice'].copy()

if len(demo_text) > 0:
    print("Text survey responses available:", len(demo_text))
    
if len(demo_multi) > 0:
    print("Multi-choice survey responses available:", len(demo_multi))
    
    # Parse responses if they exist
    if 'responses' in demo_multi.columns:
        print("\nEducation and visualization experience data:")
        for idx, row in demo_multi.iterrows():
            if pd.notna(row['responses']):
                try:
                    responses = json.loads(row['responses'].replace("'", '"'))
                    participant_id = row['participant_id']
                    condition = row['condition_id']
                    print(f"  Participant {participant_id} ({condition}): {responses}")
                except:
                    print(f"  Could not parse responses for participant {row['participant_id']}")


2. Basic Demographics
--------------------
Text survey responses available: 9
Multi-choice survey responses available: 5

Education and visualization experience data:


## 5. Summary Overview

In [None]:
def create_comprehensive_export():
    """
    Create a comprehensive data export structured as:
    - Rows: Questions/Variables
    - Columns: Conditions  
    - Cells: Ordered lists of participant responses
    """
    
    # Get all unique conditions
    all_conditions = sorted(combined_data['condition_id'].dropna().unique())
    
    # Get all unique participants and their condition assignments
    participant_conditions = combined_data.groupby('participant_id')['condition_id'].first().to_dict()
    
    # Initialize the comprehensive data structure
    comprehensive_data = {}
    
    # Helper function to get ordered participant responses for a condition
    def get_ordered_responses(data_subset, variable, condition):
        condition_data = data_subset[data_subset['condition_id'] == condition]
        # Get participant order for consistency
        participants = sorted(condition_data['participant_id'].unique())
        responses = []
        
        for participant in participants:
            participant_data = condition_data[condition_data['participant_id'] == participant]
            if len(participant_data) > 0 and variable in participant_data.columns:
                response = participant_data[variable].dropna()
                if len(response) > 0:
                    responses.append(response.iloc[0])
                else:
                    responses.append(None)
            else:
                responses.append(None)
        
        return responses
    
    # 1. Phase 1 Variables (Baseline)
    phase1_vars = ['probability_estimate', 'confidence_rating', 'travel_choice']
    for var in phase1_vars:
        comprehensive_data[f'Phase1_{var}'] = {}
        for condition in all_conditions:
            responses = get_ordered_responses(phase1_data, var, condition)
            comprehensive_data[f'Phase1_{var}'][condition] = responses
    
    # 2. Phase 2 Variables (With Visualization)
    phase2_vars = ['probability_estimate', 'confidence_rating', 'travel_choice']
    for var in phase2_vars:
        comprehensive_data[f'Phase2_{var}'] = {}
        for condition in all_conditions:
            responses = get_ordered_responses(phase2_data, var, condition)
            comprehensive_data[f'Phase2_{var}'][condition] = responses
    
    # 3. Trust and Usability Variables
    trust_vars = ['trust_composite', 'usability_composite', 'data_trust', 'skeptical_rating', 
                  'usability_difficulty', 'comprehension_ease']
    for var in trust_vars:
        comprehensive_data[f'Trust_{var}'] = {}
        for condition in all_conditions:
            responses = get_ordered_responses(trust_data, var, condition)
            comprehensive_data[f'Trust_{var}'][condition] = responses
    
    # 4. Visualization Literacy Variables
    comprehensive_data['VisLiteracy_TotalScore'] = {}
    comprehensive_data['VisLiteracy_PercentScore'] = {}
    
    for condition in all_conditions:
        if condition in [p['condition_id'] for p in vis_lit_scores.values()]:
            # Get scores for this condition
            condition_scores = [score for score in vis_lit_scores.values() 
                              if score['condition_id'] == condition]
            total_scores = [score['total_score'] for score in condition_scores]
            percent_scores = [score['percent_score'] for score in condition_scores]
            
            comprehensive_data['VisLiteracy_TotalScore'][condition] = total_scores
            comprehensive_data['VisLiteracy_PercentScore'][condition] = percent_scores
        else:
            comprehensive_data['VisLiteracy_TotalScore'][condition] = []
            comprehensive_data['VisLiteracy_PercentScore'][condition] = []
    
    # 5. Individual Visualization Literacy Questions
    for q_id in CORRECT_ANSWERS.keys():
        comprehensive_data[f'VisLit_{q_id}'] = {}
        for condition in all_conditions:
            responses = []
            if condition in question_performance:
                if q_id in question_performance[condition]:
                    responses = [r['response'] for r in question_performance[condition][q_id]]
            comprehensive_data[f'VisLit_{q_id}'][condition] = responses
    
    return comprehensive_data

# Create the comprehensive export
print("=== CREATING COMPREHENSIVE DATA EXPORT ===")
export_data = create_comprehensive_export()

# Convert to DataFrame for easier handling
export_rows = []
for variable, condition_data in export_data.items():
    row = {'Variable': variable}
    for condition, responses in condition_data.items():
        # Convert responses to string representation for CSV
        if responses:
            response_str = str(responses).replace('[', '').replace(']', '').replace("'", "")
        else:
            response_str = ""
        row[condition] = response_str
    export_rows.append(row)

export_df = pd.DataFrame(export_rows)

# Display sample of the export structure
print(f"\nExport data structure:")
print(f"Variables (rows): {len(export_df)}")
print(f"Conditions (columns): {len([col for col in export_df.columns if col != 'Variable'])}")
print(f"\nFirst 10 variables:")
print(export_df.head(10).to_string(index=False, max_colwidth=50))

# Save comprehensive export
export_filename = './data/comprehensive_export.csv'
export_df.to_csv(export_filename, index=False)
print(f"\n✓ Comprehensive data exported to: {export_filename}")

# Also create a participant summary table
participant_summary = []

for participant_id in combined_data['participant_id'].unique():
    if pd.notna(participant_id):
        participant_data = combined_data[combined_data['participant_id'] == participant_id]
        condition = participant_data['condition_id'].iloc[0] if len(participant_data) > 0 else 'unknown'
        
        # Get Phase 1 data
        p1_data = phase1_data[phase1_data['participant_id'] == participant_id]
        p1_prob = p1_data['probability_estimate'].iloc[0] if len(p1_data) > 0 and 'probability_estimate' in p1_data.columns else None
        p1_conf = p1_data['confidence_rating'].iloc[0] if len(p1_data) > 0 and 'confidence_rating' in p1_data.columns else None
        p1_choice = p1_data['travel_choice'].iloc[0] if len(p1_data) > 0 and 'travel_choice' in p1_data.columns else None
        
        # Get Phase 2 data  
        p2_data = phase2_data[phase2_data['participant_id'] == participant_id]
        p2_prob = p2_data['probability_estimate'].iloc[0] if len(p2_data) > 0 and 'probability_estimate' in p2_data.columns else None
        p2_conf = p2_data['confidence_rating'].iloc[0] if len(p2_data) > 0 and 'confidence_rating' in p2_data.columns else None
        p2_choice = p2_data['travel_choice'].iloc[0] if len(p2_data) > 0 and 'travel_choice' in p2_data.columns else None
        
        # Get trust data
        trust_participant_data = trust_data[trust_data['participant_id'] == participant_id]
        trust_comp = trust_participant_data['trust_composite'].iloc[0] if len(trust_participant_data) > 0 else None
        usab_comp = trust_participant_data['usability_composite'].iloc[0] if len(trust_participant_data) > 0 else None
        
        # Get vis literacy score
        vis_score = vis_lit_scores.get(participant_id, {}).get('total_score', None)
        vis_percent = vis_lit_scores.get(participant_id, {}).get('percent_score', None)
        
        participant_summary.append({
            'participant_id': participant_id,
            'condition_id': condition,
            'phase1_probability': p1_prob,
            'phase1_confidence': p1_conf,
            'phase1_choice': p1_choice,
            'phase2_probability': p2_prob,
            'phase2_confidence': p2_conf,
            'phase2_choice': p2_choice,
            'trust_composite': trust_comp,
            'usability_composite': usab_comp,
            'vis_literacy_score': vis_score,
            'vis_literacy_percent': vis_percent
        })

participant_summary_df = pd.DataFrame(participant_summary)

# Save participant summary
summary_filename = './data/participant_summary.csv' 
participant_summary_df.to_csv(summary_filename, index=False)
print(f"✓ Participant summary exported to: {summary_filename}")

print(f"\nParticipant Summary (first 5 rows):")
print(participant_summary_df.head().round(2))

# Create visualizations for quantitative variables

def create_dot_plot(data, x_col, y_col, title, xlabel, ylabel):
    """Create a dot plot for quantitative data by condition"""
    plt.figure(figsize=(10, 6))
    
    # Get unique conditions
    conditions = sorted(data[y_col].dropna().unique())
    colors = sns.color_palette("Set2", len(conditions))
    
    for i, condition in enumerate(conditions):
        condition_data = data[data[y_col] == condition]
        y_values = [i] * len(condition_data)
        
        # Add jitter to prevent overlapping points
        y_jitter = np.array(y_values) + np.random.normal(0, 0.05, len(y_values))
        
        plt.scatter(condition_data[x_col], y_jitter, 
                   color=colors[i], alpha=0.7, s=100, 
                   label=condition, edgecolors='black', linewidth=1)
        
        # Add mean line
        mean_val = condition_data[x_col].mean()
        plt.plot([mean_val, mean_val], [i-0.2, i+0.2], 
                color='red', linewidth=3, alpha=0.8)
    
    plt.yticks(range(len(conditions)), conditions)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.title(title)
    plt.grid(True, alpha=0.3)
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.show()

# Prepare data for visualization
viz_data = []

# Add Phase 1 data
for _, row in phase1_data.iterrows():
    if pd.notna(row['participant_id']) and pd.notna(row['condition_id']):
        viz_data.append({
            'participant_id': row['participant_id'],
            'condition_id': row['condition_id'],
            'phase': 'Phase 1',
            'probability_estimate': row.get('probability_estimate'),
            'confidence_rating': row.get('confidence_rating'),
            'travel_choice': row.get('travel_choice')
        })

# Add Phase 2 data
for _, row in phase2_data.iterrows():
    if pd.notna(row['participant_id']) and pd.notna(row['condition_id']):
        viz_data.append({
            'participant_id': row['participant_id'], 
            'condition_id': row['condition_id'],
            'phase': 'Phase 2',
            'probability_estimate': row.get('probability_estimate'),
            'confidence_rating': row.get('confidence_rating'),
            'travel_choice': row.get('travel_choice')
        })

# Add trust data
for _, row in trust_data.iterrows():
    if pd.notna(row['participant_id']) and pd.notna(row['condition_id']):
        viz_data.append({
            'participant_id': row['participant_id'],
            'condition_id': row['condition_id'], 
            'phase': 'Trust Survey',
            'trust_composite': row.get('trust_composite'),
            'usability_composite': row.get('usability_composite'),
            'data_trust': row.get('data_trust'),
            'skeptical_rating': row.get('skeptical_rating')
        })

# Add vis literacy data
for participant_id, score_data in vis_lit_scores.items():
    viz_data.append({
        'participant_id': participant_id,
        'condition_id': score_data['condition_id'],
        'phase': 'Vis Literacy',
        'vis_literacy_score': score_data['total_score'],
        'vis_literacy_percent': score_data['percent_score']
    })

viz_df = pd.DataFrame(viz_data)

print("=== QUANTITATIVE VARIABLE VISUALIZATIONS ===")
print(f"Visualization dataset shape: {viz_df.shape}")
print(f"Variables available: {viz_df.columns.tolist()}")

# 1. Probability Estimates by Phase and Condition
print("\n1. Probability Estimates")
phase1_prob = viz_df[(viz_df['phase'] == 'Phase 1') & pd.notna(viz_df['probability_estimate'])]
phase2_prob = viz_df[(viz_df['phase'] == 'Phase 2') & pd.notna(viz_df['probability_estimate'])]

if len(phase1_prob) > 0:
    create_dot_plot(phase1_prob, 'probability_estimate', 'condition_id',
                   'Phase 1: Probability Estimates by Condition (No Visualization)',
                   'Probability Estimate (%)', 'Condition')

if len(phase2_prob) > 0:
    create_dot_plot(phase2_prob, 'probability_estimate', 'condition_id',
                   'Phase 2: Probability Estimates by Condition (With Visualization)',
                   'Probability Estimate (%)', 'Condition')

# 2. Confidence Ratings by Phase and Condition
print("\n2. Confidence Ratings") 
phase1_conf = viz_df[(viz_df['phase'] == 'Phase 1') & pd.notna(viz_df['confidence_rating'])]
phase2_conf = viz_df[(viz_df['phase'] == 'Phase 2') & pd.notna(viz_df['confidence_rating'])]

if len(phase1_conf) > 0:
    create_dot_plot(phase1_conf, 'confidence_rating', 'condition_id',
                   'Phase 1: Confidence Ratings by Condition (No Visualization)',
                   'Confidence Rating (1-7)', 'Condition')

if len(phase2_conf) > 0:
    create_dot_plot(phase2_conf, 'confidence_rating', 'condition_id', 
                   'Phase 2: Confidence Ratings by Condition (With Visualization)',
                   'Confidence Rating (1-7)', 'Condition')

# 3. Trust Measures by Condition
print("\n3. Trust and Usability Measures")
trust_viz = viz_df[(viz_df['phase'] == 'Trust Survey')]

trust_vars = ['trust_composite', 'usability_composite', 'data_trust', 'skeptical_rating']
for var in trust_vars:
    var_data = trust_viz[pd.notna(trust_viz[var])]
    if len(var_data) > 0:
        create_dot_plot(var_data, var, 'condition_id',
                       f'{var.replace("_", " ").title()} by Condition',
                       f'{var.replace("_", " ").title()} (1-7)', 'Condition')

# 4. Visualization Literacy Scores
print("\n4. Visualization Literacy Scores")
vis_lit_viz = viz_df[(viz_df['phase'] == 'Vis Literacy')]

if len(vis_lit_viz) > 0:
    # Total scores
    vis_total = vis_lit_viz[pd.notna(vis_lit_viz['vis_literacy_score'])]
    if len(vis_total) > 0:
        create_dot_plot(vis_total, 'vis_literacy_score', 'condition_id',
                       'Visualization Literacy Total Scores by Condition',
                       'Total Score (out of 12)', 'Condition')
    
    # Percentage scores  
    vis_percent = vis_lit_viz[pd.notna(vis_lit_viz['vis_literacy_percent'])]
    if len(vis_percent) > 0:
        create_dot_plot(vis_percent, 'vis_literacy_percent', 'condition_id',
                       'Visualization Literacy Percentage Scores by Condition',
                       'Percentage Score (%)', 'Condition')

# Create bar chart visualizations for categorical variables

def create_stacked_bar_chart(data, cat_col, group_col, title, xlabel, ylabel):
    """Create a stacked bar chart for categorical data"""
    # Create crosstab
    crosstab = pd.crosstab(data[group_col], data[cat_col], dropna=False)
    
    # Create the plot
    fig, ax = plt.subplots(figsize=(12, 6))
    crosstab.plot(kind='bar', stacked=True, ax=ax, 
                 color=sns.color_palette("Set2", len(crosstab.columns)))
    
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.legend(title=cat_col.replace('_', ' ').title(), 
              bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.xticks(rotation=45)
    plt.grid(True, alpha=0.3, axis='y')
    plt.tight_layout()
    plt.show()
    
    # Print the crosstab for reference
    print(f"\nCrosstab for {title}:")
    print(crosstab)
    print(f"\nPercentages by {group_col}:")
    print(crosstab.div(crosstab.sum(axis=1), axis=0).round(3))

def create_side_by_side_bar_chart(data, cat_col, group_col, title, xlabel, ylabel):
    """Create a side-by-side bar chart for categorical data"""
    # Create crosstab
    crosstab = pd.crosstab(data[group_col], data[cat_col], dropna=False)
    
    # Create the plot
    fig, ax = plt.subplots(figsize=(12, 6))
    crosstab.plot(kind='bar', ax=ax, 
                 color=sns.color_palette("Set2", len(crosstab.columns)))
    
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.legend(title=cat_col.replace('_', ' ').title(), 
              bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.xticks(rotation=45)
    plt.grid(True, alpha=0.3, axis='y')
    plt.tight_layout()
    plt.show()
    
    return crosstab

print("=== CATEGORICAL VARIABLE VISUALIZATIONS ===")

# 1. Travel Choice by Condition and Phase
print("\n1. Travel Choices by Condition")

# Phase 1 travel choices
phase1_travel = viz_df[(viz_df['phase'] == 'Phase 1') & pd.notna(viz_df['travel_choice'])]
if len(phase1_travel) > 0:
    create_stacked_bar_chart(phase1_travel, 'travel_choice', 'condition_id',
                            'Phase 1: Travel Choices by Condition (No Visualization)',
                            'Condition', 'Number of Participants')

# Phase 2 travel choices
phase2_travel = viz_df[(viz_df['phase'] == 'Phase 2') & pd.notna(viz_df['travel_choice'])]
if len(phase2_travel) > 0:
    create_stacked_bar_chart(phase2_travel, 'travel_choice', 'condition_id',
                            'Phase 2: Travel Choices by Condition (With Visualization)',
                            'Condition', 'Number of Participants')

# 2. Visualization Literacy Question Performance
print("\n\n2. Visualization Literacy Question Performance")

# Create detailed question performance visualization
if question_performance:
    # Aggregate all question responses across conditions
    all_questions = set()
    for condition_data in question_performance.values():
        all_questions.update(condition_data.keys())
    
    all_questions = sorted(list(all_questions))
    
    # Create a matrix of correct/incorrect responses
    performance_matrix = []
    conditions = sorted(question_performance.keys())
    
    for condition in conditions:
        condition_row = []
        for question in all_questions:
            if question in question_performance[condition]:
                responses = question_performance[condition][question]
                correct_count = sum(1 for r in responses if r['correct'])
                total_count = len(responses)
                accuracy = correct_count / total_count if total_count > 0 else 0
                condition_row.append(accuracy)
            else:
                condition_row.append(0)
        performance_matrix.append(condition_row)
    
    # Create heatmap
    plt.figure(figsize=(14, 8))
    performance_df = pd.DataFrame(performance_matrix, 
                                 index=conditions, 
                                 columns=all_questions)
    
    sns.heatmap(performance_df, annot=True, cmap='RdYlGn', 
               vmin=0, vmax=1, cbar_kws={'label': 'Accuracy Rate'})
    plt.title('Visualization Literacy Question Accuracy by Condition')
    plt.xlabel('Question ID')
    plt.ylabel('Condition')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
    
    print("\nQuestion accuracy matrix:")
    print(performance_df.round(3))

# 3. Individual Visualization Literacy Questions - Bar Charts
print("\n\n3. Individual Question Response Distributions")

# Create bar charts for each question showing response distributions
question_types = {
    'minivlat_1': 'Line Chart', 'minivlat_2': 'Bar Chart', 'minivlat_3': 'Stacked Bar',
    'minivlat_4': 'Stacked 100% Bar', 'minivlat_5': 'Pie Chart', 'minivlat_6': 'Histogram',
    'minivlat_7': 'Scatter Plot', 'minivlat_8': 'Area Chart', 'minivlat_9': 'Stacked Area',
    'minivlat_10': 'Bubble Chart', 'minivlat_11': 'Choropleth', 'minivlat_12': 'Tree Map'
}

# Show response distribution for first few questions
sample_questions = list(all_questions)[:6] if 'all_questions' in locals() else []

for question_id in sample_questions:
    question_type = question_types.get(question_id, 'Unknown')
    print(f"\n--- {question_id} ({question_type}) ---")
    
    # Collect all responses for this question across conditions
    question_responses = []
    for condition in conditions:
        if question_id in question_performance[condition]:
            for resp_data in question_performance[condition][question_id]:
                question_responses.append({
                    'condition_id': condition,
                    'response': resp_data['response'],
                    'correct': resp_data['correct']
                })
    
    if question_responses:
        resp_df = pd.DataFrame(question_responses)
        
        # Create bar chart of response distributions
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
        
        # Response distribution by condition
        response_crosstab = pd.crosstab(resp_df['condition_id'], resp_df['response'])
        response_crosstab.plot(kind='bar', ax=ax1, 
                              color=sns.color_palette("Set3", len(response_crosstab.columns)))
        ax1.set_title(f'{question_id}: Response Distribution by Condition')
        ax1.set_xlabel('Condition')
        ax1.set_ylabel('Count')
        ax1.legend(title='Response Option', bbox_to_anchor=(1.05, 1), loc='upper left')
        ax1.grid(True, alpha=0.3, axis='y')
        
        # Correct/Incorrect by condition
        correct_crosstab = pd.crosstab(resp_df['condition_id'], resp_df['correct'])
        correct_crosstab.plot(kind='bar', ax=ax2, color=['red', 'green'])
        ax2.set_title(f'{question_id}: Accuracy by Condition')
        ax2.set_xlabel('Condition')
        ax2.set_ylabel('Count')
        ax2.legend(title='Correct', labels=['Incorrect', 'Correct'], 
                  bbox_to_anchor=(1.05, 1), loc='upper left')
        ax2.grid(True, alpha=0.3, axis='y')
        
        plt.tight_layout()
        plt.show()
        
        print("Response counts:")
        print(response_crosstab)
        print("\nAccuracy counts:")
        print(correct_crosstab)

# Enhanced statistical analysis and summaries

print("=== STATISTICAL SUMMARIES AND CONDITION COMPARISONS ===")

# 1. Phase Comparison Analysis
print("\n1. PHASE COMPARISON ANALYSIS")
print("=" * 30)

def calculate_phase_changes(participant_summary_df):
    """Calculate changes from Phase 1 to Phase 2"""
    changes = []
    
    for _, row in participant_summary_df.iterrows():
        if pd.notna(row['phase1_probability']) and pd.notna(row['phase2_probability']):
            prob_change = row['phase2_probability'] - row['phase1_probability']
            changes.append({
                'participant_id': row['participant_id'],
                'condition_id': row['condition_id'],
                'probability_change': prob_change,
                'confidence_change': (row['phase2_confidence'] - row['phase1_confidence']) 
                                    if pd.notna(row['phase1_confidence']) and pd.notna(row['phase2_confidence']) else None,
                'choice_changed': row['phase1_choice'] != row['phase2_choice']
                                 if pd.notna(row['phase1_choice']) and pd.notna(row['phase2_choice']) else None
            })
    
    return pd.DataFrame(changes)

if 'participant_summary_df' in locals():
    changes_df = calculate_phase_changes(participant_summary_df)
    
    if len(changes_df) > 0:
        print("Phase 1 to Phase 2 Changes by Condition:")
        print("-" * 45)
        
        change_summary = changes_df.groupby('condition_id').agg({
            'probability_change': ['count', 'mean', 'std', 'min', 'max'],
            'confidence_change': ['mean', 'std'],
            'choice_changed': 'sum'
        }).round(2)
        
        print(change_summary)
        
        # Visualize probability changes
        plt.figure(figsize=(12, 6))
        changes_df.boxplot(column='probability_change', by='condition_id', ax=plt.gca())
        plt.title('Probability Estimate Changes (Phase 2 - Phase 1) by Condition')
        plt.xlabel('Condition')
        plt.ylabel('Probability Change (%)')
        plt.xticks(rotation=45)
        plt.grid(True, alpha=0.3)
        plt.tight_layout()
        plt.show()
        
        print("\nDetailed change analysis:")
        for condition in changes_df['condition_id'].unique():
            condition_changes = changes_df[changes_df['condition_id'] == condition]
            print(f"\n{condition}:")
            print(f"  Participants: {len(condition_changes)}")
            if len(condition_changes) > 0:
                print(f"  Avg probability change: {condition_changes['probability_change'].mean():.1f}%")
                if condition_changes['confidence_change'].notna().any():
                    print(f"  Avg confidence change: {condition_changes['confidence_change'].mean():.1f}")
                choice_changes = condition_changes['choice_changed'].sum()
                print(f"  Travel choice changes: {choice_changes}/{len(condition_changes)}")

# 2. Trust and Usability Analysis
print("\n\n2. TRUST AND USABILITY ANALYSIS")
print("=" * 32)

if len(trust_data) > 0:
    trust_summary = trust_data.groupby('condition_id').agg({
        'trust_composite': ['count', 'mean', 'std'],
        'usability_composite': ['mean', 'std'], 
        'data_trust': ['mean', 'std'],
        'skeptical_rating': ['mean', 'std'],
        'usability_difficulty': ['mean', 'std'],
        'comprehension_ease': ['mean', 'std']
    }).round(2)
    
    print("Trust and Usability Measures by Condition:")
    print(trust_summary)
    
    # Create a comprehensive trust comparison plot
    trust_vars = ['trust_composite', 'usability_composite', 'data_trust', 'skeptical_rating']
    
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    axes = axes.flatten()
    
    for i, var in enumerate(trust_vars):
        trust_data[pd.notna(trust_data[var])].boxplot(column=var, by='condition_id', ax=axes[i])
        axes[i].set_title(f'{var.replace("_", " ").title()} by Condition')
        axes[i].set_xlabel('Condition')
        axes[i].set_ylabel(f'{var.replace("_", " ").title()} (1-7 scale)')
        axes[i].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

# 3. Visualization Literacy Detailed Analysis
print("\n\n3. VISUALIZATION LITERACY DETAILED ANALYSIS")
print("=" * 44)

if len(vis_lit_scores_df) > 0:
    print("Overall Visualization Literacy Performance:")
    print("-" * 42)
    
    overall_stats = {
        'Total Participants': len(vis_lit_scores_df),
        'Mean Score': f"{vis_lit_scores_df['total_score'].mean():.2f}/12",
        'Std Dev': f"{vis_lit_scores_df['total_score'].std():.2f}",
        'Mean Percentage': f"{vis_lit_scores_df['percent_score'].mean():.1f}%",
        'Min Score': f"{vis_lit_scores_df['total_score'].min()}/12",
        'Max Score': f"{vis_lit_scores_df['total_score'].max()}/12"
    }
    
    for stat, value in overall_stats.items():
        print(f"  {stat}: {value}")
    
    print("\nPerformance by Condition:")
    condition_lit_summary = vis_lit_scores_df.groupby('condition_id').agg({
        'total_score': ['count', 'mean', 'std', 'min', 'max'],
        'percent_score': ['mean', 'std']
    }).round(2)
    print(condition_lit_summary)
    
    # Question type analysis
    if question_performance:
        print("\nQuestion Type Performance Analysis:")
        print("-" * 35)
        
        # Group questions by type
        question_type_performance = {}
        
        for condition, questions in question_performance.items():
            for q_id, responses in questions.items():
                q_type = question_types.get(q_id, 'Unknown')
                
                if q_type not in question_type_performance:
                    question_type_performance[q_type] = []
                
                correct_count = sum(1 for r in responses if r['correct'])
                total_count = len(responses)
                accuracy = correct_count / total_count if total_count > 0 else 0
                
                question_type_performance[q_type].append({
                    'condition': condition,
                    'question_id': q_id,
                    'accuracy': accuracy,
                    'correct': correct_count,
                    'total': total_count
                })
        
        # Summarize by question type
        type_summary = {}
        for q_type, performances in question_type_performance.items():
            accuracies = [p['accuracy'] for p in performances]
            type_summary[q_type] = {
                'mean_accuracy': np.mean(accuracies),
                'std_accuracy': np.std(accuracies),
                'questions_count': len(set(p['question_id'] for p in performances)),
                'total_responses': sum(p['total'] for p in performances)
            }
        
        type_summary_df = pd.DataFrame(type_summary).T
        print(type_summary_df.round(3))

# 4. Data Quality and Participation Summary  
print("\n\n4. DATA QUALITY AND PARTICIPATION SUMMARY")
print("=" * 42)

quality_metrics = {
    'Total Participants': combined_data['participant_id'].nunique(),
    'Total Trials': len(combined_data),
    'Conditions Represented': combined_data['condition_id'].nunique(),
    'Phase 1 Completions': len(phase1_data),
    'Phase 2 Completions': len(phase2_data),
    'Trust Survey Completions': len(trust_data),
    'Vis Literacy Completions': len(vis_literacy_data)
}

for metric, value in quality_metrics.items():
    print(f"  {metric}: {value}")

print("\nCondition Distribution:")
condition_dist = combined_data['condition_id'].value_counts().sort_index()
for condition, count in condition_dist.items():
    print(f"  {condition}: {count} trials")

print("\nMissing Data Summary:")
key_columns = ['probability_estimate', 'confidence_rating', 'trust_composite', 'data_trust']
missing_summary = {}

for col in key_columns:
    if col in combined_data.columns:
        total_expected = len(combined_data)
        missing_count = combined_data[col].isna().sum()
        missing_pct = (missing_count / total_expected) * 100
        missing_summary[col] = f"{missing_count}/{total_expected} ({missing_pct:.1f}%)"

for col, summary in missing_summary.items():
    print(f"  {col}: {summary} missing")

### Statistical Summary and Condition Comparisons

In [None]:
# Save processed data for further analysis
print("\n=== SAVING PROCESSED DATA ===")

# Save key datasets
phase1_data.to_csv('./data/processed_phase1_data.csv', index=False)
phase2_data.to_csv('./data/processed_phase2_data.csv', index=False) 
if 'vis_lit_scores_df' in locals():
    vis_lit_scores_df.to_csv('./data/processed_vis_literacy.csv', index=False)
trust_data.to_csv('./data/processed_trust_data.csv', index=False)

# Save visualization data
if 'viz_df' in locals():
    viz_df.to_csv('./data/processed_visualization_data.csv', index=False)

print("Processed datasets saved:")
print("  - processed_phase1_data.csv")
print("  - processed_phase2_data.csv")
print("  - processed_vis_literacy.csv") 
print("  - processed_trust_data.csv")
print("  - processed_visualization_data.csv")
print("  - comprehensive_export.csv")
print("  - participant_summary.csv")

print("\n=== ANALYSIS COMPLETE! ===")
print("\nSummary of enhancements made:")
print("✓ Calculated visualization literacy scores using correct answers from plugin")
print("✓ Created comprehensive data export with structured format")
print("✓ Added dot plot visualizations for quantitative variables")
print("✓ Added bar chart and heatmap visualizations for categorical variables")
print("✓ Enhanced statistical analysis sections with condition comparisons")
print("✓ Generated publication-ready visualizations and summary statistics")
print("\nThe analysis notebook now provides:")
print("- Comprehensive data preprocessing and cleaning")
print("- Accurate visualization literacy scoring (12 Mini-VLAT questions)")
print("- Structured data exports for further analysis")
print("- Rich visualizations for all key variables")
print("- Statistical summaries and condition comparisons")
print("- Data quality assessment and participation tracking")