# Air Quality Prediction Study Analysis

This notebook analyzes experimental data from the trust and uncertainty visualization study.
The study examines how different visualization conditions affect user trust, confidence, and decision-making across two phases.

In [1]:
import pandas as pd
import numpy as np
import glob
import os
from pathlib import Path
import json
import warnings
warnings.filterwarnings('ignore')

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', None)

## 1. Data Loading and Preprocessing

In [2]:
# Load all CSV files from the data directory
data_dir = Path('./data')
csv_files = list(data_dir.glob('user_*.csv'))
print(f"Found {len(csv_files)} participant data files:")
for file in csv_files:
    print(f"  - {file.name}")

Found 5 participant data files:
  - user_6657381117794993965781_2025-12-08T17-16-29-651.csv
  - user_67425624930886458_2025-12-08T17-23-33-555.csv
  - user_9830_2025-12-08T04-16-47-915.csv
  - user_1765213036676_2025-12-08T16-57-16-674.csv
  - user_62199458482518274623771_2025-12-08T17-37-25-103.csv


In [3]:
# Function to load and clean individual participant data
def load_participant_data(file_path):
    """Load a single participant CSV file and clean the data"""
    try:
        df = pd.read_csv(file_path)
        
        # Extract participant ID from filename if not in data
        if 'participant_id' not in df.columns or df['participant_id'].isna().all():
            participant_id = file_path.stem.split('_')[1]  # Extract from filename
            df['participant_id'] = participant_id
        
        # Clean condition IDs and names
        if 'condition_id' in df.columns:
            df['condition_id'] = df['condition_id'].fillna('unknown')
        
        return df
    except Exception as e:
        print(f"Error loading {file_path}: {e}")
        return None

# Load all participant data
all_data = []
for file_path in csv_files:
    participant_data = load_participant_data(file_path)
    if participant_data is not None:
        all_data.append(participant_data)
        print(f"Loaded data for participant: {participant_data['participant_id'].iloc[0] if not participant_data['participant_id'].isna().all() else 'unknown'}")

# Combine all participant data
if all_data:
    combined_data = pd.concat(all_data, ignore_index=True)
    print(f"\nCombined dataset shape: {combined_data.shape}")
else:
    print("No data loaded successfully")

Loaded data for participant: 66573c811d17794993965781
Loaded data for participant: 67aced4b25f62493088645b8
Loaded data for participant: nan
Loaded data for participant: test
Loaded data for participant: 62e199458482518274623771

Combined dataset shape: (92, 46)


In [4]:
# Examine the data structure
print("Column names:")
print(combined_data.columns.tolist())
print(f"\nDataset shape: {combined_data.shape}")
print(f"\nUnique trial types:")
print(combined_data['trial_type'].value_counts())

Column names:
['city_a_estimate', 'city_b_estimate', 'click_events', 'comprehension_ease', 'condition', 'condition_id', 'condition_name', 'confidence_label', 'confidence_rating', 'data_trust', 'display_format', 'end_time', 'hover_events', 'interaction_log', 'internal_node_id', 'participant_id', 'percent_score', 'phase', 'phase1_complete', 'phase2_complete', 'predictions_shown', 'probability_estimate', 'question_order', 'response', 'responses', 'round', 'rt', 'rt_total', 'skeptical_rating', 'start_time', 'stimulus', 'success', 'time_elapsed', 'time_on_viz', 'total_interactions', 'total_questions', 'total_score', 'travel_choice', 'trial_index', 'trial_type', 'trust_composite', 'usability_composite', 'usability_difficulty', 'view_history', 'visualization_literacy_score', 'visualization_shown']

Dataset shape: (92, 46)

Unique trial types:
trial_type
html-button-response    30
prediction-task         10
trust-survey            10
survey-text              9
fullscreen               8
vis-li

In [5]:
# Filter for relevant trial types (prediction tasks and surveys)
relevant_trials = combined_data[
    combined_data['trial_type'].isin([
        'prediction-task', 'vis-literacy', 'trust-survey', 
        'personality-survey', 'survey-text', 'survey-multi-choice'
    ])
].copy()

print(f"Filtered dataset shape: {relevant_trials.shape}")
print(f"\nTrial types in filtered data:")
print(relevant_trials['trial_type'].value_counts())

Filtered dataset shape: (44, 46)

Trial types in filtered data:
trial_type
prediction-task        10
trust-survey           10
survey-text             9
vis-literacy            5
personality-survey      5
survey-multi-choice     5
Name: count, dtype: int64


In [6]:
# Examine condition distribution
print("Unique conditions:")
condition_counts = relevant_trials['condition_id'].value_counts(dropna=False)
print(condition_counts)

print("\nCondition names:")
condition_names = relevant_trials[['condition_id', 'condition_name']].drop_duplicates().dropna()
for _, row in condition_names.iterrows():
    print(f"  {row['condition_id']}: {row['condition_name']}")

Unique conditions:
condition_id
condition_7_buggy         14
condition_9_combined      14
condition_5_pi_hover       7
condition_0_historical     5
unknown                    4
Name: count, dtype: int64

Condition names:
  condition_7_buggy: Buggy Control
  condition_0_historical: Historical Only
  condition_5_pi_hover: PI Plot + Hover
  condition_9_combined: Combined PI + Ensemble


In [7]:
# Separate Phase 1 and Phase 2 data
prediction_data = relevant_trials[relevant_trials['trial_type'] == 'prediction-task'].copy()

# Phase separation logic
phase1_data = prediction_data[prediction_data['phase'] == 1].copy()
phase2_data = prediction_data[prediction_data['phase'] == 2].copy()

print(f"Phase 1 data: {len(phase1_data)} rows")
print(f"Phase 2 data: {len(phase2_data)} rows")

# Get visualization literacy data
vis_literacy_data = relevant_trials[relevant_trials['trial_type'] == 'vis-literacy'].copy()
print(f"Visualization literacy data: {len(vis_literacy_data)} rows")

# Get trust survey data
trust_data = relevant_trials[relevant_trials['trial_type'] == 'trust-survey'].copy()
print(f"Trust survey data: {len(trust_data)} rows")

Phase 1 data: 5 rows
Phase 2 data: 5 rows
Visualization literacy data: 5 rows
Trust survey data: 10 rows


## 2. Basic Statistics Tables by Condition

Each table shows conditions as rows and response variables as columns, with participant response lists in each cell.

In [8]:
def create_condition_response_table(data, response_columns, title="Response Table"):
    """
    Create a table where each row is a condition and each column is a response variable.
    Each cell contains a list of participant responses.
    """
    # Get unique conditions
    conditions = sorted(data['condition_id'].dropna().unique())
    
    # Initialize results dictionary
    results = {}
    
    for condition in conditions:
        condition_data = data[data['condition_id'] == condition]
        condition_responses = {}
        
        for col in response_columns:
            if col in condition_data.columns:
                responses = condition_data[col].dropna().tolist()
                condition_responses[col] = responses
            else:
                condition_responses[col] = []
        
        results[condition] = condition_responses
    
    # Convert to DataFrame
    df = pd.DataFrame(results).T
    
    print(f"\n{title}")
    print("=" * len(title))
    return df

# Function to display response summary statistics
def display_response_summary(df, title="Summary"):
    """
    Display summary statistics for response lists in each cell
    """
    print(f"\n{title} - Response Counts and Basic Stats")
    print("-" * (len(title) + 30))
    
    for condition in df.index:
        print(f"\nCondition: {condition}")
        for col in df.columns:
            responses = df.loc[condition, col]
            if isinstance(responses, list) and responses:
                numeric_responses = [r for r in responses if isinstance(r, (int, float)) and not pd.isna(r)]
                if numeric_responses:
                    print(f"  {col}: n={len(numeric_responses)}, mean={np.mean(numeric_responses):.2f}, responses={numeric_responses}")
                else:
                    print(f"  {col}: n={len(responses)}, responses={responses[:5]}{'...' if len(responses) > 5 else ''}")
            else:
                print(f"  {col}: No responses")
    
    return df

### Phase 1 Responses (Baseline - No Visualization)

In [9]:
# Phase 1 response columns
phase1_columns = ['probability_estimate', 'confidence_rating', 'travel_choice']

# Create Phase 1 table
phase1_table = create_condition_response_table(
    phase1_data, 
    phase1_columns, 
    "Phase 1 Responses (No Visualization)"
)

# Display the table
display_response_summary(phase1_table, "Phase 1 Summary")


Phase 1 Responses (No Visualization)

Phase 1 Summary - Response Counts and Basic Stats
---------------------------------------------

Condition: condition_0_historical
  probability_estimate: n=5, mean=70.00, responses=[80.0, 88.0, 50.0, 50.0, 82.0]
  confidence_rating: n=5, mean=3.80, responses=[5.0, 4.0, 3.0, 2.0, 5.0]
  travel_choice: n=5, responses=['City A', 'City A', 'No Preference', 'No Preference', 'City A']


Unnamed: 0,probability_estimate,confidence_rating,travel_choice
condition_0_historical,"[80.0, 88.0, 50.0, 50.0, 82.0]","[5.0, 4.0, 3.0, 2.0, 5.0]","[City A, City A, No Preference, No Preference,..."


### Phase 2 Responses (With Visualization)

In [10]:
# Phase 2 response columns
phase2_columns = ['probability_estimate', 'confidence_rating', 'travel_choice', 'data_trust', 'skeptical_rating']

# Create Phase 2 table
phase2_table = create_condition_response_table(
    phase2_data, 
    phase2_columns, 
    "Phase 2 Responses (With Visualization)"
)

# Display the table
display_response_summary(phase2_table, "Phase 2 Summary")


Phase 2 Responses (With Visualization)

Phase 2 Summary - Response Counts and Basic Stats
---------------------------------------------

Condition: condition_5_pi_hover
  probability_estimate: n=1, mean=86.00, responses=[86.0]
  confidence_rating: n=1, mean=5.00, responses=[5.0]
  travel_choice: n=1, responses=['City A']
  data_trust: No responses
  skeptical_rating: No responses

Condition: condition_7_buggy
  probability_estimate: n=2, mean=75.00, responses=[100.0, 50.0]
  confidence_rating: n=2, mean=4.50, responses=[6.0, 3.0]
  travel_choice: n=2, responses=['City A', 'No Preference']
  data_trust: No responses
  skeptical_rating: No responses

Condition: condition_9_combined
  probability_estimate: n=2, mean=67.50, responses=[50.0, 85.0]
  confidence_rating: n=2, mean=4.50, responses=[3.0, 6.0]
  travel_choice: n=2, responses=['No Preference', 'City A']
  data_trust: No responses
  skeptical_rating: No responses


Unnamed: 0,probability_estimate,confidence_rating,travel_choice,data_trust,skeptical_rating
condition_5_pi_hover,[86.0],[5.0],[City A],[],[]
condition_7_buggy,"[100.0, 50.0]","[6.0, 3.0]","[City A, No Preference]",[],[]
condition_9_combined,"[50.0, 85.0]","[3.0, 6.0]","[No Preference, City A]",[],[]


### Visualization Literacy Question Responses by Condition

In [None]:
# Function to parse visualization literacy responses
def parse_vis_literacy_responses(responses_str):
    """Parse the JSON responses from visualization literacy test"""
    if pd.isna(responses_str):
        return []
    try:
        # Clean and parse JSON
        cleaned_str = responses_str.replace("'", '"')
        responses = json.loads(cleaned_str)
        return responses
    except:
        return []

# Parse visualization literacy responses
print("=== VISUALIZATION LITERACY RESPONSES ===")
print("\nIndividual Question Responses by Participant:")
print("-" * 50)

vis_lit_responses = {}
for idx, row in vis_literacy_data.iterrows():
    participant_id = row['participant_id']
    condition_id = row['condition_id']
    responses = parse_vis_literacy_responses(row['responses'])
    
    if responses:
        print(f"\nParticipant: {participant_id} (Condition: {condition_id})")
        print(f"Total Questions: {len(responses)}")
        
        # Store responses for table creation
        vis_lit_responses[participant_id] = {
            'condition_id': condition_id,
            'responses': responses
        }
        
        # Display first few responses as example
        for i, q in enumerate(responses[:3]):
            question_id = q.get('question_id', f'Q{i+1}')
            question_type = q.get('question_type', 'unknown')
            response = q.get('response', 'N/A')
            is_correct = q.get('is_correct', False)
            print(f"  {question_id} ({question_type}): Response={response}, Correct={is_correct}")
        
        if len(responses) > 3:
            print(f"  ... and {len(responses) - 3} more questions")
    else:
        print(f"\nParticipant: {participant_id} - No responses found")

# Create a summary table of correct/incorrect responses by question type
print("\n\n=== QUESTION TYPE PERFORMANCE SUMMARY ===")
question_performance = {}

for participant_id, data in vis_lit_responses.items():
    condition = data['condition_id']
    for q in data['responses']:
        question_type = q.get('question_type', 'unknown')
        is_correct = q.get('is_correct', False)
        response = q.get('response', 'N/A')
        
        if condition not in question_performance:
            question_performance[condition] = {}
        
        if question_type not in question_performance[condition]:
            question_performance[condition][question_type] = []
        
        question_performance[condition][question_type].append({
            'participant': participant_id,
            'response': response,
            'correct': is_correct
        })

# Display performance by condition and question type
for condition in sorted(question_performance.keys()):
    print(f"\nCondition: {condition}")
    print("-" * (len(condition) + 12))
    
    for question_type in sorted(question_performance[condition].keys()):
        responses = question_performance[condition][question_type]
        correct_count = sum(1 for r in responses if r['correct'])
        total_count = len(responses)
        accuracy = (correct_count / total_count) * 100 if total_count > 0 else 0
        
        print(f"  {question_type}: {correct_count}/{total_count} correct ({accuracy:.1f}%)")
        
        # Show individual responses
        response_list = [r['response'] for r in responses]
        print(f"    Responses: {response_list}")

### Trust and Usability Measures by Condition

In [12]:
# Examine trust survey data structure
print("Trust survey columns:")
trust_cols = [col for col in trust_data.columns if any(keyword in col.lower() for keyword in ['trust', 'usability', 'skeptical', 'comprehension'])]
print(trust_cols)

# Trust and usability response columns
trust_columns = ['trust_composite', 'usability_composite', 'data_trust', 'skeptical_rating', 'usability_difficulty', 'comprehension_ease']

# Create trust measures table
trust_table = create_condition_response_table(
    trust_data, 
    trust_columns, 
    "Trust and Usability Measures by Condition"
)

# Display the table
display_response_summary(trust_table, "Trust and Usability Summary")

Trust survey columns:
['comprehension_ease', 'data_trust', 'skeptical_rating', 'trust_composite', 'usability_composite', 'usability_difficulty']

Trust and Usability Measures by Condition

Trust and Usability Summary - Response Counts and Basic Stats
---------------------------------------------------------

Condition: condition_5_pi_hover
  trust_composite: n=1, mean=5.00, responses=[5.0]
  usability_composite: n=1, mean=5.00, responses=[5.0]
  data_trust: n=1, mean=5.00, responses=[5.0]
  skeptical_rating: n=1, mean=3.00, responses=[3.0]
  usability_difficulty: n=1, mean=3.00, responses=[3.0]
  comprehension_ease: n=1, mean=5.00, responses=[5.0]

Condition: condition_7_buggy
  trust_composite: n=2, mean=2.00, responses=[2.0, 2.0]
  usability_composite: n=2, mean=5.00, responses=[6.0, 4.0]
  data_trust: n=2, mean=2.00, responses=[2.0, 2.0]
  skeptical_rating: n=2, mean=5.00, responses=[6.0, 4.0]
  usability_difficulty: n=2, mean=3.50, responses=[2.0, 5.0]
  comprehension_ease: n=2, me

Unnamed: 0,trust_composite,usability_composite,data_trust,skeptical_rating,usability_difficulty,comprehension_ease
condition_5_pi_hover,[5.0],[5.0],[5.0],[3.0],[3.0],[5.0]
condition_7_buggy,"[2.0, 2.0]","[6.0, 4.0]","[2.0, 2.0]","[6.0, 4.0]","[2.0, 5.0]","[6.0, 4.0]"
condition_9_combined,"[4.0, 2.0]","[4.0, 3.0]","[4.0, 2.0]","[4.0, 6.0]","[4.0, 5.0]","[4.0, 3.0]"


## 3. Data Quality Assessment

In [13]:
# Participation completion rates
print("=== DATA QUALITY ASSESSMENT ===")
print("\n1. Participation Completion Rates")
print("-" * 35)

# Check completion by participant
participant_completion = combined_data.groupby('participant_id').agg({
    'trial_type': list,
    'phase1_complete': 'any',
    'phase2_complete': 'any',
    'condition_id': 'first'
}).reset_index()

print(f"Total participants: {len(participant_completion)}")
print(f"Phase 1 completed: {participant_completion['phase1_complete'].sum()}")
print(f"Phase 2 completed: {participant_completion['phase2_complete'].sum()}")
print(f"Both phases completed: {(participant_completion['phase1_complete'] & participant_completion['phase2_complete']).sum()}")

print("\nCompletion by condition:")
completion_by_condition = participant_completion.groupby('condition_id').agg({
    'participant_id': 'count',
    'phase1_complete': 'sum',
    'phase2_complete': 'sum'
}).rename(columns={'participant_id': 'total_participants'})
print(completion_by_condition)

=== DATA QUALITY ASSESSMENT ===

1. Participation Completion Rates
-----------------------------------
Total participants: 5
Phase 1 completed: 5
Phase 2 completed: 5
Both phases completed: 5

Completion by condition:
              total_participants  phase1_complete  phase2_complete
condition_id                                                      
unknown                        5                5                5


In [14]:
# Response time analysis
print("\n2. Response Time Analysis")
print("-" * 25)

# Get response times for prediction tasks
prediction_rt = prediction_data[prediction_data['rt'].notna()]
if len(prediction_rt) > 0:
    print(f"Prediction task response times (ms):")
    print(f"  Mean: {prediction_rt['rt'].mean():.0f} ms")
    print(f"  Median: {prediction_rt['rt'].median():.0f} ms")
    print(f"  Min: {prediction_rt['rt'].min():.0f} ms")
    print(f"  Max: {prediction_rt['rt'].max():.0f} ms")
    
    # Response times by phase
    print("\nResponse times by phase:")
    phase_rt = prediction_rt.groupby('phase')['rt'].agg(['count', 'mean', 'median']).round(0)
    print(phase_rt)
else:
    print("No response time data available")


2. Response Time Analysis
-------------------------
Prediction task response times (ms):
  Mean: 40141 ms
  Median: 43091 ms
  Min: 5149 ms
  Max: 84929 ms

Response times by phase:
       count     mean   median
phase                         
1.0        5  55413.0  58067.0
2.0        5  24868.0  22476.0


In [15]:
# Missing data patterns
print("\n3. Missing Data Patterns")
print("-" * 25)

# Key variables missing data
key_vars = ['probability_estimate', 'confidence_rating', 'data_trust', 'visualization_literacy_score']
missing_data = {}

for var in key_vars:
    if var in relevant_trials.columns:
        total_rows = len(relevant_trials)
        missing_count = relevant_trials[var].isna().sum()
        missing_pct = (missing_count / total_rows) * 100
        missing_data[var] = {'missing_count': missing_count, 'missing_pct': missing_pct}

missing_df = pd.DataFrame(missing_data).T
print("Missing data for key variables:")
print(missing_df.round(1))


3. Missing Data Patterns
-------------------------
Missing data for key variables:
                              missing_count  missing_pct
probability_estimate                   34.0         77.3
confidence_rating                      34.0         77.3
data_trust                             39.0         88.6
visualization_literacy_score           44.0        100.0


## 4. Sample Demographics

In [16]:
# Participant counts per condition
print("=== SAMPLE DEMOGRAPHICS ===")
print("\n1. Participant Counts by Condition")
print("-" * 35)

# Get unique participants per condition
participant_counts = relevant_trials.groupby(['condition_id', 'condition_name']).agg({
    'participant_id': 'nunique'
}).rename(columns={'participant_id': 'participant_count'}).reset_index()

print(participant_counts.to_string(index=False))
print(f"\nTotal unique participants: {relevant_trials['participant_id'].nunique()}")

=== SAMPLE DEMOGRAPHICS ===

1. Participant Counts by Condition
-----------------------------------
          condition_id         condition_name  participant_count
condition_0_historical        Historical Only                  4
  condition_5_pi_hover        PI Plot + Hover                  1
     condition_7_buggy          Buggy Control                  1
  condition_9_combined Combined PI + Ensemble                  2

Total unique participants: 4


In [17]:
# Basic demographic information from survey data
print("\n2. Basic Demographics")
print("-" * 20)

# Get demographic data
demo_text = relevant_trials[relevant_trials['trial_type'] == 'survey-text'].copy()
demo_multi = relevant_trials[relevant_trials['trial_type'] == 'survey-multi-choice'].copy()

if len(demo_text) > 0:
    print("Text survey responses available:", len(demo_text))
    
if len(demo_multi) > 0:
    print("Multi-choice survey responses available:", len(demo_multi))
    
    # Parse responses if they exist
    if 'responses' in demo_multi.columns:
        print("\nEducation and visualization experience data:")
        for idx, row in demo_multi.iterrows():
            if pd.notna(row['responses']):
                try:
                    responses = json.loads(row['responses'].replace("'", '"'))
                    participant_id = row['participant_id']
                    condition = row['condition_id']
                    print(f"  Participant {participant_id} ({condition}): {responses}")
                except:
                    print(f"  Could not parse responses for participant {row['participant_id']}")


2. Basic Demographics
--------------------
Text survey responses available: 9
Multi-choice survey responses available: 5

Education and visualization experience data:


## 5. Summary Overview

In [18]:
print("=== STUDY SUMMARY OVERVIEW ===")
print(f"Dataset: {combined_data.shape[0]} total rows, {combined_data.shape[1]} columns")
print(f"Participants: {combined_data['participant_id'].nunique()} unique participants")
print(f"Conditions tested: {combined_data['condition_id'].nunique()} different conditions")
print(f"\nPhases:")
print(f"  Phase 1 (no visualization): {len(phase1_data)} prediction trials")
print(f"  Phase 2 (with visualization): {len(phase2_data)} prediction trials")
print(f"\nData collection timeframe:")
if 'start_time' in combined_data.columns and combined_data['start_time'].notna().any():
    start_times = pd.to_datetime(combined_data['start_time'].dropna())
    print(f"  Earliest: {start_times.min()}")
    print(f"  Latest: {start_times.max()}")
else:
    print(f"  Timeframe not available in data")

print(f"\nExperimental conditions observed:")
unique_conditions = combined_data[['condition_id', 'condition_name']].drop_duplicates().dropna()
for _, row in unique_conditions.iterrows():
    count = combined_data[combined_data['condition_id'] == row['condition_id']]['participant_id'].nunique()
    print(f"  {row['condition_id']}: {row['condition_name']} (n={count})")

=== STUDY SUMMARY OVERVIEW ===
Dataset: 92 total rows, 46 columns
Participants: 5 unique participants
Conditions tested: 5 different conditions

Phases:
  Phase 1 (no visualization): 5 prediction trials
  Phase 2 (with visualization): 5 prediction trials

Data collection timeframe:
  Earliest: 2025-12-08 04:14:57.200000+00:00
  Latest: 2025-12-08 17:26:19.888000+00:00

Experimental conditions observed:
  condition_7_buggy: Buggy Control (n=1)
  condition_0_historical: Historical Only (n=4)
  condition_5_pi_hover: PI Plot + Hover (n=1)
  condition_9_combined: Combined PI + Ensemble (n=2)


In [19]:
# Save processed data for further analysis
print("\n=== SAVING PROCESSED DATA ===")

# Save key datasets
phase1_data.to_csv('./data/processed_phase1_data.csv', index=False)
phase2_data.to_csv('./data/processed_phase2_data.csv', index=False)
vis_lit_scores.to_csv('./data/processed_vis_literacy.csv', index=False)
trust_data.to_csv('./data/processed_trust_data.csv', index=False)

print("Processed datasets saved:")
print("  - processed_phase1_data.csv")
print("  - processed_phase2_data.csv")
print("  - processed_vis_literacy.csv")
print("  - processed_trust_data.csv")
print("\nAnalysis complete!")


=== SAVING PROCESSED DATA ===
Processed datasets saved:
  - processed_phase1_data.csv
  - processed_phase2_data.csv
  - processed_vis_literacy.csv
  - processed_trust_data.csv

Analysis complete!
