In [None]:
import pandas as pd
import numpy as np
import pingouin as pg # For ANOVA
import os
import glob # For finding files
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
plot_choices = ("plr", "lc")
chosen_plot = plot_choices[1]

In [None]:
# --- Step 1.2: Define Paths and Load Summarized Beta Data ---

PARTICIPANT_IDS = ["aaaa", "bbbb", "cccc", "dddd", "eeee", "ffff"]
BETA_CSV_DIR = "data/decon_betas_summaries"

lc_csv_files = []
plr_csv_files = []

for sid in PARTICIPANT_IDS:
    for run in [1, 2]:
        lc_epoch_filepath = os.path.join(BETA_CSV_DIR, sid, f"{sid}_R{run}_betas_cognitive_window.csv")
        plr_epoch_filepath = os.path.join(BETA_CSV_DIR, sid, f"{sid}_R{run}_betas_plr_window.csv")

        lc_csv_files.append(lc_epoch_filepath)
        plr_csv_files.append(plr_epoch_filepath)

print(f"Found {len(lc_csv_files)} LC beta CSV files to load.")
print(f"Found {len(plr_csv_files)} PLR beta CSV files to load.")

# Load and concatenate all LC beta DataFrames
list_of_lc_dfs = []
for f_path in lc_csv_files:
    try:
        df = pd.read_csv(f_path)
        list_of_lc_dfs.append(df)
    except Exception as e:
        print(f"Error loading or processing file {f_path}: {e}")

if not list_of_lc_dfs:
    print("ERROR: No dataframes were successfully loaded. Check individual CSV files.")
    all_lc_betas_df = pd.DataFrame()
else:
    all_lc_betas_df = pd.concat(list_of_lc_dfs, ignore_index=True)
    print("\n--- All LC Betas Loaded and Concatenated ---")
    print(f"Shape of all_lc_betas_df: {all_lc_betas_df.shape}")
    print("Columns:", all_lc_betas_df.columns.tolist())
    print("\nData snippet (head):")
    print(all_lc_betas_df.head())
    print("\nUnique Subjects found:", all_lc_betas_df['Subject'].unique())
    print("Unique Runs found:", all_lc_betas_df['Run'].unique())
    print("Unique Event_Types found:", all_lc_betas_df['Event_Type'].unique())


# Load and concatenate all PLR beta DataFrames
list_of_plr_dfs = []
for f_path in plr_csv_files:
    try:
        df = pd.read_csv(f_path)
        list_of_plr_dfs.append(df)
    except Exception as e:
        print(f"Error loading or processing file {f_path}: {e}")

if not list_of_plr_dfs:
    print("ERROR: No dataframes were successfully loaded. Check individual CSV files.")
    all_plr_betas_df = pd.DataFrame()
else:
    all_plr_betas_df = pd.concat(list_of_plr_dfs, ignore_index=True)
    print("\n--- All PLR Betas Loaded and Concatenated ---")
    print(f"Shape of all_plr_betas_df: {all_plr_betas_df.shape}")
    print("Columns:", all_plr_betas_df.columns.tolist())
    print("\nData snippet (head):")
    print(all_plr_betas_df.head())
    print("\nUnique Subjects found:", all_plr_betas_df['Subject'].unique())
    print("Unique Runs found:", all_plr_betas_df['Run'].unique())
    print("Unique Event_Types found:", all_plr_betas_df['Event_Type'].unique())


In [None]:
all_plr_betas_df

In [None]:
# Pupil_ANOVA_Analysis.ipynb

# ... (Previous cell: Imports, Define Paths, Load & Concatenate Data - Steps 1.1 to 1.2) ...

# --- Step 1.3: Data Cleaning and Factor Creation (Focus on Cognitive Betas) ---

if f'all_{chosen_plot}_betas_df' not in locals() or all_lc_betas_df.empty:
    print(f"ERROR: 'all_{chosen_plot}_betas_df' not found or is empty. Cannot proceed with data cleaning.")
    # Create an empty df to prevent downstream errors if user wants to continue notebook execution
    anova_df = pd.DataFrame()
else:
    print(f"\n--- Preparing Data for ANOVA ({chosen_plot.upper()} Window Betas) ---")
    # Make a copy to work with
    if chosen_plot == "lc":
        anova_df = all_lc_betas_df.copy()
    else:
        anova_df = all_plr_betas_df.copy()

    # 1. Filter for relevant Event_Types (main stimuli only)
    # We want to exclude nuisance regressors like Keypress, Attention Targets, Distractors for the main ANOVA
    main_stim_event_types = [
        'Standard_attend', 'CommOdd_attend', 'RareOdd_attend',
        'Standard_divert', 'CommOdd_divert', 'RareOdd_divert'
    ]
    anova_df = anova_df[anova_df['Event_Type'].isin(main_stim_event_types)]

    if anova_df.empty:
        print("ERROR: No data remaining after filtering for main stimulus event types. Check Event_Type names.")
    else:
        print(f"\nShape after filtering for main stimuli Event_Types: {anova_df.shape}")
        print("Remaining Event_Types for ANOVA:", anova_df['Event_Type'].unique())

        # 2. Create Factor Columns: 'MainStimulus' and 'Attention'
        def extract_main_stimulus(event_type_str):
            if 'Standard' in event_type_str:
                return 'Standard'
            elif 'CommOdd' in event_type_str:
                return 'CommonOddball'
            elif 'RareOdd' in event_type_str:
                return 'RareOddball'
            return np.nan

        def extract_attention(event_type_str):
            if '_attend' in event_type_str:
                return 'attend'
            elif '_divert' in event_type_str:
                return 'divert'
            return np.nan

        anova_df['MainStimulus'] = anova_df['Event_Type'].apply(extract_main_stimulus)
        anova_df['Attention'] = anova_df['Event_Type'].apply(extract_attention)

        # 3. Rename the dependent variable column for convenience
        # Find the column that starts with 'Mean_Beta_'
        dv_col = [col for col in anova_df.columns if 'Mean_Beta_' in col]
        if not dv_col:
            print(f"ERROR: Dependent variable column for {chosen_plot.upper()} betas not found (expected to contain 'Mean_Beta_').")
            # raise ValueError("DV column for cognitive betas missing.")
        else:
            dv_col_name = dv_col[0] # Should only be one
            anova_df.rename(columns={dv_col_name: f'Beta_{chosen_plot}'}, inplace=True)
            print(f"\nRenamed DV column '{dv_col_name}' to f'Beta_{chosen_plot}'.")

            # 4. Check for any missing values created during parsing (shouldn't be any if Event_Types are correct)
            if anova_df[['MainStimulus', 'Attention', f'Beta_{chosen_plot}']].isna().any().any():
                print("\nWARNING: Missing values found after creating factor columns or in DV. Review anova_df:")
                print(anova_df[anova_df[['MainStimulus', 'Attention', f'Beta_{chosen_plot}']].isna().any(axis=1)])
            else:
                print("\nNo missing values in key factor columns or DV after parsing.")

            # 5. Display the prepared DataFrame structure
            print("\nPrepared anova_df structure (head):")
            print(anova_df.head())
            print("\nUnique values for new factor columns:")
            print("MainStimulus:", anova_df['MainStimulus'].unique())
            print("Attention:", anova_df['Attention'].unique())

            # Optional: Check number of observations per subject per condition
            # This helps ensure the data is balanced or to understand any imbalances.
            print("\nObservations per Subject x MainStimulus x Attention:")
            print(anova_df.groupby(['Subject', 'MainStimulus', 'Attention']).size().unstack(fill_value=0))

In [None]:
if 'anova_df' not in locals() or anova_df.empty:
    print("ERROR: 'anova_df' not found or is empty. Cannot perform ANOVA.")
    raise ValueError("'anova_df' is missing or empty.")
else:
    print("\n--- Performing 2x3 Repeated Measures ANOVA (Attention x MainStimulus) ---")
    print(f"DV: Beta_{chosen_plot}")
    print("Within-subject factors: Attention, MainStimulus")
    print("Subject identifier: Subject")

    # Ensure correct data types for factors if pingouin is picky (usually not an issue for strings)
    anova_df['Subject'] = anova_df['Subject'].astype(str)
    anova_df['MainStimulus'] = anova_df['MainStimulus'].astype(str)
    anova_df['Attention'] = anova_df['Attention'].astype(str)
    anova_df[f'Beta_{chosen_plot}'] = pd.to_numeric(anova_df[f'Beta_{chosen_plot}'], errors='coerce')

    # Drop rows with NaNs in DV if any were coerced
    if anova_df[f'Beta_{chosen_plot}'].isna().any():
        print(f"Warning: Found {anova_df[f'Beta_{chosen_plot}'].isna().sum()} NaNs in DV. These rows will be dropped for ANOVA.")
        anova_df.dropna(subset=[f'Beta_{chosen_plot}'], inplace=True)

    if anova_df.empty:
        print("ERROR: anova_df became empty after NaN drop in DV. Cannot perform ANOVA.")
    else:
        try:
            # Perform the repeated measures ANOVA
            aov = pg.rm_anova(
                data=anova_df,
                dv=f'Beta_{chosen_plot}',               # Dependent variable
                within=['Attention', 'MainStimulus'],   # Within-subject factors
                subject='Subject',                      # Subject identifier
                detailed=True,                          # Get detailed output including Mauchly's test
                correction='auto'                       # Automatically apply sphericity correction if needed (GG or HF)
            )

            print("\n--- ANOVA Results ---")
            print(aov)

            # --- Interpretation Guidance based on Hypotheses ---
            # H1 (Expectation): Pupil responses will be larger for Oddball stimuli
            #                   compared to Standard stimuli.
            #   -> Look for a significant main effect of 'MainStimulus'.
            #      If significant, follow up with post-hoc tests comparing levels
            #      (Standard vs. CommonOddball, Standard vs. RareOddball, CommonOddball vs. RareOddball).

            # H2 (Attention Modulation): The "expectation effect" (Oddball - Standard difference)
            #                            will be significantly greater in the Attend condition
            #                            compared to the Divert condition.
            #   -> Look for a significant interaction effect: 'Attention * MainStimulus'.
            #      If significant, this is key for H2. Follow up with:
            #          - Simple main effects (e.g., effect of MainStimulus within Attend, then within Divert)
            #          - Or, calculate difference scores (Oddball_Beta - Standard_Beta) for each attention
            #            condition and then t-test these difference scores.

            # Also check main effect of 'Attention'.

        except Exception as e:
            print(f"ERROR during ANOVA calculation: {e}")
            print("Check data structure, column names, and ensure pingouin is installed.")

In [None]:
# Plotting the ANOVA
if 'anova_df' in locals() and not anova_df.empty:
    print("\n--- Plotting Means for ANOVA Factors ---")

    plt.figure(figsize=(10, 6))
    sns.pointplot(
        data=anova_df,
        x='MainStimulus',
        y=f'Beta_{chosen_plot}',
        hue='Attention',
        dodge=True, # Separate points for hue levels
        errorbar='se', # Show standard error of the mean
        capsize=.1,
        order=['Standard', 'CommonOddball', 'RareOddball'] # Ensure consistent order
    )
    plt.title(f'Mean {chosen_plot.upper()} Beta (1250-3000ms) by Main Stimulus and Attention', fontsize=15)
    plt.xlabel('Main Stimulus Type', fontsize=12)
    plt.ylabel(f'Mean Beta Coefficient ({chosen_plot.upper()} Window)', fontsize=12)
    plt.axhline(0, color='grey', linestyle='--', linewidth=1)
    plt.legend(title='Attention Condition', fontsize=10, title_fontsize='11')
    plt.grid(True, linestyle=':', alpha=0.5)
    plt.tight_layout()
    plt.show()

    # Bar plot for main effect of Attention
    plt.figure(figsize=(6, 5))
    sns.barplot(
        data=anova_df,
        x='Attention',
        y=f'Beta_{chosen_plot}',
        errorbar='se',
        capsize=.1,
        palette={'attend': 'skyblue', 'divert': 'salmon'}
    )
    plt.title(f'Main Effect of Attention on {chosen_plot.upper()} Beta', fontsize=15)
    plt.xlabel('Attention Condition', fontsize=12)
    plt.ylabel(f'Mean Beta Coefficient ({chosen_plot.upper()} Window)', fontsize=12)
    plt.grid(True, axis='y', linestyle=':', alpha=0.5)
    plt.tight_layout()
    plt.show()

    # Print means for main effect of Attention
    print(f"\nMean Beta_{chosen_plot.upper()} by Attention condition:")
    print(round(anova_df.groupby('Attention')[f'Beta_{chosen_plot}'].mean(), 2))
    print(f"\nSD Beta_{chosen_plot.upper()} by Attention condition:")
    print(round(anova_df.groupby('Attention')[f'Beta_{chosen_plot}'].std(), 2))

else:
    print("anova_df not available for plotting means.")