In [1]:
# ## 01 - Data Preprocessing
#
# *Objective:* Load the raw EEG, GSR, and PSY files for all 38 participants,
# clean the data, synchronize timestamps, and create structured "data windows" for each task.
#
# *Output:* A single pickle file (task_windows.pkl) containing a list of all valid data windows.

import pandas as pd
import numpy as np
import os
from tqdm import tqdm

# --- Configuration ---
RAW_DATA_DIR = '../data/raw/' # Corrected path to be case-sensitive
PROCESSED_DATA_DIR = '../data/processed/' # Corrected path
NUM_PARTICIPANTS = 38
OUTPUT_FILE = os.path.join(PROCESSED_DATA_DIR, 'task_windows.pkl')

# Create the processed data directory if it doesn't exist
os.makedirs(PROCESSED_DATA_DIR, exist_ok=True)

# --- Main Processing Loop ---
all_participants_windows = []

print(f"Starting preprocessing for {NUM_PARTICIPANTS} participants...")

for participant_id in tqdm(range(1, NUM_PARTICIPANTS + 1), desc="Processing Participants"):
    try:
        # Construct file paths
        eeg_file = os.path.join(RAW_DATA_DIR, f'{participant_id}_EEG.csv')
        gsr_file = os.path.join(RAW_DATA_DIR, f'{participant_id}_GSR.csv')
        psy_file = os.path.join(RAW_DATA_DIR, f'{participant_id}_PSY.csv')

        # Load data
        eeg_df = pd.read_csv(eeg_file, low_memory=False)
        gsr_df = pd.read_csv(gsr_file, low_memory=False)
        psy_df = pd.read_csv(psy_file, low_memory=False)

        # --- Data Cleaning and Timestamp Conversion ---
        # Aggressively clean non-numeric and infinite values from the start
        eeg_df.replace([np.inf, -np.inf], np.nan, inplace=True)
        gsr_df.replace([np.inf, -np.inf], np.nan, inplace=True)

        eeg_df['UnixTime'] = pd.to_numeric(eeg_df['UnixTime'], errors='coerce')
        gsr_df['UnixTime'] = pd.to_numeric(gsr_df['UnixTime'], errors='coerce')
        psy_df['routineStart'] = pd.to_numeric(psy_df['routineStart'], errors='coerce')
        psy_df['routineEnd'] = pd.to_numeric(psy_df['routineEnd'], errors='coerce')

        eeg_df.dropna(subset=['UnixTime'], inplace=True)
        gsr_df.dropna(subset=['UnixTime'], inplace=True)
        psy_df.dropna(subset=['routineStart', 'routineEnd'], inplace=True)
        
        # Define cognitive load labels
        label_mapping = {1: 0, 2: 1, 3: 2} # Low, Medium, High
        psy_df['CognitiveLoad'] = psy_df['Category'].map(label_mapping)
        
        # Define the exact columns we need
        eeg_cols = ['Delta_TP9', 'Theta_TP9', 'Alpha_TP9', 'Beta_TP9', 'Gamma_TP9',
                    'Delta_AF7', 'Theta_AF7', 'Alpha_AF7', 'Beta_AF7', 'Gamma_AF7',
                    'Delta_AF8', 'Theta_AF8', 'Alpha_AF8', 'Beta_AF8', 'Gamma_AF8',
                    'Delta_TP10', 'Theta_TP10', 'Alpha_TP10', 'Beta_TP10', 'Gamma_TP10']
        gsr_col = 'GSR Conductance CAL'

        # --- Windowing ---
        for _, task in psy_df.iterrows():
            start_time = task['routineStart']
            end_time = task['routineEnd']
            
            eeg_slice = eeg_df[(eeg_df['UnixTime'] >= start_time) & (eeg_df['UnixTime'] <= end_time)]
            gsr_slice = gsr_df[(gsr_df['UnixTime'] >= start_time) & (gsr_df['UnixTime'] <= end_time)]

            # *** THE NEW, MORE ROBUST FIX IS HERE ***
            # Instead of discarding windows with missing data, we repair them.
            if not eeg_slice.empty and not gsr_slice.empty:
                # Create copies to work on
                eeg_slice_clean = eeg_slice.copy()
                gsr_slice_clean = gsr_slice.copy()

                # Use linear interpolation to fill gaps, then fill any remaining NaNs with 0
                eeg_slice_clean[eeg_cols] = eeg_slice_clean[eeg_cols].interpolate(method='linear', limit_direction='both').fillna(0)
                gsr_slice_clean[gsr_col] = gsr_slice_clean[gsr_col].interpolate(method='linear', limit_direction='both').fillna(0)
                
                all_participants_windows.append({
                    'Participant': participant_id,
                    'TaskKey': task['Key'],
                    'CognitiveLoad': task['CognitiveLoad'],
                    'EEG_Data': eeg_slice_clean[eeg_cols].values,
                    'GSR_Data': gsr_slice_clean[gsr_col].values
                })

    except FileNotFoundError:
        print(f"Warning: Files for participant {participant_id} not found. Skipping.")
        continue

# --- Save the final processed data ---
pd.to_pickle(all_participants_windows, OUTPUT_FILE)

print(f"\nPreprocessing complete.")
print(f"Total number of VALID task windows created: {len(all_participants_windows)}")
print(f"Processed data saved to: {OUTPUT_FILE}")



Starting preprocessing for 38 participants...


Processing Participants: 100%|██████████| 38/38 [01:30<00:00,  2.37s/it]



Preprocessing complete.
Total number of VALID task windows created: 1364
Processed data saved to: ../data/processed/task_windows.pkl
