In [1]:
# ## 00 - Consolidate Raw Data (Cleaned and Summarized Version)
#
# **Objective:** Combine the individual raw data files from all 38 participants
# into three master CSV files. This version cleans the data, handles errors, and
# selects only the most relevant columns to create summarized, analysis-ready files.
#
# **Input:** All individual participant files from `../data/raw/` (e.g., `1_EEG.csv`, `1_GSR.csv`, etc.)
#
# **Output:** Three cleaned and summarized CSV files saved in `../data/raw/All_raw_data/`:
# 1. `EEG.csv` (with selected columns)
# 2. `GSR.csv` (with selected columns)
# 3. `PSY.csv` (cleaned)

import pandas as pd
import numpy as np
import os
from tqdm import tqdm

# --- Configuration ---
RAW_DATA_DIR = '../data/raw/'
OUTPUT_DIR = os.path.join(RAW_DATA_DIR, 'All_raw_data')
NUM_PARTICIPANTS = 38

# Create the output directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"Output directory created at: {OUTPUT_DIR}")

# --- Define the essential columns to keep for summarization ---
EEG_COLS_TO_KEEP = [
    'UnixTime', 'Delta_TP9', 'Theta_TP9', 'Alpha_TP9', 'Beta_TP9', 'Gamma_TP9',
    'Delta_AF7', 'Theta_AF7', 'Alpha_AF7', 'Beta_AF7', 'Gamma_AF7',
    'Delta_AF8', 'Theta_AF8', 'Alpha_AF8', 'Beta_AF8', 'Gamma_AF8',
    'Delta_TP10', 'Theta_TP10', 'Alpha_TP10', 'Beta_TP10', 'Gamma_TP10'
]
GSR_COLS_TO_KEEP = ['UnixTime', 'GSR Conductance CAL']

# --- Helper Function for Memory-Efficient Consolidation ---
def consolidate_files(file_prefix, output_filename, cols_to_keep=None):
    """
    Reads, cleans, selects columns, and appends participant files to a master CSV.
    """
    master_file_path = os.path.join(OUTPUT_DIR, output_filename)
    is_first_file = True

    print(f"\nConsolidating and cleaning {file_prefix} files...")
    for participant_id in tqdm(range(1, NUM_PARTICIPANTS + 1), desc=f"Processing {file_prefix}"):
        try:
            file_path = os.path.join(RAW_DATA_DIR, f'{participant_id}_{file_prefix}.csv')
            df = pd.read_csv(file_path, low_memory=False)
            
            # --- Data Cleaning ---
            # Handle infinity values
            df.replace([np.inf, -np.inf], np.nan, inplace=True)
            
            # Clean timestamp columns
            if 'UnixTime' in df.columns:
                df['UnixTime'] = pd.to_numeric(df['UnixTime'], errors='coerce')
                df.dropna(subset=['UnixTime'], inplace=True)
            if 'routineStart' in df.columns:
                df['routineStart'] = pd.to_numeric(df['routineStart'], errors='coerce')
                df['routineEnd'] = pd.to_numeric(df['routineEnd'], errors='coerce')
                df.dropna(subset=['routineStart', 'routineEnd'], inplace=True)

            # --- Summarization (Column Selection) ---
            if cols_to_keep:
                # Add ParticipantID before selecting columns
                df['ParticipantID'] = participant_id
                # Ensure all desired columns exist, fill missing with NaN (which we'll clean)
                for col in cols_to_keep:
                    if col not in df.columns:
                        df[col] = np.nan
                df = df[cols_to_keep + ['ParticipantID']]
            else:
                 df['ParticipantID'] = participant_id

            # Final cleaning of any remaining missing data in relevant columns
            df.fillna(0, inplace=True)
            
            # Append to master file
            if is_first_file:
                df.to_csv(master_file_path, index=False, mode='w', header=True)
                is_first_file = False
            else:
                df.to_csv(master_file_path, index=False, mode='a', header=False)

        except FileNotFoundError:
            print(f"Warning: File for participant {participant_id}_{file_prefix}.csv not found. Skipping.")
            continue
    
    if not is_first_file:
        print(f"Successfully created master {output_filename}")
    else:
        print(f"No {file_prefix} files were found to consolidate.")

# --- Run the Consolidation for Each File Type ---
consolidate_files('EEG', 'EEG.csv', cols_to_keep=EEG_COLS_TO_KEEP)
consolidate_files('GSR', 'GSR.csv', cols_to_keep=GSR_COLS_TO_KEEP)
consolidate_files('PSY', 'PSY.csv')

print("\nData consolidation, cleaning, and summarization complete.")



Output directory created at: ../data/raw/All_raw_data

Consolidating and cleaning EEG files...


Processing EEG: 100%|██████████| 38/38 [03:10<00:00,  5.02s/it]


Successfully created master EEG.csv

Consolidating and cleaning GSR files...


Processing GSR: 100%|██████████| 38/38 [00:19<00:00,  1.96it/s]


Successfully created master GSR.csv

Consolidating and cleaning PSY files...


Processing PSY: 100%|██████████| 38/38 [00:00<00:00, 320.23it/s]

Successfully created master PSY.csv

Data consolidation, cleaning, and summarization complete.



