In [None]:
# Step 1: Create proper data organization structure

import os
import shutil
import pandas as pd
import numpy as np

print("=== CREATING PROPER DATA ORGANIZATION ===")

# Define paths
base_dir = r'D:\impress_project\eeg_signals\data\LRMI-21679035'
organized_dir = os.path.join(base_dir, 'organized_data_v2')

print(f"Base directory: {base_dir}")
print(f"Organized directory: {organized_dir}")

# Create organized directory structure
folders_to_create = [
    'raw_data',              # Original .mat files
    'paper_preprocessed',    # Paper's preprocessed .edf files
    'my_preprocessed',       # Your preprocessing results
    'patient_info',          # Patient metadata
    'events',                # Event markers
    'results',               # Analysis results
    'plots',                 # Visualizations
    'code',                  # Analysis scripts
    'temp'                   # Temporary files
]

# Create main organized directory
if not os.path.exists(organized_dir):
    os.makedirs(organized_dir)
    print(f"âœ“ Created main directory: {organized_dir}")
else:
    print(f"âœ“ Directory already exists: {organized_dir}")

# Create subdirectories
for folder in folders_to_create:
    folder_path = os.path.join(organized_dir, folder)
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
        print(f"  Created: {folder}")
    else:
        print(f"  Exists: {folder}")

=== CREATING PROPER DATA ORGANIZATION ===
Base directory: D:\impress_project\eeg_signals\data\LRMI-21679035
Organized directory: D:\impress_project\eeg_signals\data\LRMI-21679035\organized_data_v2
âœ“ Created main directory: D:\impress_project\eeg_signals\data\LRMI-21679035\organized_data_v2
  Created: raw_data
  Created: paper_preprocessed
  Created: my_preprocessed
  Created: patient_info
  Created: events
  Created: results
  Created: plots
  Created: code
  Created: temp


In [None]:
# Step 2: Organize RAW Data (.mat files)

print("\n=== STEP 2: ORGANIZING RAW DATA (.mat files) ===")

# Find all raw .mat files
raw_source_dir = os.path.join(base_dir, 'sourcedata')
raw_files = []

if os.path.exists(raw_source_dir):
    for root, dirs, files in os.walk(raw_source_dir):
        for file in files:
            if file.endswith('.mat') and 'task-motor-imagery' in file:
                full_path = os.path.join(root, file)
                raw_files.append(full_path)
    
    print(f"Found {len(raw_files)} raw .mat files")
    
    # Copy raw files
    raw_dest_dir = os.path.join(organized_dir, 'raw_data')
    raw_files_copied = []
    
    for i, source_file in enumerate(raw_files, 1):
        try:
            # Extract subject ID from file path
            # Path format: .../sourcedata/sub-01/sub-01_task-motor-imagery_eeg.mat
            subject_folder = os.path.basename(os.path.dirname(source_file))
            file_name = os.path.basename(source_file)
            
            # Create target filename
            target_file = os.path.join(raw_dest_dir, file_name)
            
            # Copy the file
            shutil.copy2(source_file, target_file)
            raw_files_copied.append((subject_folder, file_name))
            
            if i % 10 == 0 or i == len(raw_files):
                print(f"  Copied {i}/{len(raw_files)}: {subject_folder}")
                
        except Exception as e:
            print(f"  âœ— Failed to copy {source_file}: {e}")
    
    print(f"\nâœ… Successfully copied: {len(raw_files_copied)} raw files")
    
else:
    print(f"âœ— Raw data directory not found: {raw_source_dir}")
    raw_files = []


=== STEP 1: ORGANIZING RAW DATA (.mat files) ===
Found 50 raw .mat files
  Copied 10/50: sub-10
  Copied 20/50: sub-20
  Copied 30/50: sub-30
  Copied 40/50: sub-40
  Copied 50/50: sub-50

âœ… Successfully copied: 50 raw files


In [None]:
# Step 3: Organize Paper's Preprocessed Data (.edf files)

print("\n=== STEP 3: ORGANIZING PAPER'S PREPROCESSED DATA (.edf files) ===")

# Find all preprocessed .edf files
preprocessed_source_dir = os.path.join(base_dir, 'edffile')
preprocessed_files = []

if os.path.exists(preprocessed_source_dir):
    for root, dirs, files in os.walk(preprocessed_source_dir):
        for file in files:
            if file.endswith('.edf'):
                full_path = os.path.join(root, file)
                preprocessed_files.append(full_path)
    
    print(f"Found {len(preprocessed_files)} preprocessed .edf files")
    
    # Copy preprocessed files
    preprocessed_dest_dir = os.path.join(organized_dir, 'paper_preprocessed')
    preprocessed_files_copied = []
    
    for i, source_file in enumerate(preprocessed_files, 1):
        try:
            # Extract subject ID from file path
            # Path format: .../edffile/sub-01/eeg/sub-01_task-motor-imagery_eeg.edf
            # Go up two levels to get subject folder
            subject_folder = os.path.basename(os.path.dirname(os.path.dirname(source_file)))
            file_name = os.path.basename(source_file)
            
            # Create new filename (consistent with raw data naming)
            new_file_name = file_name  # Keep original name
            target_file = os.path.join(preprocessed_dest_dir, new_file_name)
            
            # Copy the file
            shutil.copy2(source_file, target_file)
            preprocessed_files_copied.append((subject_folder, new_file_name))
            
            if i % 10 == 0 or i == len(preprocessed_files):
                print(f"  Copied {i}/{len(preprocessed_files)}: {subject_folder}")
                
        except Exception as e:
            print(f"  âœ— Failed to copy {source_file}: {e}")
    
    print(f"\nâœ… Successfully copied: {len(preprocessed_files_copied)} preprocessed files")
    
else:
    print(f"âœ— Preprocessed data directory not found: {preprocessed_source_dir}")
    preprocessed_files = []


=== STEP 2: ORGANIZING PAPER'S PREPROCESSED DATA (.edf files) ===
Found 50 preprocessed .edf files
  Copied 10/50: sub-10
  Copied 20/50: sub-20
  Copied 30/50: sub-30
  Copied 40/50: sub-40
  Copied 50/50: sub-50

âœ… Successfully copied: 50 preprocessed files


In [None]:
# Step 4: Create a Master Index File


print("\n=== STEP 4: CREATING MASTER FILE INDEX ===")

# Create a comprehensive index of all files
file_info = []

# Get all subject IDs
all_subject_ids = set()

# Add raw files to index
raw_dest_dir = os.path.join(organized_dir, 'raw_data')
if os.path.exists(raw_dest_dir):
    raw_files_list = os.listdir(raw_dest_dir)
    for file in raw_files_list:
        if file.endswith('.mat'):
            # Extract subject ID from filename: sub-01_task-motor-imagery_eeg.mat
            subject_id = file.split('_')[0]
            all_subject_ids.add(subject_id)
            
            file_info.append({
                'subject_id': subject_id,
                'data_type': 'raw',
                'format': 'mat',
                'file_name': file,
                'file_path': os.path.join(raw_dest_dir, file),
                'source': 'sourcedata',
                'description': 'Raw EEG data with labels'
            })

# Add paper preprocessed files to index
preprocessed_dest_dir = os.path.join(organized_dir, 'paper_preprocessed')
if os.path.exists(preprocessed_dest_dir):
    preprocessed_files_list = os.listdir(preprocessed_dest_dir)
    for file in preprocessed_files_list:
        if file.endswith('.edf'):
            # Extract subject ID from filename: sub-01_task-motor-imagery_eeg.edf
            subject_id = file.split('_')[0]
            all_subject_ids.add(subject_id)
            
            file_info.append({
                'subject_id': subject_id,
                'data_type': 'paper_preprocessed',
                'format': 'edf',
                'file_name': file,
                'file_path': os.path.join(preprocessed_dest_dir, file),
                'source': 'edffile',
                'description': 'Paper preprocessed data (0.5-40 Hz filtered)'
            })

# Create DataFrame
files_df = pd.DataFrame(file_info)

# Sort by subject ID and data type
files_df['subject_num'] = files_df['subject_id'].apply(lambda x: int(x.split('-')[1]))
files_df = files_df.sort_values(['subject_num', 'data_type'])

# Save to CSV
index_file = os.path.join(organized_dir, 'data_files_index.csv')
files_df.to_csv(index_file, index=False)
print(f"âœ“ Master index saved to: {index_file}")

print(f"\nðŸ“Š Dataset Summary:")
print(f"  Total unique subjects: {len(all_subject_ids)}")
print(f"  Raw files (.mat): {len(files_df[files_df['data_type'] == 'raw'])}")
print(f"  Preprocessed files (.edf): {len(files_df[files_df['data_type'] == 'paper_preprocessed'])}")

# Check for missing files
print("\nðŸ“‹ File Availability per Subject:")
subject_summary = files_df.groupby('subject_id')['data_type'].apply(list).reset_index()
subject_summary['has_raw'] = subject_summary['data_type'].apply(lambda x: 'raw' in x)
subject_summary['has_preprocessed'] = subject_summary['data_type'].apply(lambda x: 'paper_preprocessed' in x)

print(subject_summary.head(10).to_string(index=False))

missing_raw = subject_summary[~subject_summary['has_raw']]['subject_id'].tolist()
missing_preprocessed = subject_summary[~subject_summary['has_preprocessed']]['subject_id'].tolist()

if missing_raw:
    print(f"\nâš  Missing raw data for subjects: {missing_raw}")
if missing_preprocessed:
    print(f"âš  Missing preprocessed data for subjects: {missing_preprocessed}")


=== STEP 3: CREATING MASTER FILE INDEX ===
âœ“ Master index saved to: D:\impress_project\eeg_signals\data\LRMI-21679035\organized_data_v2\data_files_index.csv

ðŸ“Š Dataset Summary:
  Total unique subjects: 50
  Raw files (.mat): 50
  Preprocessed files (.edf): 50

ðŸ“‹ File Availability per Subject:
subject_id                 data_type  has_raw  has_preprocessed
    sub-01 [paper_preprocessed, raw]     True              True
    sub-02 [paper_preprocessed, raw]     True              True
    sub-03 [paper_preprocessed, raw]     True              True
    sub-04 [paper_preprocessed, raw]     True              True
    sub-05 [paper_preprocessed, raw]     True              True
    sub-06 [paper_preprocessed, raw]     True              True
    sub-07 [paper_preprocessed, raw]     True              True
    sub-08 [paper_preprocessed, raw]     True              True
    sub-09 [paper_preprocessed, raw]     True              True
    sub-10 [paper_preprocessed, raw]     True            

In [None]:
# Step 5: Copy Patient Information and Metadata
print("\n=== STEP 5: COPYING PATIENT INFORMATION ===")

# Copy participants.tsv
participants_source = os.path.join(base_dir, 'participants.tsv')
participants_target = os.path.join(organized_dir, 'patient_info', 'participants.tsv')

if os.path.exists(participants_source):
    shutil.copy2(participants_source, participants_target)
    print(f"âœ“ Copied participants.tsv to: {participants_target}")
    
    # Load and display summary
    participants_df = pd.read_csv(participants_target, sep='\t')
    print(f"  Contains {len(participants_df)} patients")
else:
    print(f"âœ— participants.tsv not found at: {participants_source}")

# Copy all BIDS metadata files
print("\nCopying BIDS metadata files...")
bids_files_to_copy = [
    'dataset_description.json',
    'participants.json',
    'task-motor-imagery_eeg.json',
    'task-motor-imagery_events.json',
    'task-motor-imagery_channels.tsv',
    'task-motor-imagery_electrodes.tsv',
    'task-motor-imagery_coordsystem.json',
    'README.md'
]

for file_name in bids_files_to_copy:
    source_path = os.path.join(base_dir, file_name)
    target_path = os.path.join(organized_dir, 'patient_info', file_name)
    
    if os.path.exists(source_path):
        shutil.copy2(source_path, target_path)
        print(f"âœ“ Copied {file_name}")
    else:
        print(f"âš  {file_name} not found (optional file)")

# Copy event files separately
print("\nCopying event files...")
events_source = os.path.join(base_dir, 'task-motor-imagery_events.tsv')
events_target = os.path.join(organized_dir, 'events', 'task-motor-imagery_events.tsv')

if os.path.exists(events_source):
    shutil.copy2(events_source, events_target)
    print(f"âœ“ Copied event markers to: {events_target}")
    
    # Load events to check structure
    events_df = pd.read_csv(events_target, sep='\t')
    print(f"  Events file contains {len(events_df)} markers")
    if 'trial_type' in events_df.columns:
        print(f"  Event types: {events_df['trial_type'].unique()}")
else:
    print(f"âœ— Event markers not found at: {events_source}")


=== STEP 4: COPYING PATIENT INFORMATION ===
âœ“ Copied participants.tsv to: D:\impress_project\eeg_signals\data\LRMI-21679035\organized_data_v2\patient_info\participants.tsv
  Contains 50 patients

Copying BIDS metadata files...
âœ“ Copied dataset_description.json
âœ“ Copied participants.json
âœ“ Copied task-motor-imagery_eeg.json
âœ“ Copied task-motor-imagery_events.json
âœ“ Copied task-motor-imagery_channels.tsv
âœ“ Copied task-motor-imagery_electrodes.tsv
âœ“ Copied task-motor-imagery_coordsystem.json
âœ“ Copied README.md

Copying event files...
âœ“ Copied event markers to: D:\impress_project\eeg_signals\data\LRMI-21679035\organized_data_v2\events\task-motor-imagery_events.tsv
  Events file contains 120 markers
  Event types: [1 2]


In [None]:
# Step 6: Create Enhanced Configuration File

print("\n=== STEP 6: CREATING ENHANCED CONFIGURATION ===")

# Create detailed configuration
config_content = {
    "dataset": {
        "name": "EEG Motor Imagery for Acute Stroke Patients",
        "source_paper": "An EEG motor imagery dataset for brain computer interface in acute stroke patients",
        "n_subjects_total": 50,
        "n_subjects_available": len(all_subject_ids),
        "data_types": {
            "raw": {
                "format": "mat",
                "description": "Original raw data with labels",
                "sampling_rate": 500,
                "contents": "rawdata (trialsÃ—channelsÃ—samples), labels (1=left, 2=right)",
                "preprocessing": "None"
            },
            "paper_preprocessed": {
                "format": "edf",
                "description": "Paper's preprocessed data",
                "sampling_rate": 500,
                "preprocessing": "0.5-40 Hz bandpass filter, baseline correction",
                "reference": "CPz"
            },
            "my_preprocessed": {
                "format": "To be determined",
                "description": "Your custom preprocessing results",
                "location": "my_preprocessed folder"
            }
        },
        "trial_structure": {
            "trials_per_subject": 40,
            "left_hand_trials": 20,
            "right_hand_trials": 20,
            "trial_duration": 8,
            "instruction_period": 2,
            "motor_imagery_period": 4,
            "break_period": 2
        }
    },
    "paths": {
        "organized_root": organized_dir,
        "raw_data": os.path.join(organized_dir, "raw_data"),
        "paper_preprocessed": os.path.join(organized_dir, "paper_preprocessed"),
        "my_preprocessed": os.path.join(organized_dir, "my_preprocessed"),
        "patient_info": os.path.join(organized_dir, "patient_info"),
        "events": os.path.join(organized_dir, "events"),
        "results": os.path.join(organized_dir, "results"),
        "plots": os.path.join(organized_dir, "plots"),
        "code": os.path.join(organized_dir, "code")
    },
    "eeg_specifications": {
        "system": "ZhenTec NT1 wireless portable EEG",
        "montage": "International 10-10 system",
        "total_channels": 33,
        "eeg_channels": 30,
        "eog_channels": 2,
        "marker_channel": 1,
        "reference": "CPz",
        "ground": "FPz",
        "sampling_rate": 500,
        "filter_applied": "0.5-40 Hz (paper preprocessed)"
    },
    "important_channels": {
        "motor_cortex": ["C3", "C4", "Cz"],
        "motor_adjacent": ["FC3", "FC4", "CP3", "CP4"],
        "eog_channels": ["HEOL", "VEOR"],
        "reference": "CPz",
        "ground": "FPz"
    },
    "analysis_parameters": {
        "mi_analysis_band": [8, 30],  # Hz, for motor imagery analysis
        "baseline_period": [-1, 0],  # seconds relative to MI onset
        "mi_period": [0, 4],  # seconds of motor imagery
        "erds_window": 0.5,  # seconds for ERD/S calculation
        "expected_accuracy": "72.21% (paper's best method)"
    },
    "file_index": {
        "location": os.path.join(organized_dir, "data_files_index.csv"),
        "total_files": len(files_df),
        "raw_files": len(files_df[files_df['data_type'] == 'raw']),
        "preprocessed_files": len(files_df[files_df['data_type'] == 'paper_preprocessed'])
    }
}

# Save as JSON
import json
config_path = os.path.join(organized_dir, 'config.json')
with open(config_path, 'w', encoding='utf-8') as f:
    json.dump(config_content, f, indent=2, ensure_ascii=False)

print(f"âœ“ Configuration file created at: {config_path}")


=== STEP 5: CREATING ENHANCED CONFIGURATION ===
âœ“ Configuration file created at: D:\impress_project\eeg_signals\data\LRMI-21679035\organized_data_v2\config.json


In [None]:
# Step 7: Create README File with Dataset Information

print("\n=== Step 7:CREATING README FILE ===")

# Get counts from the actual organized structure
raw_data_count = len([f for f in os.listdir(os.path.join(organized_dir, 'raw_data')) 
                     if f.endswith('.mat')]) if os.path.exists(os.path.join(organized_dir, 'raw_data')) else 0

paper_preprocessed_count = len([f for f in os.listdir(os.path.join(organized_dir, 'paper_preprocessed')) 
                               if f.endswith('.edf')]) if os.path.exists(os.path.join(organized_dir, 'paper_preprocessed')) else 0

# Load participants data if available
participants_path = os.path.join(organized_dir, 'patient_info', 'participants.tsv')
if os.path.exists(participants_path):
    participants_df = pd.read_csv(participants_path, sep='\t')
    age_range = f"{participants_df['Age'].min()} - {participants_df['Age'].max()}"
    gender_dist = dict(participants_df['Gender'].value_counts())
else:
    participants_df = None
    age_range = "Unknown"
    gender_dist = "Unknown"

# Create a README with the UPDATED dataset information
readme_content = f"""
# EEG Motor Imagery Dataset - Organized Structure

## Dataset Information
- Source: "An EEG motor imagery dataset for brain computer interface in acute stroke patients"
- Total subjects: {len(files_df) if not files_df.empty else '50'}
- Data formats: .mat (raw) and .edf (preprocessed)
- Sampling rate: 500 Hz
- Channels: 30 EEG + 2 EOG + 1 marker (based on paper)
- Trials per subject: 40 (20 left hand, 20 right hand MI)

## Folder Structure
organized_data/
â”œâ”€â”€ raw_data/                    # Original raw .mat files
â”‚   â”œâ”€â”€ sub-01_task-motor-imagery_eeg.mat
â”‚   â”œâ”€â”€ sub-02_task-motor-imagery_eeg.mat
â”‚   â””â”€â”€ ... ({raw_data_count} files total)
â”œâ”€â”€ paper_preprocessed/          # Paper's preprocessed .edf files
â”‚   â”œâ”€â”€ sub-01_task-motor-imagery_eeg.edf
â”‚   â”œâ”€â”€ sub-02_task-motor-imagery_eeg.edf
â”‚   â””â”€â”€ ... ({paper_preprocessed_count} files total)
â”œâ”€â”€ patient_info/               # Patient metadata and demographics
â”‚   â”œâ”€â”€ participants.tsv
â”‚   â”œâ”€â”€ participants.json
â”‚   â””â”€â”€ ...
â”œâ”€â”€ events/                     # Event markers and triggers
â”‚   â”œâ”€â”€ task-motor-imagery_events.tsv
â”‚   â””â”€â”€ ...
â”œâ”€â”€ my_preprocessed/           # Your preprocessing results (empty)
â”œâ”€â”€ results/                   # Analysis results
â”œâ”€â”€ plots/                     # Visualizations
â”œâ”€â”€ code/                      # Analysis scripts
â””â”€â”€ temp/                      # Temporary files

## File Naming Convention
- Raw files: [subject_id]_task-motor-imagery_eeg.mat
- Preprocessed files: [subject_id]_task-motor-imagery_eeg.edf
- Example: sub-01_task-motor-imagery_eeg.mat (raw)
- Example: sub-01_task-motor-imagery_eeg.edf (preprocessed)

## Patient Information
- Total: {len(participants_df) if participants_df is not None else 'Unknown'}
- Age range: {age_range} years
- Gender: {gender_dist}

## Analysis Notes
1. All EEG files have been verified to contain C3 and C4 channels
2. Paper's preprocessed data: 0.5-40 Hz filtered
3. For MI analysis, apply additional 8-30 Hz filter as per paper
4. Event markers should be used to segment 4-second MI periods
5. Raw data (.mat) contains trial labels: 1=left hand MI, 2=right hand MI

## Created on
{pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}
"""

readme_path = os.path.join(organized_dir, 'README.md')
with open(readme_path, 'w', encoding='utf-8') as f:
    f.write(readme_content)

print(f"âœ“ README created at: {readme_path}")
print(f"  - Raw data files: {raw_data_count}")
print(f"  - Preprocessed files: {paper_preprocessed_count}")


=== CREATING README FILE ===
âœ“ README created at: D:\impress_project\eeg_signals\data\LRMI-21679035\organized_data_v2\README.md
  - Raw data files: 50
  - Preprocessed files: 50


In [None]:
# Step 8: Verify the Organized Structure

print("\n=== Step 8: VERIFYING ORGANIZED STRUCTURE ===")

# Count files in each directory
print("\nFile counts in organized structure:")
for folder in folders_to_create:
    folder_path = os.path.join(organized_dir, folder)
    if os.path.exists(folder_path):
        files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]
        print(f"  {folder}: {len(files)} files")

# Verify EEG files are accessible
print("\nVerifying EEG file accessibility...")
accessible_files = 0
eeg_folder = os.path.join(organized_dir, 'eeg_data')

if os.path.exists(eeg_folder):
    for file in os.listdir(eeg_folder)[:3]:  # Check first 3 files
        file_path = os.path.join(eeg_folder, file)
        try:
            # Try to load file info (not full data, for speed)
            raw_test = mne.io.read_raw_edf(file_path, preload=False, verbose=False)
            accessible_files += 1
            print(f"  âœ“ {file}: {len(raw_test.ch_names)} channels, {raw_test.info['sfreq']} Hz")
        except Exception as e:
            print(f"  âœ— {file}: Error - {e}")

print(f"\nâœ… Successfully verified {accessible_files} EEG files")


=== VERIFYING ORGANIZED STRUCTURE ===

File counts in organized structure:
  raw_data: 50 files
  paper_preprocessed: 50 files
  my_preprocessed: 0 files
  patient_info: 9 files
  events: 1 files
  results: 0 files
  plots: 0 files
  code: 1 files
  temp: 0 files

Verifying EEG file accessibility...

âœ… Successfully verified 0 EEG files


The data loader does not work properly

# Step 9: Create Data Loader Script for Both Data Types


It does not still work properly

print("\n=== STEP 9: CREATING DATA LOADER SCRIPT ===")

loader_script_content = '''"""
Data Loader for EEG Motor Imagery Dataset
Supports both raw (.mat) and preprocessed (.edf) data
"""

import os
import pandas as pd
import numpy as np
import mne
from scipy.io import loadmat
import json

class EEGDatasetLoader:
    def __init__(self, base_path=None):
        """Initialize dataset loader"""
        if base_path is None:
            # Try to find the organized_data folder
            current_dir = os.path.dirname(os.path.abspath(__file__))
            base_path = os.path.join(current_dir, '..', 'organized_data_v2')
        
        self.base_path = base_path
        
        # Load configuration
        config_path = os.path.join(base_path, 'config.json')
        with open(config_path, 'r') as f:
            self.config = json.load(f)
        
        # Load file index
        index_path = os.path.join(base_path, 'data_files_index.csv')
        self.file_index = pd.read_csv(index_path)
        
        # Load patient info
        participants_path = os.path.join(base_path, 'patient_info', 'participants.tsv')
        self.participants = pd.read_csv(participants_path, sep='\\t')
        
        # Load events if available
        events_path = os.path.join(base_path, 'events', 'task-motor-imagery_events.tsv')
        if os.path.exists(events_path):
            self.events = pd.read_csv(events_path, sep='\\t')
        else:
            self.events = None
        
        print(f"Dataset loaded: {self.config['dataset']['name']}")
        print(f"Subjects: {self.get_n_subjects()}")
        print(f"Data types available: {self.get_available_data_types()}")
    
    def get_n_subjects(self):
        """Get number of unique subjects"""
        return len(self.file_index['subject_id'].unique())
    
    def get_available_data_types(self):
        """Get available data types"""
        return self.file_index['data_type'].unique().tolist()
    
    def get_subject_files(self, subject_id, data_type=None):
        """Get file information for a subject"""
        if data_type:
            files = self.file_index[
                (self.file_index['subject_id'] == subject_id) & 
                (self.file_index['data_type'] == data_type)
            ]
        else:
            files = self.file_index[self.file_index['subject_id'] == subject_id]
        
        return files
    
    def load_raw_mat_data(self, subject_id):
        """Load raw .mat data for a subject"""
        files = self.get_subject_files(subject_id, 'raw')
        
        if len(files) == 0:
            raise ValueError(f"No raw data found for subject {subject_id}")
        
        file_path = files.iloc[0]['file_path']
        
        print(f"Loading raw .mat data: {subject_id}")
        mat_data = loadmat(file_path)
        
        # Extract data based on paper description
        # According to paper: 'mat' file contains 'rawdata' and 'labels'
        if 'rawdata' in mat_data:
            rawdata = mat_data['rawdata']  # trials Ã— channels Ã— time samples
            labels = mat_data['labels'].flatten() if 'labels' in mat_data else None
            
            # Get dimensions
            n_trials, n_channels, n_samples = rawdata.shape
            
            print(f"  Shape: {rawdata.shape}")
            print(f"  Trials: {n_trials}")
            print(f"  Channels: {n_channels}")
            print(f"  Samples per trial: {n_samples}")
            print(f"  Labels: {labels}")
            
            return {
                'data': rawdata,
                'labels': labels,
                'subject_id': subject_id,
                'n_trials': n_trials,
                'n_channels': n_channels,
                'n_samples': n_samples,
                'sampling_rate': 500  # From paper
            }
        else:
            print(f"Warning: 'rawdata' not found in .mat file")
            print(f"Available keys: {list(mat_data.keys())}")
            return mat_data
    
    def load_preprocessed_edf_data(self, subject_id, preload=True):
        """Load paper's preprocessed .edf data"""
        files = self.get_subject_files(subject_id, 'paper_preprocessed')
        
        if len(files) == 0:
            raise ValueError(f"No preprocessed data found for subject {subject_id}")
        
        file_path = files.iloc[0]['file_path']
        
        print(f"Loading preprocessed .edf data: {subject_id}")
        raw = mne.io.read_raw_edf(file_path, preload=preload, verbose=False)
        
        # Get data
        if preload:
            data, times = raw[:]
        else:
            data, times = None, None
        
        print(f"  Channels: {len(raw.ch_names)}")
        print(f"  Sampling rate: {raw.info['sfreq']} Hz")
        print(f"  Duration: {raw.times[-1]:.2f} seconds" if hasattr(raw, 'times') and len(raw.times) > 0 else "")
        
        return {
            'raw': raw,
            'data': data,
            'times': times,
            'subject_id': subject_id,
            'channels': raw.ch_names,
            'sampling_rate': raw.info['sfreq'],
            'has_annotations': hasattr(raw, 'annotations') and bool(raw.annotations)
        }
    
    def get_patient_info(self, subject_id):
        """Get patient information"""
        # Extract subject number
        try:
            subject_num = int(subject_id.split('-')[1])
        except:
            subject_num = None
        
        # Find in participants
        info = self.participants[self.participants['Participant_ID'] == subject_id]
        
        if len(info) > 0:
            patient_info = info.iloc[0].to_dict()
            patient_info['subject_num'] = subject_num
            return patient_info
        else:
            return None
    
    def get_events_for_subject(self, subject_id):
        """Get events for a subject"""
        if self.events is None:
            return None
        
        # Extract subject number from subject_id
        subject_num = int(subject_id.split('-')[1])
        
        # Events might be indexed by subject
        # This depends on the events file structure
        if 'participant_id' in self.events.columns:
            subject_events = self.events[self.events['participant_id'] == subject_id]
        else:
            # Assume all events are in one file, need to separate by trial counts
            # 40 trials per subject, so subject 1 = trials 0-39, subject 2 = trials 40-79, etc.
            trials_per_subject = 40
            start_trial = (subject_num - 1) * trials_per_subject
            end_trial = start_trial + trials_per_subject
            
            if 'trial_id' in self.events.columns:
                subject_events = self.events[
                    (self.events['trial_id'] >= start_trial) & 
                    (self.events['trial_id'] < end_trial)
                ]
            else:
                # Can't separate by subject
                subject_events = self.events
        
        return subject_events
    
    def list_all_subjects(self):
        """List all available subjects"""
        return sorted(self.file_index['subject_id'].unique())
    
    def get_subjects_with_both_data_types(self):
        """Get subjects that have both raw and preprocessed data"""
        subject_counts = self.file_index.groupby('subject_id')['data_type'].nunique()
        return subject_counts[subject_counts >= 2].index.tolist()
    
    def compare_data_types(self, subject_id):
        """Compare raw and preprocessed data for a subject"""
        print(f"\\n=== Comparing data for {subject_id} ===")
        
        # Get file info
        raw_files = self.get_subject_files(subject_id, 'raw')
        preprocessed_files = self.get_subject_files(subject_id, 'paper_preprocessed')
        
        print(f"Raw files: {len(raw_files)}")
        print(f"Preprocessed files: {len(preprocessed_files)}")
        
        if len(raw_files) > 0 and len(preprocessed_files) > 0:
            # Load small sample to compare
            try:
                raw_data = self.load_raw_mat_data(subject_id)
                preprocessed_data = self.load_preprocessed_edf_data(subject_id, preload=False)
                
                print(f"\\nComparison:")
                print(f"  Raw: {raw_data['n_trials']} trials, {raw_data['n_channels']} channels")
                print(f"  Preprocessed: {len(preprocessed_data['channels'])} channels")
                
                # Check if C3 and C4 are present in preprocessed data
                if preprocessed_data['raw']:
                    channels = preprocessed_data['channels']
                    has_c3 = 'C3' in channels
                    has_c4 = 'C4' in channels
                    print(f"  C3 in preprocessed: {has_c3}")
                    print(f"  C4 in preprocessed: {has_c4}")
                
                return True
            except Exception as e:
                print(f"Error comparing: {e}")
                return False
        else:
            print("Missing one or both data types")
            return False

# Example usage
if __name__ == "__main__":
    # Initialize loader
    loader = EEGDatasetLoader()
    
    # List subjects
    subjects = loader.list_all_subjects()[:5]
    print("First 5 subjects:", subjects)
    
    # Get subjects with both data types
    complete_subjects = loader.get_subjects_with_both_data_types()
    print(f"\\nSubjects with both raw and preprocessed data: {len(complete_subjects)}")
    
    # Load and compare data for first subject
    if len(complete_subjects) > 0:
        test_subject = complete_subjects[0]
        loader.compare_data_types(test_subject)
        
        # Load raw data
        print(f"\\n--- Loading raw data for {test_subject} ---")
        raw_data = loader.load_raw_mat_data(test_subject)
        
        # Load preprocessed data
        print(f"\\n--- Loading preprocessed data for {test_subject} ---")
        preprocessed_data = loader.load_preprocessed_edf_data(test_subject, preload=False)
        
        # Get patient info
        print(f"\\n--- Patient info for {test_subject} ---")
        patient_info = loader.get_patient_info(test_subject)
        if patient_info:
            print(f"Age: {patient_info.get('Age', 'N/A')}")
            print(f"Gender: {patient_info.get('Gender', 'N/A')}")
            print(f"Paralysis side: {patient_info.get('ParalysisSide', 'N/A')}")
'''

loader_script_path = os.path.join(organized_dir, 'code', 'data_loader.py')
with open(loader_script_path, 'w', encoding='utf-8') as f:
    f.write(loader_script_content)

print(f"âœ“ Data loader script created at: {loader_script_path}")

