## Data Gathering


In [None]:
# Import Required Libraries
import numpy as np
import matplotlib.pyplot as plt
import requests

# To unzip the edf_dataset
import zipfile
import os

# EDFlib and Data Preprocesing module
from mne.preprocessing import ICA, create_eog_epochs
import mne
from pyedflib import highlevel
import pyedflib as plib

In [None]:
import requests


def download_file(url, save_path):

    # Check if the file already exists
    if os.path.exists(save_path):
        print(f"File already exists at '{save_path}'. Skipping download.")
        return  # Exit the function if the file exists

    # Send a GET request to the URL
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Open the file in binary write mode and save the content
        with open(save_path, 'wb') as file:
            file.write(response.content)
        print(f"File downloaded successfully and saved to '{save_path}'")
    else:
        print(f"Failed to download file. Status code: {response.status_code}")


# Specify the URL and the path where you want to save the file
url = 'https://figshare.com/ndownloader/articles/4244171/versions/2'
# Change this to your desired path
save_path = './edf_dataset.zip'

# Call the function to download the file
download_file(url, save_path)

In [None]:
def unzip_file(zip_file_path, extract_to_folder):

    # Check if the directory exist
    
    if os.path.exists(extract_to_folder):
        print(f"Directory '{extract_to_folder} already exists")
        return # Exit the function if the directory
    
    # Create the directory if it doesn't exist
    os.makedirs(extract_to_folder, exist_ok=True)

    # Open the zip file
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        # Extract all the contents into the specified folder
        zip_ref.extractall(extract_to_folder)


# Specify the path to the zip file and the extraction folder
zip_file_path = './edf_dataset.zip'
# Change this if needed
extract_to_folder = './edf_dataset_2'

# Call the function to unzip
unzip_file(zip_file_path, extract_to_folder)


In [None]:
import os
import re

edf_directory = "./edf_dataset_2"

for filename in os.listdir(edf_directory):
    new_filename = filename
    
    # Handle spaces in filename
    if ' ' in new_filename:
        new_filename = re.sub(r'\s+', '_', new_filename)
    
    # Handle subject numbers (S1 -> S01)
    new_filename = re.sub(r'S(\d)(?!\d)', r'S0\1', new_filename)
    
    # Handle filenames containing '6931959'
    if '6931959' in new_filename:
        new_filename = new_filename.replace('6931959_', '')
    
    # Handle filenames containing '6921143'
    if '6921143' in new_filename:
        new_filename = new_filename.replace('6921143_', '')
    
    # Zero-pad other single-digit numbers in filenames
    new_filename = re.sub(r'(?<!\d)(\d)(?!\d)', r'0\1', new_filename)
    
    # Only rename if the filename has changed
    if new_filename != filename:
        old_file = os.path.join(edf_directory, filename)
        new_file = os.path.join(edf_directory, new_filename)
        
        os.rename(old_file, new_file)
        print(f'Renamed: "{filename}" to "{new_filename}"')

print("Renaming complete.")

## Preprocessing Functions


In [None]:
def extract_task_type(filename):
    if '_EC.' in filename:
        return 0  # Eyes Closed
    elif '_EO.' in filename:
        return 1  # Eyes Open
    elif '_TASK.' in filename:
        return 2  # Task
    else:
        raise ValueError(f"Unknown task type in filename: {filename}")

In [None]:
# Function to rename channels and drop specified channels based on conditions
def process_channels(raw_data):
    """
    Process and standardize EEG channels to keep only the 17 most common channels.
    """
    print(f"Initial channels: {raw_data.ch_names}")

    # Initialize a list to hold channels to drop
    channels_to_drop = []

    # Create mapping for channel renaming
    rename_map = {}
    for name in raw_data.ch_names:
        if any(x in name for x in ['23A-23R', '24A-24R', 'A2-A1']):
            channels_to_drop.append(name)
        else:
            new_name = name.replace('EEG ', '').replace('-LE', '')
            rename_map[name] = new_name

    # Drop unwanted channels
    if channels_to_drop:
        print(f"Dropping channels: {channels_to_drop}")
        raw_data.drop_channels(channels_to_drop)

    # Rename remaining channels
    raw_data.rename_channels(rename_map)

    print(f"Final channels: {raw_data.ch_names}")

    # Define the 17 most common channels
    expected_channels = [
        'Fp1', 'F3', 'C3', 'P3', 'O1', 'F7', 'T3', 'Fp2',
        'F4', 'C4', 'P4', 'O2', 'F8', 'T4', 'T6', 'Cz', 'Pz'
    ]

    # Keep only the expected channels
    channels_to_keep = set(expected_channels)
    channels_to_drop = [
        ch for ch in raw_data.ch_names if ch not in channels_to_keep]

    if channels_to_drop:
        print(
            f"Dropping channels to keep only the expected 17 channels: {channels_to_drop}")
        raw_data.drop_channels(channels_to_drop)

    # Verify we have the expected number of channels (should be 17)
    if len(raw_data.ch_names) != len(expected_channels):
        print(
            f"Warning: Expected {len(expected_channels)} channels, got {len(raw_data.ch_names)}")
        print(f"Missing: {set(expected_channels) - set(raw_data.ch_names)}")

    return raw_data

In [None]:
all_edf_files = os.listdir(edf_directory)
ec_file_path = [i for i in all_edf_files if i.endswith('EC.edf')]
eo_file_path = [i for i in all_edf_files if i.endswith('EO.edf')]
task_file_path = [i for i in all_edf_files if i.endswith('TASK.edf')]

print(len(all_edf_files), len(ec_file_path), len(eo_file_path), len(task_file_path))

In [None]:
def read_data(file_path):
    data = mne.io.read_raw_edf(file_path, preload=True)
    data.set_eeg_reference()
    return data
    

In [None]:
def bandpass_filter(data, l_freq, h_freq):
    # Adjust the filter parameters as needed
    data.filter(l_freq=l_freq, h_freq=h_freq)

In [None]:
def preprocess_ICA(raw, n_components ):
    print(f"Preprocessing ICA: {raw.filenames}")

    ica = ICA(n_components=n_components,random_state=97,
              max_iter=800) 
    ica.fit(raw)
    return ica

In [None]:
def create_epochs(processed_data, duration=5.0, overlap=1.0):
    """
    Create epochs from continuous EEG data and format for CNN input
    
    Parameters:
    -----------
    processed_data : mne.io.Raw
        The raw EEG data
    duration : float
        Duration of each epoch in seconds
    overlap : float
        Overlap between epochs in seconds
    
    Returns:
    --------
    epochs_array : numpy.ndarray
        The epoched data formatted for CNN (samples, channels, timepoints, 1)
    """

    # Create epochs
    epochs = mne.make_fixed_length_epochs(
        processed_data,
        duration=duration,
        overlap=overlap,
        preload=True
    )

    # Drop bad epochs
    epochs.drop_bad()

    # Get data and reshape for CNN
    # Shape will be (n_epochs, n_channels, n_timepoints)
    data = epochs.get_data()

    # Add channel dimension for CNN: (n_epochs, n_channels, n_timepoints, 1)
    data = data[..., np.newaxis]

    return data

In [None]:
# Directory containing the EDF files
edf_directory = "./edf_dataset_2"  # Adjust this path to your dataset location

# Initialize lists
processed_raw_data = []
class_counts = {'Healthy': 0, 'MDD': 0}

# Read all EDF files
for filename in os.listdir(edf_directory):
    if filename.endswith('.edf'):
        file_path = os.path.join(edf_directory, filename)
        try:
            # Read the raw data
            raw_data = read_data(file_path)

            if raw_data is not None:
                processed_raw_data.append(raw_data)
                print(f"Successfully loaded: {filename}")
            else:
                print(f"Failed to load: {filename}")

        except Exception as e:
            print(f"Error loading {filename}: {str(e)}")
            continue

print(f"\nTotal files loaded: {len(processed_raw_data)}")

## Actual Preprocessing Function


In [None]:
def preprocess_eeg(raw_data, l_freq=0.5, h_freq=50.0, n_components=5, time_steps=5, samples_per_step=256):
    """
    Complete EEG preprocessing pipeline: bandpass -> ICA -> reshape for CNN-LSTM
    """
    try:
        print(f"\nProcessing file: {raw_data.filenames}")
        
        processed_raw = process_channels(raw_data=raw_data)

        # 1. Bandpass filtering
        print("1. Applying bandpass filter...")
        bandpass_filter(processed_raw, l_freq, h_freq)

        # 2. ICA
        print("2. Applying ICA...")
        ica = preprocess_ICA(processed_raw, n_components)
        ica.apply(processed_raw)

        # 3. Reshape data for CNN-LSTM
        print("3. Reshaping data for CNN-LSTM...")
        data = processed_raw.get_data()
        n_channels = data.shape[0]
        
        # Calculate total samples needed for each complete sequence
        total_samples_per_sequence = time_steps * samples_per_step
        
        # Determine how many complete sequences we can make
        n_sequences = data.shape[1] // total_samples_per_sequence
        
        # Reshape the data
        reshaped_data = []
        for i in range(n_sequences):
            start = i * total_samples_per_sequence
            end = start + total_samples_per_sequence
            sequence = data[:, start:end]
            
            # Reshape into (time_steps, channels, samples_per_step)
            sequence = sequence.reshape(time_steps, n_channels, samples_per_step)
            
            # Transpose to (time_steps, channels, samples_per_step)
            sequence = np.transpose(sequence, (0, 2, 1))
            
            reshaped_data.append(sequence)
        
        # Convert to numpy array and add channel dimension
        final_data = np.array(reshaped_data)[..., np.newaxis]
        
        print(f"Reshaping completed. Final data shape: {final_data.shape}")
        return final_data

    except Exception as e:
        print(f"Preprocessing error: {str(e)}")
        return None


## Actual Preprocessing


In [None]:
# Now process each raw data file
X_data = []
y_labels = []
task_labels = [] # For EC/EO/TASK
class_counts = {'Healthy': 0, 'MDD': 0}

print("\nStarting preprocessing pipeline...")
for raw_data in processed_raw_data:
    filename = os.path.basename(raw_data.filenames[0])
    print(f"\n{'='*50}")
    print(f"Processing: {filename}")
    print(f"Initial data info:")
    print(f"Channels: {raw_data.ch_names}")
    print(f"Sample rate: {raw_data.info['sfreq']} Hz")
    print(f"Duration: {raw_data.n_times / raw_data.info['sfreq']:.2f} seconds")
    
    # Extract both condition and task labels
    condition_label = 1 if filename.startswith('MDD') else 0
    task_label = extract_task_type(filename)

    # Apply complete preprocessing pipeline
    try:
        # First, process the channels
        # Make a copy to prevent modifying original
        raw_data = process_channels(raw_data.copy()) 
        print(f"Channels after processing: {raw_data.ch_names}")

        # Apply complete preprocessing pipeline
        processed_data = preprocess_eeg(raw_data, l_freq=0.5, h_freq=50.0,
                                        n_components=5, time_steps=5, samples_per_step=256)
        

        if processed_data is not None:
            X_data.append(processed_data)
            
            # Create label (1 for MDD, 0 for Healthy)
            label = 1 if filename.startswith('MDD') else 0
            y_labels.extend([label] * processed_data.shape[0])

            # Update counts with number of epochs
            if label == 1:
                class_counts['MDD'] += processed_data.shape[0]
            else:
                class_counts['Healthy'] += processed_data.shape[0]
            
            # Create task label based on filename
            if '_EC.' in filename:
                task = 0
            elif '_EO.' in filename:
                task = 1
            elif '_TASK.' in filename:
                task = 2
                
            # Make sure to extend task labels for each epoch
            task_labels.extend([task] * processed_data.shape[0])

           

            print(f"Successfully processed {filename}")
        else:
            print(f"Failed to process {filename}")

    except Exception as e:
        print(f"Error processing {filename}: {str(e)}")
        continue

    print(f"{'='*50}\n")


# Print final summary
print("\nProcessing Summary:")
print(f"Total files processed: {len(processed_raw_data)}")
print(f"Total epochs: {len(y_labels)}")
print(f"Successfully processed files: {len(X_data)}")
print(f"Failed files: {len(processed_raw_data) - len(X_data)}")
print("\nClass distribution:")
print(f"MDD epochs: {class_counts['MDD']}")
print(f"Healthy epochs: {class_counts['Healthy']}")
# print(f"\nChannels used: {raw_data.ch_names}")

# # Optional: Print class balance percentage
# total_epochs = class_counts['MDD'] + class_counts['Healthy']
# print("\nClass balance:")
# print(f"MDD: {(class_counts['MDD']/total_epochs)*100:.2f}%")
# print(f"Healthy: {(class_counts['Healthy']/total_epochs)*100:.2f}%")

In [None]:
if len(X_data) > 0:
    try:
        # Print shapes before concatenation
        print("\nArray shapes before concatenation:")
        for i, arr in enumerate(X_data):
            print(f"Array {i}: shape {arr.shape}")

        # Extract task information from filenames
        task_labels = []
        for raw_data in processed_raw_data:
            filename = os.path.basename(raw_data.filenames[0])
            if '_EC.' in filename:
                task = 0  # Eyes Closed
            elif '_EO.' in filename:
                task = 1  # Eyes Open
            elif '_TASK.' in filename:
                task = 2  # Task

            # Extend task labels for each epoch in the processed data
            if processed_data is not None:
                task_labels.extend([task] * processed_data.shape[0])

        # Concatenate data
        X = np.concatenate(X_data, axis=0)
        y = np.array(y_labels)
        tasks = np.array(task_labels)

        # Print final information
        print("\nFinal Dataset Information:")
        print(f"Total samples: {len(X)}")
        print(f"Healthy samples: {class_counts['Healthy']}")
        print(f"MDD samples: {class_counts['MDD']}")
        print(f"Input shape: {X.shape}")
        print(f"Labels shape: {y.shape}")
        print(f"Task labels shape: {tasks.shape}")

        # Print task distribution
        print("\nTask distribution:")
        print(f"Eyes Closed (EC): {np.sum(tasks == 0)}")
        print(f"Eyes Open (EO): {np.sum(tasks == 1)}")
        print(f"Task: {np.sum(tasks == 2)}")

    except Exception as e:
        print(f"\nError during final processing: {str(e)}")
        print("Checking individual arrays for inconsistencies...")
        # Find arrays with different shapes
        base_shape = X_data[0].shape[1:]
        for i, arr in enumerate(X_data):
            if arr.shape[1:] != base_shape:
                print(
                    f"Mismatch at index {i}: expected {base_shape}, got {arr.shape[1:]}")
else:
    print("\nNo data was successfully processed!")

In [None]:
print(X_data[:100])

In [None]:
print(y_labels[:100])

In [None]:
print(task_labels[:100])

## Model Definition


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Dense, LSTM, Dropout, Flatten, Reshape, TimeDistributed


def create_2dcnn_lstm_model_with_task(input_shape=(5, 256, 17, 1)):
    # Main input for EEG data
    eeg_input = Input(shape=input_shape, name='eeg_input')

    # Task input (for EC, EO, TASK)
    task_input = Input(shape=(1,), name='task_input')

    # EEG processing branch
    x = TimeDistributed(
        Conv2D(64, (3, 3), activation='relu', padding='same'))(eeg_input)
    x = TimeDistributed(MaxPooling2D((2, 2)))(x)
    x = TimeDistributed(
        Conv2D(128, (3, 3), activation='relu', padding='same'))(x)
    x = TimeDistributed(MaxPooling2D((2, 2)))(x)
    x = TimeDistributed(
        Conv2D(256, (3, 3), activation='relu', padding='same'))(x)
    x = TimeDistributed(MaxPooling2D((2, 2)))(x)

    # Flatten CNN output while preserving time steps
    x = TimeDistributed(Flatten())(x)

    # LSTM layers
    x = LSTM(128, return_sequences=True)(x)
    x = LSTM(64)(x)

    # Combine with task information
    task_embedding = Dense(32, activation='relu')(task_input)

    # Concatenate EEG features with task embedding
    combined = tf.keras.layers.Concatenate()([x, task_embedding])

    # Dense layers
    x = Dense(128, activation='relu')(combined)
    x = Dropout(0.5)(x)
    x = Dense(64, activation='relu')(x)
    x = Dropout(0.5)(x)

    # Output layer
    outputs = Dense(1, activation='sigmoid')(x)

    # Create model with multiple inputs
    model = Model(inputs=[eeg_input, task_input], outputs=outputs)

    return model

## Train Test Split


In [None]:
from sklearn.model_selection import train_test_split


def prepare_data_with_task(X_data, y_labels, task_labels):
    # Combine all data
    X = np.concatenate(X_data, axis=0)
    y = np.array(y_labels)
    tasks = np.array(task_labels)

    # Find the minimum length among all arrays
    min_length = min(X.shape[0], len(y), len(tasks))

    # Trim all arrays to the same length
    X = X[:min_length]
    y = y[:min_length]
    tasks = tasks[:min_length]

    print("\nAfter trimming:")
    print(f"X shape: {X.shape}")
    print(f"y shape: {y.shape}")
    print(f"tasks shape: {tasks.shape}")

    # Normalize the EEG data
    X = (X - X.mean()) / X.std()

    # Train-test split while maintaining stratification
    X_train, X_test, y_train, y_test, task_train, task_test = train_test_split(
        X, y, tasks, test_size=0.2, random_state=42,
        stratify=np.column_stack((y, tasks))
    )

    return X_train, X_test, y_train, y_test, task_train, task_test

In [None]:
def train_model_with_task(model, X_train, task_train, y_train, X_test, task_test, y_test):
    # Compile model
    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy', tf.keras.metrics.AUC()]
    )

    # Callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss', patience=5, restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6
        )
    ]

    # Train model with multiple inputs
    history = model.fit(
        {'eeg_input': X_train, 'task_input': task_train},
        y_train,
        validation_data=(
            {'eeg_input': X_test, 'task_input': task_test}, y_test),
        epochs=25,
        batch_size=32,
        callbacks=callbacks
    )

    return history

In [None]:
print("Data shape checks:")
print(f"X_data length: {len(X_data)}")
print(f"y_labels length: {len(y_labels)}")
print(f"task_labels length: {len(task_labels)}")

## Model Evaluation


In [None]:
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import matplotlib.pyplot as plt


def evaluate_model_with_task(model, X_train, task_train, y_train, X_test, task_test, y_test):
    # Training predictions
    y_train_pred = model.predict({
        'eeg_input': X_train,
        'task_input': task_train
    })
    y_train_pred_classes = (y_train_pred > 0.5).astype(int)

    # Test predictions
    y_test_pred = model.predict({
        'eeg_input': X_test,
        'task_input': task_test
    })
    y_test_pred_classes = (y_test_pred > 0.5).astype(int)

    # Calculate metrics
    metrics = {
        'train': {
            'accuracy': accuracy_score(y_train, y_train_pred_classes),
            'precision': precision_score(y_train, y_train_pred_classes),
            'recall': recall_score(y_train, y_train_pred_classes),
            'f1': f1_score(y_train, y_train_pred_classes),
            'auc': roc_auc_score(y_train, y_train_pred)
        },
        'test': {
            'accuracy': accuracy_score(y_test, y_test_pred_classes),
            'precision': precision_score(y_test, y_test_pred_classes),
            'recall': recall_score(y_test, y_test_pred_classes),
            'f1': f1_score(y_test, y_test_pred_classes),
            'auc': roc_auc_score(y_test, y_test_pred)
        }
    }

    # Print results
    for dataset in ['train', 'test']:
        print(f"\n{dataset.capitalize()} Results:")
        for metric, value in metrics[dataset].items():
            print(f"{metric.capitalize()}: {value:.4f}")

    # Plot confusion matrices
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

    # Training confusion matrix
    cm_train = confusion_matrix(y_train, y_train_pred_classes)
    sns.heatmap(cm_train, annot=True, fmt='d', cmap='Greens', ax=ax1)
    ax1.set_title('Training Confusion Matrix')
    ax1.set_ylabel('True Label')
    ax1.set_xlabel('Predicted Label')

    # Testing confusion matrix
    cm_test = confusion_matrix(y_test, y_test_pred_classes)
    sns.heatmap(cm_test, annot=True, fmt='d', cmap='Blues', ax=ax2)
    ax2.set_title('Testing Confusion Matrix')
    ax2.set_ylabel('True Label')
    ax2.set_xlabel('Predicted Label')

    plt.tight_layout()
    plt.show()

    return metrics



In [None]:
# After your preprocessing code
X_train, X_test, y_train, y_test, task_train, task_test = prepare_data_with_task(
    X_data, y_labels, task_labels
)

# Create and train model
#input_shape = X_train.shape[1:]  # (channels, height, width)
input_shape = (5, 256, 17, 1)
model = create_2dcnn_lstm_model_with_task(input_shape)
history = train_model_with_task(
    model, X_train, task_train, y_train, X_test, task_test, y_test
)


In [None]:
# After training your task-aware model
metrics = evaluate_model_with_task(
    model,              # Your trained task-aware model
    X_train,           # Training EEG data
    task_train,        # Training task labels (EC=0, EO=1, TASK=2)
    y_train,           # Training class labels (MDD/Healthy)
    X_test,            # Test EEG data
    task_test,         # Test task labels
    y_test             # Test class labels
)

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
import numpy as np


def plot_training_history_with_task(history, y_test, X_test, task_test, model):
    # Get predictions for ROC curve
    y_pred = model.predict({
        'eeg_input': X_test,
        'task_input': task_test
    })

    # Create figure with subplots
    fig, axes = plt.subplots(1, 3, figsize=(18, 6))

    # Plot 1: ROC Curve
    fpr, tpr, _ = roc_curve(y_test, y_pred)
    roc_auc = auc(fpr, tpr)
    axes[0].plot(fpr, tpr, 'b-', label=f'ROC (AUC = {roc_auc:.3f})')
    axes[0].plot([0, 1], [0, 1], 'r--')
    axes[0].set_xlabel('False Positive Rate')
    axes[0].set_ylabel('True Positive Rate')
    axes[0].set_title('ROC Curve')
    axes[0].legend(loc='lower right')
    axes[0].grid(True)

    # Plot 2: Accuracy
    axes[1].plot(history.history['accuracy'], 'b-', label='Training')
    axes[1].plot(history.history['val_accuracy'], 'r-', label='Validation')
    axes[1].set_xlabel('Epoch')
    axes[1].set_ylabel('Accuracy')
    axes[1].set_title('Model Accuracy')
    axes[1].legend(loc='lower right')
    axes[1].grid(True)

    # Plot 3: Loss
    axes[2].plot(history.history['loss'], 'b-', label='Training')
    axes[2].plot(history.history['val_loss'], 'r-', label='Validation')
    axes[2].set_xlabel('Epoch')
    axes[2].set_ylabel('Loss')
    axes[2].set_title('Model Loss')
    axes[2].legend(loc='upper right')
    axes[2].grid(True)

    plt.tight_layout()
    plt.show()



In [None]:
# After training your task-aware model
plot_training_history_with_task(
    history,              # Training history from model.fit
    y_test,              # Test labels
    X_test,              # Test EEG data
    task_test,           # Test task labels (EC=0, EO=1, TASK=2)
    model                # Your trained task-aware model
)

In [None]:
# For predicting with the task-aware model
predictions = model.predict({
    'eeg_input': X_test[:100],  # First 100 EEG samples
    'task_input': task_test[:100]  # First 100 task labels
})

# Print the predictions
print("Predictions shape:", predictions.shape)
print("First few predictions:", predictions[:100])
