In [2]:
import pyarrow.parquet as pq
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Preprocess Data

In [3]:
class EEGSpectrogramPipeline:
    def __init__(self, eeg_dir, spectrogram_dir, train_csv_path):
        self.eeg_dir = eeg_dir
        self.spectrogram_dir = spectrogram_dir
        self.train_csv_path = train_csv_path
        self.soft_labels = {}
        self.id_mapping = {}
        self.train_ids = []
        self.val_ids = []
        self.test_ids = []
        self.label_types = ['seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote']

    def pad_eeg(self, eeg_sample, target_length=30000):
        current_length = eeg_sample.shape[0]
        if current_length < target_length:
            padding = np.zeros((target_length - current_length, eeg_sample.shape[1]))
            eeg_sample_padded = np.vstack((eeg_sample, padding))
        else:
            eeg_sample_padded = eeg_sample[:target_length, :]  # Truncate if longer
        return eeg_sample_padded

    def pad_spectrogram(self, spectrogram_sample, target_length=5000):
        current_length = spectrogram_sample.shape[0]
        if current_length < target_length:
            padding = np.zeros((target_length - current_length, spectrogram_sample.shape[1]))
            spectrogram_sample_padded = np.vstack((spectrogram_sample, padding))
        else:
            spectrogram_sample_padded = spectrogram_sample[:target_length, :]  # Truncate if longer
        return spectrogram_sample_padded

    def create_soft_labels(self):
        train_df = pd.read_csv(self.train_csv_path)
        # Dictionary to store the soft labels for each eeg_id
        soft_labels = {}
    
        # List of label types
        label_types = ['seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote']
    
        for eeg_id in train_df['eeg_id'].unique():
            eeg_segments = train_df[train_df['eeg_id'] == eeg_id]
            
            # Initialize a dictionary to store the sum of votes for each label type
            vote_sums = {label: 0 for label in self.label_types}
            total_votes_per_segment = []

            for _, segment in eeg_segments.iterrows():
                total_votes = sum(segment[label] for label in self.label_types)
                total_votes_per_segment.append(total_votes)
                for label in label_types:
                    vote_sums[label] += segment[label]
    
            # Calculate the soft label for each label type
            num_segments = len(eeg_segments)
            total_votes = sum(total_votes_per_segment)
            soft_labels[eeg_id] = {label: (vote_sums[label] / total_votes) if total_votes > 0 else 0 for label in self.label_types}
    
        self.soft_labels = soft_labels

    def labels_to_one_hot(self, label_dicts):
        # Assuming label_dicts is a list of dictionaries like
        # [{'seizure_vote': 0.0, 'lpd_vote': 1.0, ..., 'other_vote': 0.0}, ...]
        num_classes = 6  # Adjust based on your number of classes
        one_hot_labels = np.zeros((len(label_dicts), num_classes))
    
        for i, label_dict in enumerate(label_dicts):
            # Assuming the order of classes is consistent with the order of label_types
            one_hot_labels[i] = np.array([label_dict[label] for label in self.label_types])
    
        return one_hot_labels

    def split_data(self, test_size=0.2, val_size=0.1):
        train_df = pd.read_csv(self.train_csv_path)
        self.id_mapping = dict(zip(train_df['eeg_id'], train_df['spectrogram_id']))

        ids = train_df['eeg_id'].unique()
        self.train_ids, self.test_ids = train_test_split(ids, test_size=test_size)
        self.train_ids, self.val_ids = train_test_split(self.train_ids, test_size=val_size / (1 - test_size))

    def generate_batches(self, data_ids, batch_size=32):
        np.random.shuffle(data_ids)
        
        for i in range(0, len(data_ids), batch_size):
            batch_keys = data_ids[i:i + batch_size]
            batch_eeg = [self.load_process_eeg(eeg_id) for eeg_id in batch_keys]
            batch_spectrogram = [self.load_process_spectrogram(self.id_mapping[eeg_id]) for eeg_id in batch_keys]
            batch_labels = [self.soft_labels[eeg_id] for eeg_id in batch_keys]

            # Convert labels to one-hot encoded format
            batch_labels = self.labels_to_one_hot(batch_labels)
            
            yield np.array(batch_eeg), np.array(batch_spectrogram), np.array(batch_labels)
            
    def load_process_eeg(self, eeg_id):
        eeg_file_path = os.path.join(self.eeg_dir, f'{eeg_id}.parquet')
        eeg_data = pq.read_table(eeg_file_path).to_pandas()
        eeg_data = StandardScaler().fit_transform(eeg_data)
        return self.pad_eeg(eeg_data)

    def load_process_spectrogram(self, spectrogram_id):
        spectrogram_file_path = os.path.join(self.spectrogram_dir, f'{spectrogram_id}.parquet')
        spectrogram_data = pq.read_table(spectrogram_file_path).to_pandas()
        spectrogram_data = StandardScaler().fit_transform(spectrogram_data)
        return self.pad_spectrogram(spectrogram_data)

    def run_pipeline(self):
        self.create_soft_labels()
        self.split_data()

        print("Pipeline execution completed.")

In [4]:
eeg_dir = './train_eegs/'
spectrogram_dir = './train_spectrograms/'
train_csv_path = './train.csv'

pipeline = EEGSpectrogramPipeline(eeg_dir, spectrogram_dir, train_csv_path)
pipeline.run_pipeline()

Pipeline execution completed.


In [None]:
batch_generator = pipeline.generate_batches(pipeline.train_ids, batch_size=32)

# Iterate through a few batches and print useful information
for i, (batch_eeg, batch_spectrogram, batch_labels) in enumerate(batch_generator):
    print(f"Batch {i+1}")
    print(f"Batch EEG Shape: {batch_eeg.shape}")
    print(f"Batch Spectrogram Shape: {batch_spectrogram.shape}")
    print(f"Batch Labels Shape: {batch_labels.shape}")

    # Optionally, print some summary statistics or data examples
    print(f"EEG Data Example (first sample):\n{batch_eeg[0]}")
    print(f"Spectrogram Data Example (first sample):\n{batch_spectrogram[0]}")
    print(f"Labels Example (first sample):\n{batch_labels[0]}")

    if i == 2:  # Stop after 3 batches for this test
        break

# Hybrid Model

In [1]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization, Concatenate
from keras.utils import Progbar




In [5]:
#It will look something like this

# Define input shapes based on your data dimensions
spectrogram_input_shape = (5000, 401, 1)  # Assuming spectrogram data is 2D with 1 channel
eeg_input_shape = (30000, 20)

# CNN branch for spectrogram data
SpectrogramInput = Input(shape=spectrogram_input_shape)
Conv1 = Conv2D(32, kernel_size=(3, 3), activation='relu')(SpectrogramInput)
Pool1 = MaxPooling2D(pool_size=(2, 2))(Conv1)
Conv2 = Conv2D(64, kernel_size=(3, 3), activation='relu')(Pool1)
Pool2 = MaxPooling2D(pool_size=(2, 2))(Conv2)
Flat1 = Flatten()(Pool2)

# LSTM branch for raw EEG data
EEGInput = Input(shape=eeg_input_shape)
LSTM1 = LSTM(64, return_sequences=True)(EEGInput)
LSTM2 = LSTM(64)(LSTM1)

# Concatenate both branches
Concatenated = Concatenate()([Flat1, LSTM2])
Dense1 = Dense(64, activation='relu')(Concatenated)
BatchNorm1 = BatchNormalization()(Dense1)
Dropout1 = Dropout(0.5)(BatchNorm1)
Dense2 = Dense(64, activation='relu')(Dropout1)
Output = Dense(6, activation='softmax')(Dense2)

# Create the model
model = Model(inputs=[SpectrogramInput, EEGInput], outputs=Output)

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Model summary
model.summary()




Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 5000, 401, 1)]       0         []                            
                                                                                                  
 conv2d (Conv2D)             (None, 4998, 399, 32)        320       ['input_1[0][0]']             
                                                                                                  
 max_pooling2d (MaxPooling2  (None, 2499, 199, 32)        0         ['conv2d[0][0]']              
 D)                                                                                               
                                                                                                  
 conv2d_1 (Conv2D)           (None, 2497, 197, 64)        18496     ['max_pooling2d[0][0]']

In [6]:
# Training parameters
epochs = 10
batch_size = 32

# Training loop
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")

    # Determine the total number of steps in one epoch
    num_train_samples = len(pipeline.train_ids)
    num_train_steps = np.ceil(num_train_samples / batch_size)

    # Initialize Progbar for the training phase
    progbar = Progbar(target=num_train_steps)

    train_generator = pipeline.generate_batches(pipeline.train_ids, batch_size=batch_size)

    # Train for one epoch
    for step, (batch_eeg, batch_spectrogram, batch_labels) in enumerate(train_generator):
        model.train_on_batch([batch_spectrogram, batch_eeg], batch_labels)
        progbar.update(step + 1)

    # Validation step
    num_val_samples = len(pipeline.val_ids)
    num_val_steps = np.ceil(num_val_samples / batch_size)
    val_loss, val_accuracy = 0, 0

    val_generator = pipeline.generate_batches(pipeline.val_ids, batch_size=batch_size)

    for step, (batch_eeg, batch_spectrogram, batch_labels) in enumerate(val_generator):
        loss, accuracy = model.test_on_batch([batch_spectrogram, batch_eeg], batch_labels)
        val_loss += loss
        val_accuracy += accuracy

    val_loss /= num_val_steps
    val_accuracy /= num_val_steps
    print(f"Validation loss: {val_loss}, Validation accuracy: {val_accuracy}")


# Save the model after training
model.save('./models')

Epoch 1/10


Validation loss: 1.6078387123567086, Validation accuracy: 0.4069622512216921
Epoch 2/10
Validation loss: 1.609930000923298, Validation accuracy: 0.4044248577621248
Epoch 3/10
Validation loss: 1.6111020357520491, Validation accuracy: 0.4035790598502866
Epoch 4/10
 13/374 [>.............................] - ETA: 13:13:27

KeyboardInterrupt: 