In [1]:
import os
import pyarrow.parquet as pq
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import tensorflow as tf

physical_devices = tf.config.list_physical_devices('GPU')
if len(physical_devices) > 0:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)
from tensorflow.keras import backend as K
K.clear_session()

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Concatenate, Flatten, Dropout,BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import LearningRateScheduler

from scipy import signal
from skimage.transform import resize
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [2]:
from tensorflow.keras import backend as K
K.clear_session()


In [4]:
def read_data(data_folder, num_files=None):
    """
    Read EEG and spectrogram data from Parquet files in the specified data folder.

    Parameters:
    - data_folder (str): Path to the main data folder containing 'train' and 'test' subfolders.
    - num_files (int or None): Number of files to read from each subfolder. If None, all files will be read.

    Returns:
    - train (List[Tuple[pd.DataFrame, pd.DataFrame]]): List of tuples containing train EEG data and train spectrogram data.
    - test (List[Tuple[pd.DataFrame, pd.DataFrame]]): List of tuples containing test EEG data and test spectrogram data.
    - train_labels (pd.DataFrame): DataFrame containing train labels.
    - test_labels (pd.DataFrame): DataFrame containing test labels.
    """
    train_eeg_folder = os.path.join(data_folder, 'train_eegs')
    test_eeg_folder = os.path.join(data_folder, 'test_eegs')
    train_spectrograms_folder = os.path.join(data_folder, 'train_spectrograms')
    test_spectrograms_folder = os.path.join(data_folder, 'test_spectrograms')

    def read_parquet_folder(folder_path, n_files=None):
        dataframes = []
        files_to_read = os.listdir(folder_path)[:n_files] if n_files else os.listdir(folder_path)
        for file in files_to_read:
            if file.endswith('.parquet'):
                file_path = os.path.join(folder_path, file)
                dataframe = pq.read_table(file_path).to_pandas()
                dataframes.append(dataframe)
        print(f"Read {len(dataframes)} files from {folder_path}.")
        return dataframes


    # Read EEG data
    train_eeg = read_parquet_folder(train_eeg_folder, num_files)
    test_eeg = read_parquet_folder(test_eeg_folder)

    # Read spectrogram data
#discarded

    # Interpolate NaN values
    train_eeg = [df.fillna(df.mean()) for df in train_eeg]
#dsicarded



    # Check nan values
    print("NaN values in train_eeg:", any(df.isnull().any().any() for df in train_eeg))
    
    print("NaN values in test_eeg:", any(df.isnull().any().any() for df in test_eeg))
   
    # Combine data into tuples

    train_labels = pd.read_csv('train.csv',nrows=num_files)
    test_labels = pd.read_csv('test.csv')


    return train_eeg, test_eeg, train_labels, test_labels

In [5]:
def single_vis(f, visualization_type):
    """
    Visualize single EEG channels or Spectrogram from the provided DataFrame.

    Parameters:
    - f (pd.DataFrame): The DataFrame containing the data to be visualized.
    - visualization_type (str): Specify the type of visualization: 'eeg' for EEG channels or 'spectrogram' for Spectrogram.

    Raises:
    - ValueError: If an invalid `visualization_type` is provided.

    Returns:
    None
    """
    
    if visualization_type == 'eeg':
        channels_to_exclude = []
        title = 'EEG Channels Visualization'
        
        eeg_channels = [column for column in f.columns if column not in channels_to_exclude]
        plt.figure(figsize=(40, 10))
        
        # Calculate the offset for each channel based on the maximum amplitude
        max_amplitude = f[eeg_channels].max().max()
        channel_offset = max_amplitude * 1.2  # Adjust the multiplier as needed
        
        for i, column in enumerate(eeg_channels):
            y_values = f[column] + i * channel_offset
            plt.plot(f.index, y_values, label=column)

        plt.title(title)
        plt.xlabel('Sample')
        plt.ylabel('Amplitude')
        plt.legend()
        plt.show()
        
    elif visualization_type == 'spectrogram':
        channels_to_exclude = ['time']
        title = 'Spectrogram Visualization'
        
        spectrogram_channels = [column for column in f.columns if column not in channels_to_exclude]
        plt.figure(figsize=(40, 10))
        
        combined_spectrogram = np.zeros((len(f), len(spectrogram_channels)))
        for i, column in enumerate(spectrogram_channels):
            combined_spectrogram[:, i] = f[column].values
        
        plt.imshow(combined_spectrogram.T, aspect='auto', cmap='viridis', interpolation='nearest')
        plt.title(title)
        plt.xlabel('Sample')
        plt.ylabel('Channel')
        plt.show()
        
    else:
        raise ValueError("Invalid visualization type. Use 'eeg' or 'spectrogram'.")


In [6]:
desired_length = 100
num_features = 32
desired_length = 100  
num_frequency_bins = 32  
num_classes = 6 

In [7]:
input_shape_eeg[0]

NameError: name 'input_shape_eeg' is not defined

In [27]:

def preprocess_eeg(X_train_eeg, target_shape=(desired_length, num_features)):
    eeg_array = np.array(X_train_eeg)[:desired_length, :num_features].astype(np.float32)
    return eeg_array



def preprocess_spectrogram(spectrogram_df, target_shape=(desired_length, num_frequency_bins)):
    spec_array = np.array(spectrogram_df)[:target_shape[0], :target_shape[1]].astype(np.float32)
    return spec_array

def create_model(input_shape_eeg, num_classes=6):
    """Create a multi-input, multi-output model for
    EEG and Spectrogram data.

    Args:
        input_shape_eeg : shape of one EEG sample
        input_shape_spectrogram : shape of one Spectrogram sample
        num_classes : 6 for seizure, lpd, gpd, lrda, grda, other

    Returns:
        keras model
    """
    
    # EEG branch
    model = tf.keras.Sequential([
        tf.keras.layers.LSTM(units=64, input_shape=input_shape_eeg[1:], return_sequences=True),
        BatchNormalization(),
        Dropout(0.4),
        tf.keras.layers.LSTM(units=64),
        BatchNormalization(),
        Dropout(0.4),
        Dense(32, activation='relu'),
        Dense(6, activation='softmax')
        ])
    return model

def lr_schedule(epoch, lr):
    if epoch % 10 == 0 and epoch > 0:
        return lr * 0.9
    return lr

In [9]:
train,test,train_labels,test_labels = read_data('data/data',24)

Read 24 files from data/data\train_eegs.
Read 1 files from data/data\test_eegs.
NaN values in train_eeg: False
NaN values in test_eeg: False


In [10]:

X_train, X_test, y_train, y_test = train_test_split(train, train_labels, test_size=0.2, random_state=42,shuffle=True)

In [11]:
X_train= np.concatenate(X_train, axis=0)

In [35]:
X_test=np.concatenate(X_test,axis=0)

In [42]:
print(X_test.shape)
print(X_train.shape)

(84400, 20)
(245600, 20)


In [12]:
# Define the number of examples you want
num_examples = 19

# Calculate the number of samples per example
samples_per_example = X_train.shape[0] //num_examples

# Initialize a list to store the split examples
X_train_split = []

# Split X_train into 19 examples
for i in range(num_examples):
    start_index = i * samples_per_example
    end_index = (i + 1) * samples_per_example
    example = X_train[start_index:end_index]
    X_train_split.append(example)

# Convert the list of examples into a numpy array
X_train_split = np.array(X_train_split)

In [47]:
# Define the number of examples you want for X_test
num_examples_test = 19

# Calculate the number of samples per example for X_test
samples_per_example_test = X_test.shape[0] // num_examples_test

# Initialize a list to store the split examples for X_test
X_test_split = []

# Split X_test into 19 examples
for i in range(num_examples_test):
    start_index = i * samples_per_example_test
    end_index = (i + 1) * samples_per_example_test
    example = X_test[start_index:end_index]
    X_test_split.append(example)

# Convert the list of examples into a numpy array for X_test
X_test_split = np.array(X_test_split)


In [48]:
X_test_split.shape

(19, 4442, 20)

In [13]:
X_train_split.shape

(19, 12926, 20)

In [14]:
single_vis(X_train, 'eeg')

AttributeError: 'numpy.ndarray' object has no attribute 'columns'

In [15]:
y_train = np.array(y_train)

In [16]:
labels = y_train[:, 8]

# Encoding labels using LabelEncoder
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

# Convert labels to categorical one-hot encoding
one_hot_labels = to_categorical(encoded_labels, num_classes=num_classes)

# Convert labels to float32
y_train = one_hot_labels.astype('float32')

# Convert other columns to numeric values
for col_index in range(y_train.shape[1]):
    # Skip label column as it has already been encoded
    if col_index == 8:
        continue
    
    # Convert each element in the column to a numeric value
    y_train[:, col_index] = np.array([float(val) if isinstance(val, str) and '.' in val else int(val) for val in y_train[:, col_index]])


In [56]:
y_train

array([[1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.]], dtype=float32)

In [17]:
X_train_eeg = X_train_split.astype(np.float32)



# Define input shapes
input_shape_eeg = X_train_eeg.shape#Shape of one EEG sample

num_classes = y_train.shape

In [49]:
X_test_eeg = X_test_split.astype(np.float32)



# Define input shapes


In [50]:
X_test_eeg.shape

(19, 4442, 20)

In [46]:
input_shape_eeg

(19, 12926, 20)

In [18]:
input_shape_eeg

(19, 12926, 20)

In [19]:
num_classes=6

In [20]:
for i in range(len(X_train_eeg)):
    if np.any(np.isnan(X_train_eeg[i])):
        print(i)

In [36]:
print(X_train_eeg[14])

[[-2.3203e+02 -1.0091e+02 -2.0190e+01 ... -2.7220e+01  1.7220e+01
  -4.0010e+01]
 [-2.8830e+02 -1.4167e+02 -4.7270e+01 ... -3.5930e+01  5.2470e+01
   1.0533e+02]
 [-2.9833e+02 -1.5655e+02 -5.7660e+01 ... -3.3640e+01  1.0854e+02
   1.4543e+02]
 ...
 [ 3.2100e+01  1.7060e+01  1.4750e+01 ...  7.0000e-02  2.0300e+00
  -3.8380e+01]
 [ 1.4710e+01  1.4200e+01  7.6700e+00 ... -1.3390e+01  6.4730e+01
   5.8760e+01]
 [ 4.8330e+01  3.5810e+01  2.6800e+01 ... -2.4700e+00  7.0240e+01
   2.9320e+01]]


In [17]:
# print(X_train_eeg.dtype)
# print(X_train_spectrogram.dtype)
# print(y_train.dtype)
# print(input_shape_eeg[1:])
# print(input_shape_spectrogram[1:])
# print(np.any(np.isnan(X_train_eeg)))
# print(np.any(np.isnan(X_train_spectrogram)))
# print(np.any(np.isinf(X_train_eeg)))
# print(np.any(np.isinf(X_train_spectrogram)))

float32
float32
float32
(20,)
(32,)
False
False
False
False


In [52]:
desired_shape = (num_examples, 12926, 20)

# Calculate the current shape of X_test_split
current_shape = X_test_split.shape

# Calculate the padding required for each dimension
pad_width = [(0, desired_shape[i] - current_shape[i]) for i in range(len(desired_shape))]

# Pad X_test_split with zeros to match the desired shape
X_test_padded = np.pad(X_test_split, pad_width, mode='constant', constant_values=0)


In [53]:
X_test_padded.shape

(19, 12926, 20)

In [21]:
print("Shape of X_train_eeg:", X_train_eeg.shape)
print("Shape of y_train:", y_train.shape)


Shape of X_train_eeg: (19, 12926, 20)
Shape of y_train: (19, 6)


In [30]:
model = create_model(input_shape_eeg, 6)
optimizer = Adam(learning_rate=0.01)
lr_scheduler = LearningRateScheduler(lr_schedule)

model.compile(optimizer=optimizer, loss = tf.keras.losses.KLDivergence(), metrics=['accuracy'])

history=model.fit(X_train_eeg, y_train, epochs=10, batch_size=5)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [24]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_2 (LSTM)               (None, 12926, 64)         21760     
                                                                 
 batch_normalization_2 (Bat  (None, 12926, 64)         256       
 chNormalization)                                                
                                                                 
 dropout_2 (Dropout)         (None, 12926, 64)         0         
                                                                 
 lstm_3 (LSTM)               (None, 64)                33024     
                                                                 
 batch_normalization_3 (Bat  (None, 64)                256       
 chNormalization)                                                
                                                                 
 dropout_3 (Dropout)         (None, 64)               

In [55]:
y_preds=model.predict(X_test_padded)



In [60]:
from sklearn.preprocessing import LabelEncoder

# Convert all labels to strings
y_test_flat_str = y_test.astype(str).values.ravel()

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Encode string labels to numeric values
y_test_encoded = label_encoder.fit_transform(y_test_flat_str)

# Now, y_test_encoded contains numeric labels corresponding to the string labels in y_test


In [64]:
# Make predictions using your model
predictions = model.predict(X_test_padded)

# Assuming y_test contains the true labels for X_test
# Calculate the predicted labels by taking the index of the maximum value in each prediction array
predicted_labels = np.argmax(predictions, axis=1)

# Assuming y_test contains the true labels for X_test
# Calculate the true labels by taking the index of the maximum value in each true label array
true_labels = y_test_encoded

# Compute accuracy
accuracy = np.mean(predicted_labels == true_labels)
print("Accuracy:", accuracy)


Accuracy: 0.0


  accuracy = np.mean(predicted_labels == true_labels)


In [65]:
X_test_eeg = np.array([preprocess_eeg(item[0]) for item in X_test])
X_test_spectrogram = np.array([preprocess_spectrogram(item[1]) for item in X_test])
y_pred = model.predict([X_test_eeg, X_test_spectrogram])

eeg_ids_test = [X_test[i][0].index[0] for i in range(len(X_test))]

output_df = pd.DataFrame({
    'eeg_id': eeg_ids_test,
    'seizure_vote': y_pred[:, 0],
    'lpd_vote': y_pred[:, 1],
    'gpd_vote': y_pred[:, 2],
    'lrda_vote': y_pred[:, 3],
    'grda_vote': y_pred[:, 4],
    'other_vote': y_pred[:, 5]
})


IndexError: too many indices for array: array is 0-dimensional, but 2 were indexed

In [None]:
output_df

In [None]:
output_df['predicted_class'] = output_df.iloc[:, 1:].idxmax(axis=1)

In [None]:
print(output_df[['eeg_id', 'predicted_class']])

In [None]:
print("Shape of X_test_eeg:", X_test_eeg.shape)
print("Shape of X_test_spectrogram:", X_test_spectrogram.shape)
print("NaN values in X_test_eeg:", np.isnan(X_test_eeg).any())
print("NaN values in X_test_spectrogram:", np.isnan(X_test_spectrogram).any())


In [None]:
y_test = np.array(y_test)
labelst = y_test[:, 8]

# Encoding labels using LabelEncoder
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labelst)

# Convert labels to categorical one-hot encoding
one_hot_labels = to_categorical(encoded_labels, num_classes=num_classes)

# Convert labels to float32
y_test = one_hot_labels.astype('float32')

# Convert other columns to numeric values
for col_index in range(y_test.shape[1]):
    # Skip label column as it has already been encoded
    if col_index == 8:
        continue
    
    # Convert each element in the column to a numeric value
    y_test[:, col_index] = np.array([float(val) if isinstance(val, str) and '.' in val else int(val) for val in y_test[:, col_index]])


In [None]:
test_loss, test_accuracy = model.evaluate([X_test_eeg, X_test_spectrogram], y_test)
print(f'Test Loss: {test_loss}, Test Accuracy: {test_accuracy}')
