In [None]:
from typing import Iterator

import numpy as np
import h5py
import os
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.optimizers import Adam

In [None]:
# Data paths and Configuration
DATA_PATH = os.path.abspath("../extracted_zip_in_here/Final Project data/")
INTRA_TRAIN_FOLDER = os.path.join(DATA_PATH, os.path.relpath("./Intra/train/"))
INTRA_TEST_FOLDER = os.path.join(DATA_PATH, os.path.relpath("./Intra/test/"))
DOWNSAMPLE_FACTOR = 1.0

# Function definitions
def get_dataset_name(filename_with_dir):
    """Given a a full file path returns the name of the dataset"""
    filename_without_dir = os.path.basename(filename_with_dir)
    temp = filename_without_dir.split('.')[:-1]
    dataset_name = ''.join(temp)
    temp = dataset_name.split('_')[:-1]
    dataset_name = "_".join(temp)
    return dataset_name

def extract_data_from_folder_by_file(folder_path, shuffle=False):
    """Given a folder path containing h5 files, this function returns the matrices and 
    optionally shuffles the files."""
    files = os.listdir(folder_path)
    if shuffle:
        np.random.shuffle(files)

    for file_name in files:
        
        filename_path = os.path.join(folder_path, file_name)
        
        with h5py.File(filename_path, 'r') as f:
            dataset_name = get_dataset_name(filename_path)
            matrix = f.get(dataset_name)[()]
            yield dataset_name, matrix

# Given a folder 
def learn_minmax_from_all_files(folder_path: str) -> tuple:
    """Given a folder path containing h5 files, this functions returns the minimum and 
    maximum values across all files in the folder."""
    # Placeholders
    min_val = None
    max_val = None

    for (name, data) in extract_data_from_folder_by_file(folder_path):
        data = data.T
        if min_val is None:
            min_val = np.min(data, axis=0)
            max_val = np.max(data, axis=0)
        else:
            # Update min_val and max_val
            min_val = np.minimum(min_val, np.min(data, axis=0))
            max_val = np.maximum(max_val, np.max(data, axis=0))
        
    return min_val, max_val

def generate_label(file_name:str) -> np.ndarray:
    """Returns a vector corresponding to the one-hot encoded label for task type."""
    # Return a one-hot encoded label based on the file name, there are4 classes
    # 0: rest
    if "rest_" in file_name:
        return np.array([1, 0, 0, 0])
    # 1: task_motor
    elif "task_motor_" in file_name:
        return np.array([0, 1, 0, 0])
    # 2: task_story_math
    elif "task_story_math_" in file_name:
        return np.array([0, 0, 1, 0])
    # 3: task_working_memory
    elif "task_working_memory_" in file_name:
        return np.array([0, 0, 0, 1])
    else:
        raise ValueError(f"Unknown file name: {file_name}")

def create_batches(folder, number_of_files_per_batch: int, preprocessing_pipeline: list = None, shuffle_files=False) -> Iterator[tuple]:
    """Given a folder containg '.h5' files and the number of """
    batch_data = []
    batch_labels = []
    for n, (name, data) in enumerate(extract_data_from_folder_by_file(folder, shuffle=shuffle_files)):
        data = data.T
        if preprocessing_pipeline:
            for preprocessing_step in preprocessing_pipeline:
                data = preprocessing_step(data)
        # Add the preprocessed data to the batch
        batch_data.append(data)

        # Generate the label matrix of the length of the data for the current file
        label_vector = generate_label(name)
        batch_labels.append(label_vector)


        # Check if we have reached the desired batch size
        if (n + 1) % number_of_files_per_batch == 0:
            # Stack along the first axis (like a batch dimension)
            yield (batch_data, batch_labels)
            batch_data = []
            batch_labels = []

    # Optional: yield the remainder if not divisible
    if batch_data:
        yield (batch_data, batch_labels)

def evaluate_scores(model, folder_to_evaluate):
    losses = []
    accuracies = []
    for batch_X_list, batch_y_list in create_batches(folder=folder_to_evaluate, number_of_files_per_batch=8, preprocessing_pipeline=preprocessing_pipeline, shuffle_files=False):
        # Convert the list of arrays to a 3D numpy array
        data = np.array(batch_X_list)
        labels = np.array(batch_y_list)
        
        # Evaluate the model
        loss, accuracy = model.evaluate(data, labels)
        losses.append(loss)
        accuracies.append(accuracy)
    return np.mean(losses), np.mean(accuracies)

# Helper functions
def scale_data(data: np.ndarray, min_val: np.ndarray, max_val: np.ndarray) -> np.ndarray:
    """Given a minimal value and maximal value normalises the provided matrix."""
    # Scale the data to the range [0, 1]
    return (data - min_val) / (max_val - min_val)

def downsample(data: np.array, factor: float) -> np.array:
    """
    Downsample time series data by uniformly selecting samples at fixed intervals
    to keep the temporal order intact.

    Args:
        data (np.array): Input time series data (1D or 2D with time dimension as first axis)
        factor (float): Downsampling factor (e.g., 0.5 means keep half the samples)

    Returns:
        np.array: Downsampled data with timesteps reduced by the factor
    """
    num_samples = int(len(data) * factor)
    # Calculate the stride to evenly pick samples
    stride = len(data) / num_samples
    # Use np.floor to avoid going out of bounds and convert to int indices
    indices = (np.floor(np.arange(num_samples) * stride)).astype(int)
    downsampled_data = data[indices]
    return downsampled_data

min_val, max_val = learn_minmax_from_all_files(INTRA_TRAIN_FOLDER)
print(f"Min values: {min_val.shape}, Max values: {max_val.shape}")

preprocessing_pipeline = [
    lambda x: scale_data(x, min_val, max_val), 
    lambda x: downsample(x, DOWNSAMPLE_FACTOR)
]

# # Example usage
# for data_batch, labels_batch in create_batches(folder=INTRA_TRAIN_FOLDER, number_of_files_per_batch=8, preprocessing_pipeline=preprocessing_pipeline, shuffle_files=False):
    
#     for data, label in zip(data_batch, labels_batch):
#         print(f"Data shape: {data.shape}, Label: {label}")

# Actual code
def create_model() -> Sequential:
    lstm_classifier = Sequential([
        LSTM(64, return_sequences=False, input_shape=(3562, 248)),
        Dense(64, activation='relu'),
        Dense(4, activation='softmax')
        ])

    lstm_classifier.compile(
        loss=CategoricalCrossentropy(),  # works directly with one-hot encoded labels
        optimizer=Adam(),
        metrics=['accuracy']
    )

    return lstm_classifier


def train_model(model, epochs=10, batch_size=8, verbose=1):
    for epoch in range(epochs):
        if verbose:
            print(f"Epoch: {epoch}")
        for batch, (batch_X_list, batch_y_list) in enumerate(create_batches(folder=INTRA_TRAIN_FOLDER, number_of_files_per_batch=batch_size, preprocessing_pipeline=preprocessing_pipeline, shuffle_files=True)):
            # Convert the list of arrays to a 3D numpy array
            data = np.array(batch_X_list)
            labels = np.array(batch_y_list)

            # Shuffle the data and labels together
            indices = np.arange(data.shape[0])
            np.random.shuffle(indices)
            data = data[indices]
            labels = labels[indices]
            
            # Train the model
            model.fit(data, labels)
            
            # Evaluate the model
            loss, accuracy = model.evaluate(data, labels)
            if verbose:
                print(f"Batch: {batch}, Loss: {loss}, Accuracy: {accuracy}")
    
    return model

lstm_classifier = create_model()
trained_lstm_classifier = train_model(lstm_classifier, epochs=10, batch_size=8, verbose=1)

loss, accuracy = evaluate_scores(trained_lstm_classifier, INTRA_TRAIN_FOLDER)
print(f"Loss: {loss}, Accuracy: {accuracy}")

loss, accuracy = evaluate_scores(trained_lstm_classifier, INTRA_TEST_FOLDER)
print(f"Loss: {loss}, Accuracy: {accuracy}")

In [9]:
from typing import Iterator

import numpy as np
import h5py
import os
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.optimizers import Adam
from scipy.signal import butter, filtfilt

# --- Configuration ---
DATA_PATH = os.path.abspath("../extracted_zip_in_here/Final Project data/")
INTRA_TRAIN_FOLDER = os.path.join(DATA_PATH, "Intra/train")
INTRA_TEST_FOLDER = os.path.join(DATA_PATH, "Intra/test")
DOWNSAMPLE_FACTOR = 1.0

# --- File and Data Handling ---
def get_dataset_name(filepath: str) -> str:
    filename = os.path.basename(filepath)
    return "_".join(filename.split('.')[:-1][0].split('_')[:-1])

def extract_data_from_folder(folder_path: str, shuffle: bool = False) -> Iterator[tuple[str, np.ndarray]]:
    files = os.listdir(folder_path)
    if shuffle:
        np.random.shuffle(files)
    for file_name in files:
        with h5py.File(os.path.join(folder_path, file_name), 'r') as f:
            dataset_name = get_dataset_name(file_name)
            yield dataset_name, f[dataset_name][()].T  # transpose once here

def learn_minmax(folder_path: str) -> tuple[np.ndarray, np.ndarray]:
    min_val, max_val = None, None
    for _, data in extract_data_from_folder(folder_path):
        min_data, max_data = np.min(data, axis=0), np.max(data, axis=0)
        min_val = min_data if min_val is None else np.minimum(min_val, min_data)
        max_val = max_data if max_val is None else np.maximum(max_val, max_data)
    return min_val, max_val

def generate_label(name: str) -> np.ndarray:
    classes = ["rest", "task_motor", "task_story_math", "task_working_memory"]
    for i, cls in enumerate(classes):
        if cls + "_" in name:
            label = np.zeros(len(classes))
            label[i] = 1
            return label
    raise ValueError(f"Unknown file name: {name}")

# --- Preprocessing ---
def scale_data(data: np.ndarray, min_val: np.ndarray, max_val: np.ndarray) -> np.ndarray:
    return (data - min_val) / (max_val - min_val)

def downsample(data: np.ndarray, factor: float) -> np.ndarray:
    num_samples = int(len(data) * factor)
    indices = np.floor(np.arange(num_samples) * (len(data) / num_samples)).astype(int)
    return data[indices]

def add_gaussian_noise(data: np.ndarray, stddev: float = 0.01) -> np.ndarray:
    noise = np.random.normal(0, stddev, data.shape)
    return data + noise

def time_shift(data: np.ndarray, shift_max: int = 10) -> np.ndarray:
    shift = np.random.randint(-shift_max, shift_max)
    return np.roll(data, shift, axis=0)

def channel_dropout(data: np.ndarray, dropout_rate: float = 0.1) -> np.ndarray:
    num_channels = data.shape[1]
    mask = np.random.rand(num_channels) > dropout_rate
    return data * mask[np.newaxis, :]

def random_scaling(data: np.ndarray, scale_range=(0.9, 1.1)) -> np.ndarray:
    scale = np.random.uniform(*scale_range)
    return data * scale

def bandpass_filter(data: np.ndarray, lowcut=1.0, highcut=40.0, fs=2034.0, order=5) -> np.ndarray:
    nyq = 0.5 * fs
    low = lowcut / nyq
    high = highcut / nyq
    b, a = butter(order, [low, high], btype='band')
    return filtfilt(b, a, data, axis=0)

def zscore_per_channel(data: np.ndarray) -> np.ndarray:
    mean = np.mean(data, axis=0)
    std = np.std(data, axis=0)
    std[std == 0] = 1  # Prevent division by zero
    return (data - mean) / std

def baseline_correction(data: np.ndarray, baseline_duration=100) -> np.ndarray:
    if data.shape[0] < baseline_duration:
        return data
    baseline_mean = np.mean(data[:baseline_duration], axis=0)
    return data - baseline_mean

# --- Batching ---
def create_batches(folder: str, batch_size: int, preprocessing: list = None, shuffle: bool = False) -> Iterator[tuple[np.ndarray, np.ndarray]]:
    batch_data, batch_labels = [], []
    for i, (name, data) in enumerate(extract_data_from_folder(folder, shuffle)):
        if preprocessing:
            for fn in preprocessing:
                data = fn(data)
        batch_data.append(data)
        batch_labels.append(generate_label(name))

        if (i + 1) % batch_size == 0:
            yield np.array(batch_data), np.array(batch_labels)
            batch_data, batch_labels = [], []
    if batch_data:
        yield np.array(batch_data), np.array(batch_labels)

# --- Model ---
def create_model(input_shape=(3562, 248)) -> Sequential:
    model = Sequential([
        LSTM(64, return_sequences=False, input_shape=input_shape),
        Dense(64, activation='relu'),
        Dense(4, activation='softmax')
    ])
    model.compile(
        loss=CategoricalCrossentropy(),
        optimizer=Adam(),
        metrics=['accuracy']
    )
    return model

def train_model(model, train_folder: str, epochs: int = 10, batch_size: int = 8, preprocessing: list = None):
    for epoch in range(epochs):
        print(f"Epoch {epoch+1}/{epochs}")
        for batch_X, batch_y in create_batches(train_folder, batch_size, preprocessing, shuffle=True):
            indices = np.random.permutation(len(batch_X))
            model.fit(batch_X[indices], batch_y[indices], verbose=0)
    return model

def evaluate_model(model, folder: str, batch_size: int, preprocessing: list) -> tuple[float, float]:
    losses, accuracies = [], []
    for batch_X, batch_y in create_batches(folder, batch_size, preprocessing, shuffle=False):
        loss, acc = model.evaluate(batch_X, batch_y, verbose=0)
        losses.append(loss)
        accuracies.append(acc)
    return np.mean(losses), np.mean(accuracies)

# --- Run Training and Evaluation ---
min_val, max_val = learn_minmax(INTRA_TRAIN_FOLDER)
print(f"Learned min/max shapes: {min_val.shape}, {max_val.shape}")

preprocessing_pipeline = [
    lambda x: scale_data(x, min_val, max_val),
    lambda x: downsample(x, DOWNSAMPLE_FACTOR),
    add_gaussian_noise, # These are all added for more general performance ()
    # time_shift,
    channel_dropout
    # random_scaling
]

model = create_model()
trained_model = train_model(model, INTRA_TRAIN_FOLDER, epochs=10, batch_size=8, preprocessing=preprocessing_pipeline)

train_loss, train_acc = evaluate_model(trained_model, INTRA_TRAIN_FOLDER, batch_size=8, preprocessing=preprocessing_pipeline)
print(f"Training Loss: {train_loss:.4f}, Accuracy: {train_acc:.4f}")

test_loss, test_acc = evaluate_model(trained_model, INTRA_TEST_FOLDER, batch_size=8, preprocessing=preprocessing_pipeline)
print(f"Test Loss: {test_loss:.4f}, Accuracy: {test_acc:.4f}")

Learned min/max shapes: (248,), (248,)
Epoch 1/10


  super().__init__(**kwargs)


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Training Loss: 0.0899, Accuracy: 1.0000
Test Loss: 0.0797, Accuracy: 1.0000


In [5]:
from sklearn.metrics import classification_report

def detailed_evaluation(model, folder, batch_size, preprocessing):
    y_true, y_pred = [], []
    for batch_X, batch_y in create_batches(folder, batch_size, preprocessing, shuffle=False):
        preds = model.predict(batch_X)
        y_true.extend(np.argmax(batch_y, axis=1))
        y_pred.extend(np.argmax(preds, axis=1))
    print(classification_report(y_true, y_pred))

detailed_evaluation(trained_model, INTRA_TEST_FOLDER, batch_size=8, preprocessing=preprocessing_pipeline)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         2
           1       1.00      1.00      1.00         2
           2       1.00      1.00      1.00         2
           3       1.00      1.00      1.00         2

    accuracy                           1.00         8
   macro avg       1.00      1.00      1.00         8
weighted avg       1.00      1.00      1.00         8



# Memory efficient but very long

In [None]:
import os
import h5py
import numpy as np
from tensorflow.keras import models, layers, mixed_precision
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from meg_preprocessing import preprocess_meg
from downsample import downsample
from read_data import get_dataset_name_train, load_split_files, VALID_TASK_TYPES

# Enable mixed precision training
policy = mixed_precision.Policy('mixed_float16')
mixed_precision.set_global_policy(policy)

class MEGDataGenerator:
    def __init__(self, file_paths, task_types, label_encoder, batch_size=16, downsample_factor=0.5):
        self.file_paths = file_paths
        self.task_types = task_types
        self.label_encoder = label_encoder
        self.batch_size = batch_size
        self.downsample_factor = downsample_factor
        self.target_length = self._determine_target_length()
        
    def _determine_target_length(self):
        """Sample a few files to determine max sequence length"""
        max_length = 0
        sample_files = min(10, len(self.file_paths))
        
        for i in range(sample_files):
            data_dict = load_split_files(self.file_paths[i])
            for data in data_dict.values():
                processed = preprocess_meg(data)
                if self.downsample_factor < 1.0:
                    processed = downsample(processed, self.downsample_factor)
                if processed.shape[1] > max_length:
                    max_length = processed.shape[1]
        return max_length
    
    def __len__(self):
        return int(np.ceil(len(self.file_paths) / self.batch_size))
    
    def __iter__(self):
        while True:
            for i in range(0, len(self.file_paths), self.batch_size):
                batch_paths = self.file_paths[i:i+self.batch_size]
                batch_types = self.task_types[i:i+self.batch_size]
                batch_data = []
                batch_labels = []
                
                for path, task_type in zip(batch_paths, batch_types):
                    data_dict = load_split_files(path)
                    for data in data_dict.values():
                        processed = preprocess_meg(data)
                        if self.downsample_factor < 1.0:
                            processed = downsample(processed, self.downsample_factor)
                        batch_data.append(processed)
                        batch_labels.append(task_type)
                
                # Efficient padding
                X_batch = np.zeros((len(batch_data), batch_data[0].shape[0], self.target_length))
                for j, d in enumerate(batch_data):
                    length = min(d.shape[1], self.target_length)
                    X_batch[j, :, :length] = d[:, :length]
                
                X_batch = np.expand_dims(X_batch, axis=-1).astype('float32')
                y_batch = self.label_encoder.transform(batch_labels)
                
                yield X_batch, y_batch

def create_efficient_model(input_shape, num_classes):
    """Memory-optimized CNN-LSTM model with mixed precision"""
    model = models.Sequential([
        # First conv block with reduced filters
        layers.Conv1D(32, kernel_size=5, activation='relu', input_shape=input_shape),
        layers.BatchNormalization(),
        layers.MaxPooling1D(pool_size=4),  # Increased pooling
        layers.Dropout(0.3),
        
        # Second conv block with reduced filters
        layers.Conv1D(64, kernel_size=3, activation='relu'),
        layers.BatchNormalization(),
        layers.MaxPooling1D(pool_size=2),
        layers.Dropout(0.3),
        
        # Single LSTM layer with reduced units
        layers.LSTM(32, return_sequences=False),
        layers.Dropout(0.3),
        
        # Smaller dense layers
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.3),
        layers.Dense(num_classes, activation='softmax', dtype='float32')
    ])
    
    model.compile(optimizer='adam',
                loss='sparse_categorical_crossentropy',
                metrics=['accuracy'])
    
    return model

def train_and_evaluate_optimized(typeData='Cross', batch_size=16, downsample_factor=0.5):
    # Load file paths and labels without loading actual data
    print("Collecting training file paths...")
    train_files = []
    train_labels = []
    
    for task_type in VALID_TASK_TYPES:
        files = get_dataset_name_train(file_name="all", taskType=task_type, typeData=typeData)
        if files:
            train_files.extend(files)
            train_labels.extend([task_type] * len(files))
    
    print("Collecting test file paths...")
    test_files = find_test_files(typeData)
    test_labels = []
    
    for file_path in test_files:
        file_name = os.path.basename(file_path)
        for valid_type in VALID_TASK_TYPES:
            if file_name.startswith(valid_type):
                test_labels.append(valid_type)
                break
    
    # Initialize label encoder
    label_encoder = LabelEncoder()
    label_encoder.fit(train_labels + test_labels)
    num_classes = len(label_encoder.classes_)
    
    # Create data generators
    train_gen = MEGDataGenerator(train_files, train_labels, label_encoder, 
                               batch_size=batch_size, downsample_factor=downsample_factor)
    test_gen = MEGDataGenerator(test_files, test_labels, label_encoder,
                               batch_size=batch_size, downsample_factor=downsample_factor)
    
    # Determine input shape from a sample batch
    sample_X, _ = next(train_gen.__iter__())
    input_shape = sample_X.shape[1:]
    
    print("Creating optimized model...")
    model = create_efficient_model(input_shape, num_classes)
    model.summary()
    
    print("Training model...")
    history = model.fit(train_gen.__iter__(),
                       steps_per_epoch=len(train_gen),
                       epochs=20,
                       validation_data=test_gen.__iter__(),
                       validation_steps=len(test_gen),
                       verbose=1)
    
    # Evaluate on test set
    print("Evaluating model...")
    y_true = []
    y_pred = []
    
    for _ in range(len(test_gen)):
        X_batch, y_batch = next(test_gen.__iter__())
        y_true.extend(y_batch)
        y_pred.extend(np.argmax(model.predict(X_batch, verbose=0), axis=1))
    
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred, 
                              target_names=label_encoder.classes_))
    
    return model, history

# Helper functions (keep these from your original code)
def find_test_files(typeData):
    """Find all test files in the test directories"""
    test_files = []
    base_dir = os.path.join("..", "extracted_zip_in_here", "Final Project data", typeData)
    
    for dir_name in os.listdir(base_dir):
        if 'test' in dir_name.lower():
            test_dir = os.path.join(base_dir, dir_name)
            if os.path.isdir(test_dir):
                for file_name in os.listdir(test_dir):
                    if file_name.endswith('.h5'):
                        test_files.append(os.path.join(test_dir, file_name))
    
    return test_files    

In [None]:
model, history = train_and_evaluate_optimized(
        typeData='Cross',
        batch_size=16,
        downsample_factor=0.5
    )