In [1]:
from typing import Iterator
from livelossplot import PlotLossesKeras

import numpy as np
import h5py
import os
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.optimizers import Adam

2025-06-04 13:11:21.472541: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-06-04 13:11:21.539884: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-06-04 13:11:21.591527: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749035481.642529   57031 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749035481.660780   57031 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1749035481.756885   57031 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linkin

In [None]:
# Data paths and Configuration
DATA_PATH = os.path.abspath("../extracted_zip_in_here/Final Project data/")
INTRA_TRAIN_FOLDER = os.path.join(DATA_PATH, os.path.relpath("./Intra/train/"))
INTRA_TEST_FOLDER = os.path.join(DATA_PATH, os.path.relpath("./Intra/test/"))
DOWNSAMPLE_FACTOR = 0.5
# FEATURES = 248
# TIMESTEPS = 3562
# CLASSES = 4

# Function definitions
def get_dataset_name(filename_with_dir):
    """Given a a full file path returns the name of the dataset"""
    filename_without_dir = os.path.basename(filename_with_dir)
    temp = filename_without_dir.split('.')[:-1]
    dataset_name = ''.join(temp)
    temp = dataset_name.split('_')[:-1]
    dataset_name = "_".join(temp)
    return dataset_name

def extract_data_from_folder_by_file(folder_path, shuffle=False):
    """Given a folder path containing h5 files, this function returns the matrices and 
    optionally shuffles the files."""
    files = os.listdir(folder_path)
    if shuffle:
        np.random.shuffle(files)

    for file_name in files:
        
        filename_path = os.path.join(folder_path, file_name)
        
        with h5py.File(filename_path, 'r') as f:
            dataset_name = get_dataset_name(filename_path)
            matrix = f.get(dataset_name)[()]
            yield dataset_name, matrix

# Given a folder 
def learn_minmax_from_all_files(folder_path: str) -> tuple:
    """Given a folder path containing h5 files, this functions returns the minimum and 
    maximum values across all files in the folder."""
    # Placeholders
    min_val = None
    max_val = None

    for (name, data) in extract_data_from_folder_by_file(folder_path):
        data = data.T
        if min_val is None:
            min_val = np.min(data, axis=0)
            max_val = np.max(data, axis=0)
        else:
            # Update min_val and max_val
            min_val = np.minimum(min_val, np.min(data, axis=0))
            max_val = np.maximum(max_val, np.max(data, axis=0))
        
    return min_val, max_val

def generate_label(file_name:str) -> np.ndarray:
    # Return a one-hot encoded label based on the file name, there are4 classes
    # 0: rest
    if "rest_" in file_name:
        return np.array([1, 0, 0, 0])
    # 1: task_motor
    elif "task_motor_" in file_name:
        return np.array([0, 1, 0, 0])
    # 2: task_story_math
    elif "task_story_math_" in file_name:
        return np.array([0, 0, 1, 0])
    # 3: task_working_memory
    elif "task_working_memory_" in file_name:
        return np.array([0, 0, 0, 1])
    else:
        raise ValueError(f"Unknown file name: {file_name}")

def create_batches(folder, number_of_files_per_batch: int, preprocessing_pipeline: list = None, shuffle_files=False) -> Iterator[tuple]:
    batch_data = []
    batch_labels = []
    for n, (name, data) in enumerate(extract_data_from_folder_by_file(folder, shuffle=shuffle_files)):
        data = data.T
        if preprocessing_pipeline:
            for preprocessing_step in preprocessing_pipeline:
                data = preprocessing_step(data)
        # Add the preprocessed data to the batch
        batch_data.append(data)

        # Generate the label matrix of the length of the data for the current file
        label_vector = generate_label(name)
        batch_labels.append(label_vector)


        # Check if we have reached the desired batch size
        if (n + 1) % number_of_files_per_batch == 0:
            # Stack along the first axis (like a batch dimension)
            yield (batch_data, batch_labels)
            batch_data = []
            batch_labels = []

    # Optional: yield the remainder if not divisible
    if batch_data:
        yield (batch_data, batch_labels)

def evaluate_scores(model, folder_to_evaluate):
    losses = []
    accuracies = []
    for batch_X_list, batch_y_list in create_batches(folder=folder_to_evaluate, number_of_files_per_batch=8, preprocessing_pipeline=preprocessing_pipeline, shuffle_files=False):
        # Convert the list of arrays to a 3D numpy array
        data = np.array(batch_X_list)
        labels = np.array(batch_y_list)
        
        # Evaluate the model
        loss, accuracy = model.evaluate(data, labels)
        losses.append(loss)
        accuracies.append(accuracy)
    return np.mean(losses), np.mean(accuracies)

# Helper functions
def scale_data(data: np.ndarray, min_val: np.ndarray, max_val: np.ndarray) -> np.ndarray:
    # Scale the data to the range [0, 1]
    return (data - min_val) / (max_val - min_val)

def downsample(data: np.array, factor: float) -> np.array:
    """
    Downsample time series data by uniformly selecting samples at fixed intervals
    to keep the temporal order intact.

    Args:
        data (np.array): Input time series data (1D or 2D with time dimension as first axis)
        factor (float): Downsampling factor (e.g., 0.5 means keep half the samples)

    Returns:
        np.array: Downsampled data with timesteps reduced by the factor
    """
    num_samples = int(len(data) * factor)
    # Calculate the stride to evenly pick samples
    stride = len(data) / num_samples
    # Use np.floor to avoid going out of bounds and convert to int indices
    indices = (np.floor(np.arange(num_samples) * stride)).astype(int)
    downsampled_data = data[indices]
    return downsampled_data

min_val, max_val = learn_minmax_from_all_files(INTRA_TRAIN_FOLDER)
print(f"Min values: {min_val.shape}, Max values: {max_val.shape}")

preprocessing_pipeline = [
    lambda x: scale_data(x, min_val, max_val), 
    lambda x: downsample(x, DOWNSAMPLE_FACTOR)
]

# # Example usage
# for data_batch, labels_batch in create_batches(folder=INTRA_TRAIN_FOLDER, number_of_files_per_batch=8, preprocessing_pipeline=preprocessing_pipeline, shuffle_files=False):
    
#     for data, label in zip(data_batch, labels_batch):
#         print(f"Data shape: {data.shape}, Label: {label}")

# Actual code
def create_model() -> Sequential:
    lstm_classifier = Sequential([
        LSTM(64, return_sequences=False, input_shape=(3562, 248)),
        Dense(64, activation='relu'),
        Dense(4, activation='softmax')
        ])

    lstm_classifier.compile(
        loss=CategoricalCrossentropy(),  # works directly with one-hot encoded labels
        optimizer=Adam(),
        metrics=['accuracy']
    )

    return lstm_classifier


def train_model(model, epochs=10, batch_size=8, verbose=1):
    for epoch in range(epochs):
        if verbose:
            print(f"Epoch: {epoch}")
        for batch, (batch_X_list, batch_y_list) in enumerate(create_batches(folder=INTRA_TRAIN_FOLDER, number_of_files_per_batch=batch_size, preprocessing_pipeline=preprocessing_pipeline, shuffle_files=True)):
            # Convert the list of arrays to a 3D numpy array
            data = np.array(batch_X_list)
            labels = np.array(batch_y_list)

            # Shuffle the data and labels together
            indices = np.arange(data.shape[0])
            np.random.shuffle(indices)
            data = data[indices]
            labels = labels[indices]
            
            # Train the model
            model.fit(data, labels, callbacks = [PlotLossesKeras()])
            
            # Evaluate the model
            loss, accuracy = model.evaluate(data, labels)
            if verbose:
                print(f"Batch: {batch}, Loss: {loss}, Accuracy: {accuracy}")
    
    return model

lstm_classifier = create_model()
trained_lstm_classifier = train_model(lstm_classifier, epochs=10, batch_size=8, verbose=1)

loss, accuracy = evaluate_scores(trained_lstm_classifier, INTRA_TRAIN_FOLDER)
print(f"Loss: {loss}, Accuracy: {accuracy}")

loss, accuracy = evaluate_scores(trained_lstm_classifier, INTRA_TEST_FOLDER)
print(f"Loss: {loss}, Accuracy: {accuracy}")

# Memory efficient but very long

In [None]:
import os
import h5py
import numpy as np
from tensorflow.keras import models, layers, mixed_precision
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from meg_preprocessing import preprocess_meg
from downsample import downsample
from read_data import get_dataset_name_train, load_split_files, VALID_TASK_TYPES

# Enable mixed precision training
policy = mixed_precision.Policy('mixed_float16')
mixed_precision.set_global_policy(policy)

class MEGDataGenerator:
    def __init__(self, file_paths, task_types, label_encoder, batch_size=16, downsample_factor=0.5):
        self.file_paths = file_paths
        self.task_types = task_types
        self.label_encoder = label_encoder
        self.batch_size = batch_size
        self.downsample_factor = downsample_factor
        self.target_length = self._determine_target_length()
        
    def _determine_target_length(self):
        """Sample a few files to determine max sequence length"""
        max_length = 0
        sample_files = min(10, len(self.file_paths))
        
        for i in range(sample_files):
            data_dict = load_split_files(self.file_paths[i])
            for data in data_dict.values():
                processed = preprocess_meg(data)
                if self.downsample_factor < 1.0:
                    processed = downsample(processed, self.downsample_factor)
                if processed.shape[1] > max_length:
                    max_length = processed.shape[1]
        return max_length
    
    def __len__(self):
        return int(np.ceil(len(self.file_paths) / self.batch_size))
    
    def __iter__(self):
        while True:
            for i in range(0, len(self.file_paths), self.batch_size):
                batch_paths = self.file_paths[i:i+self.batch_size]
                batch_types = self.task_types[i:i+self.batch_size]
                batch_data = []
                batch_labels = []
                
                for path, task_type in zip(batch_paths, batch_types):
                    data_dict = load_split_files(path)
                    for data in data_dict.values():
                        processed = preprocess_meg(data)
                        if self.downsample_factor < 1.0:
                            processed = downsample(processed, self.downsample_factor)
                        batch_data.append(processed)
                        batch_labels.append(task_type)
                
                # Efficient padding
                X_batch = np.zeros((len(batch_data), batch_data[0].shape[0], self.target_length))
                for j, d in enumerate(batch_data):
                    length = min(d.shape[1], self.target_length)
                    X_batch[j, :, :length] = d[:, :length]
                
                X_batch = np.expand_dims(X_batch, axis=-1).astype('float32')
                y_batch = self.label_encoder.transform(batch_labels)
                
                yield X_batch, y_batch

def create_efficient_model(input_shape, num_classes):
    """Memory-optimized CNN-LSTM model with mixed precision"""
    model = models.Sequential([
        # First conv block with reduced filters
        layers.Conv1D(32, kernel_size=5, activation='relu', input_shape=input_shape),
        layers.BatchNormalization(),
        layers.MaxPooling1D(pool_size=4),  # Increased pooling
        layers.Dropout(0.3),
        
        # Second conv block with reduced filters
        layers.Conv1D(64, kernel_size=3, activation='relu'),
        layers.BatchNormalization(),
        layers.MaxPooling1D(pool_size=2),
        layers.Dropout(0.3),
        
        # Single LSTM layer with reduced units
        layers.LSTM(32, return_sequences=False),
        layers.Dropout(0.3),
        
        # Smaller dense layers
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.3),
        layers.Dense(num_classes, activation='softmax', dtype='float32')
    ])
    
    model.compile(optimizer='adam',
                loss='sparse_categorical_crossentropy',
                metrics=['accuracy'])
    
    return model

def train_and_evaluate_optimized(typeData='Cross', batch_size=16, downsample_factor=0.5):
    # Load file paths and labels without loading actual data
    print("Collecting training file paths...")
    train_files = []
    train_labels = []
    
    for task_type in VALID_TASK_TYPES:
        files = get_dataset_name_train(file_name="all", taskType=task_type, typeData=typeData)
        if files:
            train_files.extend(files)
            train_labels.extend([task_type] * len(files))
    
    print("Collecting test file paths...")
    test_files = find_test_files(typeData)
    test_labels = []
    
    for file_path in test_files:
        file_name = os.path.basename(file_path)
        for valid_type in VALID_TASK_TYPES:
            if file_name.startswith(valid_type):
                test_labels.append(valid_type)
                break
    
    # Initialize label encoder
    label_encoder = LabelEncoder()
    label_encoder.fit(train_labels + test_labels)
    num_classes = len(label_encoder.classes_)
    
    # Create data generators
    train_gen = MEGDataGenerator(train_files, train_labels, label_encoder, 
                               batch_size=batch_size, downsample_factor=downsample_factor)
    test_gen = MEGDataGenerator(test_files, test_labels, label_encoder,
                               batch_size=batch_size, downsample_factor=downsample_factor)
    
    # Determine input shape from a sample batch
    sample_X, _ = next(train_gen.__iter__())
    input_shape = sample_X.shape[1:]
    
    print("Creating optimized model...")
    model = create_efficient_model(input_shape, num_classes)
    model.summary()
    
    print("Training model...")
    history = model.fit(train_gen.__iter__(),
                       steps_per_epoch=len(train_gen),
                       epochs=20,
                       validation_data=test_gen.__iter__(),
                       validation_steps=len(test_gen),
                       verbose=1)
    
    # Evaluate on test set
    print("Evaluating model...")
    y_true = []
    y_pred = []
    
    for _ in range(len(test_gen)):
        X_batch, y_batch = next(test_gen.__iter__())
        y_true.extend(y_batch)
        y_pred.extend(np.argmax(model.predict(X_batch, verbose=0), axis=1))
    
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred, 
                              target_names=label_encoder.classes_))
    
    return model, history

# Helper functions (keep these from your original code)
def find_test_files(typeData):
    """Find all test files in the test directories"""
    test_files = []
    base_dir = os.path.join("..", "extracted_zip_in_here", "Final Project data", typeData)
    
    for dir_name in os.listdir(base_dir):
        if 'test' in dir_name.lower():
            test_dir = os.path.join(base_dir, dir_name)
            if os.path.isdir(test_dir):
                for file_name in os.listdir(test_dir):
                    if file_name.endswith('.h5'):
                        test_files.append(os.path.join(test_dir, file_name))
    
    return test_files    

In [None]:
model, history = train_and_evaluate_optimized(
        typeData='Cross',
        batch_size=16,
        downsample_factor=0.5
    )