The data extraction was based on the provided code, with slight adjustments for where we decided to store the downloaded data:

In [158]:
from typing import Iterator

import numpy as np
import h5py
import os

DATA_PATH = os.path.abspath("../extracted_zip_in_here/Final Project data/")
INTRA_TRAIN_FOLDER = os.path.join(DATA_PATH, os.path.relpath("./Intra/train/"))

def get_dataset_name(filename_with_dir):
    filename_without_dir = os.path.basename(filename_with_dir)
    temp = filename_without_dir.split('.')[:-1]
    dataset_name = ''.join(temp)
    temp = dataset_name.split('_')[:-1]
    dataset_name = "_".join(temp)
    return dataset_name


def extract_data_from_folder_by_file(folder_path, shuffle=False):
    files = os.listdir(folder_path)
    if shuffle:
        np.random.shuffle(files)

    for file_name in files:
        
        filename_path = os.path.join(folder_path, file_name)
        
        with h5py.File(filename_path, 'r') as f:
            dataset_name = get_dataset_name(filename_path)
            matrix = f.get(dataset_name)[()]
            yield dataset_name, matrix

We first have to scale the data across different files in the same way, so we have to scan the files and find min max to perform the scaling

In [159]:
def learn_minmax_from_all_files(folder_path: str) -> tuple:
    # Placeholders
    min_val = None
    max_val = None

    for (name, data) in extract_data_from_folder_by_file(folder_path):
        data = data.T
        if min_val is None:
            min_val = np.min(data, axis=0)
            max_val = np.max(data, axis=0)
        else:
            # Update min_val and max_val
            min_val = np.minimum(min_val, np.min(data, axis=0))
            max_val = np.maximum(max_val, np.max(data, axis=0))
        
    return min_val, max_val

def scale_data(data: np.ndarray, min_val: np.ndarray, max_val: np.ndarray) -> np.ndarray:
    # Scale the data to the range [0, 1]
    return (data - min_val) / (max_val - min_val)

In [160]:
min_val, max_val = learn_minmax_from_all_files(INTRA_TRAIN_FOLDER)
print(f"Min values: {min_val.shape}, Max values: {max_val.shape}")

Min values: (248,), Max values: (248,)


Because of independent sampling, we can just sample each file independently and the same dropout should occur:

In [161]:
def downsample(data: np.array, factor: float) -> np.array:
    """
    Downsample time series data by uniformly selecting samples at fixed intervals
    to keep the temporal order intact.

    Args:
        data (np.array): Input time series data (1D or 2D with time dimension as first axis)
        factor (float): Downsampling factor (e.g., 0.5 means keep half the samples)

    Returns:
        np.array: Downsampled data with timesteps reduced by the factor
    """
    num_samples = int(len(data) * factor)
    # Calculate the stride to evenly pick samples
    stride = len(data) / num_samples
    # Use np.floor to avoid going out of bounds and convert to int indices
    indices = (np.floor(np.arange(num_samples) * stride)).astype(int)
    downsampled_data = data[indices]
    return downsampled_data

Here we can set the downsample factor for all sampling

In [162]:
DOWNSAMPLE_FACTOR = 0.1

Here, we define the preprocessing steps that we apply to all data after reading it from the file:

In [163]:
preprocessing_pipeline = [
    lambda x: scale_data(x, min_val, max_val), 
    lambda x: downsample(x, DOWNSAMPLE_FACTOR)
]

We should also create labels based on the file names:

In [164]:
def generate_label(file_name:str) -> np.ndarray:
    # Return a one-hot encoded label based on the file name, there are4 classes
    # 0: rest
    if "rest_" in file_name:
        return np.array([1, 0, 0, 0])
    # 1: task_motor
    elif "task_motor_" in file_name:
        return np.array([0, 1, 0, 0])
    # 2: task_story_math
    elif "task_story_math_" in file_name:
        return np.array([0, 0, 1, 0])
    # 3: task_working_memory
    elif "task_working_memory_" in file_name:
        return np.array([0, 0, 0, 1])
    else:
        raise ValueError(f"Unknown file name: {file_name}")

To create batches by number of files, we can use a generator like this:

In [165]:
def create_batches(number_of_files_per_batch: int, preprocessing_pipeline: list = None, shuffle_files=False) -> Iterator[tuple]:
    batch_data = []
    batch_labels = []
    for n, (name, data) in enumerate(extract_data_from_folder_by_file(INTRA_TRAIN_FOLDER, shuffle=shuffle_files)):
        data = data.T
        if preprocessing_pipeline:
            for preprocessing_step in preprocessing_pipeline:
                data = preprocessing_step(data)
        # Add the preprocessed data to the batch
        batch_data.append(data)

        # Generate the label matrix of the length of the data for the current file
        label_vector = generate_label(name)
        batch_labels.append(label_vector)


        # Check if we have reached the desired batch size
        if (n + 1) % number_of_files_per_batch == 0:
            # Stack along the first axis (like a batch dimension)
            yield (batch_data, batch_labels)
            batch_data = []
            batch_labels = []

    # Optional: yield the remainder if not divisible
    if batch_data:
        yield (batch_data, batch_labels)

In [166]:
# Example usage
for data_batch, labels_batch in create_batches(number_of_files_per_batch=8, preprocessing_pipeline=preprocessing_pipeline, shuffle_files=False):
    
    for data, label in zip(data_batch, labels_batch):
        print(f"Data shape: {data.shape}, Label: {label}")



Data shape: (3562, 248), Label: [1 0 0 0]
Data shape: (3562, 248), Label: [1 0 0 0]
Data shape: (3562, 248), Label: [1 0 0 0]
Data shape: (3562, 248), Label: [1 0 0 0]
Data shape: (3562, 248), Label: [1 0 0 0]
Data shape: (3562, 248), Label: [1 0 0 0]
Data shape: (3562, 248), Label: [1 0 0 0]
Data shape: (3562, 248), Label: [1 0 0 0]
Data shape: (3562, 248), Label: [0 1 0 0]
Data shape: (3562, 248), Label: [0 1 0 0]
Data shape: (3562, 248), Label: [0 1 0 0]
Data shape: (3562, 248), Label: [0 1 0 0]
Data shape: (3562, 248), Label: [0 1 0 0]
Data shape: (3562, 248), Label: [0 1 0 0]
Data shape: (3562, 248), Label: [0 1 0 0]
Data shape: (3562, 248), Label: [0 1 0 0]
Data shape: (3562, 248), Label: [0 0 1 0]
Data shape: (3562, 248), Label: [0 0 1 0]
Data shape: (3562, 248), Label: [0 0 1 0]
Data shape: (3562, 248), Label: [0 0 1 0]
Data shape: (3562, 248), Label: [0 0 1 0]
Data shape: (3562, 248), Label: [0 0 1 0]
Data shape: (3562, 248), Label: [0 0 1 0]
Data shape: (3562, 248), Label: [0

## Training

### Hyperparameters

In [167]:
EPOCHS = []
# TODO: which hyperparameters are we going to grid search?

### Model definition

In [168]:
import random
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.optimizers import Adam

FEATURES = 248
TIMESTEPS = 3562
CLASSES = 4


lstm_classifier = Sequential([
    LSTM(64, return_sequences=False, input_shape=(3562, 248)),
    Dense(64, activation='relu'),
    Dense(4, activation='softmax')
])

lstm_classifier.compile(
    loss=CategoricalCrossentropy(),  # works directly with one-hot encoded labels
    optimizer=Adam(),
    metrics=['accuracy']
)




  super().__init__(**kwargs)


### Trainingloop

In [169]:
def train_model(model, epochs=10, batch_size=8, verbose=1):
    for epoch in range(epochs):
        if verbose:
            print(f"Epoch: {epoch}")
        for batch, (batch_X_list, batch_y_list) in enumerate(create_batches(number_of_files_per_batch=batch_size, preprocessing_pipeline=preprocessing_pipeline, shuffle_files=True)):
            # Convert the list of arrays to a 3D numpy array
            data = np.array(batch_X_list)
            labels = np.array(batch_y_list)

            # Shuffle the data and labels together
            indices = np.arange(data.shape[0])
            np.random.shuffle(indices)
            data = data[indices]
            labels = labels[indices]
            
            # Train the model
            model.fit(data, labels)
            
            # Evaluate the model
            loss, accuracy = model.evaluate(data, labels)
            if verbose:
                print(f"Batch: {batch}, Loss: {loss}, Accuracy: {accuracy}")
    
    return model

## Training

In [170]:
trained_lstm_classifier = train_model(lstm_classifier, epochs=10, batch_size=8, verbose=1)

Epoch: 0
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - accuracy: 0.1250 - loss: 1.4172
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - accuracy: 0.6250 - loss: 1.0185
Batch: 0, Loss: 1.0184794664382935, Accuracy: 0.625
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 723ms/step - accuracy: 0.5000 - loss: 1.3216
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 240ms/step - accuracy: 0.6250 - loss: 1.1916
Batch: 1, Loss: 1.1916171312332153, Accuracy: 0.625
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 652ms/step - accuracy: 0.7500 - loss: 1.0934
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 230ms/step - accuracy: 0.8750 - loss: 0.9300
Batch: 2, Loss: 0.9300388693809509, Accuracy: 0.875
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 645ms/step - accuracy: 0.5000 - loss: 1.2694
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 216ms/step - accuracy: 0.5000 

In [174]:
def evaluate_training_scores(model):
    losses = []
    accuracies = []
    for batch_X_list, batch_y_list in create_batches(number_of_files_per_batch=8, preprocessing_pipeline=preprocessing_pipeline, shuffle_files=False):
        # Convert the list of arrays to a 3D numpy array
        data = np.array(batch_X_list)
        labels = np.array(batch_y_list)
        
        # Evaluate the model
        loss, accuracy = model.evaluate(data, labels)
        losses.append(loss)
        accuracies.append(accuracy)
    return np.mean(losses), np.mean(accuracies)

In [175]:
loss, accuracy = evaluate_training_scores(trained_lstm_classifier)
print(f"Loss: {loss}, Accuracy: {accuracy}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 228ms/step - accuracy: 1.0000 - loss: 0.0123
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 250ms/step - accuracy: 1.0000 - loss: 0.0214
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 249ms/step - accuracy: 1.0000 - loss: 0.0181
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 227ms/step - accuracy: 1.0000 - loss: 0.1226
Loss: 0.043595935218036175, Accuracy: 1.0


## Testing