The data extraction was based on the provided code, with slight adjustments for where we decided to store the downloaded data:

In [69]:
from typing import Iterator

import numpy as np
import h5py
import os

DATA_PATH = os.path.abspath("../extracted_zip_in_here/Final Project data/")
INTRA_TRAIN_FOLDER = os.path.join(DATA_PATH, os.path.relpath("./Intra/train/"))

def get_dataset_name(filename_with_dir):
    filename_without_dir = os.path.basename(filename_with_dir)
    temp = filename_without_dir.split('.')[:-1]
    dataset_name = ''.join(temp)
    temp = dataset_name.split('_')[:-1]
    dataset_name = "_".join(temp)
    return dataset_name


def extract_data_from_folder_by_file(folder_path, shuffle=False):
    files = os.listdir(folder_path)
    if shuffle:
        np.random.shuffle(files)

    for file_name in files:
        
        filename_path = os.path.join(folder_path, file_name)
        
        with h5py.File(filename_path, 'r') as f:
            dataset_name = get_dataset_name(filename_path)
            matrix = f.get(dataset_name)[()]
            yield dataset_name, matrix

We first have to scale the data across different files in the same way, so we have to scan the files and find min max to perform the scaling

In [70]:
def learn_minmax_from_all_files(folder_path: str) -> tuple:
    # Placeholders
    min_val = None
    max_val = None

    for (name, data) in extract_data_from_folder_by_file(folder_path):
        data = data.T
        if min_val is None:
            min_val = np.min(data, axis=0)
            max_val = np.max(data, axis=0)
        else:
            # Update min_val and max_val
            min_val = np.minimum(min_val, np.min(data, axis=0))
            max_val = np.maximum(max_val, np.max(data, axis=0))
        
    return min_val, max_val

def scale_data(data: np.ndarray, min_val: np.ndarray, max_val: np.ndarray) -> np.ndarray:
    # Scale the data to the range [0, 1]
    return (data - min_val) / (max_val - min_val)

In [71]:
min_val, max_val = learn_minmax_from_all_files(INTRA_TRAIN_FOLDER)
print(f"Min values: {min_val.shape}, Max values: {max_val.shape}")

Min values: (248,), Max values: (248,)


Because of independent sampling, we can just sample each file independently and the same dropout should occur:

In [72]:
def downsample(data: np.array, factor: float) -> np.array:
    # Downsample with uniform chance of emission
    
    # Calculate the number of samples to keep
    num_samples = int(len(data) * factor)
    # Generate random indices to select samples
    indices = np.random.choice(len(data), num_samples, replace=False)
    # Select the samples using the random indices
    downsampled_data = data[indices]
    # Sort the indices to maintain the original order
    downsampled_data.sort()
    return downsampled_data

Here we can set the downsample factor for all sampling

In [73]:
DOWNSAMPLE_FACTOR = 0.0001

Here, we define the preprocessing steps that we apply to all data after reading it from the file:

In [74]:
preprocessing_pipeline = [
    lambda x: scale_data(x, min_val, max_val), 
    lambda x: downsample(x, DOWNSAMPLE_FACTOR)
]

We should also create labels based on the file names:

In [75]:
def generate_label(file_name:str) -> np.ndarray:
    # Return a one-hot encoded label based on the file name, there are4 classes
    # 0: rest
    if "rest_" in file_name:
        return np.array([1, 0, 0, 0])
    # 1: task_motor
    elif "task_motor_" in file_name:
        return np.array([0, 1, 0, 0])
    # 2: task_story_math
    elif "task_story_math_" in file_name:
        return np.array([0, 0, 1, 0])
    # 3: task_working_memory
    elif "task_working_memory_" in file_name:
        return np.array([0, 0, 0, 1])
    else:
        raise ValueError(f"Unknown file name: {file_name}")

To create batches by number of files, we can use a generator like this:

In [76]:
def create_batches(number_of_files_per_batch: int, preprocessing_pipeline: list = None, shuffle_files=False) -> Iterator[tuple]:
    batch_data = []
    batch_labels = []
    for n, (name, data) in enumerate(extract_data_from_folder_by_file(INTRA_TRAIN_FOLDER, shuffle=shuffle_files)):
        data = data.T
        if preprocessing_pipeline:
            for preprocessing_step in preprocessing_pipeline:
                data = preprocessing_step(data)
        # Add the preprocessed data to the batch
        batch_data.append(data)

        # Generate the label matrix of the length of the data for the current file
        label_vector = np.tile(generate_label(name), (data.shape[0], 1))
        batch_labels.append(label_vector)


        # Check if we have reached the desired batch size
        if (n + 1) % number_of_files_per_batch == 0:
            # Stack along the first axis (like a batch dimension)
            yield (np.concatenate(batch_data, axis=0), np.concatenate(batch_labels, axis=0))
            batch_data = []
            batch_labels = []

    # Optional: yield the remainder if not divisible
    if batch_data:
        yield (np.concatenate(batch_data, axis=0), np.concatenate(batch_labels, axis=0))

In [78]:
# Example usage
for data, labels in create_batches(number_of_files_per_batch=8, preprocessing_pipeline=preprocessing_pipeline, shuffle_files=False):
    print(data.shape, labels.shape)
    print(data, labels)

(24, 248) (24, 4)
[[0.12451467 0.136548   0.13877756 ... 0.77114324 0.77218858 0.98219193]
 [0.08579206 0.09775649 0.10059086 ... 0.77538048 0.77798728 0.98635244]
 [0.13041652 0.13491226 0.1398753  ... 0.77973122 0.78462014 0.97988468]
 ...
 [0.05455497 0.05735303 0.05835616 ... 0.94892235 0.94942445 0.95048357]
 [0.01126965 0.01480185 0.01596172 ... 0.97989852 0.98088233 0.98591771]
 [0.04490415 0.04547977 0.04612234 ... 0.96460717 0.96749138 0.97164311]] [[1 0 0 0]
 [1 0 0 0]
 [1 0 0 0]
 [1 0 0 0]
 [1 0 0 0]
 [1 0 0 0]
 [1 0 0 0]
 [1 0 0 0]
 [1 0 0 0]
 [1 0 0 0]
 [1 0 0 0]
 [1 0 0 0]
 [1 0 0 0]
 [1 0 0 0]
 [1 0 0 0]
 [1 0 0 0]
 [1 0 0 0]
 [1 0 0 0]
 [1 0 0 0]
 [1 0 0 0]
 [1 0 0 0]
 [1 0 0 0]
 [1 0 0 0]
 [1 0 0 0]]
(24, 248) (24, 4)
[[0.05804587 0.06670499 0.06859928 ... 0.92570719 0.92955797 0.94059463]
 [0.04866248 0.05297468 0.05474345 ... 0.92477624 0.93052868 0.94982356]
 [0.05711845 0.06220372 0.06521095 ... 0.90516952 0.9056015  0.91499122]
 ...
 [0.04411501 0.04934978 0.05424

## Training

### Hyperparameters

In [67]:
EPOCHS = []
# TODO: which hyperparameters are we going to grid search?

### Model definition

### Trainingloop

## Testing