In [1]:
import os
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras
from keras import layers
from keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.sequence import pad_sequences # to pad sequences

from sklearn.preprocessing import StandardScaler # to normalise data
from sklearn.metrics import mean_squared_error

2023-05-01 19:40:05.284200: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-05-01 19:40:05.340895: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# check working directory (should be .../EPiC-challenge)
cwd = os.getcwd()
print(cwd)

# change working directory if needed
# DIR = "set working directory here"
# os.chdir(DIR)

/home/jc2822/EPiC-challenge


In [None]:
# package versions

print('\n'.join(f'{m.__name__}=={m.__version__}' for m in globals().values() if getattr(m, '__version__', None)))

# pandas==2.0.0
# numpy==1.23.5
# tensorflow==2.12.0
# keras.api._v2.keras==2.12.0

In [3]:
#========================#
# LOAD DATA              #
#========================#
#
# Variable Names:
# train_data
# test_data
#

# load each fold as dictionary
def load_data_with_folds(scenario_dir_path, dataset_type):
    # make dict to store data frames for each fold
    fold_dfs = {}
    # iterate over the scenario directory
    for fold_dir in Path(scenario_dir_path).iterdir():
        if fold_dir.name.startswith('.'):
            continue  # skip hidden files/folders like .DS_Store and .ipynb_checkpoints
        train_annotations_dir = Path(fold_dir, f"{dataset_type}/annotations/")
        train_physiology_dir = Path(fold_dir, f"{dataset_type}/physiology/")
        train_physiology_files = sorted(p for p in Path(train_physiology_dir).iterdir() if not p.name.startswith('.'))
        train_annotation_files = sorted(p for p in Path(train_annotations_dir).iterdir() if not p.name.startswith('.'))
        fold_df = pd.DataFrame()
        for physiology_file_path, annotations_file_path in zip(train_physiology_files, train_annotation_files):
            if physiology_file_path.name.startswith('.') or annotations_file_path.name.startswith('.'):
                continue  # skip hidden files like .DS_Store and .ipynb_checkpoints
            assert physiology_file_path.name == annotations_file_path.name, "Order mismatch"
            df_physiology = pd.read_csv(physiology_file_path)
            df_annotations = pd.read_csv(annotations_file_path)
            subject = annotations_file_path.name
            merged_df = pd.merge(df_physiology, df_annotations, on="time", how="outer")
            merged_df["subject"] = [subject] * len(merged_df)
            fold_df = pd.concat([fold_df, merged_df], ignore_index=True)
        fold_dfs[fold_dir.name] = fold_df
    return fold_dfs

# specify scenario path
scenario_dir = "data/scenario_4"

# train data
print("Loading train data")
train_data = load_data_with_folds(scenario_dir, "train")

# test data
print("Loading test data")
test_data = load_data_with_folds(scenario_dir, "test")


Loading train data
Loading test data


In [9]:
# check data was loaded
print(sorted(train_data.keys()))
print(sorted(test_data.keys()))

for key, df in train_data.items():
    print(f"{key}:\n")
    display(df)
    print(f"The max time: {df['time'].max()}")

['fold_0', 'fold_1']
['fold_0', 'fold_1']
fold_0:



Unnamed: 0,time,ecg,bvp,gsr,rsp,skt,emg_zygo,emg_coru,emg_trap,valence,arousal,subject
0,0,0.856,36.632,7.440,36.041,24.906,5.194,5.399,6.343,5.000,5.000,sub_0_vid_16.csv
1,1,0.853,36.680,7.436,36.070,24.912,5.194,5.399,6.384,,,sub_0_vid_16.csv
2,2,0.866,36.518,7.424,36.070,24.909,5.153,5.399,6.425,,,sub_0_vid_16.csv
3,3,0.859,36.612,7.502,36.051,24.913,5.153,5.440,6.344,,,sub_0_vid_16.csv
4,4,0.817,36.709,7.441,36.098,24.906,5.234,5.481,6.466,,,sub_0_vid_16.csv
...,...,...,...,...,...,...,...,...,...,...,...,...
11583465,80046,0.747,37.184,21.893,43.978,32.517,6.467,5.563,10.204,,,sub_9_vid_3.csv
11583466,80047,0.747,37.368,21.949,43.968,32.517,6.508,5.563,10.409,,,sub_9_vid_3.csv
11583467,80048,0.757,37.553,21.881,43.968,32.513,6.549,5.440,10.532,,,sub_9_vid_3.csv
11583468,80049,0.747,37.640,21.811,43.988,32.517,6.508,5.604,10.820,,,sub_9_vid_3.csv


The max time: 131950
fold_1:



Unnamed: 0,time,ecg,bvp,gsr,rsp,skt,emg_zygo,emg_coru,emg_trap,valence,arousal,subject
0,0,0.794,36.476,9.663,35.490,24.573,4.908,5.360,6.058,5.000,5.00,sub_0_vid_0.csv
1,1,0.787,36.544,9.691,35.508,24.579,5.028,5.521,6.178,,,sub_0_vid_0.csv
2,2,0.797,36.286,9.675,35.509,24.583,5.030,5.563,6.179,,,sub_0_vid_0.csv
3,3,0.790,36.408,9.683,35.509,24.576,4.989,5.523,6.139,,,sub_0_vid_0.csv
4,4,0.793,36.515,9.679,35.518,24.576,4.988,5.482,6.138,,,sub_0_vid_0.csv
...,...,...,...,...,...,...,...,...,...,...,...,...
10903965,95046,0.649,34.850,23.892,42.749,34.431,5.317,5.810,9.095,,,sub_9_vid_9.csv
10903966,95047,0.639,33.959,23.924,42.758,34.441,5.399,5.933,9.177,,,sub_9_vid_9.csv
10903967,95048,0.636,34.211,23.865,42.729,34.431,5.317,5.892,9.136,,,sub_9_vid_9.csv
10903968,95049,0.649,34.453,23.845,42.768,34.434,5.194,5.892,9.054,,,sub_9_vid_9.csv


The max time: 108050


In [10]:
#====================================#
# PAD VIDEO SEQUENCES: TRAINING SET  #
#====================================#
#
# Variable Names:
# train_data_scaled_padded
# test_data_scaled_padded
#

# set maxlen so that it is in increments of 50 to match the affect timescales
maxlen = 132000 

# separate out each video sequence, in prep for padding
train_data_scaled_padded = {}

# loop through each fold
for key, df in train_data.items():
    
    # find start of each sequence
    idx_zero = np.where(df['time']==0)[0]

    # initialize lists to store the data
    valence = []
    arousal = []
    raw_data = []

    # loop through each video sequence in the fold
    for i in range(len(idx_zero)):
        
        if i == len(idx_zero) - 1:
            
            valence.append(df['valence'][idx_zero[i]:])
            arousal.append(df['arousal'][idx_zero[i]:])
            raw_data.append(df.iloc[idx_zero[i]:, 1:9])
            
        else:
            
            valence.append(df['valence'][idx_zero[i]:idx_zero[i+1]])
            arousal.append(df['arousal'][idx_zero[i]:idx_zero[i+1]])
            raw_data.append(df.iloc[idx_zero[i]:idx_zero[i+1], 1:9])

    # store in each fold in dictionary
    train_data_scaled_padded[key] = {
        'valence': np.array(valence, dtype=object),
        'arousal': np.array(arousal, dtype=object),
        'raw_data': np.array(raw_data, dtype=object)
    }

# pad train data sequences to make them equal length
for key, array in train_data_scaled_padded.items():
    
    array['valence'] = pad_sequences(valence, dtype="float64", maxlen=maxlen, value=0.0, padding="post")
    array['arousal'] = pad_sequences(arousal, dtype="float64", maxlen=maxlen, value=0.0, padding="post")
    array['raw_data'] = pad_sequences(raw_data, dtype="float64", maxlen=maxlen, value=0.0, padding="post")

# check shape after padding
for key, array in train_data_scaled_padded.items():  
    
    print(array['valence'].shape)
    print(array['arousal'].shape)
    print(array['raw_data'].shape)
    break
    
for seq in array['valence']:
    length = len(seq)
    print(f"Length of sequence: {length}")
    break

(120, 132000)
(120, 132000)
(120, 132000, 8)
Length of sequence: 132000


In [11]:
# reshape data to prepare for timeseries batching
for key, array in train_data_scaled_padded.items():
    array['valence'] = array['valence'].reshape(-1)
    array['arousal'] = array['arousal'].reshape(-1)
    array['raw_data'] = array['raw_data'].reshape(-1, 8)
    
# check shape again
for key, array in train_data_scaled_padded.items():  
    print(array['valence'].shape)
    print(array['arousal'].shape)
    print(array['raw_data'].shape)
    break


(15840000,)
(15840000,)
(15840000, 8)


In [12]:
# stack valence and arousal & transpose
for key, array in train_data_scaled_padded.items(): 
    array['affect'] = np.vstack((array['valence'], array['arousal'])).T

In [14]:
#================================#
# PAD VIDEO SEQUENCES: TEST SET  #
#================================#


# pad test data
test_data_scaled_padded = {}

# loop through each fold
for key, df in test_data.items():
    
    # find start of each video sequence
    idx_start = np.where(df['time']==9900)[0] # only keep buffer of 100ms prior
    idx_end = np.where(df['time']==40000)[0]

    # initialize lists to store the data
    raw_data = []

    # loop through each sequence in fold
    for i in range(len(idx_start)): # n_videos
        
        raw_data.append(df.iloc[idx_start[i]:idx_end[i], 1:9])

    # store in each fold in dictionary
    test_data_scaled_padded[key] = {
        'raw_data': np.array(raw_data, dtype=object)
    }

# pad test data sequences
for key, array in test_data_scaled_padded.items():
    array['raw_data'] = pad_sequences(raw_data, dtype="float64", maxlen=30100, value=0.0, padding="post")
    array['raw_data'] = array['raw_data'].reshape(-1, 8)

# check shape after padding
for key, array in test_data_scaled_padded.items():  
    
    print(array['raw_data'].shape)
    break


(3612000, 8)


In [15]:
#================================#
# SPLIT INTO TIMESERIES BATCHES  #
#================================#
#
# Variable Names:
# train_dataset
# test_dataset
#

# split into timeseries batches for train, test
# using keras timeseries_dataset_from_array

sequence_length = 100 # 100ms windows
sampling_rate = 1 # sampling rate within a sequence
sequence_stride = 50 # how much to shift window by
delay = sequence_length # target is N+delay

train_dataset = {}
test_dataset = {}

for key, array in train_data_scaled_padded.items():
    
    train_dataset[key] = keras.utils.timeseries_dataset_from_array(
        data=array['raw_data'],
        targets= array['affect'],
        sequence_length=sequence_length,
        sequence_stride=sequence_stride,
        sampling_rate=sampling_rate,
        batch_size=2640, # number of sequences per batch,
        shuffle=False,
        )
    
    
    
for key, array in test_data_scaled_padded.items():
    
    test_dataset[key] = keras.utils.timeseries_dataset_from_array(
        data=array['raw_data'],
        targets=None, # set to none to generate input sequences only
        sequence_length=sequence_length,
        sequence_stride=sequence_stride,
        sampling_rate=sampling_rate,
        batch_size=610, # from 10000-100 to 40000, every 50
        shuffle=False,
        )


2023-05-01 19:51:22.469789: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1635] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 38201 MB memory:  -> device: 0, name: NVIDIA A100-PCIE-40GB, pci bus id: 0000:e3:00.0, compute capability: 8.0


In [16]:
print(sorted(train_dataset.keys()))

['fold_0', 'fold_1']


In [17]:
#========================#
# MODEL TRAINING: LSTM   #
#========================#

# for reproducibility
tf.random.set_seed(1)

# train one model per fold
for key in sorted(train_dataset.keys()):
    print(f"Training fold: {key}")
    
    # build model
    inputs = keras.Input(shape=(100, 8))
    x = layers.Masking(mask_value=0., input_shape=(100, 8))(inputs)
    x = layers.LSTM(128, recurrent_dropout=0.25)(x)
    x = layers.BatchNormalization()(x)
    outputs = layers.Dense(2, activation="linear")(x)
    model = keras.Model(inputs, outputs)
    
    model_name = f"scen4_model_{key}.h5"
    print(f"Training model: {model_name}")
    
    # compile
    model.compile(optimizer="rmsprop",
                  loss="mse")
    
    # train
    model.fit(train_dataset[key],
              epochs=15,
              shuffle=False,
              verbose=1,
             )
    
    # save
    model.save(model_name)
    
    # reset for next fold
    model.reset_states()
    tf.keras.backend.clear_session()
    

Training fold: fold_0
Training model: scen4_model_fold_0.h5
Epoch 1/15


2023-05-01 19:52:11.202443: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype double and shape [15840000,8]
	 [[{{node Placeholder/_0}}]]
2023-05-01 19:52:11.202834: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_20' with dtype double and shape [15840000,2]
	 [[{{node Placeholder/_20}}]]
2023-05-01 19:52:13.707375: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:637] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2023-05-01 19:52:13.947800: I tensorflow/compiler/xla/service/service.cc:169] XLA service 0x7f4e2801b7c0 initialize

Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Training fold: fold_1
Training model: scen4_model_fold_1.h5
Epoch 1/15


2023-05-01 19:57:11.298627: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_8' with dtype int32 and shape [316799]
	 [[{{node Placeholder/_8}}]]
2023-05-01 19:57:11.299017: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_13' with dtype int32
	 [[{{node Placeholder/_13}}]]


Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [12]:
# # reset
# model.reset_states()
# tf.keras.backend.clear_session()

In [20]:
# # Evaluate model from fold 0 on fold 1 as validation
# model = keras.models.load_model(f'scen4_model_fold_0.h5')
# model.evaluate(train_dataset['fold_1'], verbose=1)

# # Evaluate model from fold 1 on fold 0 as validation
# model = keras.models.load_model(f'scen4_model_fold_1.h5')
# model.evaluate(train_dataset['fold_0'], verbose=1)



1.385783314704895

In [21]:
#========================#
# MODEL PREDICTIONS      #
#========================#

# Define a dictionary to store predictions
all_preds = {}

# Loop through each model
for i in range(2):
    # Load the model
    model = keras.models.load_model(f'scen4_model_fold_{i}.h5')
    # Get predictions for the test data using the loaded model
    preds = model.predict(test_dataset[f'fold_{i}'])
    # Add the predictions to the dictionary with the key 'fold_i'
    all_preds[f'fold_{i}'] = preds
    





2023-05-01 20:17:06.615274: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype double and shape [3612000,8]
	 [[{{node Placeholder/_0}}]]
2023-05-01 20:17:06.615576: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype double and shape [3612000,8]
	 [[{{node Placeholder/_0}}]]




2023-05-01 20:17:09.605443: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_13' with dtype int32
	 [[{{node Placeholder/_13}}]]
2023-05-01 20:17:09.605762: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_13' with dtype int32
	 [[{{node Placeholder/_13}}]]




In [22]:
# generate and save predictions as csv files

num_iterations = 120 # 120 test files
num_preds = 601 # number of predictions per file

# loop through each fold
for key, preds in sorted(all_preds.items()):
    print(key)
    print(len(preds))
    
    # save predictions
    valence = all_preds[key][:, 0:1]
    arousal = all_preds[key][:, 1:2]
    print(len(valence))
    
    # set directory file path
    directory = f"results/scenario_4/{key}/test/annotations/"
    
    # create directory if not exist
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    # loop through each test file and save
    for i in range(num_iterations):
        
        print(i)
        # get file name and set file path
        f = test_data[key][test_data[key]['time'] == 10000]['subject'].iloc[i]
        file_name = f"{f}"
        file_path = os.path.join(directory, file_name)
        
        # create time column
        df = pd.DataFrame({'time': np.arange(10000, 40000+1, 50)})
        
        start_index = (num_preds)*i
        end_index = (num_preds)*(i+1)
        
        # add in predictions
        df['valence'] = valence[start_index:end_index]
        df['arousal'] = arousal[start_index:end_index]
        
#         display(df)
        
        # save df as csv
        df.to_csv(file_path, index=False)
        print(f"Saved: {file_path}")
    


    

fold_0
72239
72239
0
Saved: results/scenario_4/fold_0/test/annotations/sub_0_vid_0.csv
1
Saved: results/scenario_4/fold_0/test/annotations/sub_0_vid_12.csv
2
Saved: results/scenario_4/fold_0/test/annotations/sub_0_vid_15.csv
3
Saved: results/scenario_4/fold_0/test/annotations/sub_0_vid_9.csv
4
Saved: results/scenario_4/fold_0/test/annotations/sub_10_vid_0.csv
5
Saved: results/scenario_4/fold_0/test/annotations/sub_10_vid_12.csv
6
Saved: results/scenario_4/fold_0/test/annotations/sub_10_vid_15.csv
7
Saved: results/scenario_4/fold_0/test/annotations/sub_10_vid_9.csv
8
Saved: results/scenario_4/fold_0/test/annotations/sub_11_vid_0.csv
9
Saved: results/scenario_4/fold_0/test/annotations/sub_11_vid_12.csv
10
Saved: results/scenario_4/fold_0/test/annotations/sub_11_vid_15.csv
11
Saved: results/scenario_4/fold_0/test/annotations/sub_11_vid_9.csv
12
Saved: results/scenario_4/fold_0/test/annotations/sub_12_vid_0.csv
13
Saved: results/scenario_4/fold_0/test/annotations/sub_12_vid_12.csv
14
Saved

Saved: results/scenario_4/fold_1/test/annotations/sub_11_vid_20.csv
11
Saved: results/scenario_4/fold_1/test/annotations/sub_11_vid_3.csv
12
Saved: results/scenario_4/fold_1/test/annotations/sub_12_vid_16.csv
13
Saved: results/scenario_4/fold_1/test/annotations/sub_12_vid_19.csv
14
Saved: results/scenario_4/fold_1/test/annotations/sub_12_vid_20.csv
15
Saved: results/scenario_4/fold_1/test/annotations/sub_12_vid_3.csv
16
Saved: results/scenario_4/fold_1/test/annotations/sub_16_vid_16.csv
17
Saved: results/scenario_4/fold_1/test/annotations/sub_16_vid_19.csv
18
Saved: results/scenario_4/fold_1/test/annotations/sub_16_vid_20.csv
19
Saved: results/scenario_4/fold_1/test/annotations/sub_16_vid_3.csv
20
Saved: results/scenario_4/fold_1/test/annotations/sub_18_vid_16.csv
21
Saved: results/scenario_4/fold_1/test/annotations/sub_18_vid_19.csv
22
Saved: results/scenario_4/fold_1/test/annotations/sub_18_vid_20.csv
23
Saved: results/scenario_4/fold_1/test/annotations/sub_18_vid_3.csv
24
Saved: res