In [73]:
from google.colab import drive
import pandas as pd
from sklearn.preprocessing import StandardScaler
from scipy import stats
import numpy as np
import logging
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense
from datetime import datetime
import matplotlib.pyplot as plt
import tensorflow as tf
import datetime
import matplotlib.dates as mdates
import os

In [74]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Set the base path to the desired directory on Google Drive
base_path = '/content/drive/MyDrive/Study_1_Data/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [75]:
def read_csv(file_path):
    data = pd.read_csv(file_path)
    return data

In [76]:
def process_data(data, columns_to_remove):
    processed_data = data.drop(columns=columns_to_remove).values
    return processed_data

In [77]:

def construct_3d_array(base_dir, participants, simulations, columns_to_remove_hr, columns_to_remove_gsr, columns_to_remove_head, columns_to_remove_eye):
    """
    Construct 3D array from CSV files.
    """
    num_rows = 180  # Define number of rows to keep (last 180 rows)
    arrays_3d = []

    for participant in participants:
        participant_id = f"{int(participant):02d}"  # Format participant number to two digits

        valid_simulations = []

        for simulation in simulations:
            hr_file_path = os.path.join(base_dir, participant_id, simulation, f'HR{simulation.capitalize()}.csv')
            gsr_file_path = os.path.join(base_dir, participant_id, simulation, f'EDA{simulation.capitalize()}_downsampled.csv')
            head_file_path = os.path.join(base_dir, participant_id, simulation, 'head_tracking_downsampled.csv')
            eye_file_path = os.path.join(base_dir, participant_id, simulation, 'eye_tracking_downsampled.csv')

            # Check if all files exist
            if all(os.path.exists(file) for file in [hr_file_path, gsr_file_path, head_file_path, eye_file_path]):
                valid_simulations.append(simulation)

        num_valid_simulations = len(valid_simulations)
        if num_valid_simulations == 0:
            continue  # Skip this participant if no valid simulations are found

        array_3d = np.zeros((num_valid_simulations, num_rows, 47)) # hr=1, gsr=1, head=15-3, eye=41-8 total columns after removing columns= 48

        for s_idx, simulation in enumerate(valid_simulations):
            # Process hr data
            hr_file_path = os.path.join(base_dir, participant_id, simulation, f'HR{simulation.capitalize()}.csv')
            hr_data = read_csv(hr_file_path)
            processed_hr_data = process_data(hr_data, columns_to_remove_hr)
            processed_hr_data = processed_hr_data[-num_rows:]  # Keep only the last 180 rows

            # Process gsr data
            gsr_file_path = os.path.join(base_dir, participant_id, simulation, f'EDA{simulation.capitalize()}_downsampled.csv')
            gsr_data = read_csv(gsr_file_path)
            processed_gsr_data = process_data(gsr_data, columns_to_remove_gsr)
            processed_gsr_data = processed_gsr_data[-num_rows:]  # Keep only the last 180 rows

            # Process head data
            head_file_path = os.path.join(base_dir, participant_id, simulation, 'head_tracking_downsampled.csv')
            head_data = read_csv(head_file_path)
            processed_head_data = process_data(head_data, columns_to_remove_head)
            processed_head_data = processed_head_data[-num_rows:]  # Keep only the last 180 rows

            # Process eye data
            eye_file_path = os.path.join(base_dir, participant_id, simulation, 'eye_tracking_downsampled.csv')
            eye_data = read_csv(eye_file_path)
            processed_eye_data = process_data(eye_data, columns_to_remove_eye)
            processed_eye_data = processed_eye_data[-num_rows:]  # Keep only the last 180 rows

            # Combine processed data
            combined_data = np.concatenate((processed_hr_data, processed_gsr_data, processed_head_data, processed_eye_data), axis=1)



            array_3d[s_idx, :, :] = combined_data

            arrays_3d.append(array_3d)
    return arrays_3d


In [78]:
sample_size=60
# simulations_train = ['noise','bumps']
# simulations_test=['flat']
# val_indices = [4, 10, 11, 26, 28, 31, 33, 37] # for flat
# train_indices = [0, 1, 2, 3, 5, 6, 7, 8, 9, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 29, 30, 32, 34, 35, 36, 38, 39, 40, 41] # for flat


# simulations_test=['noise']
# simulations_train = ['flat','bumps']
# val_indices = [7, 15, 17, 19, 28, 31, 32, 42, 44, 48] # for noise
# train_indices = [0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 16, 18, 20, 21, 22, 23, 24, 25, 26, 27, 29, 30, 33, 34, 35, 36, 37, 38, 39, 40, 41, 43, 45, 46, 47] # for noise

simulations_test=['bumps']
simulations_train = ['flat','noise']
val_indices = [1, 12, 16, 18, 22, 26, 28, 37, 41] # for speedbumps
train_indices = [0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 17, 19, 20, 21, 23, 24, 25, 27, 29, 30, 31, 32, 33, 34, 35, 36, 38, 39, 40, 42, 43, 44] # for speedbumps

In [79]:
participants = [str(i) for i in range(1, 27)]  # Participants 101 to 123
columns_to_remove_hr = []
columns_to_remove_gsr = []
columns_to_remove_eye = ['#Frame','Time', 'Unnamed: 40','ConvergenceValid','Left_Eye_Closed','Right_Eye_Closed','LocalGazeValid','WorldGazeValid']
columns_to_remove_head = ['#Frame','Time', 'Unnamed: 14']

In [80]:
arrays_train = construct_3d_array(base_path, participants, simulations_train, columns_to_remove_hr, columns_to_remove_gsr, columns_to_remove_head, columns_to_remove_eye)
arrays_test = construct_3d_array(base_path, participants, simulations_test, columns_to_remove_hr, columns_to_remove_gsr, columns_to_remove_head, columns_to_remove_eye)

In [81]:
# Concatenate arrays along the first axis
input_train = np.concatenate(arrays_train, axis=0)
input_test = np.concatenate(arrays_test, axis=0)

# Display the shape of the final concatenated 3D array
print(f"Shape of the final concatenated 3D array: {input_train.shape}")
print(f"Shape of the final concatenated 3D array: {input_test.shape}")

Shape of the final concatenated 3D array: (83, 180, 47)
Shape of the final concatenated 3D array: (23, 180, 47)


In [82]:
def calculate_total_ssq(csv_file):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(csv_file)
    n_columns = [0, 5, 6, 7, 8, 14, 15]
    o_columns = [0, 1, 2, 3, 4, 8, 10]
    d_columns = [4, 7, 9, 10, 11, 12, 13]

    # Calculate sum for each specified set of columns
    n_val = df.iloc[:, n_columns].sum(axis=1)
    o_val = df.iloc[:, o_columns].sum(axis=1)
    d_val = df.iloc[:, d_columns].sum(axis=1)

    return n_val,o_val,d_val

In [83]:
def merge_ssq_column(conditions,participants):
  directories = []
  total_ssq_values = []
  for participant in participants:
      participant = f"{int(participant):02d}"
      for condition in conditions:
          directory = os.path.join(base_path, participant, condition)
          directories.append(directory)

  # Loop through each directory
  for directory in directories:
      # Check if the directory exists
      if not os.path.exists(directory):
          continue

      # Get all CSV files in the directory that are named 'ssq.csv'
      csv_files = [file for file in os.listdir(directory) if file == 'ssq.csv']

      # Loop through each CSV file
      for csv_file in csv_files:
          csv_path = os.path.join(directory, csv_file)
          df = pd.read_csv(csv_path)
          n_val,o_val,d_val = calculate_total_ssq(csv_path)
          total_ssq_values.append([n_val, o_val, d_val])
          #ssq_values_participant = df.iloc[:, 0:17].values.flatten()   # Assuming SSQ values are in columns 1 to 16
          #total_ssq_values.append(ssq_values_participant)
  ssq_array = np.array(total_ssq_values)
  return ssq_array

def merge_total_ssq(conditions,participants):
  directories = []
  total_ssq_values = []
  for participant in participants:
      participant = f"{int(participant):02d}"
      for condition in conditions:
          directory = os.path.join(base_path, participant, condition)
          directories.append(directory)

  # Loop through each directory
  for directory in directories:
      # Check if the directory exists
      if not os.path.exists(directory):
          continue

      # Get all CSV files in the directory that are named 'ssq.csv'
      csv_files = [file for file in os.listdir(directory) if file == 'ssq.csv']

      # Loop through each CSV file
      for csv_file in csv_files:
          csv_path = os.path.join(directory, csv_file)
          n_val,o_val,d_val = calculate_total_ssq(csv_path)
          total_ssq = (n_val+o_val+d_val) * 3.74
          df = pd.read_csv(csv_path)
          df["total-ssq"] = total_ssq
          #print("csv_path: ",csv_path,"   ",total_ssq)
          total_ssq_values.append(total_ssq)
  # Create a DataFrame from the list of total SSQ values
  df_total_ssq = pd.DataFrame(total_ssq_values, columns=["total-ssq"])
  # Convert the list of total SSQ values to a NumPy array
  total_ssq_array = np.array(total_ssq_values)
  return total_ssq_array



In [84]:
output_train=merge_ssq_column(simulations_train,participants)
output_train = np.squeeze(output_train)
output_test=merge_ssq_column(simulations_test,participants)
output_test = np.squeeze(output_test)
output_train_total_ssq=merge_total_ssq(simulations_train,participants)
output_test_total_ssq=merge_total_ssq(simulations_test,participants)
output_train_total_ssq=output_train_total_ssq.reshape(-1, 1)
output_test_total_ssq=output_test_total_ssq.reshape(-1, 1)
print(output_train.shape,output_test.shape,output_train_total_ssq.shape,output_test_total_ssq.shape)
# print(output_train)
# print(output_train_total_ssq)


(45, 3) (23, 3) (45, 1) (23, 1)


In [85]:
input_train.shape

(83, 180, 47)

In [86]:
def scale_input_data(input_train, input_test):
    # Get the shape of the input data
    num_samples_train, time_steps_train, num_features = input_train.shape
    num_samples_test, time_steps_test, _ = input_test.shape

    # Reshape the input data into 2D arrays
    flattened_train_data = input_train.reshape(-1, num_features)
    flattened_test_data = input_test.reshape(-1, num_features)

    # Initialize a MinMaxScaler object
    scaler = MinMaxScaler()

    # Fit the scaler on the training data and transform both train and test data
    scaled_train_data = scaler.fit_transform(flattened_train_data)
    scaled_test_data = scaler.transform(flattened_test_data)

    # Reshape the scaled data back to its original shape
    scaled_train_data = scaled_train_data.reshape(num_samples_train, time_steps_train, num_features)
    scaled_test_data = scaled_test_data.reshape(num_samples_test, time_steps_test, num_features)

    return scaled_train_data, scaled_test_data

def scale_target_var(target_data):
    min_val, max_val = np.min(target_data, axis=0), np.max(target_data, axis=0)
    target_data = (target_data-min_val)/(max_val-min_val)

    return target_data, min_val, max_val

In [87]:
# import numpy as np


# new_data = np.zeros((38, 3))

# # Assigning original serial numbers to the first column
# new_data[:, 0] = np.arange(1, 39)

# # Assigning original values to the second column
# new_data[:, 1] = output_train_total_ssq[:, 0]

# # Sorting the array based on the values (second column)
# sorted_indices = np.argsort(new_data[:, 1])
# sorted_data = new_data[sorted_indices]

# # Assigning new serial numbers to the third column
# current_val = sorted_data[0][1]
# start_idx = 0
# for i, row in enumerate(sorted_data):
#     if row[1] != current_val:
#         sorted_data[start_idx:i, 2] = np.arange(start_idx + 1, i + 1)
#         start_idx = i
#         current_val = row[1]

# # Assigning new serial numbers for the last group
# sorted_data[start_idx:, 2] = np.arange(start_idx + 1, len(sorted_data) + 1)

# #print(sorted_data)


In [88]:
input_train, input_test= scale_input_data(input_train[:, (60-sample_size):(180-sample_size), :], input_test[:, (60-sample_size):(180-sample_size), :])
output_train, min_val, max_val = scale_target_var(output_train)

input_val = input_train[val_indices]
input_train = input_train[train_indices]
output_val = output_train_total_ssq[val_indices]
output_train = output_train[train_indices]


In [89]:
print("input_val :",input_val.shape)
print("input_train :",input_train.shape)
print("output_val :",output_val.shape)
print("output_train :",output_train.shape)
print("output_test :",output_test.shape)
print("input_test :",input_test.shape)

input_val : (9, 120, 47)
input_train : (36, 120, 47)
output_val : (9, 1)
output_train : (36, 3)
output_test : (23, 3)
input_test : (23, 120, 47)


In [90]:
from keras.models import Model
from keras.layers import Input, Dense, Dropout, LayerNormalization, MultiHeadAttention, GlobalAveragePooling1D
from keras.optimizers import Adam
import numpy as np
import sklearn

rmse_n=[]
rmse_o=[]
rmse_d=[]

def transformer_encoder(inputs, head_size, num_heads, ff_dim, dropout=0):
    x = LayerNormalization(epsilon=1e-6)(inputs)
    x = MultiHeadAttention(key_dim=head_size, num_heads=num_heads, dropout=dropout)(x, x)
    x = Dropout(dropout)(x)
    res = x + inputs

    x = LayerNormalization(epsilon=1e-6)(res)
    x = Dense(ff_dim, activation="relu")(x)
    x = Dropout(dropout)(x)
    x = Dense(inputs.shape[-1])(x)
    return x + res

def transformer_decoder(inputs, enc_outputs, head_size, num_heads, ff_dim, dropout=0):
    # Self attention
    x = LayerNormalization(epsilon=1e-6)(inputs)
    x = MultiHeadAttention(key_dim=head_size, num_heads=num_heads, dropout=dropout)(x, x)
    x = Dropout(dropout)(x)
    res = x + inputs

    # Cross attention
    x = LayerNormalization(epsilon=1e-6)(res)
    x = MultiHeadAttention(key_dim=head_size, num_heads=num_heads, dropout=dropout)(x, enc_outputs)
    x = Dropout(dropout)(x)
    res = x + res

    # Feed forward
    x = LayerNormalization(epsilon=1e-6)(res)
    x = Dense(ff_dim, activation="relu")(x)
    x = Dropout(dropout)(x)
    x = Dense(inputs.shape[-1])(x)
    return x + res

def get_hard_shared_model(input_shape1, input_shape2, output_shape):
    # Encoder input
    enc_inputs = Input(shape=(input_shape1, input_shape2))

    # Encoder
    enc_outputs = transformer_encoder(enc_inputs, head_size=64, num_heads=4, ff_dim=256, dropout=0.2)

    # Decoder input
    dec_inputs = Input(shape=(output_shape[1], input_shape2))

    # Decoder
    dec_outputs = transformer_decoder(dec_inputs, enc_outputs, head_size=64, num_heads=4, ff_dim=256, dropout=0.2)

    # Global pooling
    x = GlobalAveragePooling1D()(dec_outputs)

    # Shared dense layer
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.2)(x)

    # Task-specific output layers
    outputs = []
    for i in range(output_shape[1]):
        output = Dense(1, name=f'output_{i}')(x)
        outputs.append(output)

    return Model([enc_inputs, dec_inputs], outputs)

for iteration in range(5):
    # Reshape inputs
    train_input_reshaped = input_train.reshape((input_train.shape[0], input_train.shape[1], input_train.shape[2]))
    test_input_reshaped = input_test.reshape((input_test.shape[0], input_test.shape[1], input_test.shape[2]))
    val_input_reshaped = input_val.reshape((input_val.shape[0], input_val.shape[1], input_val.shape[2]))

    # Create decoder inputs
    train_dec_input = np.zeros((train_input_reshaped.shape[0], output_train.shape[1], train_input_reshaped.shape[2]))
    val_dec_input = np.zeros((val_input_reshaped.shape[0], output_train.shape[1], val_input_reshaped.shape[2]))
    test_dec_input = np.zeros((test_input_reshaped.shape[0], output_test.shape[1], test_input_reshaped.shape[2]))

    # Create the hard parameter sharing model
    model = get_hard_shared_model(input_train.shape[1], input_train.shape[2], output_train.shape)

    # Compile and train the model
    model.compile(loss='mse', optimizer=Adam(learning_rate=0.001), metrics=[['mse'] for _ in range(output_train.shape[1])])
    best_val = 1000000
    patience = 0
    best_model = None

    for k in range(200):
        # Train the model
        model.fit([train_input_reshaped, train_dec_input],
                  [output_train[:, i] for i in range(output_train.shape[1])],
                  epochs=1, batch_size=32, verbose=1)

        # Predict validation data
        pred_val = np.array(model.predict([val_input_reshaped, val_dec_input]))
        pred_val = np.transpose(pred_val, (1, 0, 2)).squeeze()
        print("k:", k, "patience:", patience)

        # Evaluate the model
        losses = []
        for i in range(pred_val.shape[0]):
            total_ssq = 0
            for j in range(3):
                total_ssq = np.sum(pred_val[i,j] * (max_val[j] - min_val[j]) + min_val[j]) + total_ssq
            total_ssq = total_ssq * 3.74
            output_val_ssq = output_val[i,0]
            loss = sklearn.metrics.mean_squared_error([total_ssq], [output_val_ssq], squared=False)
            losses.append(loss)
        tmp_val_loss = np.mean(losses)
        if tmp_val_loss <= best_val:
            best_val = tmp_val_loss
            patience = 0
            best_model = model
        else:
            patience += 1
            if patience > 10:
                break

    # Predict test data
    pred_test = np.array(best_model.predict([test_input_reshaped, test_dec_input]))
    pred_test = np.transpose(pred_test, (1, 0, 2)).squeeze()
    pred_test_n = np.zeros((pred_test.shape[0], 1))
    pred_test_o = np.zeros((pred_test.shape[0], 1))
    pred_test_d = np.zeros((pred_test.shape[0], 1))
    pred_test_final = np.empty((output_test.shape[0], 0))
    for m in range(pred_test.shape[0]):
      pred_test_n[m,0] = pred_test[m,0]*(max_val[0]-min_val[0]) + min_val[0]
      pred_test_o[m,0] = pred_test[m,1]*(max_val[1]-min_val[1]) + min_val[1]
      pred_test_d[m,0] = pred_test[m,2]*(max_val[2]-min_val[2]) + min_val[2]

    pred_test_final=np.hstack((pred_test_final, pred_test_n))
    pred_test_final=np.hstack((pred_test_final, pred_test_o))
    pred_test_final=np.hstack((pred_test_final, pred_test_d))

    # Overall Test Loss
    loss_n = sklearn.metrics.mean_squared_error(pred_test_final[:,0], output_test[:,0], squared = False)
    rmse_n.append(loss_n)
    loss_o = sklearn.metrics.mean_squared_error(pred_test_final[:,1], output_test[:,1], squared = False)
    rmse_o.append(loss_o)
    loss_d= sklearn.metrics.mean_squared_error(pred_test_final[:,2], output_test[:,2], squared = False)
    rmse_d.append(loss_d)
rmse_combined = np.column_stack((rmse_n, rmse_o, rmse_d))
rmse_n_loss = sum(rmse_n) / len(rmse_n)
rmse_o_loss = sum(rmse_o) / len(rmse_o)
rmse_d_loss = sum(rmse_d) / len(rmse_d)
average_rmse = np.array([rmse_n_loss, rmse_o_loss, rmse_d_loss])

print(rmse_n_loss,rmse_o_loss,rmse_d_loss)

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 68ms/step - loss: 0.4850 - output_0_mse: 0.1108 - output_1_mse: 0.1661 - output_2_mse: 0.2081
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 434ms/step
k: 0 patience: 0
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step - loss: 0.3149 - output_0_mse: 0.1757 - output_1_mse: 0.0369 - output_2_mse: 0.1023 
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
k: 1 patience: 0
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step - loss: 0.2735 - output_0_mse: 0.0946 - output_1_mse: 0.0452 - output_2_mse: 0.1337 
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
k: 2 patience: 1
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 91ms/step - loss: 0.2009 - output_0_mse: 0.1041 - output_1_mse: 0.0517 - output_2_mse: 0.0452
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 80ms/step
k: 3 patience: 2
[1m2/2[0m [32