In [14]:
from google.colab import drive
import pandas as pd
from sklearn.preprocessing import StandardScaler
from scipy import stats
import numpy as np
import logging
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from datetime import datetime
import matplotlib.pyplot as plt
import tensorflow as tf
import datetime
import matplotlib.dates as mdates
import os

In [15]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Set the base path to the desired directory on Google Drive
base_path = '/content/drive/MyDrive/Study_1_Data/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [16]:
def read_csv(file_path):
    data = pd.read_csv(file_path)
    return data

In [17]:
def process_data(data, columns_to_remove):
    processed_data = data.drop(columns=columns_to_remove).values
    return processed_data

In [18]:
def construct_3d_array(base_dir, participants, simulations, columns_to_remove_hr, columns_to_remove_gsr, columns_to_remove_head, columns_to_remove_eye):
    """
    Construct 3D array from CSV files.
    """
    num_rows = 180  # Define number of rows to keep (last 180 rows)
    arrays_3d = []

    for participant in participants:
        participant_id = f"{int(participant):02d}"  # Format participant number to two digits

        valid_simulations = []

        for simulation in simulations:
            hr_file_path = os.path.join(base_dir, participant_id, simulation, f'HR{simulation.capitalize()}.csv')
            gsr_file_path = os.path.join(base_dir, participant_id, simulation, f'EDA{simulation.capitalize()}_downsampled.csv')
            head_file_path = os.path.join(base_dir, participant_id, simulation, 'head_tracking_downsampled.csv')
            eye_file_path = os.path.join(base_dir, participant_id, simulation, 'eye_tracking_downsampled.csv')

            # Check if all files exist
            if all(os.path.exists(file) for file in [hr_file_path, gsr_file_path, head_file_path, eye_file_path]):
                valid_simulations.append(simulation)

        num_valid_simulations = len(valid_simulations)
        if num_valid_simulations == 0:
            continue  # Skip this participant if no valid simulations are found

        array_3d = np.zeros((num_valid_simulations, num_rows, 47)) # hr=1, gsr=1, head=15-3, eye=41-8 total columns after removing columns= 48

        for s_idx, simulation in enumerate(valid_simulations):
            # Process hr data
            hr_file_path = os.path.join(base_dir, participant_id, simulation, f'HR{simulation.capitalize()}.csv')
            hr_data = read_csv(hr_file_path)
            processed_hr_data = process_data(hr_data, columns_to_remove_hr)
            processed_hr_data = processed_hr_data[-num_rows:]  # Keep only the last 180 rows

            # Process gsr data
            gsr_file_path = os.path.join(base_dir, participant_id, simulation, f'EDA{simulation.capitalize()}_downsampled.csv')
            gsr_data = read_csv(gsr_file_path)
            processed_gsr_data = process_data(gsr_data, columns_to_remove_gsr)
            processed_gsr_data = processed_gsr_data[-num_rows:]  # Keep only the last 180 rows

            # Process head data
            head_file_path = os.path.join(base_dir, participant_id, simulation, 'head_tracking_downsampled.csv')
            head_data = read_csv(head_file_path)
            processed_head_data = process_data(head_data, columns_to_remove_head)
            processed_head_data = processed_head_data[-num_rows:]  # Keep only the last 180 rows

            # Process eye data
            eye_file_path = os.path.join(base_dir, participant_id, simulation, 'eye_tracking_downsampled.csv')
            eye_data = read_csv(eye_file_path)
            processed_eye_data = process_data(eye_data, columns_to_remove_eye)
            processed_eye_data = processed_eye_data[-num_rows:]  # Keep only the last 180 rows

            # Combine processed data
            combined_data = np.concatenate((processed_hr_data, processed_gsr_data, processed_head_data, processed_eye_data), axis=1)

            array_3d[s_idx, :, :] = combined_data

        arrays_3d.append(array_3d)

    return arrays_3d


In [19]:
sample_size=60
simulations = ['flat','noise','bumps']
participants = [str(i) for i in range(1, 27)]  # Participants 101 to 127
columns_to_remove_hr = []
columns_to_remove_gsr = []
columns_to_remove_eye = ['#Frame','Time', 'Unnamed: 40','ConvergenceValid','Left_Eye_Closed','Right_Eye_Closed','LocalGazeValid','WorldGazeValid']
columns_to_remove_head = ['#Frame','Time', 'Unnamed: 14']

In [20]:
def calculate_total_ssq(csv_file):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(csv_file)
    n_columns = [0, 5, 6, 7, 8, 14, 15]
    o_columns = [0, 1, 2, 3, 4, 8, 10]
    d_columns = [4, 7, 9, 10, 11, 12, 13]

    # Calculate sum for each specified set of columns
    n_val = df.iloc[0, n_columns].sum()
    o_val = df.iloc[0, o_columns].sum()
    d_val = df.iloc[0, d_columns].sum()

    return n_val, o_val, d_val

In [21]:
def merge_ssq_column(conditions,participants):
  directories = []
  total_ssq_values = []
  for participant in participants:
      participant = f"{int(participant):02d}"
      for condition in conditions:
          directory = os.path.join(base_path, participant, condition)
          directories.append(directory)

  # Loop through each directory
  for directory in directories:
      # Check if the directory exists
      if not os.path.exists(directory):
          continue

      # Get all CSV files in the directory that are named 'ssq.csv'
      csv_files = [file for file in os.listdir(directory) if file == 'ssq.csv']

      # Loop through each CSV file
      for csv_file in csv_files:
          csv_path = os.path.join(directory, csv_file)
          df = pd.read_csv(csv_path)
          # n_val,o_val,d_val = calculate_total_ssq(csv_path)
          # total_ssq_values.append([n_val, o_val, d_val])
          ssq_values_participant = df.iloc[:, 0:17].values.flatten()   # Assuming SSQ values are in columns 1 to 16
          total_ssq_values.append(ssq_values_participant)
  ssq_array = np.array(total_ssq_values)
  return ssq_array

def merge_total_ssq(conditions,participants):
  directories = []
  total_ssq_values = []
  for participant in participants:
      participant = f"{int(participant):02d}"
      for condition in conditions:
          directory = os.path.join(base_path, participant, condition)
          directories.append(directory)

  # Loop through each directory
  for directory in directories:
      # Check if the directory exists
      if not os.path.exists(directory):
          continue

      # Get all CSV files in the directory that are named 'ssq.csv'
      csv_files = [file for file in os.listdir(directory) if file == 'ssq.csv']

      # Loop through each CSV file
      for csv_file in csv_files:
          csv_path = os.path.join(directory, csv_file)
          n_val,o_val,d_val = calculate_total_ssq(csv_path)
          total_ssq = (n_val+o_val+d_val) * 3.74
          df = pd.read_csv(csv_path)
          df["total-ssq"] = total_ssq
          #print("csv_path: ",csv_path,"   ",total_ssq)
          total_ssq_values.append(total_ssq)
  # Create a DataFrame from the list of total SSQ values
  df_total_ssq = pd.DataFrame(total_ssq_values, columns=["total-ssq"])
  # Convert the list of total SSQ values to a NumPy array
  total_ssq_array = np.array(total_ssq_values)
  return total_ssq_array



In [22]:
participants_group_1 = [1,3,4,11,25]
participants_group_2 = [2,7,8,9,17]
participants_group_3 = [10,12,13,22,23]
participants_group_4 = [5,14,18,20,21]
participants_group_5 = [6,15,16,19,24,26]

arrays_group_1 = construct_3d_array(base_path, participants_group_1, simulations, columns_to_remove_hr, columns_to_remove_gsr, columns_to_remove_head, columns_to_remove_eye)
arrays_group_2 = construct_3d_array(base_path, participants_group_2, simulations, columns_to_remove_hr, columns_to_remove_gsr, columns_to_remove_head, columns_to_remove_eye)
arrays_group_3 = construct_3d_array(base_path, participants_group_3, simulations, columns_to_remove_hr, columns_to_remove_gsr, columns_to_remove_head, columns_to_remove_eye)
arrays_group_4 = construct_3d_array(base_path, participants_group_4, simulations, columns_to_remove_hr, columns_to_remove_gsr, columns_to_remove_head, columns_to_remove_eye)
arrays_group_5 = construct_3d_array(base_path, participants_group_5, simulations, columns_to_remove_hr, columns_to_remove_gsr, columns_to_remove_head, columns_to_remove_eye)

In [23]:
# Concatenate arrays along the first axis
input_group_1 = np.concatenate(arrays_group_1, axis=0)
input_group_2 = np.concatenate(arrays_group_2, axis=0)
input_group_3 = np.concatenate(arrays_group_3, axis=0)
input_group_4 = np.concatenate(arrays_group_4, axis=0)
input_group_5 = np.concatenate(arrays_group_5, axis=0)


In [24]:
output_group_1=merge_ssq_column(simulations,participants_group_1)
output_group_2=merge_ssq_column(simulations,participants_group_2)
output_group_3=merge_ssq_column(simulations,participants_group_3)
output_group_4=merge_ssq_column(simulations,participants_group_4)
output_group_5=merge_ssq_column(simulations,participants_group_5)

output_group_1 = np.squeeze(output_group_1)
output_group_2 = np.squeeze(output_group_2)
output_group_3 = np.squeeze(output_group_3)
output_group_4 = np.squeeze(output_group_4)
output_group_5 = np.squeeze(output_group_5)


output_total_ssq_group_1=merge_total_ssq(simulations,participants_group_1)
output_total_ssq_group_2=merge_total_ssq(simulations,participants_group_2)
output_total_ssq_group_3=merge_total_ssq(simulations,participants_group_3)
output_total_ssq_group_4=merge_total_ssq(simulations,participants_group_4)
output_total_ssq_group_5=merge_total_ssq(simulations,participants_group_5)

output_total_ssq_group_1=output_total_ssq_group_1.reshape(-1, 1)
output_total_ssq_group_2=output_total_ssq_group_2.reshape(-1, 1)
output_total_ssq_group_3=output_total_ssq_group_3.reshape(-1, 1)
output_total_ssq_group_4=output_total_ssq_group_4.reshape(-1, 1)
output_total_ssq_group_5=output_total_ssq_group_5.reshape(-1, 1)



In [25]:
def scale_input_data(input_train, input_test):
    # Get the shape of the input data
    num_samples_train, time_steps_train, num_features = input_train.shape
    num_samples_test, time_steps_test, _ = input_test.shape

    # Reshape the input data into 2D arrays
    flattened_train_data = input_train.reshape(-1, num_features)
    flattened_test_data = input_test.reshape(-1, num_features)

    # Initialize a MinMaxScaler object
    scaler = MinMaxScaler()

    # Fit the scaler on the training data and transform both train and test data
    scaled_train_data = scaler.fit_transform(flattened_train_data)
    scaled_test_data = scaler.transform(flattened_test_data)

    # Reshape the scaled data back to its original shape
    scaled_train_data = scaled_train_data.reshape(num_samples_train, time_steps_train, num_features)
    scaled_test_data = scaled_test_data.reshape(num_samples_test, time_steps_test, num_features)

    return scaled_train_data, scaled_test_data

def scale_target_var(target_data):
    min_val, max_val = np.min(target_data, axis=0), np.max(target_data, axis=0)
    target_data = (target_data-min_val)/(max_val-min_val)

    return target_data, min_val, max_val

In [26]:
from keras.models import Sequential
from keras.layers import Input, GRU, Dense, Dropout
from keras.models import Model
import numpy as np
import sklearn

total_losses=[]
def get_shared_lstm(input_shape1, input_shape2):
    # Define shared GRU model
    input_layer = Input(shape=(input_shape1, input_shape2))
    x = LSTM(64, return_sequences=False)(input_layer)
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.2)(x)
    shared_model = Model(inputs=input_layer, outputs=x)
    return shared_model




def get_output_model(shared_lstm_output, output_shape):
    # Define separate output model for each column
    output_models = []
    for i in range(output_shape[1]):
        output_model = Sequential()
        output_model.add(Dense(256, activation='relu'))
        output_model.add(Dropout(0.2))
        output_model.add(Dense(1))  # Output shape is (None, 1) for each column
        output_model_output = output_model(shared_lstm_output)
        output_models.append(output_model_output)
    return output_models
input_groups = [input_group_1, input_group_2, input_group_3, input_group_4, input_group_5]
output_groups = [output_group_1, output_group_2, output_group_3, output_group_4, output_group_5]
ssq_groups = [output_total_ssq_group_1, output_total_ssq_group_2, output_total_ssq_group_3, output_total_ssq_group_4, output_total_ssq_group_5]


# Specify the number of samples to select for each group in each iteration
samples_per_iteration = [
    [3, 3, 3, 3, 2],  # For input_group_1
    [3, 3, 3, 3, 2],  # For input_group_2
    [3, 3, 3, 2, 2],  # For input_group_3
    [2, 2, 2, 2, 3],  # For input_group_4
    [3, 3, 3, 3, 4]   # For input_group_5
]

# Initialize a list of global indices arrays, one for each group
global_indices = [[] for _ in range(len(input_groups))]
print("global_indices",global_indices)

# Outer loop to repeat the sampling process for 5 iterations
for iteration in range(5):
  X_train, X_val, X_test = [], [], []
  y_train, y_val, y_test = [], [], []
  ssq_train, ssq_val, ssq_test = [], [], []
  print(f"Iteration {iteration + 1}")
  print("global_indices",global_indices)
  # Loop over each group
  for i, (input_group, output_group, ssq_group) in enumerate(zip(input_groups, output_groups, ssq_groups)):
      num_samples = samples_per_iteration[i][iteration]  # Number of samples to select for the current group and iteration

      # Create a set of available indices that haven't been selected yet for the current group
      available_indices = list(set(range(len(input_group))) - set(global_indices[i]))

      # Check if there are fewer available indices than needed
      if len(available_indices) < num_samples:
          print(f"Not enough indices left in group {i + 1} to select {num_samples} new samples.")
          num_samples = len(available_indices)  # Adjust to take whatever is left

      # Select the required number of samples from the available indices for the current group
      selected_indices = np.random.choice(available_indices, num_samples, replace=False)
      global_indices[i].extend(selected_indices)  # Add these indices to the group's global list

      # Remove these selected samples from the input, output, and SSQ groups
      X_test_temp = input_group[selected_indices]
      y_test_temp = output_group[selected_indices]
      ssq_test_temp = ssq_group[selected_indices]

      X_temp = np.delete(input_group, selected_indices, axis=0)
      y_temp = np.delete(output_group, selected_indices, axis=0)
      ssq_temp = np.delete(ssq_group, selected_indices, axis=0)

      # Split the remaining data into a training set (60%) and a validation set (40%)
      X_train_temp, X_val_temp, y_train_temp, y_val_temp, ssq_train_temp, ssq_val_temp = train_test_split(
          X_temp, y_temp, ssq_temp, test_size=0.2)

      # Append the results to the corresponding lists
      X_train.append(X_train_temp)
      X_val.append(X_val_temp)
      X_test.append(X_test_temp)

      y_train.append(y_train_temp)
      y_val.append(y_val_temp)
      y_test.append(y_test_temp)

      ssq_train.append(ssq_train_temp)
      ssq_val.append(ssq_val_temp)
      ssq_test.append(ssq_test_temp)

  # After the loop, concatenate the data for all groups if needed
  input_train = np.concatenate(X_train, axis=0)
  input_val = np.concatenate(X_val, axis=0)
  input_test = np.concatenate(X_test, axis=0)

  output_train = np.concatenate(y_train, axis=0)
  output_val = np.concatenate(y_val, axis=0)
  output_test = np.concatenate(y_test, axis=0)

  output_test_total_ssq = np.concatenate(ssq_test, axis=0)


  #  this section for scaling both train and validation set simultaniously
  # Step 1: Combine the training and validation sets
  combined_input = np.concatenate([input_train, input_val], axis=0)
  combined_output = np.concatenate([output_train, output_val], axis=0)

  # Step 2: Scale the combined input data
  # Assuming scale_input_data scales the data based on the combined dataset
  combined_input, input_test = scale_input_data(
      combined_input[:, (60-sample_size):(180-sample_size), :],
      input_test[:, (60-sample_size):(180-sample_size), :]
  )

  # Step 3: Scale the combined output data
  # Assuming scale_target_var scales the data and returns min_val, max_val
  combined_output, min_val, max_val = scale_target_var(combined_output)

  # Step 4: Split the combined data back into training and validation sets
  # Use the original shapes of input_train and input_val to slice the combined arrays
  input_train = combined_input[:input_train.shape[0], :, :]
  input_val = combined_input[input_train.shape[0]:, :, :]

  output_train = combined_output[:output_train.shape[0], :]
  output_val = combined_output[output_train.shape[0]:, :]



  print("input_train :", input_train.shape)
  print("output_train :", output_train.shape)
  print("input_val :", input_val.shape)
  print("output_val :", output_val.shape)
  print("input_test :", input_test.shape)
  print("output_test :", output_test.shape)






  # Reshape train and test inputs to match GRU input shape
  train_input_reshaped = input_train.reshape((input_train.shape[0], input_train.shape[1], input_train.shape[2]))
  test_input_reshaped = input_test.reshape((input_test.shape[0], input_test.shape[1], input_test.shape[2]))
  val_input_reshaped = input_val.reshape((input_val.shape[0], input_val.shape[1], input_val.shape[2]))
  # Get shared LSTM model
  shared_lstm = get_shared_lstm(input_train.shape[1], input_train.shape[2])

  # Create separate output models for each column
  output_models = get_output_model(shared_lstm.output, output_train.shape)

  # Create combined model
  model = Model(inputs=shared_lstm.input, outputs=output_models)

  # Compile and train the model
  model.compile(loss='mse', optimizer='adam', metrics=[['mse'] for _ in range(output_train.shape[1])])  # Using MSE as loss and metric
  best_val=1000000
  patience=0
  best_model = None

  for k in range(200):
      # Predict test data
      model.fit(train_input_reshaped, [output_train[:, i] for i in range(output_train.shape[1])], epochs=1, batch_size=32)
      pred_val = np.array(model.predict(val_input_reshaped))
      pred_val = np.transpose(pred_val.squeeze(), (1, 0))
      print("k:", k, "patience:", patience)
      # Evaluate the model
      losses = []
      for i in range(pred_val.shape[0]):
        total_ssq=0
        for j in [0,5,6,7,8,14,15]:
          total_ssq=np.sum(pred_val[i,j]*(max_val[j]-min_val[j]) + min_val[j])+total_ssq

        for j in [0,1,2,3,4,8,10]:
          total_ssq=np.sum(pred_val[i,j]*(max_val[j]-min_val[j]) + min_val[j])+total_ssq

        for j in [4,7,9,10,11,12,13]:
          total_ssq=np.sum(pred_val[i,j]*(max_val[j]-min_val[j]) + min_val[j])+total_ssq
        total_ssq=total_ssq*3.74
        output_val_ssq= output_val[i,0]
        #print("total_ssq",total_ssq)
        #print("output_val_ssq",output_val_ssq)
        loss = sklearn.metrics.mean_squared_error([total_ssq], [output_val_ssq], squared=False)
        losses.append(loss)
      tmp_val_loss = np.mean(losses)
      if tmp_val_loss <= best_val:
          best_val = tmp_val_loss
          patience = 0
          best_model = model
      else:
          patience +=1
          if patience > 10:
            break

      # Predict test data
      pred_test = np.array(best_model.predict(test_input_reshaped))
      pred_test = np.transpose(pred_test.squeeze(), (1, 0))
      # Evaluate the model
      pred_total_ssq = []
      #losses=[]
      for i in range(pred_test.shape[0]):
          total_ssq=0
          for j in [0,5,6,7,8,14,15]:
            total_ssq=np.sum(pred_test[i,j]*(max_val[j]-min_val[j]) + min_val[j])+total_ssq

          for j in [0,1,2,3,4,8,10]:
            total_ssq=np.sum(pred_test[i,j]*(max_val[j]-min_val[j]) + min_val[j])+total_ssq

          for j in [4,7,9,10,11,12,13]:
            total_ssq=np.sum(pred_test[i,j]*(max_val[j]-min_val[j]) + min_val[j])+total_ssq
          total_ssq=total_ssq*3.74

          pred_total_ssq.append(total_ssq)


      # Overall Test Loss
      loss = sklearn.metrics.mean_squared_error(pred_total_ssq, output_test_total_ssq, squared = False)
      print("Test Loss no ",iteration,":" ,loss)
      total_losses.append(loss)

average_loss = sum(total_losses) / len(total_losses)
total_losses.append(average_loss)
print("average_loss:", average_loss)

global_indices [[], [], [], [], []]
Iteration 1
global_indices [[], [], [], [], []]
input_train : (41, 120, 47)
output_train : (41, 16)
input_val : (13, 120, 47)
output_val : (13, 16)
input_test : (14, 120, 47)
output_test : (14, 16)
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 87ms/step - loss: 1.5336 - sequential_80_mse: 0.1683 - sequential_81_mse: 0.1997 - sequential_82_mse: 0.0619 - sequential_83_mse: 0.1037 - sequential_84_mse: 0.0321 - sequential_85_mse: 0.0718 - sequential_86_mse: 0.1385 - sequential_87_mse: 0.1844 - sequential_88_mse: 0.0450 - sequential_89_mse: 0.0536 - sequential_90_mse: 0.0422 - sequential_91_mse: 0.0855 - sequential_92_mse: 0.0992 - sequential_93_mse: 0.0902 - sequential_94_mse: 0.1120 - sequential_95_mse: 0.0457
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 522ms/step
k: 0 patience: 0
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 508ms/step
Test Loss no  0 : 26.03496664470233
[1m2/2[0m [32m━━━━━━━━━━━━━━