In [33]:
from google.colab import drive
import pandas as pd
from sklearn.preprocessing import StandardScaler
from scipy import stats
import numpy as np
import logging
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from datetime import datetime
import matplotlib.pyplot as plt
import tensorflow as tf
import datetime
import matplotlib.dates as mdates
import os

In [34]:
from google.colab import drive

drive.mount('/content/drive')
base_path = '/content/drive/MyDrive/Study1WSSQ/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [35]:
def read_csv(file_path):
    data = pd.read_csv(file_path)
    return data

In [36]:
def process_data(data, columns_to_remove):
    processed_data = data.drop(columns=columns_to_remove).values
    return processed_data

In [37]:
def construct_3d_array(base_dir, participants, simulations, columns_to_remove_hr,columns_to_remove_gsr, columns_to_remove_head,columns_to_remove_eye):
    """
    Construct 3D array from CSV files.
    """
    num_rows = None  # Define number of rows (can be inferred from data)

    num_simulations = len(simulations)

    arrays_3d = []

    for participant in participants:
        array_3d = np.zeros((num_simulations, 300, 48)) # hr=5-4,gsr=5-4,head=17-4,eye=38-5 total columns after removing column= 48

        for s_idx, simulation in enumerate(simulations):
            # Process hr data
            hr_file_path = os.path.join(base_dir, participant, simulation, 'hr.csv')
            hr_data = read_csv(hr_file_path)
            processed_hr_data = process_data(hr_data, columns_to_remove_hr)


            # Process gsr data
            gsr_file_path = os.path.join(base_dir, participant, simulation, 'gsr.csv')
            gsr_data = read_csv(gsr_file_path)
            processed_gsr_data = process_data(hr_data, columns_to_remove_gsr)

            # Process head data
            head_file_path = os.path.join(base_dir, participant, simulation, 'head.csv')
            head_data = read_csv(head_file_path)
            processed_head_data = process_data(head_data, columns_to_remove_head)

            # Process eye data
            eye_file_path = os.path.join(base_dir, participant, simulation, 'eye.csv')
            eye_data = read_csv(eye_file_path)
            processed_eye_data = process_data(eye_data, columns_to_remove_eye)
            # Combine processed data
            combined_data = np.concatenate((processed_hr_data,processed_gsr_data, processed_head_data,processed_eye_data ), axis=1)

            array_3d[s_idx, :, :] = combined_data

        arrays_3d.append(array_3d)
    return arrays_3d


In [38]:
participants = [str(i) for i in range(101, 120)]
test_simulation="flat"
simulations_train = ['noise',"speedbumps"]
simulations_test=[test_simulation]
columns_to_remove_hr = ['simulation', 'fms', 'Time', 'individual']
columns_to_remove_gsr = ['simulation', 'fms', 'Time', 'individual']
columns_to_remove_head = ['simulation', 'fms', 'Time', 'Unnamed: 14']
columns_to_remove_eye = ['simulation', 'fms', 'Time', 'individual','Unnamed: 40']

In [39]:
arrays_train = construct_3d_array(base_path, participants, simulations_train, columns_to_remove_hr, columns_to_remove_gsr, columns_to_remove_head, columns_to_remove_eye)
arrays_test = construct_3d_array(base_path, participants, simulations_test, columns_to_remove_hr, columns_to_remove_gsr, columns_to_remove_head, columns_to_remove_eye)

In [40]:
# Concatenate arrays along the first axis
input_train = np.concatenate(arrays_train, axis=0)
input_test = np.concatenate(arrays_test, axis=0)

# Display the shape of the final concatenated 3D array
print(f"Shape of the final concatenated 3D array: {input_train.shape}")
print(f"Shape of the final concatenated 3D array: {input_test.shape}")

Shape of the final concatenated 3D array: (38, 300, 48)
Shape of the final concatenated 3D array: (19, 300, 48)


In [41]:
def calculate_total_ssq(csv_file):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(csv_file)
    n_columns = [4, 7, 9, 10, 11, 12, 13]
    o_columns = [0, 1, 2, 3, 4, 8, 10]
    d_columns = [4, 7, 9, 10, 11, 12, 13]

    # Calculate sum for each specified set of columns
    n_val = df.iloc[:, n_columns].sum(axis=1)
    o_val = df.iloc[:, o_columns].sum(axis=1)
    d_val = df.iloc[:, d_columns].sum(axis=1)

    total_ssq = (n_val+o_val+d_val) * 3.74
    return total_ssq

In [42]:
def merge_ssq_column(conditions,participants):
  directories = []
  total_ssq_values = []
  for participant in participants:
      for condition in conditions:
          directory = os.path.join(base_path, f"{participant}/{condition}/")
          directories.append(directory)

  for directory in directories:
      csv_files = [file for file in os.listdir(directory) if file.endswith("-ssq.csv")]

      for csv_file in csv_files:
          csv_path = os.path.join(directory, csv_file)
          df = pd.read_csv(csv_path)

          ssq_values_participant = df.iloc[:, 0:17].values.flatten()   # Assuming SSQ values are in columns 1 to 16
          total_ssq_values.append(ssq_values_participant)
  ssq_array = np.array(total_ssq_values)
  return ssq_array

def merge_total_ssq(conditions,participants):
  directories = []
  total_ssq_values = []
  for participant in participants:
      for condition in conditions:
          directory = os.path.join(base_path, f"{participant}/{condition}/")
          directories.append(directory)
  for directory in directories:
      csv_files = [file for file in os.listdir(directory) if file.endswith("-ssq.csv")]

      for csv_file in csv_files:
          csv_path = os.path.join(directory, csv_file)
          total_ssq = calculate_total_ssq(csv_path)
          df = pd.read_csv(csv_path)
          df["total-ssq"] = total_ssq
          #print("csv_path: ",csv_path,"   ",total_ssq)
          total_ssq_values.append(total_ssq)
  # Create a DataFrame from the list of total SSQ values
  df_total_ssq = pd.DataFrame(total_ssq_values, columns=["total-ssq"])
  # Convert the list of total SSQ values to a NumPy array
  total_ssq_array = np.array(total_ssq_values)
  return total_ssq_array

def sorted_merged_total_ssq(conditions,participants):
  directories = []
  total_ssq_values = []
  ssq_dict = {}
  for participant in participants:
      for condition in conditions:
          directory = os.path.join(base_path, f"{participant}/{condition}/")
          directories.append(directory)
  for directory in directories:
      csv_files = [file for file in os.listdir(directory) if file.endswith("-ssq.csv")]

      for csv_file in csv_files:
          csv_path = os.path.join(directory, csv_file)
          total_ssq = calculate_total_ssq(csv_path)
          #print(total_ssq.tolist())
          # Split the CSV path by '/'
          path_parts = csv_path.split('/')

          # Extract the relevant part from the path
          participant_key = path_parts[-3] + path_parts[-2]
          ssq_dict[participant_key] = total_ssq.tolist()
  sorted_ssq_dict = dict(sorted(ssq_dict.items(), key=lambda item: item[1]))
  return sorted_ssq_dict

In [43]:
import numpy as np
import matplotlib.pyplot as plt

def show_column_distribution(array, name):
    plt.figure(figsize=(10, 6))  # Adjust figure size if needed

    # Calculate the frequency of each value (0, 1, 2, 3) for each column
    column_counts = np.apply_along_axis(lambda x: np.bincount(x, minlength=4), axis=0, arr=array)

    # Plot the histogram
    x = np.arange(16)  # Assuming there are 16 columns
    width = 0.2  # Width of each bar
    for i in range(4):  # Iterate over the values 0, 1, 2, 3
        plt.bar(x + i * width, column_counts[i], width=width, label=f'Value {i}')

    # Add numerical data alongside the plot
    for i in range(16):  # Iterate over each column
        for j in range(4):  # Iterate over each value (0, 1, 2, 3)
            plt.text(x[i] + width / 2 + j * width, column_counts[j, i] + 0.5, str(int(column_counts[j, i])), ha='center', va='bottom')

    plt.xlabel('Column')  # Label for x-axis
    plt.ylabel('Frequency')  # Label for y-axis
    plt.title(name)  # Title of the plot
    plt.xticks(x + 0.3, range(1, 17))  # Adjust x-axis ticks
    plt.legend()
    plt.grid(True)

    plt.show()



In [44]:

output_train=merge_ssq_column(simulations_train,participants)
output_test=merge_ssq_column(simulations_test,participants)
#show_column_distribution(output_train,"train distribution")
#show_column_distribution(output_test,"test distribution")
output_train_total_ssq=merge_total_ssq(simulations_train,participants)
sorted_output_train_ssq=sorted_merged_total_ssq(simulations_train,participants)
#print(sorted_output_train_ssq)
output_test_total_ssq=merge_total_ssq(simulations_test,participants)
output_train_total_ssq=output_train_total_ssq.reshape(-1, 1)
output_test_total_ssq=output_test_total_ssq.reshape(-1, 1)
#print(output_train.shape,output_test.shape,output_train_total_ssq.shape,output_test_total_ssq.shape)


In [45]:
def scale_input_data(input_data, input_test):
    # Get the shape of the input data
    num_samples, time_steps, num_features = input_data.shape
    num_samples_test, _, _ = input_test.shape

    # Reshape the input data into a 2D array
    flattened_data = input_data.reshape(-1, num_features)
    flattened_test_data = input_test.reshape(-1, num_features)

    # Initialize a StandardScaler object
    scaler = MinMaxScaler()

    # Apply StandardScaler to the flattened data
    scaled_data = scaler.fit_transform(flattened_data)
    scaled_test_data = scaler.transform(flattened_test_data)

    # Reshape the scaled data back to its original shape
    scaled_data = scaled_data.reshape(num_samples, time_steps, num_features)
    scaled_test_data = scaled_test_data.reshape(num_samples_test, time_steps, num_features)

    return scaled_data, scaled_test_data

def scale_target_var(target_data):
    min_val, max_val = np.min(target_data, axis=0), np.max(target_data, axis=0)
    target_data = (target_data-min_val)/(max_val-min_val)

    return target_data, min_val, max_val


In [46]:
import numpy as np


new_data = np.zeros((38, 3))

# Assigning original serial numbers to the first column
new_data[:, 0] = np.arange(1, 39)

# Assigning original values to the second column
new_data[:, 1] = output_train_total_ssq[:, 0]

# Sorting the array based on the values (second column)
sorted_indices = np.argsort(new_data[:, 1])
sorted_data = new_data[sorted_indices]

# Assigning new serial numbers to the third column
current_val = sorted_data[0][1]
start_idx = 0
for i, row in enumerate(sorted_data):
    if row[1] != current_val:
        sorted_data[start_idx:i, 2] = np.arange(start_idx + 1, i + 1)
        start_idx = i
        current_val = row[1]

# Assigning new serial numbers for the last group
sorted_data[start_idx:, 2] = np.arange(start_idx + 1, len(sorted_data) + 1)

#print(sorted_data)


In [47]:
sample_size=270
input_train, input_test= scale_input_data(input_train[:, :sample_size, :], input_test[:, :sample_size, :])
output_train, min_val, max_val = scale_target_var(output_train)

val_indices = [1,15,16,21,24,26,30,37]
train_indices = [0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 17, 18, 19, 20, 22, 23, 25, 27, 28, 29, 31, 32, 33, 34, 35, 36]
input_val = input_train[val_indices]
input_train = input_train[train_indices]
output_val = output_train_total_ssq[val_indices]
output_train = output_train[train_indices]

print("val input :",input_val.shape)
print("train input :",input_train.shape)
print("val output :",output_val.shape)
print("train output :",output_train.shape)

val input : (8, 270, 48)
train input : (30, 270, 48)
val output : (8, 1)
train output : (30, 16)


In [48]:
from keras.models import Sequential
from keras.layers import Input, LSTM, Dense, Dropout
from keras.models import Model
import numpy as np
import sklearn

total_losses=[]
for iteration in range(10):
  def get_shared_lstm(input_shape1, input_shape2):
      # Define shared LSTM model
      shared_model = Sequential()
      shared_model.add(LSTM(64, input_shape=(input_shape1, input_shape2), return_sequences=False))
      shared_model.add(Dense(256, activation='relu'))
      shared_model.add(Dropout(0.2))
      return shared_model

  def get_output_model(shared_lstm_output, output_shape):
      # Define separate output model for each column
      output_models = []
      for i in range(output_shape[1]):
          output_model = Sequential()
          output_model.add(Dense(256, activation='relu'))
          output_model.add(Dropout(0.2))
          output_model.add(Dense(1))  # Output shape is (None, 1) for each column
          output_model_output = output_model(shared_lstm_output)
          output_models.append(output_model_output)
      return output_models

  # Assuming train_input, train_output, test_input, test_output are numpy arrays

  # Reshape train and test inputs to match LSTM input shape
  train_input_reshaped = input_train.reshape((input_train.shape[0], input_train.shape[1], input_train.shape[2]))
  test_input_reshaped = input_test.reshape((input_test.shape[0], input_test.shape[1], input_test.shape[2]))
  val_input_reshaped = input_val.reshape((input_val.shape[0], input_val.shape[1], input_val.shape[2]))

  # Get shared LSTM model
  shared_lstm = get_shared_lstm(input_train.shape[1], input_train.shape[2])

  # Create separate output models for each column
  output_models = get_output_model(shared_lstm.output, output_train.shape)

  # Create combined model
  model = Model(inputs=shared_lstm.input, outputs=output_models)

  # Compile and train the model
  model.compile(loss='mse', optimizer='adam', metrics=['mse','mae'])  # Using MSE as loss and metric
  best_val=1000000;
  patience=0;
  best_model = None


  for k in range(200):
    # Predict test data
    model.fit(train_input_reshaped, [output_train[:, i] for i in range(output_train.shape[1])], epochs=1, batch_size=32)
    pred_val = np.array(model.predict(val_input_reshaped))
    pred_val = np.transpose(pred_val.squeeze(), (1, 0))
    # Evaluate the model
    losses = []
    for i in range(pred_val.shape[0]):
      total_ssq=0
      for j in [0,5,6,7,8,14,15]:
        total_ssq=np.sum(pred_val[i,j]*(max_val[j]-min_val[j]) + min_val[j])+total_ssq

      for j in [0,1,2,3,4,8,10]:
        total_ssq=np.sum(pred_val[i,j]*(max_val[j]-min_val[j]) + min_val[j])+total_ssq

      for j in [4,7,9,10,11,12,13]:
        total_ssq=np.sum(pred_val[i,j]*(max_val[j]-min_val[j]) + min_val[j])+total_ssq
      total_ssq=total_ssq*3.74
      output_val_ssq= output_val[i,0]
      #print("total_ssq",total_ssq)
      #print("output_val_ssq",output_val_ssq)
      loss = sklearn.metrics.mean_squared_error([total_ssq], [output_val_ssq], squared=False)
      losses.append(loss)
    tmp_val_loss = np.mean(losses)
    if tmp_val_loss <= best_val:
        best_val = tmp_val_loss
        patience = 0
        best_model = model
    else:
        patience +=1
        if patience > 10:
          break

  # Predict test data
  pred_test = np.array(best_model.predict(test_input_reshaped))
  pred_test = np.transpose(pred_test.squeeze(), (1, 0))
  np.savetxt("pred_test_mtl_{}_{}_{}.csv".format(test_simulation,sample_size,iteration), pred_test, delimiter=",", header="", fmt='%.2f')
  # Evaluate the model
  pred_total_ssq = []
  #losses=[]
  for i in range(pred_test.shape[0]):
      total_ssq=0
      for j in [0,5,6,7,8,14,15]:
        total_ssq=np.sum(pred_test[i,j]*(max_val[j]-min_val[j]) + min_val[j])+total_ssq

      for j in [0,1,2,3,4,8,10]:
        total_ssq=np.sum(pred_test[i,j]*(max_val[j]-min_val[j]) + min_val[j])+total_ssq

      for j in [4,7,9,10,11,12,13]:
        total_ssq=np.sum(pred_test[i,j]*(max_val[j]-min_val[j]) + min_val[j])+total_ssq
      total_ssq=total_ssq*3.74

      pred_total_ssq.append(total_ssq)


  # Overall Test Loss
  loss = sklearn.metrics.mean_squared_error(pred_total_ssq, output_test_total_ssq, squared = False)
  print("Test Loss no ",iteration,":" ,loss)
  total_losses.append(loss)
average_loss = sum(total_losses) / len(total_losses)
total_losses.append(average_loss)
np.savetxt("pred_test_mtl_{}_{}_losses.csv".format(test_simulation,sample_size), total_losses, delimiter=",", header="", fmt='%.2f')
print("average_loss:",average_loss)
import zipfile
with zipfile.ZipFile("pred_test_mtl_{}_{}.zip".format(test_simulation,sample_size), "w") as zipf:
    for iteration in range(10):
        zipf.write("pred_test_mtl_{}_{}_{}.csv".format(test_simulation,sample_size,iteration))
    zipf.write("pred_test_mtl_{}_{}_losses.csv".format(test_simulation,sample_size))


Test Loss no  0 : 24.835398330127497
Test Loss no  1 : 22.317633899075364
Test Loss no  2 : 27.101056889132302
Test Loss no  3 : 24.767312611677834
Test Loss no  4 : 21.36382498354954
Test Loss no  5 : 25.505320718861245
Test Loss no  6 : 22.331103339536913
Test Loss no  7 : 22.876214758697646
Test Loss no  8 : 24.82463087674469
Test Loss no  9 : 24.972302966725344
average_loss: 24.08947993741284
