<a href="https://colab.research.google.com/github/BRomans/IdMind/blob/main/autoencoder_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
import numpy as np
import pandas as pd

In [None]:
drive.mount("/content/drive")
filepath = "/content/drive/MyDrive/ml2-eeg-biometrics/eeg_dataset_right_hand_task_subset_9channels.csv" 

In [None]:
# Read in the full dataset (i.e. the file with right hand task subset 9 channels)
df=pd.read_csv(filepath)
print(df.shape)
df.dropna(axis=1,how='all',inplace=True)
print(df.shape)

In [None]:
# Need to exclude S1 0724 Run2 Trial 10 because of incomplete data.
df = df[~ ((df['Participant']=='S1') & (df['Date']==20200724) & (df['Run']=='Run2') & (df['Trial']==10))]
df.reset_index(drop=True, inplace=True) # Reset the index because of the dropped rows.

df.shape

In [None]:
# Create a train-validation-test split.
# For testing we want to take a separate person entirely (let's take S12) - this constitutes 5% of the total dataset.
# For validation we want to keep it equal across participants and sessions. We'll take 1 run per session for each of the participants. 
# # This is one sixth of the remaining data which equates to 15.8% of the total dataset. Thus, we keep 79.2% for training data.

# Convert Partipant ID to an integer column 'Target' so that the NN can handle it.
# df['Target'] = df['Participant'].astype('category').cat.codes.values

# test_arr = np.array(df[df['Participant']=='S12'])
# print("test_arr shape:", test_arr.shape)

# Run5 looks most complete in the data.
# valid_arr = np.array(df[(df['Run']=='Run5') & (df['Participant']!='S12')])
# print("valid_arr shape:", valid_arr.shape)




In [None]:
def reshape_data(arr, n_timepoints=2500, filename=None):
  """ 
  Function to split the ID, target & feature columns and reshape the data into the 
    required format for the AE.
  Input:
      arr           np.ndarray containing the 5 ID columns, the channel measurements and integer target (ID)
      n_timepoints  (Optional) integer specifying the number of timepoints per sample.
      filename      (Optional) if specified, then the data will be written to file in train-test-data with filename included in the name of the written file.
  Output: 
    [All np.ndarrays]
      x_arr   The feature values in shape [n_samples, timepoints, features]
      y_arr   The (integer) target values in shape [n_samples, 1, 1]
      id_arr  The categorical identifiers (Date, Run, etc.) in shape [n_samples, 1, 5]
  """
  x_arr = arr[:,5:14] # Exclude the first 5 ID columns and the 15th column (Target)
  y_arr = arr[:,14] # Last column - target (participant ID as integer)
  id_arr = arr[:,:5] # ['Participant','Date', 'Run', 'Task', 'Trial']

  # Find dimensions for new shape.
  n_rows = len(x_arr)
  n_samples = int(n_rows/n_timepoints)
  
  # Reshape the three arrays to the required shape.
  x_arr = x_arr.reshape((n_samples, n_timepoints, x_arr.shape[1]))
  y_arr = y_arr.reshape((n_samples, n_timepoints, 1))
  id_arr = id_arr.reshape((n_samples, n_timepoints, id_arr.shape[1]))

  # We do not need the target values / ID values to be replicated n_timepoints for each sample.
  # Reduce these to just 1 value per sample.
  y_arr_reduced = np.amin(y_arr, axis=1, keepdims=True) # Take the minimum along the second dimension.
  # Check if this is the same as the max value, if not then something has gone wrong.
  if not np.all(y_arr_reduced==np.amax(y_arr, axis=1, keepdims=True)):
    raise ValueError("The target value for each sample does not look to be consistent.")

  id_arr_reduced = np.amin(id_arr, axis=1, keepdims=True) # Take the minimum along the second dimension.

  print("x_arr shape: {} \ny_arr shape: {} \nid_arr shape: {}".format(    \
                                x_arr.shape, y_arr_reduced.shape, id_arr_reduced.shape))

  if filename is not None:
    filepath = "/content/drive/MyDrive/ml2-eeg-biometrics/train-test-data/"
    print("Writing to file...") 
    np.save(filepath + "x_" + filename + ".npy", x_arr)
    np.save(filepath + "y_" + filename + ".npy", y_arr_reduced)
    np.save(filepath + "id_" + filename + ".npy", id_arr_reduced)
    print("Writing done.")

  return x_arr, y_arr_reduced, id_arr_reduced

In [None]:
b = np.arange(15*6).reshape(6,15)
# Ensure we have the same value in the 15th column for all samples from the same target. (n_timepoints=2)
b[0:3, 14] = 10
b[3:6, 14] = 20
print(b)
reshape_data(b, n_timepoints=3)
# reshape_data(b, n_timepoints=2) throws an error since the target value is not the same for each sample.

In [None]:
# x_valid, y_valid, id_valid = reshape_data(valid_arr, filename='valid')
# x_test, y_test, id_test = reshape_data(test_arr, filename='test')

In [None]:
## Have to do it piece by piece for training data since it was crashing RAM

## # *** Crashed RAM when running everything all at once, so wrote to file restarted and read back in. ***
## train_df = df[(df['Run']!='Run5') & (df['Participant']!='S12')]
## train_df.to_csv("/content/drive/MyDrive/ml2-eeg-biometrics/train_df_tmp.csv", index=False)
## train_arr = np.genfromtxt("/content/drive/MyDrive/ml2-eeg-biometrics/train_df_tmp.csv", delimiter=',', skip_header=1) # Doesn't work (RAM)
## train_arr = pd.read_csv("/content/drive/MyDrive/ml2-eeg-biometrics/train_df_tmp.csv").to_numpy() # Doesn't work (RAM)

# train_id_df = pd.read_csv("/content/drive/MyDrive/ml2-eeg-biometrics/train_df_tmp.csv", usecols=['Participant','Date', 'Run', 'Task', 'Trial'])
# train_id_arr = train_id_df.to_numpy()

# train_y_df = pd.read_csv("/content/drive/MyDrive/ml2-eeg-biometrics/train_df_tmp.csv", usecols=['Target'])
# train_y_arr = train_y_df.to_numpy()

# cols_to_skip=['Participant','Date', 'Run', 'Task', 'Trial','Target']
# train_x_df = pd.read_csv("/content/drive/MyDrive/ml2-eeg-biometrics/train_df_tmp.csv", usecols=lambda x: x not in cols_to_skip)
# train_x_arr = train_x_df.to_numpy()


filepath = "/content/drive/MyDrive/ml2-eeg-biometrics/train-test-data/"

# n_samples = int(len(train_id_arr)/2500)
# train_id_arr = train_id_arr.reshape((n_samples, 2500, 5))
# train_id_arr_reduced = np.amin(train_id_arr, axis=1, keepdims=True)
# np.save(filepath + "id_train.npy", train_id_arr_reduced)
# print(train_id_arr_reduced.shape)

# n_samples = int(len(train_y_arr)/2500)
# train_y_arr = train_y_arr.reshape((n_samples, 2500, 1))
# train_y_arr_reduced = np.amin(train_y_arr, axis=1, keepdims=True)
# np.save(filepath + "y_train.npy", train_y_arr_reduced)
# print(train_y_arr_reduced.shape)

# n_samples = int(len(train_x_arr)/2500)
# train_x_arr = train_x_arr.reshape((n_samples, 2500, train_x_arr.shape[1]))
# np.save(filepath + "x_train.npy", train_x_arr)
# print(train_x_arr.shape)

After running the relevant code, we have now created the following files in the train-test-data/ folder:
*   x_train.npy
*   x_test.npy
*   x_valid.npy
*   y_train.npy
*   y_test.npy
*   y_valid.npy
*   id_train.npy
*   id_test.npy
*   id_valid.npy