In [None]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os.path

In [None]:
# load drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# make participant list
participants = []
for i in range(1,18):
  if i != 1 and i!=12:
    participants.append("S"+ str(i))

print(participants)

participants_id = [x[1:] for x in participants]
print(participants_id)

['S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8', 'S9', 'S10', 'S11', 'S13', 'S14', 'S15', 'S16', 'S17']
['2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '13', '14', '15', '16', '17']


In [None]:
def getSplitLabels(participant):
  path = '/content/drive/My Drive/stress_data_unzipped/WESAD/S{}/labels_split.csv'
  return pd.read_csv(path.format(participant))

In [None]:
# Function to access labels
def getLabel(participant):
  """Function that accesses the label for a given participant. The labels are formatted as S#_quest.csv. Function returns pandas dataframe.
  participant: number that represents a file in the dataset. 
  """
  path = '/content/drive/My Drive/stress_data_unzipped/WESAD/S{}/S{}_quest.csv'.format(participant,participant)
  label = pd.read_csv(path)

  return label

In [None]:
def getTimeStamps(participant):
  """ Function that accesses the labels of the given participant and returns a dictionary that 
  contains the type of activity (Base, TSST, Medi1, Fun, Medi2) as the key and a tuple of start and
  end time as a value.
  """

  label = getLabel(participant)
  # first line of label is the order to time segments, 
  # second line is start times, third line is end times. 

  # access the rows and split into list
  order = label.iloc[0][0].split(";")
  start = label.iloc[1][0].split(";")
  end = label.iloc[2][0].split(";")

  # find index of "Medi 2" in the order list, which is the last time segment I want to keep
  index = order.index("Medi 2")

  # cut all the lists based on index, and remove first value
  order = order[1:index+1]
  start = start[1:index+1]
  end = end[1:index+1]

  # make dictionary
  label_dict = {}

  for i in range(len(order)):
    label_dict[order[i]] = (start[i], end[i])
  
  return label_dict

In [None]:
# helper function to convert minute.second format to second

def getSeconds(timestamp):
 # time stamp is string

  # if there is no "." in the time stamp then assume no seconds
  # when there are no seconds in the time stamp the data is NOT
  # formatted as such: e.g. 7.00

  if ('.' not in timestamp):
    total_seconds = int(timestamp)*60

  else:
    time = timestamp.split('.')
    minutes = int(time[0])
    seconds = int(time[1])
    total_seconds = minutes*60 + seconds

  return total_seconds

In [None]:
# function that creates pandas dataframe of labeled time stamps
# labels will only either be stressed/not stressed

def makeLabelsDF(participant):
  labels_dict = getTimeStamps(participant)

  df = pd.DataFrame(columns = ["label","name","start_time","end_time","label_note"])

  counter = 1

  for key in labels_dict:
    timestamps = labels_dict[key]
    starttime = getSeconds(timestamps[0])
    endtime = getSeconds(timestamps[1])

    # make label
    # label either 0 or 1. 0 = not stressed, 1 = stressed
    # if key is TSST, then stress. if not, then not stressed
    label = -1

    if (key == "TSST"):
      label = 1
    else:
      label = 0

    row = [label, 'S{}'.format(participant), starttime, endtime, key]
    df.loc[counter] = row
    counter += 1

  return df

In [None]:
def getNormalizedData(participant):
  """function that accesses the watch data that has been resampled AND normalized
   and stores .csv files into a dictionary. 
  participant: number  that references a participant in the dataset.
  """

  path = '/content/drive/My Drive/stress_data_unzipped/WESAD/S{}/sync_data/normalized/'.format(participant)

  data = {}

  #data['ACC'] = pd.read_csv(path + 'ACC.csv')
  data['ACC_x'] = pd.read_csv(path + 'ACC_x.csv')
  data['ACC_y'] = pd.read_csv(path + 'ACC_y.csv')
  data['ACC_z'] = pd.read_csv(path + 'ACC_z.csv')
  data['BVP'] = pd.read_csv(path + 'BVP.csv')
  data['EDA'] = pd.read_csv(path + 'EDA.csv')
  data['TEMP'] = pd.read_csv(path + 'TEMP.csv')

  return data

In [None]:
df = getSplitLabels(2)
display(df)

Unnamed: 0,label,name,start_time,end_time,label_note
0,0,S2,428,488,Base
1,0,S2,488,548,Base
2,0,S2,548,608,Base
3,0,S2,608,668,Base
4,0,S2,668,728,Base
5,0,S2,728,788,Base
6,0,S2,788,848,Base
7,0,S2,848,908,Base
8,0,S2,908,968,Base
9,0,S2,968,1028,Base


In [None]:
data_test = getNormalizedData(2)
data_x = data_test['ACC_x']

In [None]:
display(data_x)

Unnamed: 0,0
0,0.311424
1,0.323504
2,0.336840
3,0.343857
4,0.336152
...,...
778107,0.250123
778108,0.274332
778109,0.290021
778110,0.298289


In [None]:
print(data_x.iloc[62463])
print(data_x.iloc[54784-1])

0    0.248318
Name: 62463, dtype: float64
0    0.230579
Name: 54783, dtype: float64


In [None]:
label = makeLabelsDF(2)
display(label)

label_subset = label.iloc[0:3]
display(label_subset)

Unnamed: 0,label,name,start_time,end_time,label_note
1,0,S2,428,1592,Base
2,1,S2,2395,3003,TSST
3,0,S2,4219,4621,Medi 1
4,0,S2,4885,5267,Fun
5,0,S2,5618,6015,Medi 2


Unnamed: 0,label,name,start_time,end_time,label_note
1,0,S2,428,1592,Base
2,1,S2,2395,3003,TSST
3,0,S2,4219,4621,Medi 1


In [None]:
for i in range(len(label)):
  print(label.iloc[i])

label            0
name            S2
start_time     428
end_time      1592
label_note    Base
Name: 1, dtype: object
label            1
name            S2
start_time    2395
end_time      3003
label_note    TSST
Name: 2, dtype: object
label              0
name              S2
start_time      4219
end_time        4621
label_note    Medi 1
Name: 3, dtype: object
label            0
name            S2
start_time    4885
end_time      5267
label_note     Fun
Name: 4, dtype: object
label              0
name              S2
start_time      5618
end_time        6015
label_note    Medi 2
Name: 5, dtype: object


In [None]:
def timeToRow(seconds, freq):
  """This helper function takes in seconds as an input and outputs 
  the row number of a dataframe that time corresponds based on the 
  frequency of data sampling. 
  Ex: at 128Hz, the 60th second would be the 128*60th row. 
  """
  return seconds*freq

In [None]:
def splitData(participant, field):
  """This function splits the data of the specified field (ACC_x, BVP, etc). 
  These segments are based off of the segmented labels which have already been 
  created in the participant folder
  """

  # first get split labels
  split_labels = getSplitLabels(participant)

  # iterate through rows of labels
  for i in range(len(split_labels)):
    label = split_labels.iloc[i]

    start = label['start_time']
    end = label['end_time']

    start_row = timeToRow(start, 128) -1
    end_row = timeToRow(end, 128) -1

    # get signal data
    signal = getNormalizedData(participant)[field]

    # make df
    split_signal = signal.iloc[start_row:end_row]

    # make directory
    dir = '/content/drive/My Drive/stress_data_unzipped/WESAD/S{}/segments/{}/'.format(participant, field)
    if not os.path.isdir(dir):
      os.makedirs(dir)

    # save segment
    split_signal.to_csv(os.path.join(dir + 'segment_{}.csv'.format(i)), index=False)



In [None]:
splitData(2,'ACC_x')

In [None]:
def getSegment(participant, field, num):
  path = '/content/drive/My Drive/stress_data_unzipped/WESAD/S{}/segments/{}/segment_{}.csv'.format(participant, field, num)

  df = pd.read_csv(path)
  return df
  

In [None]:
seg = getSegment(2,"ACC_x",1)

In [None]:
display(seg)

Unnamed: 0,0
0,0.248318
1,0.249604
2,0.250946
3,0.251601
4,0.251135
...,...
7675,0.266377
7676,0.264336
7677,0.261968
7678,0.259992


In [None]:
# split all data
field_list = ['ACC_x', 'ACC_y', 'ACC_z', 'BVP', 'EDA','TEMP']

for participant in participants_id:
  for field in field_list:
    splitData(participant,field)