In [None]:
import pandas as pd
import numpy as np
import datetime
from datetime import datetime as dt
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
ACC, BVP, EDA, HR, IBI, tags, TEMP = [], [], [], [], [], [], []

## Loading The CSV files

In [None]:
# This Code block will load the csv files for every student.
# The index of every list actually contains the data of that perticular student
# e.g: BVP[0] contains BVP values of Student # 1, BVP[1] contains BVP Values for Student # 2
# All the other lists follow the same pattern
# Not loading the ACC files because the hand movementhas nothing to do with the Stress Prediction.

for i in range(1,36):
    BVP.append(pd.read_csv('/Raw_data/S' + '%02d'%i + '/BVP.csv', header=None))
    EDA.append(pd.read_csv('/Raw_data/S' + '%02d'%i + '/EDA.csv', header=None))
    HR.append(pd.read_csv('/Raw_data/S' + '%02d'%i + '/HR.csv', header=None))
    IBI.append(pd.read_csv('/Raw_data/S' + '%02d'%i + '/IBI.csv', header=None))
    tags.append(pd.read_csv('/Raw_data/S' + '%02d'%i + '/tags_S' + '%02d'%i + '.csv', header=None))
    TEMP.append(pd.read_csv('/Raw_data/S' + '%02d'%i + '/TEMP.csv', header=None))

## Making Time Stamps

In [None]:
time_values_bvp = []
time_values_eda = []
time_values_hr = []
time_values_temp = []

for student_index in range(35):
    start_time_bvp = BVP[student_index].iloc[0,0]           # Getting Start time at 1st row of data in UNIX format
    sampling_frequency_bvp = BVP[student_index].iloc[1,0]   # Getting Sampling Frequency at 2st row of data.

    start_time_eda = EDA[student_index].iloc[0,0]           # Getting Start time at 1st row of data in UNIX format
    sampling_frequency_eda = EDA[student_index].iloc[1,0]   # Getting Sampling Frequency at 2st row of data.

    start_time_hr = HR[student_index].iloc[0,0]             # Getting Start time at 1st row of data in UNIX format
    sampling_frequency_hr = HR[student_index].iloc[1,0]     # Getting Sampling Frequency at 2st row of data.

    start_time_temp = TEMP[student_index].iloc[0,0]         # Getting Start time at 1st row of data in UNIX format
    sampling_frequency_temp = TEMP[student_index].iloc[1,0] # Getting Sampling Frequency at 2st row of data.

    # Calculate time values in standard date-time format for all siganls based on start time and sampling frequency

    time_values_bvp.append(pd.to_datetime(np.arange(len(BVP[student_index]) - 2) * (1 / sampling_frequency_bvp) + start_time_bvp, unit='s'))
    time_values_eda.append(pd.to_datetime(np.arange(len(EDA[student_index]) - 2) * (1 / sampling_frequency_eda) + start_time_eda, unit='s'))
    time_values_hr.append(pd.to_datetime(np.arange(len(HR[student_index]) - 2) * (1 / sampling_frequency_hr) + start_time_hr, unit='s'))
    time_values_temp.append(pd.to_datetime(np.arange(len(TEMP[student_index]) - 2) * (1 / sampling_frequency_temp) + start_time_temp, unit='s'))

In [None]:
# Printing the Start Times of Siganls for all students to check wether start time of siganls is same or not.

for student_index in range(35):
    print('BVP Start:', time_values_bvp[student_index][0].time(),'EDA Start:', time_values_eda[student_index][0].time(),
          'HR Start:', time_values_hr[student_index][0].time(), 'TEMP Start:', time_values_temp[student_index][0].time(),)

BVP Start: 09:27:41 EDA Start: 09:27:41 HR Start: 09:27:51 TEMP Start: 09:27:41
BVP Start: 09:52:54 EDA Start: 09:52:54 HR Start: 09:53:04 TEMP Start: 09:52:54
BVP Start: 10:56:12 EDA Start: 10:56:12 HR Start: 10:56:22 TEMP Start: 10:56:12
BVP Start: 11:29:49 EDA Start: 11:29:49 HR Start: 11:29:59 TEMP Start: 11:29:49
BVP Start: 09:12:05 EDA Start: 09:12:05 HR Start: 09:12:15 TEMP Start: 09:12:05
BVP Start: 09:45:00 EDA Start: 09:45:00 HR Start: 09:45:10 TEMP Start: 09:45:00
BVP Start: 10:30:34 EDA Start: 10:30:34 HR Start: 10:30:44 TEMP Start: 10:30:34
BVP Start: 12:12:24 EDA Start: 12:12:24 HR Start: 12:12:34 TEMP Start: 12:12:24
BVP Start: 12:38:09 EDA Start: 12:38:09 HR Start: 12:38:19 TEMP Start: 12:38:09
BVP Start: 13:14:19 EDA Start: 13:14:19 HR Start: 13:14:29 TEMP Start: 13:14:19
BVP Start: 15:20:28 EDA Start: 15:20:28 HR Start: 15:20:38 TEMP Start: 15:20:28
BVP Start: 09:15:08 EDA Start: 09:15:08 HR Start: 09:15:18 TEMP Start: 09:15:08
BVP Start: 11:24:07 EDA Start: 11:24:07 

In [None]:
# Lists to store synchronized signals
BVP_sync = []
EDA_sync = []
HR_sync = []
TEMP_sync = []

for i in range(35):
    # Getting the latest start time of all signals
    start_time = max(BVP[i].iloc[0, 0], EDA[i].iloc[0, 0], HR[i].iloc[0,0], TEMP[i].iloc[0, 0])

    # Getting the lowest sampling frequency
    lowest_sampling_freq = min(BVP[i].iloc[1, 0], EDA[i].iloc[1, 0], HR[i].iloc[1,0], TEMP[i].iloc[1, 0])

    # Calculate the number of rows to delete before the start time based on time difference and sampling frequency
    bvp_rows_to_delete = int((start_time - BVP[i].iloc[0, 0]) * BVP[i].iloc[1, 0]) + 2
    eda_rows_to_delete = int((start_time - EDA[i].iloc[0, 0]) * EDA[i].iloc[1, 0]) + 2
    hr_rows_to_delete = int((start_time - HR[i].iloc[0, 0]) * HR[i].iloc[1, 0]) + 2
    temp_rows_to_delete = int((start_time - TEMP[i].iloc[0, 0]) * TEMP[i].iloc[1, 0]) + 2

    # Delete rows before latest start time
    bvp_data = BVP[i].iloc[bvp_rows_to_delete:, 0]
    eda_data = EDA[i].iloc[eda_rows_to_delete:, 0]
    hr_data = HR[i].iloc[hr_rows_to_delete:, 0]
    temp_data = TEMP[i].iloc[temp_rows_to_delete:, 0]

    # Resample signals to match the lowest_sampling_freq
    bvp_data = bvp_data.iloc[::int(BVP[i].iloc[1, 0] / lowest_sampling_freq)]
    eda_data = eda_data.iloc[::int(EDA[i].iloc[1, 0] / lowest_sampling_freq)]
    hr_data = hr_data.iloc[::int(HR[i].iloc[1, 0] / lowest_sampling_freq)]
    temp_data = temp_data.iloc[::int(TEMP[i].iloc[1, 0] / lowest_sampling_freq)]

    # Making End time of the signals same
    end = min(len(bvp_data), len(eda_data), len(hr_data), len(temp_data))

    # Append synchronized signals to the lists
    BVP_sync.append(bvp_data[:end])
    EDA_sync.append(eda_data[:end])
    HR_sync.append(hr_data[:end])
    TEMP_sync.append(temp_data[:end])

# Now ACC_sync, BVP_sync, EDA_sync, HR_sync, IBI_sync, tags_sync, TEMP_sync contain synchronized signals for each student


## Combining the BVP, EDA, HR and TEMP of for each student.

In [None]:
# This Code block actually combines the BVP, EDA, HR and TEMP values of each studeint in one dictonary.

S = []

for i in range(35):
    S.append({"BVP": [], "EDA": [], "HR": [], "TEMP": []})
    S[i]["BVP"] = BVP_sync[i].tolist()
    S[i]["EDA"] = EDA_sync[i].tolist()
    S[i]["HR"] = HR_sync[i].tolist()
    S[i]["TEMP"] = TEMP_sync[i].tolist()

## Reading the Time Log for Students

In [None]:
time_log = pd.read_excel('/Processed_data/Time_logs.xlsx')

## Defining stressed and non-Stressed periods

In [None]:
# This block of code actually markes the time period of Stroop Test, Interview and Hyperventilation test and Questionniare as the stressfull periods.

# Label column represents the stress level as 0 or 1.
# 0 = No stress
# 1 = Stress

from datetime import datetime, time

for i in range(35):
    last_start_simple = max(time_values_bvp[i][0].time(), time_values_eda[i][0].time(), time_values_hr[i][0].time(), time_values_temp[i][0].time())
    # last_start_simple = datetime.strptime(last_start_simple.strftime('%I:%M %p'), '%I:%M %p').time()

    hours_12 = last_start_simple.hour % 12 if last_start_simple.hour % 12 != 0 else 12
    last_start_simple = time(hours_12, last_start_simple.minute, last_start_simple.second)
    S[i]["Label"] = [0] * len(S[i]['BVP'])
    S[i]['Student_ID'] = [i+1] * len(S[i]['BVP'])
    for j in [8, 12, 16, 20]:
        if time_log.iloc[i+1,j] > last_start_simple:
            stress_start = (time_log.iloc[i+1,j].hour * 3600 + time_log.iloc[i+1,j].minute * 60) - (last_start_simple.hour *3600 + last_start_simple.minute *60 + last_start_simple.second)
            stress_end = (time_log.iloc[i+1,j+1].hour * 3600 + time_log.iloc[i+1,j+1].minute * 60) - (last_start_simple.hour *3600 + last_start_simple.minute *60 + last_start_simple.second)
            S[i]["Label"][stress_start:stress_end] = [1]*(stress_end-stress_start)
        else:
            stress_start = (time_log.iloc[i+1,j].hour * 3600 + 12*3600 + time_log.iloc[i+1,j].minute * 60) - (last_start_simple.hour *3600 + last_start_simple.minute *60 + last_start_simple.second)
            stress_end = (time_log.iloc[i+1,j+1].hour * 3600 + 12*3600 + time_log.iloc[i+1,j+1].minute * 60) - (last_start_simple.hour *3600 + last_start_simple.minute *60 + last_start_simple.second)
            S[i]["Label"][stress_start:stress_end] = [1]*(stress_end-stress_start)

## Converting the Dictionaries of every students into separate dataframes.

In [None]:
S_df = []
for i in range(0, 35):
    S_df.append(pd.DataFrame(S[i]))

In [None]:
S_df[0]

Unnamed: 0,BVP,EDA,HR,TEMP,Label,Student_ID
0,-26.61,0.169126,83.00,28.39,0,1
1,44.90,0.172969,83.00,28.43,0,1
2,17.57,0.172969,72.67,28.43,0,1
3,91.18,0.176813,87.00,28.47,0,1
4,-136.83,0.169126,80.00,28.49,0,1
...,...,...,...,...,...,...
3243,201.46,0.020500,83.12,28.31,0,1
3244,-153.97,0.062781,83.28,28.33,0,1
3245,-23.22,0.029469,83.35,28.29,0,1
3246,-33.29,0.019219,83.33,28.31,0,1


## Combining all the dataframes into single dataframe.

In [None]:
# Keeping 1st 30 students for training and validation.
Training_Students = pd.concat([S_df[k] for k in range(30)])
Training_Students.reset_index(drop=True)

# Keeping last 5 students completely separate to evaluate
Evaluation_Students = pd.concat([S_df[k] for k in range(30,35)])
Evaluation_Students.reset_index(drop=True)

Unnamed: 0,BVP,EDA,HR,TEMP,Label,Student_ID
0,46.10,0.124282,88.00,32.09,0,31
1,-6.76,0.124282,79.50,32.11,0,31
2,38.16,0.126844,72.33,32.11,0,31
3,27.49,0.125563,71.75,32.11,0,31
4,-39.24,0.125563,67.00,32.11,0,31
...,...,...,...,...,...,...
16785,47.66,0.209124,65.28,32.16,0,35
16786,-115.06,0.205280,65.37,32.18,0,35
16787,-245.07,0.210405,65.42,32.18,0,35
16788,17.59,0.212968,65.45,32.18,0,35


## Exporting the Last DataFrame to cvs file

In [None]:
Training_Students.to_csv("/Combined_train.csv")
Evaluation_Students.to_csv("/Combined_evaluate.csv")