In [1]:
import os
import pandas as pd
import re

# Load The Data

## Eye Tracking Data

In [2]:
# Set the directory path where the CSV files are located
directory_path = '../data/raw/eyetracking'

# Create an empty list to store dataframes
dataframes = []

# Define a regular expression to extract name, date, category, and random number from filenames
filename_pattern = re.compile(r'(?P<name>[\w\-]+)-(?P<date>\d{4}-\d{2}-\d{2})_(?P<category>\w+)_(?P<random_number>\d+)\.csv')

# Iterate through each file in the directory
for filename in os.listdir(directory_path):
    if filename.endswith('.csv'):
        # Match the pattern to extract metadata from the filename
        match = filename_pattern.match(filename)
        if match:
            # Extract the components from the filename
            name = match.group('name')
            date = match.group('date')
            category = match.group('category')
            random_number = match.group('random_number')
            
            # Load the CSV file into a dataframe
            file_path = os.path.join(directory_path, filename)
            df = pd.read_csv(file_path)
            
            # Add the extracted components as constant columns
            df['name'] = name
            df['date'] = date
            df['category'] = category
            df['id'] = filename.replace('.csv', '')
            df['path_name'] = filename.replace('.csv', '')
            
            # Append the dataframe to the list
            dataframes.append(df)

# Concatenate all dataframes into one
df_eyetracking = pd.concat(dataframes, ignore_index=True)


In [3]:
df_eyetracking.head()

Unnamed: 0,Timestamp,MovingTarget_X,MovingTarget_Y,EyeTracker_X,EyeTracker_Y,name,date,category,id,path_name
0,172949213876766,1003.046493,549.421743,773.780904,436.736583,aaliyah,2024-10-21,SP,aaliyah-2024-10-21_SP_1729492081,aaliyah-2024-10-21_SP_1729492081
1,172949213967954,960.215109,629.71283,964.622779,672.15798,aaliyah,2024-10-21,SP,aaliyah-2024-10-21_SP_1729492081,aaliyah-2024-10-21_SP_1729492081
2,172949213972125,957.686528,633.044023,962.812944,680.02506,aaliyah,2024-10-21,SP,aaliyah-2024-10-21_SP_1729492081,aaliyah-2024-10-21_SP_1729492081
3,172949213976253,955.119897,636.32937,960.909494,681.762546,aaliyah,2024-10-21,SP,aaliyah-2024-10-21_SP_1729492081,aaliyah-2024-10-21_SP_1729492081
4,172949213980347,952.517312,639.567137,958.592232,682.5897,aaliyah,2024-10-21,SP,aaliyah-2024-10-21_SP_1729492081,aaliyah-2024-10-21_SP_1729492081


Split SP and SEM data

In [4]:
df_eyetracking_sp = df_eyetracking[df_eyetracking['category']=='SP']

df_eyetracking_sem = df_eyetracking[df_eyetracking['category']=='SEM']

## Heartrate Data

In [5]:
df_heartrate = pd.read_csv('../data/raw/heartrate/00_HR-data.csv')

df_heartrate.head()

Unnamed: 0,row_id,SP_result,SEM_result,HR_before,HR_after,age,height,weight,sex
0,1,ar-2024-10-09_SP_1728433242,ar-2024-10-09_SEM_1728433242,72,120.0,21,164,50,1
1,2,han-2024-10-09_SP_1728433863,han-2024-10-09_SEM_1728433863,83,140.0,21,178,80,1
2,3,ar-2024-10-10_SP_1728533147,ar-2024-10-10_SEM_1728533147,84,134.0,21,164,50,1
3,4,han-2024-10-10_SP_1728534050,han-2024-10-10_SEM_1728534050,86,141.0,21,178,80,1
4,5,ar-2024-10-12_SP_1728718590,ar-2024-10-12_SEM_1728718590,94,146.0,21,164,50,1


In [6]:
len(df_heartrate.row_id.unique())

31

Extract the data from id

In [7]:
# # Define a regular expression to extract name, date, category, and random number
# pattern = re.compile(r'(?P<name>[\w\-]+)-(?P<date>\d{4}-\d{2}-\d{2})_(?P<category>\w+)_(?P<random_number>\d+)')

# # Use str.extract with the regex pattern to create new columns
# df_heartrate[['name', 'date', 'category', 'random_number']] = df['SP_result'].str.extract(pattern)

# Merge both data into one dataframe

In [8]:
len(df_eyetracking_sp.id.unique())

28

In [9]:
df_eyetracking_sp = pd.merge(df_eyetracking_sp,df_heartrate, left_on='path_name', right_on='SP_result', how='left')

df_eyetracking_sem = pd.merge(df_eyetracking_sem,df_heartrate, left_on='path_name', right_on='SEM_result', how='left')

In [10]:
df_eyetracking_sp.head()

Unnamed: 0,Timestamp,MovingTarget_X,MovingTarget_Y,EyeTracker_X,EyeTracker_Y,name,date,category,id,path_name,row_id,SP_result,SEM_result,HR_before,HR_after,age,height,weight,sex
0,172949213876766,1003.046493,549.421743,773.780904,436.736583,aaliyah,2024-10-21,SP,aaliyah-2024-10-21_SP_1729492081,aaliyah-2024-10-21_SP_1729492081,26.0,aaliyah-2024-10-21_SP_1729492081,aaliyah-2024-10-21_SEM_1729492081,94.0,141.0,18.0,160.0,60.0,0.0
1,172949213967954,960.215109,629.71283,964.622779,672.15798,aaliyah,2024-10-21,SP,aaliyah-2024-10-21_SP_1729492081,aaliyah-2024-10-21_SP_1729492081,26.0,aaliyah-2024-10-21_SP_1729492081,aaliyah-2024-10-21_SEM_1729492081,94.0,141.0,18.0,160.0,60.0,0.0
2,172949213972125,957.686528,633.044023,962.812944,680.02506,aaliyah,2024-10-21,SP,aaliyah-2024-10-21_SP_1729492081,aaliyah-2024-10-21_SP_1729492081,26.0,aaliyah-2024-10-21_SP_1729492081,aaliyah-2024-10-21_SEM_1729492081,94.0,141.0,18.0,160.0,60.0,0.0
3,172949213976253,955.119897,636.32937,960.909494,681.762546,aaliyah,2024-10-21,SP,aaliyah-2024-10-21_SP_1729492081,aaliyah-2024-10-21_SP_1729492081,26.0,aaliyah-2024-10-21_SP_1729492081,aaliyah-2024-10-21_SEM_1729492081,94.0,141.0,18.0,160.0,60.0,0.0
4,172949213980347,952.517312,639.567137,958.592232,682.5897,aaliyah,2024-10-21,SP,aaliyah-2024-10-21_SP_1729492081,aaliyah-2024-10-21_SP_1729492081,26.0,aaliyah-2024-10-21_SP_1729492081,aaliyah-2024-10-21_SEM_1729492081,94.0,141.0,18.0,160.0,60.0,0.0


In [11]:
df_eyetracking_sem.head()

Unnamed: 0,Timestamp,MovingTarget_X,MovingTarget_Y,EyeTracker_X,EyeTracker_Y,name,date,category,id,path_name,row_id,SP_result,SEM_result,HR_before,HR_after,age,height,weight,sex
0,172871978069683,611.447637,729.671923,589.245264,674.948394,bian,2024-10-12,SEM,bian-2024-10-12_SEM_1728719688,bian-2024-10-12_SEM_1728719688,8.0,bian-2024-10-12_SP_1728719688,bian-2024-10-12_SEM_1728719688,96.0,143.0,21.0,160.0,60.0,1.0
1,172871978073803,609.704707,728.989154,580.7628,676.906245,bian,2024-10-12,SEM,bian-2024-10-12_SEM_1728719688,bian-2024-10-12_SEM_1728719688,8.0,bian-2024-10-12_SP_1728719688,bian-2024-10-12_SEM_1728719688,96.0,143.0,21.0,160.0,60.0,1.0
2,172871978078009,607.992587,728.306197,575.323315,678.27564,bian,2024-10-12,SEM,bian-2024-10-12_SEM_1728719688,bian-2024-10-12_SEM_1728719688,8.0,bian-2024-10-12_SP_1728719688,bian-2024-10-12_SEM_1728719688,96.0,143.0,21.0,160.0,60.0,1.0
3,172871978082189,606.31196,727.623899,564.281712,680.405454,bian,2024-10-12,SEM,bian-2024-10-12_SEM_1728719688,bian-2024-10-12_SEM_1728719688,8.0,bian-2024-10-12_SP_1728719688,bian-2024-10-12_SEM_1728719688,96.0,143.0,21.0,160.0,60.0,1.0
4,172871978086339,604.663485,726.943114,559.621195,681.28281,bian,2024-10-12,SEM,bian-2024-10-12_SEM_1728719688,bian-2024-10-12_SEM_1728719688,8.0,bian-2024-10-12_SP_1728719688,bian-2024-10-12_SEM_1728719688,96.0,143.0,21.0,160.0,60.0,1.0


## Drop Unnecesarry Feature

In [12]:
df_eyetracking_sp = df_eyetracking_sp.drop(['path_name','SP_result','SEM_result'], axis =1)
df_eyetracking_sem = df_eyetracking_sem.drop(['path_name','SP_result','SEM_result'], axis =1)

## Save dataframe into csv

In [13]:
# df_eyetracking_sem[df_eyetracking_sem['id']='rindu-2024-10-21_SP_1729490930']

In [14]:
df_eyetracking_sp.to_csv('../data/interim/1_0_data_sp.csv', index=False)
df_eyetracking_sem.to_csv('../data/interim/1_1_data_sem.csv', index=False)