#### Import packages for setting up the essential tools and libraries

In [None]:
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.linear_model import LinearRegression, Ridge, Lasso, SGDClassifier, LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score, f1_score, recall_score, roc_auc_score, precision_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import cross_validate
import xgboost as xgb
import lightgbm as lgb

import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning, module="my_module")
import warnings
warnings.simplefilter('ignore')


#### Read data and data preprocessing

In [None]:
df = pd.read_csv('your_data.csv')# If you need data, please email haoyan.jiang@mail.utoronto.ca for data
replacement_dict = {'Saccade': 0, 'Fixation': 1, 'Unclassified': 2, 'EyesNotFound': 3}

# Replace the values in the column
df['Eye movement type'] = df['Eye movement type'].replace(replacement_dict)

feature_names = ['Gaze event duration', 'Fixation point X',
       'Fixation point Y', 'Eye movement type', 'Accelerometer X',
       'Accelerometer Y', 'Accelerometer Z', 'Gyro X', 'Gyro Y', 'Gyro Z',
       'Gaze point X', 'Gaze point Y', 'Gaze point 3D X', 'Gaze point 3D Y',
       'Gaze point 3D Z', 'Gaze direction left X', 'Gaze direction left Y',
       'Gaze direction left Z', 'Gaze direction right X',
       'Gaze direction right Y', 'Gaze direction right Z',
       'Pupil position left X', 'Pupil position left Y',
       'Pupil position left Z', 'Pupil position right X',
       'Pupil position right Y', 'Pupil position right Z',
       'Pupil diameter left', 'Pupil diameter right']
X = df[feature_names]
imputer = SimpleImputer()
X_filled = imputer.fit_transform(X)
X_filled = pd.DataFrame(X_filled, columns=feature_names)
X_filled['Time Window'] = df['Time Window']
X_filled['is_distraction'] = df['is_distraction']
X_filled['videoNum'] = df['videoNum']
X_filled['Participant ID'] = df['Participant ID']
X_filled['order'] = df['order']
X_filled['interest_level'] = df['interest_level']


X_filled = X_filled[X_filled['Time Window'].notna()]

Parsing and Extracting data windows

In [None]:
def parse_data(df, window_size, include = True):
### THE function for data window parsing
### return 1) parsed window contains EITHER distraction or attention
### return 2) parsed window contains neigher distraction nor attention, and not overlap with 1)


  all_windows = [] # windows for verbolization
  all_windows_neg = []
  tracked = False
  last_time = None
  last_index = -1
  videoN = -1
  for index, row in df.iloc[::-1].iterrows():
    if tracked ==True:
      # print('Here')
      if row['is_distraction']!='None' or row['videoNum']!=videoN:
        res = row['Time Window'].strip('][').split(', ')
        # print(last_time, float(res[0]))
        if ((last_time - float(res[0])) <= window_size):
          if include and (last_index-index>3):
            all_windows.append(df.iloc[index+1:last_index+2].copy())
        last_time = float(res[1])
        last_index = index
        if row['videoNum']!=videoN and row['is_distraction']=='None':
          tracked=False
          # print(1, last_time-float(res[0]))
          # print('find a window')
        videoN = row['videoNum']
      elif row['is_distraction']=='None':
        res = row['Time Window'].strip('][').split(', ')
        if row['videoNum']!=videoN:
          tracked = False
          last_index = index
          videoN = row['videoNum']
        if (last_time - float(res[0])) > window_size:
          all_windows.append(df.iloc[index+1:last_index+1].copy())
          tracked = False
          last_index = index
          videoN = row['videoNum']
          # print(2, last_time-float(res[0]))

          # print('find a window')
    else:
      if row['is_distraction']!='None':
        # print('window')
        res = row['Time Window'].strip('][').split(', ')
        tracked = True
        last_time = float(res[1])
        all_windows_neg.append(df.iloc[index+1:last_index])
        last_index = index
        videoN = row['videoNum']

  ## negation of window data
  finite_all_windows_neg = []
  for wind in all_windows_neg:
    if wind.shape[0]>0:
      start_time = float(wind.iloc[0]['Time Window'].strip('][').split(', ')[0])
      for index, row in wind.iterrows():
        t = float(row['Time Window'].strip('][').split(', ')[1])
        if (t-start_time)>=window_size:
          finite_all_windows_neg.append(wind.loc[0:index-1].copy())
          break
  print(len(all_windows), len(finite_all_windows_neg))
  return all_windows, finite_all_windows_neg

#### Feature Engineering and Aggregation functions

In [None]:
#std
def get_std_df(df_list, column_names):
    std_list = []

    for df in df_list:
        std_row = []
        for col in column_names:
            std_row.append(np.std(df[col]))
        std_list.append(std_row)

    std_df = pd.DataFrame(std_list, columns=column_names)

    return std_df


In [None]:
#sum
def calculate_avg_a(df_list, column_names):
    l1 = []
    for df in df_list:
        df = df.reset_index(drop=True)
        a_list = []
        for i in range(len(df[column_names[0]])):
            if i > 0:
                a = np.sqrt(np.square(df[column_names[0]][i] - df[column_names[0]][i-1]) \
                + np.square(df[column_names[1]][i] - df[column_names[1]][i-1]) \
                + np.square(df[column_names[2]][i] - df[column_names[2]][i-1]))
                a_list.append(a)
        sum_a = np.sum(a_list)
        l1.append(sum_a)
    return pd.DataFrame({'avg_a': l1})

In [None]:
def Saccade_fixation_ratio(df_list):
    fixation_count = []
    saccade_count = []

    for df in df_list:

        fixation = df[df['Eye movement type'] == 0]['Eye movement type'].count()
        saccade = df[df['Eye movement type'] == 1]['Eye movement type'].count()

        fixation_count.append(fixation/len(df))
        saccade_count.append(saccade/len(df))

    result = pd.DataFrame({'Fixation count': fixation_count, 'Saccade count': saccade_count})

    return result

In [None]:
def absolute_changes(df_list, col_name):
  max = 0
  min = 0
  a = 0
  change = []
  for df in df_list:
    for i in df[col_name]:
      if i < min:
        min = i
      elif i > max:
        max = i
    a = abs(max - min)
    min = 0
    max = 0
    change.append(a)

  return pd.DataFrame({'abs_change': change})

In [None]:
def avg_col(df_list, col_name):
  y = []
  avg = []
  for df in df_list:
    for i in df[col_name]:
      y.append(i)
    average = abs(np.mean(y))
    y = []
    avg.append(average)
  return pd.DataFrame({'avg1': avg})

In [None]:
def con_df(df1, df2, df3):
  #concatenated = df1.join(df2)
  concatenated = df1.join([df2, df3])
  return concatenated

In [None]:
def longest_fixation(df_list):
    fixation_durations = []  # List to store the longest fixation durations

    for idx, df in enumerate(df_list, start=1):
        fixation_range_container = []
        start_time = 0

        for index, row in df.iterrows():
            eye_movement_type = row['Eye movement type']
            time_window = row['Time Window']

            if eye_movement_type == 1 and not row.equals(df.iloc[-1]):
                time_range = [float(t) for t in time_window.strip('][').split(', ')]
                last_time = time_range[1]
                first_time = time_range[0]

                if start_time == 0:
                    start_time = first_time

            if eye_movement_type != 1 and not row.equals(df.iloc[-1]):
                time_range = [float(t) for t in time_window.strip('][').split(', ')]
                first_time_2 = time_range[0]

                if start_time != 0:
                    fixation_range_container.append(first_time_2 - start_time)
                    start_time = 0

            if row.equals(df.iloc[-1]):
                if eye_movement_type == 1:
                    time_range = [float(t) for t in time_window.strip('][').split(', ')]
                    last_time = time_range[1]
                    first_time = time_range[0]

                    if start_time == 0:
                        fixation_range_container.append(last_time - first_time)
                        start_time = 0
                        fixation_durations.append(max(fixation_range_container))
                    else:
                        fixation_range_container.append(last_time - start_time)
                        fixation_durations.append(max(fixation_range_container))
                        start_time = 0

    # Create a DataFrame with fixation durations
    lg_df = pd.DataFrame({'Longest Fixation Duration': fixation_durations})

    return lg_df


In [None]:
def verbalization_lag(df_list):
    lag_durations = []  # List to store verbalization lag durations

    for idx, df in enumerate(df_list, start=1):
        last_index_label = df.index.max()
        last_row = df.loc[last_index_label]
        start_time = 0
        end_list = []
        fixation_range_container = []

        for index, row in df.iterrows():
            eye_movement_type = row['Eye movement type']
            time_window = row['Time Window']

            if eye_movement_type == 1 and not row.equals(last_row):
                time_range = [float(t) for t in time_window.strip('][').split(', ')]
                last_time = time_range[1]
                first_time = time_range[0]

                if start_time == 0:
                    start_time = first_time

            if eye_movement_type != 1 and not row.equals(last_row):
                time_range = [float(t) for t in time_window.strip('][').split(', ')]
                first_time_2 = time_range[0]

                if start_time != 0:
                    fixation_range_container.append(first_time_2 - start_time)
                    start_time = 0
                    end_list.append(first_time_2)

            if row.equals(last_row):
                if eye_movement_type == 1:
                    time_range = [float(t) for t in time_window.strip('][').split(', ')]
                    last_time = time_range[1]
                    first_time = time_range[0]

                    if start_time == 0:
                        fixation_range_container.append(last_time - first_time)
                        end_list.append(last_time)
                        start_time = 0
                        lag_durations.append(max(fixation_range_container))
                    else:
                        fixation_range_container.append(last_time - start_time)
                        end_list.append(last_time)
                        lag_durations.append(max(fixation_range_container))
                else:
                    if start_time != 0:
                        time_range = [float(t) for t in time_window.strip('][').split(', ')]
                        first_time_2 = time_range[0]
                        fixation_range_container.append(first_time_2 - start_time)
                        end_list.append(first_time_2)
                        start_time = 0
                        lag_durations.append(max(fixation_range_container))
                    elif len(fixation_range_container) == 0:
                        lag_durations.append(0)
                    elif len(fixation_range_container) != 0 and start_time == 0:
                        lag_durations.append(max(fixation_range_container))

        max_fixation_index = fixation_range_container.index(max(fixation_range_container))
        last_time_range = [float(t) for t in last_row['Time Window'].strip('][').split(', ')]
        start_point = last_time_range[0]

        if start_point - end_list[max_fixation_index] < 0:
            lag_durations.append(start_point - end_list[max_fixation_index] + max(fixation_range_container))
        else:
            lag_durations.append(start_point - end_list[max_fixation_index] + max(fixation_range_container))

    lag_df = pd.DataFrame({'Verbalization Lag Duration': lag_durations})

    return lag_df


In [None]:
def Temporal_movement(df_list, col_names):
    fixation_to_saccade_ratio = Saccade_fixation_ratio(df_list).reset_index(drop=True)
    avg_a = calculate_avg_a(df_list, col_names).reset_index(drop=True)
    longest_fixation_range = longest_fixation(df_list)
    verbalization_lag = verbalization_lag(df_list)
    result = pd.concat([fixation_to_saccade_ratio, avg_a, longest_fixation_range, verbalization_lag], axis=1, ignore_index=True)
    result.columns = ['Fixation-to-Saccade Ratio', 'Average "a"', 'Longest Fixation Range', 'Verbalization Lag']
    return result



def Gaze_pupil(df_list, col_names_1):
    # Calculate standard deviations and reset index
    std_fixation_x_y = get_std_df(df_list, ['Fixation point X', 'Fixation point Y']).reset_index(drop=True)
    std_gaze_3d = get_std_df(df_list, col_names_1).reset_index(drop=True)
    std_gaze_point = get_std_df(df_list, ['Gaze point X', 'Gaze point Y']).reset_index(drop=True)
    std_gaze_dir_left = get_std_df(df_list, ['Gaze direction left X', 'Gaze direction left Y', 'Gaze direction left Z']).reset_index(drop=True)
    std_gaze_dir_right = get_std_df(df_list, ['Gaze direction right X', 'Gaze direction right Y', 'Gaze direction right Z']).reset_index(drop=True)
    std_pupil_pos_left = get_std_df(df_list, ['Pupil position left X', 'Pupil position left Y', 'Pupil position left Z']).reset_index(drop=True)
    std_pupil_pos_right = get_std_df(df_list, ['Pupil position right X', 'Pupil position right Y', 'Pupil position right Z']).reset_index(drop=True)
    std_pupil_diameter = get_std_df(df_list, ['Pupil diameter left', 'Pupil diameter right']).reset_index(drop=True)

    # Calculate averages and reset index
    avg_fixation_x = avg_col(df_list, 'Fixation point X').reset_index(drop=True)
    avg_fixation_y = avg_col(df_list, 'Fixation point Y').reset_index(drop=True)
    avg_gaze_3d_x = avg_col(df_list, 'Gaze point 3D X').reset_index(drop=True)
    avg_gaze_3d_y = avg_col(df_list, 'Gaze point 3D Y').reset_index(drop=True)
    avg_gaze_3d_z = avg_col(df_list, 'Gaze point 3D Z').reset_index(drop=True)
    avg_gaze_point_x = avg_col(df_list, 'Gaze point X').reset_index(drop=True)
    avg_gaze_point_y = avg_col(df_list, 'Gaze point Y').reset_index(drop=True)
    avg_gaze_dir_left_x = avg_col(df_list, 'Gaze direction left X').reset_index(drop=True)
    avg_gaze_dir_left_y = avg_col(df_list, 'Gaze direction left Y').reset_index(drop=True)
    avg_gaze_dir_left_z = avg_col(df_list, 'Gaze direction left Z').reset_index(drop=True)
    avg_gaze_dir_right_x = avg_col(df_list, 'Gaze direction right X').reset_index(drop=True)
    avg_gaze_dir_right_y = avg_col(df_list, 'Gaze direction right Y').reset_index(drop=True)
    avg_gaze_dir_right_z = avg_col(df_list, 'Gaze direction right Z').reset_index(drop=True)
    avg_pupil_pos_left_x = avg_col(df_list, 'Pupil position left X').reset_index(drop=True)
    avg_pupil_pos_left_y = avg_col(df_list, 'Pupil position left Y').reset_index(drop=True)
    avg_pupil_pos_left_z = avg_col(df_list, 'Pupil position left Z').reset_index(drop=True)
    avg_pupil_pos_right_x = avg_col(df_list, 'Pupil position right X').reset_index(drop=True)
    avg_pupil_pos_right_y = avg_col(df_list, 'Pupil position right Y').reset_index(drop=True)
    avg_pupil_pos_right_z = avg_col(df_list, 'Pupil position right Z').reset_index(drop=True)
    avg_pupil_diameter_left = avg_col(df_list, 'Pupil diameter left')
    avg_pupil_diameter_right = avg_col(df_list, 'Pupil diameter right')

    # Calculate absolute changes and reset index
    abs_change_gaze_3d_x = absolute_changes(df_list, 'Gaze point 3D X').reset_index(drop=True)
    abs_change_gaze_3d_y = absolute_changes(df_list, 'Gaze point 3D Y').reset_index(drop=True)
    abs_change_gaze_3d_z = absolute_changes(df_list, 'Gaze point 3D Z').reset_index(drop=True)

    # Concatenate all dataframes
    result = pd.concat([
        std_fixation_x_y, avg_fixation_x, avg_fixation_y,
        std_gaze_3d, abs_change_gaze_3d_x, abs_change_gaze_3d_y, abs_change_gaze_3d_z,
        avg_gaze_3d_x, avg_gaze_3d_y, avg_gaze_3d_z,
        std_gaze_point, avg_gaze_point_x, avg_gaze_point_y,
        std_gaze_dir_left, avg_gaze_dir_left_x, avg_gaze_dir_left_y, avg_gaze_dir_left_z,
        std_gaze_dir_right, avg_gaze_dir_right_x, avg_gaze_dir_right_y, avg_gaze_dir_right_z,
        std_pupil_pos_left, avg_pupil_pos_left_x, avg_pupil_pos_left_y, avg_pupil_pos_left_z,
        std_pupil_pos_right, avg_pupil_pos_right_x, avg_pupil_pos_right_y, avg_pupil_pos_right_z,
        std_pupil_diameter, avg_pupil_diameter_left, avg_pupil_diameter_right], axis=1, ignore_index=True)

    # Define meaningful column names
    new_name = ['SD of Fixation point X', 'SD of Fixation point Y',
                'Avg of Fixation point X', 'Avg of Fixation point Y',
                'SD of Gaze point 3D X', 'SD of Gaze point 3D Y', 'SD of Gaze point 3D Z',
                'Abs change of Gaze point 3D X', 'Abs change of Gaze point 3D Y', 'Abs change of Gaze point 3D Z',
                'Avg of Gaze point 3D X', 'Avg of Gaze point 3D Y', 'Avg of Gaze point 3D Z',
                'SD of Gaze point X', 'SD of Gaze point Y', 'Avg of Gaze point X', 'Avg of Gaze point Y',
                'SD of Gaze direction left X', 'SD of Gaze direction left Y', 'SD of Gaze direction left Z',
                'Avg of Gaze direction left X', 'Avg of Gaze direction left Y', 'Avg of Gaze direction left Z',
                'SD of Gaze direction right X', 'SD of Gaze direction right Y', 'SD of Gaze direction right Z',
                'Avg of Gaze direction right X', 'Avg of Gaze direction right Y', 'Avg of Gaze direction right Z',
                'SD of Pupil position left X', 'SD of Pupil position left Y', 'SD of Pupil position left Z',
                'Avg of Pupil position left X', 'Avg of Pupil position left Y', 'Avg of Pupil position left Z',
                'SD of Pupil position right X', 'SD of Pupil position right Y', 'SD of Pupil position right Z',
                'Avg of Pupil position right X', 'Avg of Pupil position right Y', 'Avg of Pupil position right Z',
                'SD of Pupil diameter left', 'SD of Pupil diameter right',
                'Avg of Pupil diameter left', 'Avg of Pupil diameter right']

    # Assign column names to the result dataframe
    result.columns = new_name

    return result


def headmovement_orientation(df_list, col_names):
    # Calculate absolute changes and reset index
    abs_gyro_x = absolute_changes(df_list, 'Gyro X').reset_index(drop=True)
    abs_gyro_y = absolute_changes(df_list, 'Gyro Y').reset_index(drop=True)
    abs_gyro_z = absolute_changes(df_list, 'Gyro Z').reset_index(drop=True)

    # Calculate averages and reset index
    avg_gyro_x = avg_col(df_list, 'Gyro X').reset_index(drop=True)
    avg_gyro_y = avg_col(df_list, 'Gyro Y').reset_index(drop=True)
    avg_gyro_z = avg_col(df_list, 'Gyro Z').reset_index(drop=True)

    # Calculate standard deviations and reset index
    std_col_names = get_std_df(df_list, col_names).reset_index(drop=True)
    std_accelerometer_xyz = get_std_df(df_list, ['Accelerometer X', 'Accelerometer Y', 'Accelerometer Z']).reset_index(drop=True)

    # Calculate average 'a' (not sure what 'a' represents) and reset index
    avg_a = calculate_avg_a(df_list, col_names).reset_index(drop=True)
    avg_accelerometer_x = avg_col(df_list, 'Accelerometer X').reset_index(drop=True)
    avg_accelerometer_y = avg_col(df_list, 'Accelerometer Y').reset_index(drop=True)
    avg_accelerometer_z = avg_col(df_list, 'Accelerometer Z').reset_index(drop=True)
    avg_trajectory_length_accelerometer = calculate_avg_a(df_list, ['Accelerometer X', 'Accelerometer Y', 'Accelerometer Z']).reset_index(drop=True)

    # Concatenate all dataframes
    result = pd.concat([
        abs_gyro_x, abs_gyro_y, abs_gyro_z,
        avg_gyro_x, avg_gyro_y, avg_gyro_z,
        std_col_names, avg_a,
        std_accelerometer_xyz, avg_accelerometer_x, avg_accelerometer_y, avg_accelerometer_z,
        avg_trajectory_length_accelerometer
    ], axis=1, ignore_index=True)

    # Define meaningful column names
    new_name = ['Abs_Gyro_X', 'Abs_Gyro_Y', 'Abs_Gyro_Z',
                'Avg_Gyro_X', 'Avg_Gyro_Y', 'Avg_Gyro_Z',
                'SD_Gyro_X', 'SD_Gyro_Y', 'SD_Gyro_Z', 'Trajectory_Length_Gyro',
                'SD_Accelerometer_X', 'SD_Accelerometer_Y', 'SD_Accelerometer_Z',
                'Avg_Accelerometer_X', 'Avg_Accelerometer_Y', 'Avg_Accelerometer_Z', 'Trajectory_Length_Accelerometer']

    # Assign column names to the result dataframe
    result.columns = new_name

    return result


#### Remove useless windows(windows without saccade and fixation) automatically

In [None]:
def remove_useless_window(window_list, neg_window_list):
    useless_window_indexes = []
    useless_neg_window_indexes = []

    longest_fixation_window = find_longest_fixation(window_list)
    longest_fixation_neg_window = find_longest_fixation(neg_window_list)

    for i in range(len(longest_fixation_window['ColumnA'])):
        if longest_fixation_window['ColumnA'][i] == 0:
            useless_window_indexes.append(i)

    for i in range(len(longest_fixation_neg_window['ColumnA'])):
        if longest_fixation_neg_window['ColumnA'][i] == 0:
            useless_neg_window_indexes.append(i)

    useful_windows = [window for i, window in enumerate(window_list) if i not in useless_window_indexes]
    useful_neg_windows = [window for i, window in enumerate(neg_window_list) if i not in useless_neg_window_indexes]

    return useful_windows, useful_neg_windows

In [None]:
window_1s, neg_window_1s = parse_data(X_filled, 1, include=False)
window_2s, neg_window_2s = parse_data(X_filled, 2, include=False)
window_3s, neg_window_3s = parse_data(X_filled, 3, include=False)
window_4s, neg_window_4s = parse_data(X_filled, 4, include=False)
window_5s, neg_window_5s = parse_data(X_filled, 5, include=False)
window_0_5s, neg_window_0_5s = parse_data(X_filled, 0.5, include=False)

2806 2515
2397 1456
1809 881
1328 597
978 432
2865 2923


In [None]:
# remove some useless windows(does not contain any saccade and fixation) manually
del neg_window_0_5s[2810:2833]

In [None]:
new_window_1s, new_neg_window_1s = remove_useless_window(window_1s, neg_window_1s)
new_window_2s, new_neg_window_2s = remove_useless_window(window_2s, neg_window_2s)
new_window_3s, new_neg_window_3s = remove_useless_window(window_3s, neg_window_3s)
new_window_4s, new_neg_window_4s = remove_useless_window(window_4s, neg_window_4s)
new_window_5s, new_neg_window_5s = remove_useless_window(window_5s, neg_window_5s)
new_window_0_5s, new_neg_window_0_5s = remove_useless_window(window_0_5s, neg_window_0_5s)

#### Define response variable

Think-aloud(based on 'interest_level' column)

when interest_level == nan : 0

when interest_level == 0 : 1

when interest_level >= 1 : 2

In [None]:
def response_cta(df_list1, df_list2):
  a = []
  for df in df_list1:
    for i in df['interest_level']:
      if i == 0:
        a.append(1)
      elif i >= 1:
        a.append(2)
  df2 = pd.DataFrame(a, columns=['Response'])
  length2 = len(df_list2)
  df3 = pd.DataFrame(0, index=range(length2), columns=['Response'])
  df_concat = pd.concat([df2, df3])
  df_concat = df_concat.reset_index(drop=True)
  return df_concat

Distraction(based on 'is_distraction' column)


When is_distraction == NaN : 0

when is_distraction == True : 1

when is_distraction == False : 2

In [None]:
def response_dis(df_list1, df_list2):
  a = []
  for df in df_list1:
    for i in df['is_distraction']:
      if i == 'False':
        a.append(2)
      elif i == 'True':
        a.append(1)
  df2 = pd.DataFrame(a, columns=['Response'])
  length2 = len(df_list2)
  df3 = pd.DataFrame(0, index=range(length2), columns=['Response'])
  df_concat = pd.concat([df2, df3])
  df_concat = df_concat.reset_index(drop=True)
  return df_concat

### Derive X & y

In [None]:
def get_x_y_c(df1, df2):
    # Calculate temporal features for the first dataset
    temporal_features1 = Temporal_movement(df1, ['Gaze point 3D X', 'Gaze point 3D Y', 'Gaze point 3D Z'])

    # Calculate temporal features for the second dataset
    temporal_features2 = Temporal_movement(df2, ['Gaze point 3D X', 'Gaze point 3D Y', 'Gaze point 3D Z'])

    # Calculate gaze and pupil features for the first dataset
    gaze_pupil_features1 = Gaze_pupil(df1, ['Gaze point 3D X', 'Gaze point 3D Y', 'Gaze point 3D Z'])

    # Calculate gaze and pupil features for the second dataset
    gaze_pupil_features2 = Gaze_pupil(df2, ['Gaze point 3D X', 'Gaze point 3D Y', 'Gaze point 3D Z'])

    # Calculate head movement orientation features for the first dataset
    head_movement_features1 = headmovement_orientation(df1, ['Gyro X', 'Gyro Y', 'Gyro Z'])

    # Calculate head movement orientation features for the second dataset
    head_movement_features2 = headmovement_orientation(df2, ['Gyro X', 'Gyro Y', 'Gyro Z'])

    # Combine features for the first dataset
    X1 = con_df(temporal_features1, gaze_pupil_features1, head_movement_features1)

    # Combine features for the second dataset
    X2 = con_df(temporal_features2, gaze_pupil_features2, head_movement_features2)

    # Concatenate both datasets
    X = pd.concat([X1, X2])
    X = X.reset_index(drop=True)

    # Calculate the response variable
    y = response_cta(df1, df2)

    return X, y


In [None]:
def get_x_y_d(df1, df2):
    # Calculate temporal features for the first dataset
    temporal_features1 = Temporal_movement(df1, ['Gaze point 3D X', 'Gaze point 3D Y', 'Gaze point 3D Z'])

    # Calculate temporal features for the second dataset
    temporal_features2 = Temporal_movement(df2, ['Gaze point 3D X', 'Gaze point 3D Y', 'Gaze point 3D Z'])

    # Calculate gaze and pupil features for the first dataset
    gaze_pupil_features1 = Gaze_pupil(df1, ['Gaze point 3D X', 'Gaze point 3D Y', 'Gaze point 3D Z'])

    # Calculate gaze and pupil features for the second dataset
    gaze_pupil_features2 = Gaze_pupil(df2, ['Gaze point 3D X', 'Gaze point 3D Y', 'Gaze point 3D Z'])

    # Calculate head movement orientation features for the first dataset
    head_movement_features1 = headmovement_orientation(df1, ['Gyro X', 'Gyro Y', 'Gyro Z'])

    # Calculate head movement orientation features for the second dataset
    head_movement_features2 = headmovement_orientation(df2, ['Gyro X', 'Gyro Y', 'Gyro Z'])

    # Combine features for the first dataset
    X1 = con_df(temporal_features1, gaze_pupil_features1, head_movement_features1)

    # Combine features for the second dataset
    X2 = con_df(temporal_features2, gaze_pupil_features2, head_movement_features2)

    # Concatenate both datasets
    X = pd.concat([X1, X2])
    X = X.reset_index(drop=True)

    # Calculate the response variable
    y = response_dis(df1, df2)

    return X, y


##### X, y for think-aloud with different window size

In [None]:
#X_0_2s, y_0_2s = get_x_y(window_0_2s, neg_window_0_2s)
X_0_5s_c, y_0_5s_c = get_x_y_c(new_window_0_5s, new_neg_window_0_5s)
X_1s_c, y_1s_c = get_x_y_c(new_window_1s, new_neg_window_1s)
#X_1_5s, y_1_5s = get_x_y(window_1_5s, neg_window_1_5s)
X_2s_c, y_2s_c = get_x_y_c(new_window_2s, new_neg_window_2s)
#X_2_5s, y_2_5s = get_x_y(window_2_5s, neg_window_2_5s)
X_3s_c, y_3s_c = get_x_y_c(new_window_3s, new_neg_window_3s)
#X_3_5s, y_3_5s = get_x_y(window_3_5s, neg_window_3_5s)
X_4s_c, y_4s_c = get_x_y_c(new_window_4s, new_neg_window_4s)
X_5s_c, y_5s_c = get_x_y_c(new_window_5s, new_neg_window_5s)

##### X, y for Distraction/Attention with different window size

In [None]:
#X_0_2s, y_0_2s = get_x_y(window_0_2s, neg_window_0_2s)
X_0_5s_d, y_0_5s_d = get_x_y_d(new_window_0_5s, new_neg_window_0_5s)
X_1s_d, y_1s_d = get_x_y_d(new_window_1s, new_neg_window_1s)
#X_1_5s, y_1_5s = get_x_y(window_1_5s, neg_window_1_5s)
X_2s_d, y_2s_d = get_x_y_d(new_window_2s, new_neg_window_2s)
#X_2_5s, y_2_5s = get_x_y(window_2_5s, neg_window_2_5s)
X_3s_d, y_3s_d = get_x_y_d(new_window_3s, new_neg_window_3s)
#X_3_5s, y_3_5s = get_x_y(window_3_5s, neg_window_3_5s)
X_4s_d, y_4s_d = get_x_y_d(new_window_4s, new_neg_window_4s)
X_5s_d, y_5s_d = get_x_y_d(new_window_5s, new_neg_window_5s)

In [None]:
# Define a function to save X and y as CSV files
def save_x_y_as_csv(X, y, X_filename, y_filename):
    X_df = pd.DataFrame(X)
    y_df = pd.DataFrame(y)

    X_df.to_csv(X_filename, index=False)
    y_df.to_csv(y_filename, index=False)


save_x_y_as_csv(X_0_5s_c, y_0_5s_c, 'X_0_5s_c.csv', 'y_0_5s_c.csv')
save_x_y_as_csv(X_1s_c, y_1s_c, 'X_1s_c.csv', 'y_1s_c.csv')
save_x_y_as_csv(X_2s_c, y_2s_c, 'X_2s_c.csv', 'y_2s_c.csv')
save_x_y_as_csv(X_3s_c, y_3s_c, 'X_3s_c.csv', 'y_3s_c.csv')
save_x_y_as_csv(X_4s_c, y_4s_c, 'X_4s_c.csv', 'y_4s_c.csv')
save_x_y_as_csv(X_5s_c, y_5s_c, 'X_5s_c.csv', 'y_5s_c.csv')

save_x_y_as_csv(X_0_5s_d, y_0_5s_d, 'X_0_5s_d.csv', 'y_0_5s_d.csv')
save_x_y_as_csv(X_1s_d, y_1s_d, 'X_1s_d.csv', 'y_1s_d.csv')
save_x_y_as_csv(X_2s_d, y_2s_d, 'X_2s_d.csv', 'y_2s_d.csv')
save_x_y_as_csv(X_3s_d, y_3s_d, 'X_3s_d.csv', 'y_3s_d.csv')
save_x_y_as_csv(X_4s_d, y_4s_d, 'X_4s_d.csv', 'y_4s_d.csv')
save_x_y_as_csv(X_5s_d, y_5s_d, 'X_5s_d.csv', 'y_5s_d.csv')