In [2]:
import pandas as pd
import numpy as np
import sys
import os
import json

project_root = "/home/gatemrou/uds/thesis/Thesis-Project/"
# desired data shape:
path = "analysis/data/preprocessed_data/final_experiment.csv"

In [7]:
path_gaze = "analysis/data/gaze_simulations/gaze_simulation_gaze.csv"
path_resp = "analysis/data/gaze_simulations/gaze_simulation_responses.csv"
df_gaze = pd.read_csv(os.path.join(project_root, path_gaze), index_col=0)
df_resp = pd.read_csv(os.path.join(project_root, path_resp), index_col=0)
# create unique IDs for each simulated participant
# descriptive features: 
desc_features = ['Rneg', 'Rpos', 'patience', 'utility_1', 'utility_2', 'utility_3', 'spreading_activation']
# each unique combination of these features is a subject
# aggregate the data by these features
df_resp['subj_desc'] = df_resp[desc_features].apply(lambda x: '_'.join(x.astype(str)), axis=1)

df_gaze['subj_desc'] = df_gaze[desc_features].apply(lambda x: '_'.join(x.astype(str)), axis=1)

# mapper = df_resp['subject_id']
# create a unique ID for each subject
df_resp['subject_id'] = df_resp['subj_desc'].astype('category').cat.codes
subj_desc_to_id = df_resp.set_index('subj_desc')['subject_id'].to_dict()
# make sure the subject_ids agree in both dataframes
df_resp['subject_id'] = df_resp['subj_desc'].map(subj_desc_to_id)
df_gaze['subject_id'] = df_gaze['subj_desc'].map(subj_desc_to_id)

# check if the ids match
assert(df_resp['subject_id'].nunique() == df_gaze['subject_id'].nunique()), "Number of unique subjects in gaze and response data do not match."

df_resp['RT'] = df_resp['RT'] * 1000
df_resp['RT'] = df_resp['RT'].astype(int)
df_gaze['timeInTrial'] = df_gaze['timeInTrial'] * 1000
df_gaze['timeInTrial'] = df_gaze['timeInTrial'].astype(int)


In [8]:
# replace missing values in gaze data with last valid value
df_gaze['region'] = df_gaze['region'].ffill()

def time_on_regions(timing_seq, region_seq):
    # sort sequences by time
    timing_seq, region_seq = zip(*sorted(zip(timing_seq, region_seq)))
    cur_region = 'message'
    regions = ['message', 'target', 'competitor', 'distractor', 'available_msgs']
    cur_time = 0
    total_time = {reg: 0 for reg in regions}
    for time, region in zip(timing_seq, region_seq):
        total_time[cur_region] += time - cur_time
        cur_time = time
        cur_region = region
    assert all(val >= 0 for val in total_time.values()), "Total time on regions should be positive."
    return total_time

def toggles_av_msgs(timing_seq, region_seq):
    toggles = 0
    cur_region = 'message'
    for time, region in zip(timing_seq, region_seq):
        if region == 'available_msgs' and cur_region != 'available_msgs':
            toggles += 1
        elif region != 'available_msgs' and cur_region == 'available_msgs':
            toggles += 1
        cur_region = region
    return toggles

time_on = df_gaze.groupby(['subject_id', 'trial'])[['timeInTrial', 'region']].apply(lambda x: time_on_regions(x['timeInTrial'], x['region'])).reset_index()
toggles = df_gaze.groupby(['subject_id', 'trial'])[['timeInTrial', 'region']].apply(lambda x: toggles_av_msgs(x['timeInTrial'], x['region'])).reset_index()
# merge the time on regions and toggles with the response data
df_resp = df_resp.merge(time_on, on=['subject_id', 'trial'], how='left')
df_resp = df_resp.merge(toggles, on=['subject_id', 'trial'], how='left')


In [None]:
save_path = "analysis/data/gaze_simulations/preprocessed_simulation_responses.csv"
df_resp = df_resp.rename(columns={'0_x': 'timeOn', '0_y': 'togglesAvMsgs'})
df_resp['timeOn'] = df_resp['timeOn'].apply(json.dumps)
df_resp.to_csv(os.path.join(project_root, save_path), index=False)

In [3]:
save_path = "analysis/data/gaze_simulations/preprocessed_simulation_responses.csv"
df = pd.read_csv(os.path.join(project_root, save_path))
df['timeOn'] = df['timeOn'].apply(json.loads)
aois = ['SentMsg', 'Trgt', 'Comp', 'Dist', 'AvailableMsgs']
regions = ['message', 'target', 'competitor', 'distractor', 'available_msgs']
for aoi, reg in zip(aois, regions):
    df[f'PropTimeOn{aoi}'] = df['timeOn'].map(lambda x: x[reg] if isinstance(x, dict) else 0) / df['RT']
df['RateTogglingAvailableMsgs'] = df['togglesAvMsgs'] / df['RT']


In [17]:
# get some statistics for average of the whole dataset, grouped by condition
df_stats = df.groupby('condition').agg({
    'RT': ['mean', 'median', 'std', 'min', 'max'],
    'PropTimeOnSentMsg': ['mean', 'median', 'std', 'min', 'max'],
    'PropTimeOnTrgt': ['mean', 'median', 'std', 'min', 'max'],
    'PropTimeOnComp': ['mean', 'median', 'std', 'min', 'max'],
    'PropTimeOnDist': ['mean', 'median', 'std', 'min', 'max'],
    'PropTimeOnAvailableMsgs': ['mean', 'median', 'std', 'min', 'max'],
    'RateTogglingAvailableMsgs': ['mean', 'median', 'std', 'min', 'max']
}).reset_index()
for stat in df_stats.columns.levels[0]:
    if stat == 'condition':
        continue
    print(f"Statistics for {stat}:")
    print(df_stats[[stat, 'condition']].head())
# compute correlation between isCorrect and PropTimeOnAvailableMsgs
correlation = df['isCorrect'].corr(df['PropTimeOnAvailableMsgs'])
print(f"Correlation between isCorrect and PropTimeOnAvailableMsgs: {correlation}")

Statistics for RT:
            RT                                     condition
          mean  median          std   min   max             
0  3485.128177  4059.0  1168.601616  1390  7345      complex
1  3508.710271  4057.0  1200.576902  1390  7316       simple
2  1387.464458  1440.0    71.585607  1290  1445  unambiguous
Statistics for PropTimeOnSentMsg:
  PropTimeOnSentMsg                                            condition
               mean    median       std       min       max             
0          0.191406  0.140325  0.085874  0.077468  0.410072      complex
1          0.191135  0.140394  0.086730  0.077911  0.410072       simple
2          0.411566  0.395833  0.021961  0.394044  0.441860  unambiguous
Statistics for PropTimeOnTrgt:
  PropTimeOnTrgt                                            condition
            mean    median       std       min       max             
0       0.357860  0.326006  0.213666  0.000479  1.879870      complex
1       0.359427  0.326411  0.213660

In [28]:
# Condition + TrgtPos + Trial +  PropTimeOnTrgt +
#     PropTimeOnComp + PropTimeOnDist + PropTimeOnSentMsg +
#     PropTimeOnAvailableMsgs + 
#     RateTogglingAvailableMsgs +
#     MsgType + AnswerTime
df.head()
df_correct = df.rename(columns={
    'condition': 'Condition',
    'trial': 'Trial',
    'RT': 'AnswerTime',
    'isCorrect': 'Correct',
    'subject_id': 'Subject',
})
df_correct = df_correct.drop(columns = desc_features + ['timeOn'])
df_correct.to_csv(os.path.join(project_root, "analysis/data/gaze_simulations/preprocessed_simulation_responses_correct.csv"), index=False)

Unnamed: 0,Rneg,Rpos,patience,utility_1,utility_2,utility_3,spreading_activation,trial,item,condition,...,subj_desc,subject_id,timeOn,togglesAvMsgs,propTimeOnSentMsg,propTimeOnTrgt,propTimeOnComp,propTimeOnDist,propTimeOnAvailableMsgs,rateTogglingAvailableMsgs
0,-10.0,5,24,0.0,-2.5,-5000,1,0,30,unambiguous,...,-10.0_5.0_24.0_0.0_-2.5_-5000.0_1.0,307,"{'message': 570, 'target': 370, 'competitor': ...",0,0.44186,0.286822,0.30155,0.424806,0.0,0.0
1,-10.0,5,24,0.0,-2.5,-5000,1,1,29,unambiguous,...,-10.0_5.0_24.0_0.0_-2.5_-5000.0_1.0,307,"{'message': 569, 'target': 370, 'competitor': ...",0,0.441085,0.286822,0.000775,0.29845,0.0,0.0
2,-10.0,5,24,0.0,-2.5,-5000,1,2,13,complex,...,-10.0_5.0_24.0_0.0_-2.5_-5000.0_1.0,307,"{'message': 569, 'target': 769, 'competitor': ...",0,0.369481,0.499351,0.0,0.24026,0.0,0.0
3,-10.0,5,24,0.0,-2.5,-5000,1,3,16,complex,...,-10.0_5.0_24.0_0.0_-2.5_-5000.0_1.0,307,"{'message': 570, 'target': 1640, 'competitor':...",2,0.135779,0.390662,0.067651,0.166508,0.188185,0.000476
4,-10.0,5,24,0.0,-2.5,-5000,1,4,9,simple,...,-10.0_5.0_24.0_0.0_-2.5_-5000.0_1.0,307,"{'message': 570, 'target': 569, 'competitor': ...",0,0.410072,0.409353,0.133094,0.000719,0.0,0.0
