In [1]:
import pandas as pd
import numpy as np
import sys
import os
import json

project_root = "/home/gatemrou/uds/thesis/Thesis-Project/"
# desired data shape:
path = "analysis/data/preprocessed_data/final_experiment.csv"

In [2]:
path_gaze = "analysis/data/gaze_simulations/gaze_simulation_gaze.csv"
path_resp = "analysis/data/gaze_simulations/gaze_simulation_responses.csv"
df_gaze = pd.read_csv(os.path.join(project_root, path_gaze))
df_resp = pd.read_csv(os.path.join(project_root, path_resp))
# create unique IDs for each simulated participant
# descriptive features: 
desc_features = ['subjID', 'Rneg', 'Rpos', 'patience', 'utility_1', 'utility_2', 'utility_3', 'spreading_activation']
# each unique combination of these features is a subject
# aggregate the data by these features
df_resp['subj_desc'] = df_resp[desc_features].apply(lambda x: '_'.join(x.astype(str)), axis=1)

df_gaze['subj_desc'] = df_gaze[desc_features].apply(lambda x: '_'.join(x.astype(str)), axis=1)

# mapper = df_resp['subject_id']
# create a unique ID for each subject
df_resp['subject_id'] = df_resp['subj_desc'].astype('category').cat.codes
subj_desc_to_id = df_resp.set_index('subj_desc')['subject_id'].to_dict()
# make sure the subject_ids agree in both dataframes
df_resp['subject_id'] = df_resp['subj_desc'].map(subj_desc_to_id)
df_gaze['subject_id'] = df_gaze['subj_desc'].map(subj_desc_to_id)

# check if the ids match
assert(df_resp['subject_id'].nunique() == df_gaze['subject_id'].nunique()), "Number of unique subjects in gaze and response data do not match."

df_resp['RT'] = df_resp['RT'] * 1000
df_resp['RT'] = df_resp['RT'].astype(int)
df_gaze['timeInTrial'] = df_gaze['timeInTrial'] * 1000
df_gaze['timeInTrial'] = df_gaze['timeInTrial'].astype(int)


In [7]:
# replace missing values in gaze data with last valid value
df_gaze['region'] = df_gaze['region'].ffill()

def time_on_regions(timing_seq, region_seq):
    # sort sequences by time
    timing_seq, region_seq = zip(*sorted(zip(timing_seq, region_seq)))
    cur_region = 'message'
    regions = ['message', 'target', 'competitor', 'distractor', 'available_msgs', 'total_time']
    cur_time = 0
    total_time = {reg: 0 for reg in regions}
    for time, region in zip(timing_seq, region_seq):
        total_time[cur_region] += time - cur_time
        cur_time = time
        cur_region = region
    assert all(val >= 0 for val in total_time.values()), "Total time on regions should be positive."
    total_time['total_time'] = sum(total_time.values())
    return total_time

def toggles_av_msgs(timing_seq, region_seq):
    toggles = 0
    cur_region = 'message'
    for time, region in zip(timing_seq, region_seq):
        if region == 'available_msgs' and cur_region != 'available_msgs':
            toggles += 1
        elif region != 'available_msgs' and cur_region == 'available_msgs':
            toggles += 1
        cur_region = region
    return toggles

time_on = df_gaze.groupby(['subject_id', 'trial'])[['timeInTrial', 'region']].apply(lambda x: time_on_regions(x['timeInTrial'], x['region'])).reset_index()
toggles = df_gaze.groupby(['subject_id', 'trial'])[['timeInTrial', 'region']].apply(lambda x: toggles_av_msgs(x['timeInTrial'], x['region'])).reset_index()
# merge the time on regions and toggles with the response data
df_resp = df_resp.merge(time_on, on=['subject_id', 'trial'], how='left')
df_resp = df_resp.merge(toggles, on=['subject_id', 'trial'], how='left')


def determine_strtgy(subject_df):
    complex_df = subject_df[subject_df['condition'] == 'complex']
    simple_df = subject_df[subject_df['condition'] == 'simple']
    last_complex = complex_df[complex_df['trial'] == complex_df['trial'].max()]
    last_simple = simple_df[simple_df['trial'] == simple_df['trial'].max()]
    if not last_complex['guessed'].any():
        return 2
    if not last_simple['guessed'].any():
        return 1
    return 0
# turn the series into a dataframe with a name 'strategy'
strategy_df = df_resp.groupby('subject_id').apply(determine_strtgy).reset_index()
strategy_df.columns = ['subject_id', 'strategy']
df_resp = df_resp.merge(strategy_df, on='subject_id', how='left')

  strategy_df = df_resp.groupby('subject_id').apply(determine_strtgy).reset_index()


In [9]:
save_path = "analysis/data/gaze_simulations/preprocessed_simulation_responses.csv"
df_resp = df_resp.rename(columns={'0_x': 'timeOn', '0_y': 'togglesAvMsgs'})
df_resp['timeOn'] = df_resp['timeOn'].apply(json.dumps)
df_resp.to_csv(os.path.join(project_root, save_path), index=False)

In [7]:
save_path = "analysis/data/gaze_simulations/preprocessed_simulation_responses.csv"
df = pd.read_csv(os.path.join(project_root, save_path))
df['timeOn'] = df['timeOn'].apply(json.loads)
aois = ['SentMsg', 'Trgt', 'Comp', 'Dist', 'AvailableMsgs']
regions = ['message', 'target', 'competitor', 'distractor', 'available_msgs']
for aoi, reg in zip(aois, regions):
    df[f'PropTimeOn{aoi}'] = df['timeOn'].map(lambda x: x[reg]/x['total_time'] if isinstance(x, dict) else 0)
df['RateTogglingAvailableMsgs'] = df['togglesAvMsgs'] / df['timeOn'].map(lambda x: x['total_time'] if isinstance(x, dict) else 1)


In [12]:
df.head()

Unnamed: 0,subjID,Rneg,Rpos,patience,utility_1,utility_2,utility_3,spreading_activation,trial,item,...,subject_id,timeOn,togglesAvMsgs,strategy,PropTimeOnSentMsg,PropTimeOnTrgt,PropTimeOnComp,PropTimeOnDist,PropTimeOnAvailableMsgs,RateTogglingAvailableMsgs
0,1,-10.0,5,24,0.0,-2.5,-5000,1,0,30,...,307,"{'message': 570, 'target': 185, 'competitor': ...",0,2,0.465306,0.15102,0.15102,0.232653,0.0,0.0
1,1,-10.0,5,24,0.0,-2.5,-5000,1,1,29,...,307,"{'message': 570, 'target': 185, 'competitor': ...",0,2,0.465306,0.15102,0.15102,0.232653,0.0,0.0
2,1,-10.0,5,24,0.0,-2.5,-5000,1,2,13,...,307,"{'message': 570, 'target': 385, 'competitor': ...",0,2,0.430189,0.290566,0.140377,0.138868,0.0,0.0
3,1,-10.0,5,24,0.0,-2.5,-5000,1,3,16,...,307,"{'message': 570, 'target': 1785, 'competitor':...",2,2,0.143108,0.448155,0.117499,0.092895,0.198343,0.000502
4,1,-10.0,5,24,0.0,-2.5,-5000,1,4,9,...,307,"{'message': 570, 'target': 184, 'competitor': ...",0,2,0.430189,0.138868,0.291321,0.139623,0.0,0.0


In [17]:
# get some statistics for average of the whole dataset, grouped by condition
df_stats = df.groupby(['condition', 'isCorrect', 'strategy']).agg({
    'RT': ['mean','median', 'max'],
    'PropTimeOnSentMsg': ['mean','median', 'max'],
    'PropTimeOnTrgt': ['mean','median', 'max'],
    'PropTimeOnComp': ['mean','median', 'max'],
    'PropTimeOnDist': ['mean','median', 'max'],
    'PropTimeOnAvailableMsgs': ['mean','median', 'max'],
    'RateTogglingAvailableMsgs': ['mean','median', 'max']
}).reset_index()
for stat in df_stats.columns.levels[0]:
    if stat == 'condition' or stat == 'isCorrect' or stat == 'strategy':
        continue
    print(f"Statistics for {stat}:")
    print(df_stats[[stat, 'condition', 'isCorrect', 'strategy']])
# compute correlation between isCorrect and PropTimeOnAvailableMsgs
correlation = df['isCorrect'].corr(df['PropTimeOnAvailableMsgs'])
print(f"Correlation between isCorrect and PropTimeOnAvailableMsgs: {correlation}")

Statistics for RT:
             RT                  condition isCorrect strategy
           mean  median   max                                
0   2985.567792  2635.5  6159      complex     False        0
1   3084.949000  3872.0  5992      complex     False        1
2   3528.933052  4038.0  5967      complex     False        2
3   2996.779622  2650.0  7321      complex      True        0
4   3181.719893  3928.0  5945      complex      True        1
5   4273.338782  4523.0  7345      complex      True        2
6   2760.219258  2160.0  6632       simple     False        0
7   2723.188105  2112.0  5846       simple     False        1
8   3203.398775  4019.0  5902       simple     False        2
9   3149.490228  3803.0  6435       simple      True        0
10  3465.190356  3960.0  5854       simple      True        1
11  4317.937899  4581.0  7316       simple      True        2
12  1387.840020  1440.0  1444  unambiguous      True        0
13  1387.973993  1440.0  1442  unambiguous      Tru

In [13]:
# Condition + TrgtPos + Trial +  PropTimeOnTrgt +
#     PropTimeOnComp + PropTimeOnDist + PropTimeOnSentMsg +
#     PropTimeOnAvailableMsgs + 
#     RateTogglingAvailableMsgs +
#     MsgType + AnswerTime
df_correct = df.rename(columns={
    'condition': 'Condition',
    'trial': 'Trial',
    'RT': 'AnswerTime',
    'isCorrect': 'Correct',
    'subject_id': 'Subject',
    'strategy': 'StrategyLabel',
})
df_correct = df_correct.drop(columns = desc_features + ['timeOn'])
df_correct.to_csv(os.path.join(project_root, "analysis/data/gaze_simulations/preprocessed_simulation_responses_correct.csv"), index=False)

In [6]:
# (df_gaze[(df_gaze['condition'] == 'simple') & (df_gaze['currentStrategy'] == 'match_without_other_messages')]).iloc[0:10]
df_gaze[(df_gaze['subject_id'] == 307) & (df_gaze['condition'] == 'complex')]

Unnamed: 0,subjID,Rneg,Rpos,patience,utility_1,utility_2,utility_3,spreading_activation,trial,item,condition,timeInTrial,currentStrategy,gaze_x,gaze_y,region,subj_desc,subject_id
10,1,-10.0,5,24,0.0,-2.5,-5000,1,2,13,complex,385,match,300,0,message,1.0_-10.0_5.0_24.0_0.0_-2.5_-5000.0_1.0,307
11,1,-10.0,5,24,0.0,-2.5,-5000,1,2,13,complex,570,match,100,200,distractor,1.0_-10.0_5.0_24.0_0.0_-2.5_-5000.0_1.0,307
12,1,-10.0,5,24,0.0,-2.5,-5000,1,2,13,complex,754,match,300,200,competitor,1.0_-10.0_5.0_24.0_0.0_-2.5_-5000.0_1.0,307
13,1,-10.0,5,24,0.0,-2.5,-5000,1,2,13,complex,940,match,500,200,target,1.0_-10.0_5.0_24.0_0.0_-2.5_-5000.0_1.0,307
14,1,-10.0,5,24,0.0,-2.5,-5000,1,2,13,complex,1325,match,1,1,,1.0_-10.0_5.0_24.0_0.0_-2.5_-5000.0_1.0,307
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
254,1,-10.0,5,24,0.0,-2.5,-5000,1,34,19,complex,2421,match_without_better_messages,400,400,available_msgs,1.0_-10.0_5.0_24.0_0.0_-2.5_-5000.0_1.0,307
255,1,-10.0,5,24,0.0,-2.5,-5000,1,34,19,complex,2656,match_without_better_messages,100,200,target,1.0_-10.0_5.0_24.0_0.0_-2.5_-5000.0_1.0,307
256,1,-10.0,5,24,0.0,-2.5,-5000,1,34,19,complex,2841,match_without_better_messages,300,200,distractor,1.0_-10.0_5.0_24.0_0.0_-2.5_-5000.0_1.0,307
257,1,-10.0,5,24,0.0,-2.5,-5000,1,34,19,complex,3026,match_without_better_messages,500,200,competitor,1.0_-10.0_5.0_24.0_0.0_-2.5_-5000.0_1.0,307
