In [1]:
import pandas as pd
import numpy as np
import sys
import os
import json

project_root = "/home/gatemrou/uds/thesis/Thesis-Project/"
# desired data shape:
path = "analysis/data/preprocessed_data/final_experiment.csv"

In [4]:
path_gaze = "analysis/data/gaze_simulations/gaze_simulation_gaze.csv"
path_resp = "analysis/data/gaze_simulations/gaze_simulation_responses.csv"
df_gaze = pd.read_csv(os.path.join(project_root, path_gaze))
df_resp = pd.read_csv(os.path.join(project_root, path_resp))
# create unique IDs for each simulated participant
# descriptive features: 
desc_features = ['subjID', 'Rneg', 'Rpos', 'patience', 'utility_1', 'utility_2', 'utility_3', 'spreading_activation']
# each unique combination of these features is a subject
# aggregate the data by these features
df_resp['subj_desc'] = df_resp[desc_features].apply(lambda x: '_'.join(x.astype(str)), axis=1)

df_gaze['subj_desc'] = df_gaze[desc_features].apply(lambda x: '_'.join(x.astype(str)), axis=1)

# mapper = df_resp['subject_id']
# create a unique ID for each subject
df_resp['subject_id'] = df_resp['subj_desc'].astype('category').cat.codes
subj_desc_to_id = df_resp.set_index('subj_desc')['subject_id'].to_dict()
# make sure the subject_ids agree in both dataframes
df_resp['subject_id'] = df_resp['subj_desc'].map(subj_desc_to_id)
df_gaze['subject_id'] = df_gaze['subj_desc'].map(subj_desc_to_id)

# check if the ids match
assert(df_resp['subject_id'].nunique() == df_gaze['subject_id'].nunique()), "Number of unique subjects in gaze and response data do not match."

df_resp['RT'] = df_resp['RT'] * 1000
df_resp['RT'] = df_resp['RT'].astype(int)
df_gaze['timeInTrial'] = df_gaze['timeInTrial'] * 1000
df_gaze['timeInTrial'] = df_gaze['timeInTrial'].astype(int)


In [7]:
# replace missing values in gaze data with last valid value
df_gaze['region'] = df_gaze['region'].ffill()

def time_on_regions(timing_seq, region_seq):
    # sort sequences by time
    timing_seq, region_seq = zip(*sorted(zip(timing_seq, region_seq)))
    cur_region = 'message'
    regions = ['message', 'target', 'competitor', 'distractor', 'available_msgs', 'total_time']
    cur_time = 0
    total_time = {reg: 0 for reg in regions}
    for time, region in zip(timing_seq, region_seq):
        total_time[cur_region] += time - cur_time
        cur_time = time
        cur_region = region
    assert all(val >= 0 for val in total_time.values()), "Total time on regions should be positive."
    total_time['total_time'] = sum(total_time.values())
    return total_time

def toggles_av_msgs(timing_seq, region_seq):
    toggles = 0
    cur_region = 'message'
    for time, region in zip(timing_seq, region_seq):
        if region == 'available_msgs' and cur_region != 'available_msgs':
            toggles += 1
        elif region != 'available_msgs' and cur_region == 'available_msgs':
            toggles += 1
        cur_region = region
    return toggles

time_on = df_gaze.groupby(['subject_id', 'trial'])[['timeInTrial', 'region']].apply(lambda x: time_on_regions(x['timeInTrial'], x['region'])).reset_index()
toggles = df_gaze.groupby(['subject_id', 'trial'])[['timeInTrial', 'region']].apply(lambda x: toggles_av_msgs(x['timeInTrial'], x['region'])).reset_index()
# merge the time on regions and toggles with the response data
df_resp = df_resp.merge(time_on, on=['subject_id', 'trial'], how='left')
df_resp = df_resp.merge(toggles, on=['subject_id', 'trial'], how='left')


def determine_strtgy(subject_df):
    complex_df = subject_df[subject_df['condition'] == 'complex']
    simple_df = subject_df[subject_df['condition'] == 'simple']
    last_complex = complex_df[complex_df['trial'] == complex_df['trial'].max()]
    last_simple = simple_df[simple_df['trial'] == simple_df['trial'].max()]
    if not last_complex['guessed'].any():
        return 2
    if not last_simple['guessed'].any():
        return 1
    return 0
# turn the series into a dataframe with a name 'strategy'
strategy_df = df_resp.groupby('subject_id').apply(determine_strtgy).reset_index()
strategy_df.columns = ['subject_id', 'strategy']
df_resp = df_resp.merge(strategy_df, on='subject_id', how='left')

  strategy_df = df_resp.groupby('subject_id').apply(determine_strtgy).reset_index()


In [9]:
save_path = "analysis/data/gaze_simulations/preprocessed_simulation_responses.csv"
df_resp = df_resp.rename(columns={'0_x': 'timeOn', '0_y': 'togglesAvMsgs'})
df_resp['timeOn'] = df_resp['timeOn'].apply(json.dumps)
df_resp.to_csv(os.path.join(project_root, save_path), index=False)

In [10]:
save_path = "analysis/data/gaze_simulations/preprocessed_simulation_responses.csv"
df = pd.read_csv(os.path.join(project_root, save_path))
df['timeOn'] = df['timeOn'].apply(json.loads)
aois = ['SentMsg', 'Trgt', 'Comp', 'Dist', 'AvailableMsgs']
regions = ['message', 'target', 'competitor', 'distractor', 'available_msgs']
for aoi, reg in zip(aois, regions):
    df[f'PropTimeOn{aoi}'] = df['timeOn'].map(lambda x: x[reg]/x['total_time'] if isinstance(x, dict) else 0)
df['RateTogglingAvailableMsgs'] = df['togglesAvMsgs'] / df['timeOn'].map(lambda x: x['total_time'] if isinstance(x, dict) else 1)


In [11]:
df.head()

Unnamed: 0,subjID,Rneg,Rpos,patience,utility_1,utility_2,utility_3,spreading_activation,trial,item,...,subject_id,timeOn,togglesAvMsgs,strategy,PropTimeOnSentMsg,PropTimeOnTrgt,PropTimeOnComp,PropTimeOnDist,PropTimeOnAvailableMsgs,RateTogglingAvailableMsgs
0,1,-10.0,5,24,0.0,-2.5,-5000,1,0,30,...,307,"{'message': 570, 'target': 185, 'competitor': ...",0,2,0.465306,0.15102,0.15102,0.232653,0.0,0.0
1,1,-10.0,5,24,0.0,-2.5,-5000,1,1,29,...,307,"{'message': 570, 'target': 185, 'competitor': ...",0,2,0.465306,0.15102,0.15102,0.232653,0.0,0.0
2,1,-10.0,5,24,0.0,-2.5,-5000,1,2,13,...,307,"{'message': 570, 'target': 385, 'competitor': ...",0,2,0.430189,0.290566,0.140377,0.138868,0.0,0.0
3,1,-10.0,5,24,0.0,-2.5,-5000,1,3,16,...,307,"{'message': 570, 'target': 1785, 'competitor':...",2,2,0.143108,0.448155,0.117499,0.092895,0.198343,0.000502
4,1,-10.0,5,24,0.0,-2.5,-5000,1,4,9,...,307,"{'message': 570, 'target': 184, 'competitor': ...",0,2,0.430189,0.138868,0.291321,0.139623,0.0,0.0


In [12]:
# get some statistics for average of the whole dataset, grouped by condition
df_stats = df.groupby(['condition', 'isCorrect']).agg({
    'RT': ['mean', 'std', 'min', 'max'],
    'PropTimeOnSentMsg': ['mean', 'std', 'min', 'max'],
    'PropTimeOnTrgt': ['mean', 'std', 'min', 'max'],
    'PropTimeOnComp': ['mean', 'std', 'min', 'max'],
    'PropTimeOnDist': ['mean', 'std', 'min', 'max'],
    'PropTimeOnAvailableMsgs': ['mean', 'std', 'min', 'max'],
    'RateTogglingAvailableMsgs': ['mean', 'std', 'min', 'max']
}).reset_index()
for stat in df_stats.columns.levels[0]:
    if stat == 'condition':
        continue
    print(f"Statistics for {stat}:")
    print(df_stats[[stat, 'condition', 'isCorrect']].head())
# compute correlation between isCorrect and PropTimeOnAvailableMsgs
correlation = df['isCorrect'].corr(df['PropTimeOnAvailableMsgs'])
print(f"Correlation between isCorrect and PropTimeOnAvailableMsgs: {correlation}")

Statistics for RT:
            RT                             condition isCorrect
          mean          std   min   max                       
0  3134.508425  1136.523957  1390  6159      complex     False
1  3697.951004  1136.054301  1390  7345      complex      True
2  2856.036183  1163.337808  1390  6632       simple     False
3  3778.163581  1108.726381  1390  7316       simple      True
4  1387.464458    71.585607  1290  1445  unambiguous      True
Statistics for PropTimeOnSentMsg:
  PropTimeOnSentMsg                                  condition isCorrect
               mean       std       min       max                       
0          0.229108  0.101150  0.095895  0.430189      complex     False
1          0.190088  0.092867  0.078297  0.430189      complex      True
2          0.254522  0.103554  0.088827  0.430189       simple     False
3          0.183970  0.089103  0.080270  0.430189       simple      True
4          0.465085  0.000361  0.463355  0.465306  unambiguous      

In [None]:
# Condition + TrgtPos + Trial +  PropTimeOnTrgt +
#     PropTimeOnComp + PropTimeOnDist + PropTimeOnSentMsg +
#     PropTimeOnAvailableMsgs + 
#     RateTogglingAvailableMsgs +
#     MsgType + AnswerTime
df.head()
df_correct = df.rename(columns={
    'condition': 'Condition',
    'trial': 'Trial',
    'RT': 'AnswerTime',
    'isCorrect': 'Correct',
    'subject_id': 'Subject',
    'strategy': 'StrategyLabel',
})
df_correct = df_correct.drop(columns = desc_features + ['timeOn'])
df_correct.to_csv(os.path.join(project_root, "analysis/data/gaze_simulations/preprocessed_simulation_responses_correct.csv"), index=False)