# Imports

In [1]:
import pandas as pd; import os; from analysis_helpers import *; import warnings
warnings.filterwarnings('ignore')

# Data Compile
Organize behavioral data from each participant into one dataframe for the whole experiment.

In [12]:
# Compile data from all exps
# add attention labels to memory stim
# correct timings for group1 - safest estimate
# check and add gaze data
# output dataframe with attn labels, timing, and gaze data

all_data = []
all_gaze = []

a,b = '../sustained_attention_experiment/data/','../variable_attention_experiment/data/'
data_dirs = [b+'group2'] #[a+'group1', a+'group2', b+'group1', b+'group2']
filenames = [x+'_aggregate.csv' for x in data_dirs]

sub_count = 90

# for each group (1 & 2) in each experiment (sustained and variable)
for data,file in zip(data_dirs, filenames):
    
    # empty subject list
    sub_list = []
    
    # for each subject in this directory
    for sub_dir in os.listdir(data):
        print(sub_dir)
        if sub_dir != '.DS_Store' and sub_dir != 'README.md':
            
            # add attention labels to memory stim 
            subject = add_level(sum_pd(data + '/' + sub_dir))

            # If Group 1 (Sustained or Variable)
            if data[-1] == '1':
                
                # TIMING CORRRECTIONS (for gaze)
               
                # github Issue #83, Check B: times in behavioral csv's (Group1) imprecise by fractions of a second
                # correct for those timing issues (for gaze analysis), using safest estimates 
                
                # SUSTAINED: cued composite starts .0178 seconds earlier; VAR : .0167 earlier 
                # SUSTAINED: composites disappear 0.0322 secs later; VARIABLE: .0359 secs 
                subject.loc[subject['Trial Type']=='Presentation','Stimulus Onset'] = subject[subject['Trial Type']=='Presentation']['Stimulus Onset'] - .0178
                subject.loc[subject['Trial Type']=='Presentation','Stimulus End']   = subject[subject['Trial Type']=='Presentation']['Stimulus End']   + .0359
                  
                # SUST: mem images display 0.0179 earlier; VAR: .0142
                # SUST: mem images disappear .259 secs later; VAR: .137
                subject.loc[subject['Trial Type']=='Memory','Stimulus Onset'] = subject[subject['Trial Type']=='Memory']['Stimulus Onset'] - .0179
                subject.loc[subject['Trial Type']=='Memory','Stimulus End']   = subject[subject['Trial Type']=='Memory']['Stimulus End']   +.259
                
                # NOTE: no correction for button press timing differences, as they averaged .001 seconds or less

                if data == '../variable_attention_experiment/data/group1':
                    
                    # EXP 2 cue corrections ---------------
                    # github Issue #83, Check E: some Group1, Variable Experiment valid cues marked as invalid cues
                    
                    # correct cue labels Variable Experiment, Group 1
                    for run in subject['Run'].unique():
                        first_cue = list(subject[(subject['Run']==run) & (subject['Trial Type']=='Presentation')]['Cued Side'])
                        if subject[(subject['Run']==run) & (subject['Cue Validity']==0) & (subject['Cued Side'] != first_cue[0])].shape[0]>0:
                            subject.loc[(subject['Run']==run) & (subject['Cue Validity']==0) & (subject['Cued Side'] != first_cue[0]), 'Cue Validity'] = 1
            
            # if Group 2 (Sustained or Variable)
            if data[-1] == '2':
                
                # add ON and OFF stim times for group 2
                subject_log = list_logs(data + '/' + sub_dir + '/')
                subject_log['Subject'] = pd.to_numeric(subject_log['Subject'])
                subject_log = subject_log.sort_values(by=['Subject','Run','TIME'])
                subject = subject.sort_values(by=['Subject','Run'])
                
                # now, extract desired stim on and off times from log files
                composite_onsets  = subject_log[subject_log[0].str.contains('COMPOSITES ON')]
                composite_offsets = subject_log[subject_log[0].str.contains('COMPOSITES OFF')]
                memory_onsets  = list(subject_log[ (subject_log[0].str.contains('MEMORY ON')) & (subject_log[0].str.contains('FLIP')) ]['TIME'])
                memory_offsets = list(subject_log[ (subject_log[0].str.contains('MEMORY OFF')) & (subject_log[0].str.contains('FLIP')) ]['TIME'])
                # attention_on = composite_offsets 
                
                # add ON and OFF stim times for group 2
                subject.loc[subject['Trial Type']=='Presentation', 'Stimulus Onset'] = list(composite_onsets['TIME'])
                subject.loc[subject['Trial Type']=='Presentation', 'Stimulus End'  ] = list(composite_offsets['TIME'])
                subject.loc[subject['Trial Type']=='Presentation','Attention Response Time (s)'] = subject[subject['Trial Type']=='Presentation']['Attention Response Time (s)'] - subject[subject['Trial Type']=='Presentation']['Stimulus End']
                subject.loc[subject['Trial Type']=='Memory', 'Stimulus Onset'] = memory_onsets
                subject.loc[subject['Trial Type']=='Memory', 'Stimulus End'  ] = memory_offsets
                
                
                # Pull attention RT's from log file
                
                # find every probe display, and the next event after each probe display
                probe_time_indices = subject_log[(subject_log[0].str.contains('ATTN'))].index
                key_press_indices  = [x+1 for x in list(probe_time_indices)]

                # if next event isn't keypress 1 or keypress 3, go until you find the first keypress 1 or 3
                for idx,x in enumerate(key_press_indices):
                    while 'Keypress: 1' not in str(subject_log.loc[x][0]) and 'Keypress: 3' not in str(subject_log.loc[x][0]):
                        x+=1
                    key_press_indices[idx]=x
                    # then stop and collect the time of the button press
            
                attn_rt = {}
                attn_rt['probe_start'] = list(subject_log.loc[probe_time_indices]['TIME'])
                attn_rt['key press' ] = list(subject_log.loc[key_press_indices ]['TIME'])
                attn_df = pd.DataFrame(attn_rt)
                log_file_rt = attn_df['key press'].astype('float64')-attn_df['probe_start'].astype('float64')
                subject.loc[subject['Trial Type']=='Presentation','Attention Response Time (s)'] = log_file_rt
                
                # Convert all times to be eyetribe compatible
                for r in subject['Run'].unique():
                    time = float(subject_log[subject_log['Run']==r].loc[subject_log[subject_log['Run']==r][0].str.contains('urrent time')]['TIME'])
                    curr_string = subject_log[subject_log['Run']==r].loc[subject_log[subject_log['Run']==r][0].str.contains('urrent time')][0].str.split(' ')
                    curr_time = float(list(curr_string)[0][-1])
                    diff = curr_time - time

                    # convert times for each run
                    subject.loc[subject['Run']==r, 'Stimulus Onset'] = subject.loc[subject['Run']==r, 'Stimulus Onset'] + diff
                    subject.loc[subject['Run']==r, 'Stimulus End']   = subject.loc[subject['Run']==r, 'Stimulus End'] + diff
                
                subject = subject.rename(columns={'Attention Response Time (s)': 'Attention Reaction Time (s)'})

            # add trial numbers to behavioral data
            subject['Trial'] = np.nan
            subject.loc[subject['Trial Type']=='Memory','Trial']       = list(range(0,40))*8
            subject.loc[subject['Trial Type']=='Presentation','Trial'] = list(range(0,10))*8
                
            # Gaze data 
            gaze = eye_initial(data + '/' + sub_dir + '/eye_data/')
        
            gaze['Subject']  = sub_dir.split('_')[0]
            gaze['UniqueID'] = sub_count
            gaze['Group'] = int(data[-1])
            gaze['Experiment'] = data[2:10]
            
            # KZ : need to update pres_gaze so that it pulls times frome new df and not from data files
            pres_gaze = pres_gaze_from_df(subject, gaze) # pres_gaze_image(data + '/' + sub_dir, gaze)
            print('pres_gaze')
            
            # Give every subj unique ID, label group & experiment
            subject['UniqueID'] = sub_count
            subject['Group'] = int(data[-1])
            subject['Experiment'] = data[2:10]
            
            print(sub_count)
            print()
            sub_count += 1
            sub_list.append(subject)
            pres_gaze.to_csv(data + '/' + sub_dir + '/df_gaze_data.csv')
            print('gaze_out')
            
    exp_raw = pd.concat(sub_list)
#     all_gaze.append(pres_gaze)
#     all_data.append(exp_raw)
    exp_raw.to_csv(file)


56_2020_Feb_21
pres_gaze
90

gaze_out
17_2019_Nov_18
pres_gaze
91

gaze_out
.DS_Store
12_2019_Nov_17
pres_gaze
92

gaze_out
20_2019_Nov_19
pres_gaze
93

gaze_out
30_2020_Jan_13
pres_gaze
94

gaze_out
25_2020_Jan_24
pres_gaze
95

gaze_out
11_2019_Nov_17
pres_gaze
96

gaze_out
18_2019_Nov_19
pres_gaze
97

gaze_out
9_2019_Nov_16
pres_gaze
98

gaze_out
29_2020_Jan_13
pres_gaze
99

gaze_out
14_2019_Nov_17
pres_gaze
100

gaze_out
21_2019_Nov_19
pres_gaze
101

gaze_out
27_2020_Jan_15
pres_gaze
102

gaze_out
6_2019_Nov_15
pres_gaze
103

gaze_out
16_2019_Nov_18
pres_gaze
104

gaze_out
26_2020_Jan_16
pres_gaze
105

gaze_out
28_2020_Jan_13
pres_gaze
106

gaze_out
8_2019_Nov_16
pres_gaze
107

gaze_out
15_2019_Nov_18
pres_gaze
108

gaze_out
5_2019_Nov_15
pres_gaze
109

gaze_out
19_2019_Nov_19
pres_gaze
110

gaze_out
10_2019_Nov_16
pres_gaze
111

gaze_out
0_2020_Feb_07
pres_gaze
112

gaze_out


In [14]:
subject_log #[subject_log['Run']==r].loc[subject_log[subject_log['Run']==r][0].str.contains('urrent time')]['TIME']

Unnamed: 0,index,0,Subject,Run,TIME,WARNING,MESSAGE
0,0,0.0164 \tWARNING \twarning,0,0,0.0164,WARNING,warning
1,1,0.0165 \tEXP \texperiment,0,0,0.0165,EXP,experiment
2,2,0.0165 \tDATA \tdata,0,0,0.0165,DATA,data
3,3,0.0165 \tINFO \tinfo,0,0,0.0165,INFO,info
4,4,0.0165 \tINFO \tcurrent time: 1581099021.09,0,0,0.0165,INFO,current time: 1581099021.09
...,...,...,...,...,...,...,...
1535,1535,387.0650 \tDEBUG \thandleLineEditChange: input...,0,7,387.0650,DEBUG,handleLineEditChange: inputFieldName=5. What s...
1536,1536,387.2249 \tDEBUG \thandleLineEditChange: input...,0,7,387.2249,DEBUG,handleLineEditChange: inputFieldName=5. What s...
1537,1537,387.3610 \tDEBUG \thandleLineEditChange: input...,0,7,387.3610,DEBUG,handleLineEditChange: inputFieldName=5. What s...
1538,1538,387.5853 \tDEBUG \thandleLineEditChange: input...,0,7,387.5853,DEBUG,handleLineEditChange: inputFieldName=5. What s...


# Data Save

Save a single, compiled dataframe, containing the data from every participant in the study (n=120)

In [15]:
# compile behavioral df's from groups 1 and 2, variable and sustained
files,exps = ['group1_aggregate.csv', 'group2_aggregate.csv'],['sustained_attention_experiment/data/', 'variable_attention_experiment/data/']

full_four = []
for exp in exps:
    for f in files:
        full_four.append(pd.read_csv('../'+exp+f))

full_behavioral = pd.concat(full_four)
full_behavioral.to_csv('../parsed_data/full_behavioral.csv')

# parsed_data/full_behavioral.csv has ALL behavioral data from ALL participants!

In [22]:
# compile gaze df's from each subject
paths  = ['../sustained_attention_experiment/data/', '../variable_attention_experiment/data/']
groups = ['group1', 'group2']
gaze   = []

for exp in paths:
    for group in groups:
        print(exp)
        print(group)
        print()
        subjects = os.listdir(exp+group)
        print(subjects)
        for s in subjects:
            if s != '.DS_Store':
                print('df_gaze_data.csv' in os.listdir(exp+group+'/'+s))
                if 'df_gaze_data.csv' in os.listdir(exp+group+'/'+s):
                    print(s)
                    gaze.append(pd.read_csv(exp+group+'/'+s+'/df_gaze_data.csv'))
                    
gaze_df = pd.concat(gaze)
gaze_df.to_csv('../parsed_data/full_gaze.csv')              

../sustained_attention_experiment/data/
group1

['10_2018_Oct_03', '13_2018_Oct_10', '02_2018_Sep_26', '.DS_Store', '31_2018_Oct_21', '09_2018_Oct_03', '24_2018_Oct_17', '20_2018_Oct_14', '16_2018_Oct_10', '36_2018_Oct_24', '15_2018_Oct_10', '23_2018_Oct_14', '27_2018_Oct_17', '32_2018_Oct_21', '17_2018_Oct_10', '34_2018_Oct_24', '25_2018_Oct_17', '08_2018_Oct_03', '21_2018_Oct_14', '30_2018_Oct_21', '06_2018_Sep_28', '11_2018_Oct_03', '12_2018_Oct_10', '07_2018_Oct_03', '18_2018_Oct_10', '33_2018_Oct_21', '22_2018_Oct_14', '0_2018_Sep_25', '26_2018_Oct_17', '19_2018_Oct_12', '14_2018_Oct_10']
True
10_2018_Oct_03
True
13_2018_Oct_10
True
02_2018_Sep_26
True
31_2018_Oct_21
True
09_2018_Oct_03
True
24_2018_Oct_17
True
20_2018_Oct_14
True
16_2018_Oct_10
True
36_2018_Oct_24
True
15_2018_Oct_10
True
23_2018_Oct_14
True
27_2018_Oct_17
True
32_2018_Oct_21
True
17_2018_Oct_10
True
34_2018_Oct_24
True
25_2018_Oct_17
True
08_2018_Oct_03
True
21_2018_Oct_14
True
30_2018_Oct_21
True
06_2018_Sep_28

In [23]:
gaze_df[(gaze_df['Experiment']=='/variabl') & (gaze_df['Group']==2)]

Unnamed: 0.1,Unnamed: 0,avg,fix,lefteye,raw,righteye,state,time,timestamp,xRaw_righteye,...,xRaw_lefteye,yRaw_lefteye,av_x_coord,av_y_coord,Subject,UniqueID,Group,Experiment,Trial,Run
0,37384,"{'x': 981.7921, 'y': 440.784}",True,"{'avg': {'x': 998.1666, 'y': 431.7654}, 'pcent...","{'x': 989.0936, 'y': 435.7714}","{'avg': {'x': 966.489, 'y': 448.1826}, 'pcente...",7,695631495,1.582317e+09,985.3455,...,992.8417,425.7119,989.09360,435.77140,56,90,2,/variabl,0.0,0
1,37385,"{'x': 982.4951, 'y': 440.2157}",True,"{'avg': {'x': 997.8373, 'y': 431.3694}, 'pcent...","{'x': 984.9148, 'y': 436.0869}","{'avg': {'x': 967.1797, 'y': 447.7227}, 'pcent...",7,695631539,1.582317e+09,972.9595,...,996.8701,429.9779,984.91480,436.08685,56,90,2,/variabl,0.0,0
2,37386,"{'x': 982.7675, 'y': 439.9826}",True,"{'avg': {'x': 998.5808, 'y': 430.4707}, 'pcent...","{'x': 991.4553, 'y': 443.4114}","{'avg': {'x': 967.4844, 'y': 448.6855}, 'pcent...",7,695631573,1.582317e+09,970.0112,...,1012.8995,421.0729,991.45535,443.41135,56,90,2,/variabl,0.0,0
3,37387,"{'x': 983.8894, 'y': 439.2862}",True,"{'avg': {'x': 998.7445, 'y': 430.5535}, 'pcent...","{'x': 994.5328, 'y': 447.4511}","{'avg': {'x': 969.0383, 'y': 449.487}, 'pcente...",7,695631608,1.582317e+09,987.1200,...,1001.9456,433.8564,994.53280,447.45110,56,90,2,/variabl,0.0,0
4,37388,"{'x': 983.2207, 'y': 439.792}",True,"{'avg': {'x': 997.9229, 'y': 430.761}, 'pcente...","{'x': 983.3511, 'y': 440.2393}","{'avg': {'x': 970.467, 'y': 449.3812}, 'pcente...",7,695631643,1.582317e+09,984.3114,...,982.3907,435.9619,983.35105,440.23930,56,90,2,/variabl,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6027,33197,"{'x': 1208.8556, 'y': 528.8167}",True,"{'avg': {'x': 1207.9839, 'y': 523.3488}, 'pcen...","{'x': 1204.7251, 'y': 524.0274}","{'avg': {'x': 1209.3267, 'y': 527.0235}, 'pcen...",7,553245092,1.581101e+09,1202.3119,...,1207.1383,534.0266,1204.72510,524.02740,0,112,2,/variabl,9.0,7
6028,33198,"{'x': 1208.8379, 'y': 528.4561}",True,"{'avg': {'x': 1208.0197, 'y': 524.4816}, 'pcen...","{'x': 1213.271, 'y': 521.1824}","{'avg': {'x': 1209.8717, 'y': 525.1141}, 'pcen...",7,553245126,1.581101e+09,1217.3524,...,1209.1897,543.5931,1213.27105,521.18235,0,112,2,/variabl,9.0,7
6029,33199,"{'x': 1208.8469, 'y': 528.1111}",True,"{'avg': {'x': 1207.2369, 'y': 524.196}, 'pcent...","{'x': 1207.1382, 'y': 520.8585}","{'avg': {'x': 1210.4349, 'y': 524.6412}, 'pcen...",7,553245159,1.581101e+09,1218.0865,...,1196.1897,521.5787,1207.13810,520.85850,0,112,2,/variabl,9.0,7
6030,33200,"{'x': 1209.1949, 'y': 527.767}",True,"{'avg': {'x': 1212.0786, 'y': 526.6959}, 'pcen...","{'x': 1285.6401, 'y': 552.8955}","{'avg': {'x': 1215.6586, 'y': 525.2877}, 'pcen...",7,553245203,1.581101e+09,1288.4972,...,1282.7831,569.5120,1285.64015,552.89555,0,112,2,/variabl,9.0,7


In [21]:
# pd.read_csv(exp+group+'/'+s+'/df_gaze_data.csv')