In [21]:
import os

import json, csv
from collections import defaultdict

import math , pywt , numpy as np, pandas as pd
import scipy.stats as stats, scipy.signal as signal

from IPython.display import display, HTML
import matplotlib,matplotlib.pyplot as plt


from utils import *
from constants import *


## Config & functions

In [22]:
## Config *** TO UPDATE ***
COMPUTE_EMOTIONS = False
#

EYE_IMOTIONSFEATURES_AND_AOIS_FILE = DATA_DIR + r'\eventsDataWithAois.csv' 
PERCEIVED_DIFFICULTY_DATA = DATA_DIR + r'\perceivedDifficulty.csv'
EYEMIND_FULL_DATA = DATA_DIR + r'\allDataEyeMind.csv'
PUPIL_FILTERED_FILE = DATA_DIR + r'\PupilData_filtered.csv'
ANSWERS_DATA = DATA_DIR + r'\answers.csv'

QUESTION_FILE_PARSED = ORIG_DIR + r'\QuestionSetParsed.csv'

DEMOGRAPHIC_DATA = DATA_DIR + r'\surveys.csv'
DEMOGRAPHIC_CATALOG_DATA = DATA_DIR + r'\surveys_catalog.json'
DEMOGRAPHIC_RCATALOG_DATA = DATA_DIR + r'\surveys_rcatalog.json'


## Load data

In [31]:
#---------------------------------------------------
# Read EYE_IMOTIONSFEATURES_AND_AOIS_FILE
eifaData = pd.read_csv(EYE_IMOTIONSFEATURES_AND_AOIS_FILE) #EIFA stands for eifa
#---------------------------------------------------

#---------------------------------------------------
# Read PERCEIVED_DIFFICULTY_DATA
perceivedDifficultyData = pd.read_csv(PERCEIVED_DIFFICULTY_DATA) 
#---------------------------------------------------

#---------------------------------------------------
# Read ANSWERS_DATA
answersData = pd.read_csv(ANSWERS_DATA) 
#---------------------------------------------------

#---------------------------------------------------
# Read QUESTION_FILE_PARSED
questionsFileParsed = pd.read_csv(QUESTION_FILE_PARSED)
#---------------------------------------------------


#---------------------------------------------------
# Read EYEMIND_DATA
eyeMindFullData = pd.read_csv(EYEMIND_FULL_DATA) 
#---------------------------------------------------


#---------------------------------------------------
# Read PUPIL_FILTERED_FILE 
pupiFiltered = pd.read_csv(PUPIL_FILTERED_FILE)
#---------------------------------------------------


  eyeMindFullData = pd.read_csv(EYEMIND_FULL_DATA)


In [32]:
##### Add additional question attributes to eifaData, pupiFiltered ###

In [33]:
eifaData = eifaData.merge(questionsFileParsed, left_on='currentQuestion', right_on='id', how='left')
pupiFiltered = pupiFiltered.merge(questionsFileParsed, left_on='currentQuestion', right_on='id', how='left')

## Tests
Several tests on the collected data

### Tests 0: number of tasks & participants

In [34]:
# Tests 0: partIDs & taskIDs
#
partIDs = list(eifaData['participant'].unique())
taskIDs = list(questionsFileParsed['id'].values)

print(f'{len(partIDs)=} retrieved from files (for {len(PARTICIPANTS)=} in total).')
print(f'{len(taskIDs)=} tasks found after parsing questions file.')

len(partIDs)=25 retrieved from files (for len(PARTICIPANTS)=50 in total).
len(taskIDs)=30 tasks found after parsing questions file.


### Tests 1: data size
For each task (of each participant), a sufficient amount of data must be selected for the computation.

In [35]:
# Tests 1 - measures: perceived difficulty, performance, count of fixation and LHIPA
from warnings import warn

# 1. Pereceived difficulty
data = perceivedDifficultyData.loc[
    perceivedDifficultyData['Type1'] == 'Coarse',  # Type3: acc/ess, Type4: Compl/Simpl
    ['participant', 'MainQuestionID']].copy(deep=True)
# For each task (MainQUestionID), is there enough participant responses?
sum_response = defaultdict(int)
sum_response = {k: data.loc[data['MainQuestionID']==k].size for k in data['MainQuestionID']}
print(f'{data.size=}, count (qID: #q)={sum_response}')
del data

# 2. Performance
# Nothing to do

# 3. Count of fixations
data = eifaData.loc[(eifaData['Type2'] == 'MainQuestion') &
    (~eifaData['FixID'].isna()) & (~eifaData['currentQuestion'].isna()),
    ['participant', 'currentQuestion', 
     'FixID', 'Fixation Start', 'Fixation End', 'Fixation Duration', 
     'tabName', 'element', 
     'Timestamp']].copy(deep=True)
taskMainQuestionIDs = list(data['currentQuestion'].unique())

# For each task, compute the duration (of the task) and the count of (all) fixations
testCountFixations = pd.DataFrame()
for pID in partIDs:
    for tID in taskMainQuestionIDs: # instead of taskIDs:
        df_filtering_mask = (data['participant' ]== pID) & (data['currentQuestion'] == tID)
        taskData = data.loc[df_filtering_mask, ['Timestamp', 'FixID', 'Fixation Duration']]

        fixationCount = len(taskData['FixID'].unique())
        task_duration = 0
        if fixationCount > 0:
             task_duration = taskData['Timestamp'].iloc[-1] - taskData['Timestamp'].iloc[0]
        else: warn(f'Task{tID} of part. {pID} has no fixations.')

        testCountFixations = pd.concat([testCountFixations, pd.DataFrame({
            'participant': pID, 'MainQuestionID': tID,
            'TaskDuratuion': task_duration,
            'FixationCount': fixationCount}, index=[0])]).reset_index(drop=True)

# All tasks should have fixations
# assert (testCountFixations['FixationCount'] > 0).all(), 'One or more tasks have no fixations'
display(testCountFixations.head())

# 4. LHIPA - time support
data = pupiFiltered.loc[(pupiFiltered['Type2'] == 'MainQuestion') &
    (~pupiFiltered['currentQuestion'].isna()),
    ['participant', 'currentQuestion', 
     'pupilSize', 
     'tabName', 'element', 
     'Timestamp']].copy(deep=True)
taskMainQuestionIDs = list(data['currentQuestion'].unique())

# For each task, compute the duration (of the task) and the count of pupilSize samples
testTimeSupport = pd.DataFrame()
for pID in partIDs:
    for tID in taskMainQuestionIDs: # instead of taskIDs:
        df_filtering_mask = (data['participant' ]== pID) & (data['currentQuestion'] == tID)
        taskData = data.loc[df_filtering_mask, ['Timestamp', 'pupilSize']]
        
        sampleCount = taskData['pupilSize'].count() # count of non-na values
        task_duration = 0
        sample_freq = None
        if sampleCount > 0:
            task_duration = taskData['Timestamp'].iloc[-1] - taskData['Timestamp'].iloc[0]
            sample_freq = sampleCount * 1000/task_duration

            if sampleCount < 5 * sample_freq: # 5[s] min time support (pupillography should be max @40Hz)
                warn(f'Task{tID} of part. {pID} dosen\'t reach the minimum time support.')

        else: warn(f'Task{tID} of part. {pID} has no pupil size sample.')

        testTimeSupport = pd.concat([testTimeSupport, pd.DataFrame({
            'participant': pID,'MainQuestionID': tID,
            'TaskDuratuion': task_duration,
            'SampleCount': sampleCount,
            'SampleFreq': sample_freq}, index=[0])]).reset_index(drop=True)

# All tasks should have pupil size samples
# assert (testTimeSupport['SampleCount'] > 0).all(), 'One or more tasks have no fixations'        
display(testTimeSupport.head())

data.size=300, count (qID: #q)={9.0: 50, 21.0: 50, 5.0: 50, 19.0: 50, 3.0: 50, 7.0: 50}


Unnamed: 0,participant,MainQuestionID,TaskDuratuion,FixationCount
0,P12,1.0,82157.396,248
1,P12,25.0,59168.374,202
2,P12,27.0,96380.841,324
3,P12,9.0,37837.394,139
4,P12,21.0,87598.463,305


  warn(f'Task{tID} of part. {pID} dosen\'t reach the minimum time support.')


Unnamed: 0,participant,MainQuestionID,TaskDuratuion,SampleCount,SampleFreq
0,P12,1.0,82315.721,2503,30.407314
1,P12,25.0,59318.326,1793,30.226746
2,P12,27.0,96414.168,2713,28.139018
3,P12,9.0,38295.69,1319,34.442518
4,P12,21.0,87715.084,2927,33.369403


In [36]:
print('questionsFileParsed:')
df = questionsFileParsed
display(df.head())
print(f'data size:{len(df.index)}')
display(df['id'].unique())

print('\neifaData:')
df = eifaData
display(df.head())
print(f'data size:{len(df.index)}')
display(df['currentQuestion'].unique())

print('\nperceivedDifficultyData:')
df = perceivedDifficultyData
display(df.head())
print(f'data size:{len(df.index)}')
display(df['MainQuestionID'].unique())

print('\nanswersData:')
df = answersData
display(df.head())
print(f'data size:{len(df.index)}')
display(df['questionID'].unique())

print('\neyeMindFullData:')
df = eyeMindFullData
display(df.head())
print(f'data size:{len(df.index)}')
display(df['questionID'].unique())

print('\npupiFiltered:')
df = pupiFiltered
display(df.head())
print(f'data size:{len(df.index)}')
display(df['currentQuestion'].unique())

df = None

questionsFileParsed:


Unnamed: 0,id,question,type,options,TaskId,Type1,Type2,Type3,Type4,ComparisionGroup,DifficultyQuestionID,GroundTruthAnswer
0,1,This process has an execution where activity A...,multiple-choice,Yes;No;I don't know,1,,MainQuestion,,,,,yes
1,2,How difficult was the task on this model? [Thi...,multiple-choice,Very difficult;Difficult;Neutral;Easy;Very easy,1,,DifficultyQuestion,,,,1.0,
2,3,"In this process, if activity BA is executed, a...",multiple-choice,Yes;No;I don't know,2,Coarse,MainQuestion,essentialComlexity,Complex,1.0,,no
3,4,How difficult was the task on this model? [In ...,multiple-choice,Very difficult;Difficult;Neutral;Easy;Very easy,2,Coarse,DifficultyQuestion,essentialComlexity,DiffComplex,1.0,1.0,
4,5,This process has an execution where activity B...,multiple-choice,Yes;No;I don't know,3,Coarse,MainQuestion,essentialComlexity,Complex,2.0,,yes


data size:30


array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], dtype=int64)


eifaData:


Unnamed: 0,participant,FixID,Fixation X,Fixation Y,Fixation Start,Fixation End,Fixation Duration,Fixation Dispersion,SacID,Saccade Start,...,type,options,TaskId,Type1,Type2,Type3,Type4,ComparisionGroup,DifficultyQuestionID,GroundTruthAnswer
0,P12,1.0,922.1019,350.4255,2803678.0,2803836.1,158.3245,0.346,,,...,multiple-choice,Yes;No;I don't know,1.0,,MainQuestion,,,,,yes
1,P12,,,,,,,,1.0,2803836.1,...,multiple-choice,Yes;No;I don't know,1.0,,MainQuestion,,,,,yes
2,P12,2.0,980.6897,360.7931,2803853.0,2804094.373,241.6455,0.1919,,,...,multiple-choice,Yes;No;I don't know,1.0,,MainQuestion,,,,,yes
3,P12,,,,,,,,2.0,2804094.373,...,multiple-choice,Yes;No;I don't know,1.0,,MainQuestion,,,,,yes
4,P12,3.0,706.2981,54.0288,2804161.0,2804377.682,216.6395,0.282,,,...,multiple-choice,Yes;No;I don't know,1.0,,MainQuestion,,,,,yes


data size:449537


array([ 1.,  2., nan, 25., 26., 27., 28.,  9., 10., 21., 22., 13., 14.,
       15., 16.,  5.,  6., 23., 24., 29., 30., 19., 20.,  3.,  4.,  7.,
        8., 11., 12., 17., 18.])


perceivedDifficultyData:


Unnamed: 0.1,Unnamed: 0,participant,questionTimestamp,questionEventType,questionText,questionAnswer,difficultyScore,questionPosition,questionID,MainQuestionID,eventSource,Type1,Type3,Type4,ComparisionGroup
0,0,P12,1698669000000.0,questionOffset,How difficult was the task on this model? [Thi...,Neutral,2,1.0,2.0,1.0,questionnaire,,,,
1,1,P12,1698670000000.0,questionOffset,How difficult was the task on this model? [In ...,Easy,1,3.0,26.0,25.0,questionnaire,Fine,accidentalComlexity,Simple,8.0
2,2,P12,1698670000000.0,questionOffset,How difficult was the task on this model? [In ...,Neutral,2,5.0,28.0,27.0,questionnaire,Fine,accidentalComlexity,Complex,7.0
3,3,P12,1698670000000.0,questionOffset,How difficult was the task on this model? [Thi...,Very easy,0,7.0,10.0,9.0,questionnaire,Coarse,essentialComlexity,Simple,2.0
4,4,P12,1698670000000.0,questionOffset,How difficult was the task on this model? [Thi...,Neutral,2,9.0,22.0,21.0,questionnaire,Coarse,accidentalComlexity,Complex,2.0


data size:375


array([ 1., 25., 27.,  9., 21., 13., 15.,  5., 23., 29., 19.,  3.,  7.,
       11., 17.])


answersData:


Unnamed: 0.1,Unnamed: 0,participant,questionTimestamp,questionEventType,questionText,questionAnswer,currentQuestion,questionID,eventSource,Type1,Type2,Type3,Type4,GroundTruthAnswer,accuracy,ComparisionGroup
0,0,P12,1698669000000.0,questionOffset,This process has an execution where activity A...,Yes,,1.0,questionnaire,,MainQuestion,,,yes,1,
1,2,P12,1698670000000.0,questionOffset,"In this process, activity AR is executed in pa...",No,,25.0,questionnaire,Fine,MainQuestion,accidentalComlexity,Simple,no,1,8.0
2,4,P12,1698670000000.0,questionOffset,"In every execution of this process, there is a...",Yes,,27.0,questionnaire,Fine,MainQuestion,accidentalComlexity,Complex,yes,1,7.0
3,6,P12,1698670000000.0,questionOffset,This process has an execution where activity A...,Yes,,9.0,questionnaire,Coarse,MainQuestion,essentialComlexity,Simple,yes,1,2.0
4,8,P12,1698670000000.0,questionOffset,This process has an execution where activity A...,Yes,,21.0,questionnaire,Coarse,MainQuestion,accidentalComlexity,Complex,yes,1,2.0


data size:375


array([ 1., 25., 27.,  9., 21., 13., 15.,  5., 23., 29., 19.,  3.,  7.,
       11., 17.])


eyeMindFullData:


Unnamed: 0.1,Unnamed: 0,questionTimestamp,questionEventType,questionText,questionAnswer,questionPosition,questionID,eventSource,Timestamp,validLeft,...,leftY,rightX,rightY,leftDistance,rightDistance,x,y,tabName,element,participant
0,0,1698669000000.0,questionOnset,This process has an execution where activity A...,,0.0,1.0,questionnaire,,,...,,,,,,,,,,P12
1,1,,,,,,,eye-tracker,2803665.274,0.0,...,,,,,,,,,,P12
2,2,,,,,,,eye-tracker,2803673.606,1.0,...,368.0,943.0,357.0,642.949097,630.195435,929.5,362.5,model-g1.bpmn,Activity_08pnog0,P12
3,3,,,,,,,eye-tracker,2803681.945,1.0,...,357.0,942.0,352.0,642.937439,630.076782,929.0,354.5,model-g1.bpmn,Activity_08pnog0,P12
4,4,,,,,,,eye-tracker,2803690.245,1.0,...,357.0,942.0,353.0,643.449402,630.30719,929.0,355.0,model-g1.bpmn,Activity_08pnog0,P12


data size:5117734


array([ 1., nan,  2., 25., 26., 27., 28.,  9., 10., 21., 22., 13., 14.,
       15., 16.,  5.,  6., 23., 24., 29., 30., 19., 20.,  3.,  4.,  7.,
        8., 11., 12., 17., 18.])


pupiFiltered:


Unnamed: 0.2,Unnamed: 0.1,index,Unnamed: 0,eventSource,Timestamp,validLeft,validRight,leftPupilValidity,rightPupilValidity,snapshotId,...,type,options,TaskId,Type1,Type2,Type3,Type4,ComparisionGroup,DifficultyQuestionID,GroundTruthAnswer
0,0,0,3,eye-tracker,2803690.245,1,1,1,1,0,...,multiple-choice,Yes;No;I don't know,1.0,,MainQuestion,,,,,yes
1,1,1,6,eye-tracker,2803715.241,1,1,1,1,0,...,multiple-choice,Yes;No;I don't know,1.0,,MainQuestion,,,,,yes
2,2,2,9,eye-tracker,2803740.24,1,1,1,1,0,...,multiple-choice,Yes;No;I don't know,1.0,,MainQuestion,,,,,yes
3,3,3,12,eye-tracker,2803765.235,1,1,1,1,0,...,multiple-choice,Yes;No;I don't know,1.0,,MainQuestion,,,,,yes
4,4,4,15,eye-tracker,2803790.258,1,1,1,1,0,...,multiple-choice,Yes;No;I don't know,1.0,,MainQuestion,,,,,yes


data size:1374291


array([ 1.,  2., nan, 25., 26., 27., 28.,  9., 10., 21., 22., 13., 14.,
       15., 16.,  5.,  6., 23., 24., 29., 30., 19., 20.,  3.,  4.,  7.,
        8., 11., 12., 17., 18.])

In [37]:
## Data filtering

# Completely remove participants
#  /!\ Participants are no longer part of the contingent /!\
#  See hereafter for removing tasks (all if needed) from participants
#  -> Nothing to do
# Completely remove tasks
#  -> Nothing to do


# task kept for the analysis
keepTID = set([3, 5, 7, 9, 19, 21]) # tuple are hashable (immutable)

# Remove tasks from participants
removedElmt = {
    'P32': set([9]) # Task not recorded (press next button twice)
}

# Look at the set of participants (in constants.py) for participants
# in the set of rejected eye-tracking in order to remove their tasks
for pID in PARTICIPANT_SETS['et-rejected']:
    if not(pID in removedElmt.keys()): removedElmt[pID] = set()
    [removedElmt[pID].add(x) for x in keepTID]




# Remove tasks with NaN values (if exists)
def setFilterNA(df, field, colName = 'DataSuitabilityFilter'):
    if colName not in df.columns:
        df[colName] = False
    l = sum(df[colName].values)
    
    df[colName] |= df[field].isna()

    return sum(df[colName].values) - l
# Remove particular tasks in the list of removedElmt
def setFilterElmt(df, pField, tField, colName = 'DataSuitabilityFilter'):
    if colName not in df.columns:
        df[colName] = False
    l = sum(df[colName].values)

    for pID, tIDs in removedElmt.items():
        df[colName] |= (df[pField] == pID) & (df[tField].isin(tIDs))

    return sum(df[colName].values) - l
# Remove particular tasks for all participants
def setFilter(df, field, values, colName = 'DataSuitabilityFilter'):
    if colName not in df.columns:
        df[colName] = False
    l = sum(df[colName].values)
    
    df[colName] |= df[field].isin(values)

    return sum(df[colName].values) - l



# Use EYE_IMOTIONSFEATURES_AND_AOIS_FILE
l = eifaData.index.size
l -= setFilterNA(eifaData, 'currentQuestion')
l -= setFilterElmt(eifaData, 'participant', 'currentQuestion')
l -= setFilter(eifaData, 'currentQuestion', [_ for _ in list(taskIDs) if _ not in keepTID])
eifaData = eifaData.loc[~eifaData['DataSuitabilityFilter']]
assert eifaData.index.size == l

# Use PERCEIVED_DIFFICULTY_DATA
# setFilterElmt(perceivedDifficultyData, 'participant', 'MainQuestionID')
# perceivedDifficultyData = perceivedDifficultyData[~perceivedDifficultyData['DataSuitabilityFilter']]

# Use ANSWERS_DATA
# setFilterElmt(answersData, 'participant', 'questionID')
# answersData = answersData.loc[~answersData['DataSuitabilityFilter']]

# Use QUESTION_FILE_PARSED
# (nothing to do for participants)
# questionsFileParsed = questionsFileParsed.loc[~questionsFileParsed['DataSuitabilityFilter']]

# Use EYEMIND_DATA
setFilterNA(eyeMindFullData, 'questionID')
setFilterElmt(eyeMindFullData, 'participant', 'questionID')
setFilter(eyeMindFullData, 'questionID', [_ for _ in list(taskIDs) if _ not in keepTID])
eyeMindFullData = eyeMindFullData.loc[~eyeMindFullData['DataSuitabilityFilter']]

# Use PUPIL_FILTERED_FILE 
setFilterNA(pupiFiltered, 'currentQuestion')
setFilterElmt(pupiFiltered, 'participant', 'currentQuestion')
setFilter(pupiFiltered, 'currentQuestion', [_ for _ in list(taskIDs) if _ not in keepTID])
pupiFiltered = pupiFiltered.loc[~pupiFiltered['DataSuitabilityFilter']]



print(f'{len(partIDs)=} after filtering.')
print(f'{len(taskIDs)=} after filtering.')
print()

print_size = lambda label, df: print(f'{label}\t\t\tdata size:{len(df.index)}')
print_size('eifaData', eifaData)
print_size('perceivedDifficultyData', perceivedDifficultyData)
print_size('answersData', answersData)
print_size('questionsFileParsed', questionsFileParsed)
print_size('eyeMindFullData', eyeMindFullData)
print_size('pupiFiltered', pupiFiltered)

len(partIDs)=25 after filtering.
len(taskIDs)=30 after filtering.

eifaData			data size:158169
perceivedDifficultyData			data size:375
answersData			data size:375
questionsFileParsed			data size:30
eyeMindFullData			data size:298
pupiFiltered			data size:468151


In [38]:
## Copy 'simple' task data
# For Coarse grained analysis, copy 'essentialComlexity' 'Simple' as 'AccidentalComplexity' 'Simple' since it is the same artifact/question used for both
# Note: after this operation the order of the data is lost, but this is expected not be a problem, for the analysis in this notebook, Please check!

datasets = [eifaData, perceivedDifficultyData, answersData, pupiFiltered]

for i in range(len(datasets)):
    # Selecting rows where Type3='essentialComlexity' and Type4='simple' and 'Type1' == 'Coarse'
    subset = datasets[i][(datasets[i]['Type3'] == 'essentialComlexity') & (datasets[i]['Type4'] == 'Simple') & (datasets[i]['Type1'] == 'Coarse')].copy()
    
    # Changing 'Type3' in the subset to 'accidentalComlexity'
    subset['Type3'] = 'accidentalComlexity'

    # Appending the modified subset to the original dataframe
    datasets[i] = pd.concat([datasets[i], subset], ignore_index=True)
    
#workaround to ensure that the original datasets are modified. TO FIX
eifaData = datasets[0] 
perceivedDifficultyData = datasets[1] 
answersData =  datasets[2] 
pupiFiltered =  datasets[3] 

In [39]:
# Store DataFrames - Pickling
if not os.path.isdir(DATA_DIR + 'pkl'):
    os.mkdir(DATA_DIR + 'pkl')

eifaData.to_pickle(DATA_DIR + 'pkl' + r'\eifaData.pkl')
perceivedDifficultyData.to_pickle(DATA_DIR + 'pkl' + r'\perceivedDifficultyData.pkl')
answersData.to_pickle(DATA_DIR + 'pkl' + r'\answersData.pkl')
questionsFileParsed.to_pickle(DATA_DIR + 'pkl' + r'\questionsFileParsed.pkl')
eyeMindFullData.to_pickle(DATA_DIR + 'pkl' + r'\eyeMindFullData.pkl')
pupiFiltered.to_pickle(DATA_DIR + 'pkl' + r'\pupiFiltered.pkl')

In [40]:
## Complete the datasets with demographic data
#
# Load demographic data (and catalogs)
demogData = pd.read_csv(DEMOGRAPHIC_DATA)
with open(DEMOGRAPHIC_CATALOG_DATA) as f:
    demogCatalog = json.load(f)
    f.close()

with open(DEMOGRAPHIC_RCATALOG_DATA) as f:
    demogRCatalog = json.load(f)
    f.close()
demogData_cols = ['ParticipantID', 'Familiarized']


## Store DataFrames - Pickling
demogData.to_pickle(DATA_DIR + 'pkl' + r'\demogData.pkl')

eifaData2 = eifaData.merge(demogData, left_on='participant', right_on='ParticipantID', how='left')
eifaData2.to_pickle(DATA_DIR + 'pkl' + r'\eifaData2.pkl')

pupiFiltered2 = pupiFiltered.merge(demogData, left_on='participant', right_on='ParticipantID', how='left')
pupiFiltered2.to_pickle(DATA_DIR + 'pkl' + r'\pupiFiltered2.pkl')

perceivedDifficultyData2 = perceivedDifficultyData.merge(demogData, left_on='participant', right_on='ParticipantID', how='left')
perceivedDifficultyData2.to_pickle(DATA_DIR + 'pkl' + r'\perceivedDifficultyData2.pkl')

answersData2 = answersData.merge(demogData, left_on='participant', right_on='ParticipantID', how='left')
answersData2.to_pickle(DATA_DIR + 'pkl' + r'\answersData2.pkl')


pupiFiltered2.head()

Unnamed: 0,Unnamed: 0.1,index,Unnamed: 0_x,eventSource,Timestamp,validLeft,validRight,leftPupilValidity,rightPupilValidity,snapshotId,...,StrategyChangedReason,StrategySecond,StrategySecondOther,Familiarized,Continue,ProgKnownLanguageProcedural,ProgKnownLanguageLogic,ProgKnownLanguageOO,ProgKnownLanguageVisual,ProgKnownLanguageFunctional
0,11964,11964,48138,eye-tracker,3204770.173,1,1,1,1,46,...,1.0,3.0,-1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0
1,11965,11965,48141,eye-tracker,3204795.121,1,1,1,1,47,...,1.0,3.0,-1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0
2,11966,11966,48144,eye-tracker,3204820.164,1,1,1,1,47,...,1.0,3.0,-1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0
3,11967,11967,48147,eye-tracker,3204845.151,1,0,1,0,47,...,1.0,3.0,-1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0
4,11968,11968,48150,eye-tracker,3204870.115,1,1,1,1,47,...,1.0,3.0,-1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0
