In [1]:
import pandas as pd

In [32]:
df = pd.read_csv("joint_data_trimmed.csv", index_col=0)

In [21]:
# following the dataset description, some columns shouldn't be used

- Skipping: a binary index of whether the word was fixated at least once during the entire reading of the text [and not only during the first pass].
- First Fixation: the duration of the first fixation landing on the word.
- Gaze Duration: the summed duration of fixations on the word in the first pass, i.e., before the gaze leaves it for the first time.
- Total Fixation Duration: the summed duration of all fixations on the word.
- First-run Number of Fixation: the number of fixations on a word during the first pass.
- Total Number of Fixations: number of fixations on a word overall.
- Regression: a binary index of whether the gaze returned to the word after inspecting further textual material.
- Rereading: a binary index of whether the word elicited fixations after the first pass.


In [35]:
# following a paper cited on the MECO website, i will use a subset of the gaze features
gaze_features = ["skip", "firstfix.dur", "firstrun.dur", "dur", "firstrun.nfix", "nfix", "refix", "reread"]
basic_features = ["trialid", "sentnum", "ianum", "ia", "lang", "uniform_id"]
df = df[basic_features + gaze_features]

In [36]:
df.head()

Unnamed: 0,trialid,sentnum,ianum,ia,lang,uniform_id,skip,firstfix.dur,firstrun.dur,dur,firstrun.nfix,nfix,refix,reread
1,1.0,1.0,1.0,Janus,du,du_1,0.0,154.0,154.0,400.0,1.0,2.0,0.0,1.0
2,1.0,1.0,2.0,is,du,du_1,1.0,,,,,,,
3,1.0,1.0,3.0,in,du,du_1,0.0,551.0,551.0,551.0,1.0,1.0,0.0,0.0
4,1.0,1.0,4.0,de,du,du_1,1.0,,,,,,,
5,1.0,1.0,5.0,oude,du,du_1,0.0,189.0,189.0,439.0,1.0,2.0,0.0,1.0


In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 855123 entries, 1 to 855123
Data columns (total 14 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   trialid        855122 non-null  float64
 1   sentnum        855122 non-null  float64
 2   ianum          855122 non-null  float64
 3   ia             854741 non-null  object 
 4   lang           855122 non-null  object 
 5   uniform_id     855123 non-null  object 
 6   skip           855122 non-null  float64
 7   firstfix.dur   639530 non-null  float64
 8   firstrun.dur   639530 non-null  float64
 9   dur            639530 non-null  float64
 10  firstrun.nfix  639530 non-null  float64
 11  nfix           639530 non-null  float64
 12  refix          639454 non-null  float64
 13  reread         639530 non-null  float64
dtypes: float64(11), object(3)
memory usage: 97.9+ MB


In [40]:
df.describe()

Unnamed: 0,trialid,sentnum,ianum,skip,firstfix.dur,firstrun.dur,dur,firstrun.nfix,nfix,refix,reread
count,855122.0,855122.0,855122.0,855122.0,639530.0,639530.0,639530.0,639530.0,639530.0,639454.0,639530.0
mean,6.319812,5.100584,84.710652,0.252118,214.771812,274.000635,396.190598,1.291295,1.870305,0.270565,0.315846
std,3.44021,2.697842,51.443266,0.434229,94.834265,181.464901,332.095123,0.666067,1.378493,0.444252,0.464852
min,1.0,1.0,1.0,0.0,2.0,2.0,2.0,1.0,1.0,0.0,0.0
25%,3.0,3.0,41.0,0.0,156.0,171.0,199.0,1.0,1.0,0.0,0.0
50%,6.0,5.0,82.0,0.0,200.0,229.0,297.0,1.0,1.0,0.0,0.0
75%,9.0,7.0,124.0,1.0,255.0,324.0,478.0,1.0,2.0,1.0,1.0
max,12.0,16.0,243.0,1.0,12688.0,12688.0,15579.0,44.0,50.0,1.0,1.0


In [41]:
df.lang.unique()

array(['du', 'ee', 'fi', 'ge', 'gr', 'he', 'it', 'ko', 'en', 'no', nan,
       'ru', 'sp', 'tr'], dtype=object)

In [42]:
# get only the languages that are necessary to the project
supported_languages = ["ge", "it", "ru", "en", "sp"]

In [45]:
df = df[df.lang.isin(supported_languages)]

In [46]:
df[gaze_features].corr()

Unnamed: 0,skip,firstfix.dur,firstrun.dur,dur,firstrun.nfix,nfix,refix,reread
skip,1.0,,,,,,,
firstfix.dur,,1.0,0.545658,0.294891,-0.047132,-0.028587,-0.044375,-0.011931
firstrun.dur,,0.545658,1.0,0.575459,0.741006,0.380757,0.506017,0.009664
dur,,0.294891,0.575459,1.0,0.436837,0.890073,0.496736,0.566731
firstrun.nfix,,-0.047132,0.741006,0.436837,1.0,0.509649,0.729273,0.014519
nfix,,-0.028587,0.380757,0.890073,0.509649,1.0,0.597431,0.641414
refix,,-0.044375,0.506017,0.496736,0.729273,0.597431,1.0,0.169941
reread,,-0.011931,0.009664,0.566731,0.014519,0.641414,0.169941,1.0
