In [None]:
import sys
import importlib

import numpy as np
import pandas as pd
import os.path

# Import the model we are using
import seaborn as sns
import matplotlib.pyplot as plt



In [None]:
# run this command only the first time to download the BTSSlayers library from gitbub
!wget https://github.com/Critt-Kent/Behavioral-Translation-Style-Space/blob/main/BTSSlayers.py 

# then import as btss
import BTSSlayers as btss


In [None]:
# set dataframe vizualisation options 
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Key-Gaze analysis
- Read AU and FD data
- Compute Gaze measures
- Correlate with PUB and KBI 

In [None]:
# Read a small set of sessions
L = ['ACS08/Tables/P03_T1', 
     'ACS08/Tables/P08_T1', 
     'ACS08/Tables/P06_T4',
     'ACS08/Tables/P11_T2',
     'AR22/Tables/P12_T6',
     'AR22/Tables/P12_T5']

# read the Activity Units (au1) and the fixation data (fd) 
BTSS1 = btss.readBTSSsessions(L, layers = ['au1', 'fd'], verbose=0)

# rename for easier use
AUdf = BTSS1['au1']
FDdf = BTSS1['fd'] 
print("AUs:", AUdf.shape, "FDs:", FDdf.shape)

In [None]:
# Read a large set of 491 translation sessions from the TPR-DB
GD = pd.read_csv('sorted.gaze.clean.txt', sep="\t", dtype=None)

# read the Tables from the TPR-DB
BTSS1 = btss.readBTSSsessions(GD['Study-Session'], layers = ['au1', 'fd'], verbose=0)

# rename for easier use
AUdf = BTSS1['au1']
FDdf = BTSS1['fd'] 
print("AUs:", AUdf.shape, "FDs:", FDdf.shape)

In [None]:
FDdf.head()

In [None]:
# extract one row per session 
AUdf['StudySession'] = AUdf['Study'] + AUdf['Session']
A = AUdf.drop_duplicates(subset=['StudySession'])
A.shape

In [None]:
AUdf.head()

In [None]:
FDdf.head()

In [None]:
PUdf.head()

In [None]:
HOFdf.head()

In [None]:
POLdf.head()

In [None]:
PHdf.head()

## Distribution of log HOF values

- log duration
- log insertions and log deletions
- log Linear reading, Regressive reading, scattered reading

In [None]:

HOF_order = ["H", "O", "R", "F"]
palette = ["red","blue","green", "black"]

HOFdf['LogIns'] = np.log(HOFdf['Ins'] + 1)
HOFdf['LogDel'] = np.log(HOFdf['Del'] + 1)

AU1 = HOFdf[HOFdf.Dur > 1]
sns.histplot(data=AU1, x="LogDur", bins=100, alpha=0.3, hue='HOF', fill=False, kde=True, stat="probability", 
             palette=palette, hue_order=HOF_order )
plt.show()

AU1 = HOFdf[(HOFdf.LogIns > 0)]
sns.histplot(data=AU1, x="LogIns", bins=100, alpha=0.3, hue='HOF', kde=True, fill=False, stat="probability", 
             palette=palette, hue_order=HOF_order)
plt.show()

#AU1 = HOFdf[(HOFdf.Del > 0) & (HOFdf.HOF.isin(['H','F']))]
AU1 = HOFdf[(HOFdf.LogDel > 0)]
sns.histplot(data=AU1, x="LogDel", bins=100, alpha=0.3, hue='HOF', kde=True, fill=False, stat="probability", 
             palette=palette, hue_order=HOF_order)
plt.show()

AU1 = HOFdf[HOFdf.LogDur_L > 1]
sns.histplot(data=AU1, x="LogDur_L", bins=100, alpha=0.3, hue='HOF', kde=True, fill=False, stat="probability", 
             palette=palette, hue_order=HOF_order)
plt.show()

AU1 = HOFdf[HOFdf.LogDur_R > 1]
sns.histplot(data=AU1, x="LogDur_R", bins=100, alpha=0.3, hue='HOF', kde=True, fill=False, 
             stat="probability", palette=palette, hue_order=HOF_order)
plt.show()

AU1 = HOFdf[HOFdf.LogDur_S > 1]
sns.histplot(data=AU1, x="LogDur_S", bins=100, alpha=0.3, hue='HOF', kde=True, fill=False, 
             stat="probability", palette=palette, hue_order=HOF_order)


## Distribution of Gaze Duration

In [None]:
sns.set(font_scale=1.5)
sns.set_style("whitegrid")

desired_order = ["H", "O", "R","F"]

H = HOFdf[HOFdf.LogDur_R > 1]
sns.histplot(data=H, x="LogDur_R", bins=30, hue='HOF', alpha=0.2, stat="density", kde = True, hue_order=desired_order )
plt.show()

H = HOFdf[HOFdf.LogDur_L > 1]
sns.histplot(data=H, x="LogDur_L", bins=30, hue='HOF', alpha=0.2, stat="probability", kde = True, hue_order=desired_order)
plt.show()

H = HOFdf[HOFdf.LogDur_S > 1]
sns.histplot(data=H, x="LogDur_S", bins=30, hue='HOF', alpha=0.2, stat="probability", kde = True, hue_order=desired_order)
plt.show()

H = HOFdf[HOFdf.RelDur_R > 0]
sns.histplot(data=H, x="RelDur_R", bins=30, hue='HOF', alpha=0.2, stat="density", kde = True, hue_order=desired_order )
#plt.title(title="Type 1 reading per HOF states")
plt.show()

H = HOFdf[HOFdf.RelDur_L > 0]
sns.histplot(data=H, x="RelDur_L", bins=30, hue='HOF', alpha=0.2, stat="probability", kde = True, hue_order=desired_order  )
plt.show()

H = HOFdf[HOFdf.RelDur_S > 0]
sns.histplot(data=H, x="RelDur_S", bins=30, hue='HOF', alpha=0.2, stat="probability", kde = True, hue_order=desired_order  )
plt.show()



## LogOdds of ST / TT Fixations per HOF state

fixation odds is ${\frac {p}{1-p}}$ where $p$ is the refixation probability. 
$p = 1$ if every fixation in a HOF state is on a different word (refixaton chance = 0). 

logOdds is the logarithm of the fixation odds, i.e, the chances of how often a word is refixated:
- logOdds = 0: 50% refixation chance (every word is on average fixated twice)
- logOdds > 0: if refixation on a word chance > 50%
- logOdds < 0: if refixation on a word chance < 50%


it's computed as:

- $fixLogOdds = \mathrm{logit} (fixRel)= \log (\frac {fixRel}{1-fixRel})$

where:
- $fixRel$: number different ST or TT words fixated  $ fixRel= \frac{fixDiff}{fixTot}$

and
- $fixDiff$: number different ST / TT words fixated
- $fixTot$: total number of ST /TT fixations per HOF state


In [None]:
desired_order = ["H", "O", "R", "F"]

# SfixTot: total number of ST fixations per HOF state
# SfixDiff: number different ST words fixated
# SfixRel: number different TT words fixated
# SfixLogOdds: chances of how often an ST word is refixated

# TfixTot: total number of TT fixations per HOF state
# TfixDiff: number different TT words fixated

HOFdf['SfixRel'] = HOFdf['SfixDiff']/HOFdf['SfixTot']
HOFdf['TfixRel'] = HOFdf['TfixDiff']/HOFdf['TfixTot']

HOFdf['SfixLogOdds'] = np.log(HOFdf['SfixRel']/(1-HOFdf['SfixRel']))
HOFdf['TfixLogOdds'] = np.log(HOFdf['TfixRel']/(1-HOFdf['TfixRel']))

sns.boxplot(data=HOFdf, y='SfixLogOdds' , hue='HOF', hue_order=desired_order )
plt.show()

#H = HOFdf[HOFdf['TfixLogOdds'].notna()]
sns.boxplot(data=HOFdf, y='TfixLogOdds' , hue='HOF', hue_order=desired_order )
plt.show()


## LogOdds of reading patterns:
    - Dur_L: linead reading
    - Dur_R: regressive reading
    - Dur_S: scattered gaze data
    - Dur_N: no fixation data recorded

In [None]:
HOFdf['LogOddsDur_L'] = np.log(HOFdf['RelDur_L']/(1-HOFdf['RelDur_L']))
HOFdf['LogOddsDur_R'] = np.log(HOFdf['RelDur_R']/(1-HOFdf['RelDur_R']))
HOFdf['LogOddsDur_S'] = np.log(HOFdf['RelDur_S']/(1-HOFdf['RelDur_S']))
HOFdf['LogOddsDur_N'] = np.log(HOFdf['RelDur_N']/(1-HOFdf['RelDur_N']))

sns.boxplot(data=HOFdf, y='LogOddsDur_R' , hue='HOF', hue_order=desired_order )
plt.show()

sns.boxplot(data=HOFdf, y='LogOddsDur_L' , hue='HOF', hue_order=desired_order )
plt.show()

sns.boxplot(data=HOFdf, y='LogOddsDur_S' , hue='HOF', hue_order=desired_order )
plt.show()

sns.boxplot(data=HOFdf, y='LogOddsDur_N' , hue='HOF', hue_order=desired_order )
plt.show()



In [None]:
# normal distribution
import numpy as np
from scipy import stats

POLdf['LogInEff'] = np.log((POLdf['Ins'] + POLdf['Del'] + 1)/ (POLdf['Del'] + 1))

PL1 = POLdf
T = ['LogOdur', 'LogHdur', 'LogRdur', 'LogFdur', 'LogInEff']
#T = ['RelRdur']

# Interpret the results (common significance level alpha = 0.05)
alpha = 0.05
for t in T:
    D1 =  PL1[PL1[t] > 0][t]

    statistic_normal, pvalue_normal = stats.normaltest(D1)
    M1 = D1.mean()
    S1 = D1.std()
    
    print(t, f"Mean:{M1} Std:{S1} Statistic: {statistic_normal:.4f}, P-value: {pvalue_normal:.8f}")
    if pvalue_normal > alpha:
        print(t, "Normal Data: The sample likely comes from a normal distribution (fail to reject H0).")
    else:
        print(t, "Normal Data: The sample likely does not come from a normal distribution (reject H0).")
        
    D1.hist(bins=50)
    plt.show()



In [None]:
H = pd.concat([POLdf.DOHRFI.value_counts(), 
               POLdf.DOHRFI.value_counts(normalize=True)], 
              axis=1, keys=["Count", "%"]).reset_index()
H['CumSum'] = H['Count'].cumsum()
H['CumSum%'] = H['%'].cumsum()

print("Number of different Policy labels:", H.shape)
H.head(30)

In [None]:
# Types of policies
X = POLdf.groupby(['HOF']).agg(
    {'Dur': 'mean', 
     'Odur': 'mean', 
     'Ins': 'mean', 
     'Del': 'mean', 
     'InEff' 	: 'mean',
     
     'RelOdur' 	: 'mean',
     'RelDur_L' : 'mean',
     'RelDur_R' : 'mean',
     'RelDur_S' : 'mean',
     'RelDur_N' : 'mean',
     
     'RelOdur' : 'mean',
     'RelHdur' : 'mean',
     'RelRdur' : 'mean',
     'RelFdur' : 'mean',

     'LogOdur' : 'mean',
     'LogHdur' : 'mean',
     'LogRdur' : 'mean',
     'LogFdur' : 'mean',

}).reset_index()


X['InEff1']  = X['Del'] / (X['Ins'] + X['Del'])

H = pd.concat([POLdf.HOF.value_counts(), 
               POLdf.HOF.value_counts(normalize=True)], 
              axis=1, keys=["Count", "%"]).reset_index()
H['CumSum'] = H['Count'].cumsum()
H['CumSum%'] = H['%'].cumsum()

POLtype = pd.merge(H, X)
POLtype.head(20)

In [None]:
print(POLtype[['InEff', 'InEff1']].corr())
PT50 = ['OF', 'OR', 'OH']

PT1 = POLtype[POLtype.HOF.isin(PT50)]
PT1 = POLtype
#sns.histplot(data=POLtype, x="InEff", y='InEff1', stat="density", kde = True, palette='dark')
sns.scatterplot(data=PT1, x="InEff", y='InEff1')
plt.show()

sns.scatterplot(data=POLtype, x="InEff", y='RelOdur')
plt.show()

sns.scatterplot(data=POLtype, x="InEff", y='RelHdur')
plt.show()

sns.scatterplot(data=POLtype, x="InEff", y='RelRdur')
plt.show()

sns.scatterplot(data=POLtype, x="InEff", y='RelFdur')
plt.show()




In [None]:
# correlation
C = ['RelOdur', 'RelDur_L', 'RelDur_R', 'RelDur_S', 'RelDur_N', 'InEff']
PLO75 = ['OF', 'OR', 'OH']
PLO75 = ['OF']
PLO75 = ['OR']

POLdf[POLdf.HOF.isin(PLO75)][C].corr()


## Log duration of Fixations

In [None]:
# Log duration of fixation
FDdf['LogDur'] = np.log1p(FDdf.Dur)

sns.boxplot(data=FDdf, y="LogDur", hue='Win').set(title='Fixation Duration ST (Win1) and TT (win2)')
plt.show()

sns.histplot(data=FDdf, x="LogDur", hue='Win')
plt.show()



In [None]:
# Log duration of first fixation on ST and TT words
FDdf['StudySession'] = FDdf['Study'] + '-' +FDdf['Session']
FDdf['StudySessionWinFirst'] = FDdf['StudySession'] + '-' + FDdf['Win'].astype(str)  + '.' + FDst['SGid'].astype(str) 

FDst = FDdf[FDdf.Win == 1].copy()
FDtt = FDdf[FDdf.Win == 2].copy()

FDst['StudySessionFirst'] = FDst['StudySession'] + FDst['SGid'].astype(str)
FDtt['StudySessionFirst'] = FDtt['StudySession'] + FDtt['SGid'].astype(str)

FDst_first = FDst.drop_duplicates(subset=['StudySessionFirst'], keep='first')
FDtt_first = FDtt.drop_duplicates(subset=['StudySessionFirst'], keep='first')

#sns.histplot(data=FDst, x="LogDur", hue='StudySession')
#plt.show()

sns.histplot(data=FDst_first, x="LogDur", hue='StudySession', alpha=0.3, bins=80)
plt.show()

sns.histplot(data=FDtt_first, x="LogDur", hue='StudySession', alpha=0.3, bins=80)
plt.show()


In [None]:
# boxplots
sns.boxplot(data=FDst, y="LogDur", hue="StudySession").set(title='Fixation Duration ST')
plt.show()
sns.boxplot(data=FDst_first, y="LogDur", hue="StudySession").set(title='First Fixation Duration ST')
plt.show()

sns.boxplot(data=FDtt, y="LogDur", hue="StudySession").set(title='Fixation Duration TT')
plt.show()
sns.boxplot(data=FDtt_first, y="LogDur", hue="StudySession").set(title='First Fixation Duration TT')
plt.show()



In [None]:
sns.set(font_scale=1.5)
sns.set_style("whitegrid")


#desired_order = ["H", "O", "F"]
#sns.histplot(data=AUdf1[(AUdf1.Type == 1)], x="LogDur_R", bins=30, hue='HOF', alpha=0.2, stat="probability", kde = True, hue_order=desired_order )
#plt.show()

AU1 = AUdf[AUdf['LogDur'] > 1]
pd.crosstab(AU1['Phase'], AU1['HOF'], normalize='index').plot.bar()

desired_order = ["H", "O", "R", "F"]
plt.legend(bbox_to_anchor=(1, 1), title="HOF states", loc="upper left")
plt.show()


#pd.crosstab(AUdf['Phase'],AUdf['LabelT'], normalize='index').plot.bar()
#plt.show()


In [None]:
sns.set(font_scale=1.5)
sns.set_style("whitegrid")


desired_order = ["H", "O", "R", "F"]

#AU1 = AUdf[(AUdf['LogDur'] > 2) & (AUdf.Tstyles != '---')]
#AU1 = AUdf[(AUdf.Tstyles != '---')]
pd.crosstab(AU1['Phase'], AU1['Tstyles'], normalize='index').plot.bar()

plt.legend(bbox_to_anchor=(1, 1), title="Translation Styles", loc="upper left")
plt.show()

sns.histplot(data=AU1[(AU1.Type == 1) ], x="LogDur", bins=50, alpha=0.4, hue='HOF', fill=False, kde = True)
plt.show()


#pd.crosstab(AUdf['Phase'],AUdf['LabelT'], normalize='index').plot.bar()
#plt.show()


In [None]:
AUdf1 = AUdf[(AUdf.LogDur_L > 1) & (AUdf.LogDur_R > 1) & (AUdf.LogDur_S > 1)]
AUdf1 = AUdf[(AUdf.LogDur_L > 1) ]

Phase_order = ['O','D','R']
sns.histplot(data=AUdf1[(AUdf1.Type == 1) & (AUdf1.Phase == 'O')], x="LogDur", stat="probability", fill=False, kde = True, color='blue')
sns.histplot(data=AUdf1[(AUdf1.Type == 1) & (AUdf1.Phase == 'D')], x="LogDur", stat="probability", fill=False, kde = True, color='red')
sns.histplot(data=AUdf1[(AUdf1.Type == 1) & (AUdf1.Phase == 'R')], x="LogDur", stat="probability", fill=False, kde = True, color='green', hue_order=Phase_order)
plt.show()

AUdf1 = AUdf[(AUdf.Dur > 3) & (AUdf.Dur < 30000) ]
sns.histplot(data=AUdf1, x="LogDur", bins=50, alpha=1, hue='Type', stat="density", fill=False, kde=True, palette='deep')
plt.show()

sns.histplot(data=AUdf1[(AUdf1.Type == 1) & (AUdf1.Phase == 'O')], x="LogDur_L", stat="probability", fill=False, kde = True, color='blue')
sns.histplot(data=AUdf1[(AUdf1.Type == 1) & (AUdf1.Phase == 'D')], x="LogDur_L", stat="probability", fill=False, kde = True, color='red')
sns.histplot(data=AUdf1[(AUdf1.Type == 1) & (AUdf1.Phase == 'R')], x="LogDur_L", stat="probability", fill=False, kde = True, color='green', hue_order=Phase_order)

plt.show()


## Distribution of Phases 

In [None]:
AUdf1 = AUdf[(AUdf.LogDur_L > 1)]

sns.histplot(data=AUdf1, x="LogDur_L", bins=25, alpha=0.3, stat="probability")
sns.histplot(data=AUdf1[(AUdf1.Phase == 'O')], x="LogDur_L", stat="probability", element="poly", alpha=0.1, kde = False )
sns.histplot(data=AUdf1[(AUdf1.Phase == 'D')], x="LogDur_L", stat="probability", element="poly", alpha=0.1, kde = False )
sns.histplot(data=AUdf1[(AUdf1.Phase == 'R')], x="LogDur_L", stat="probability", element="poly", alpha=0.1, kde = False )


desired_order = ["O", "D", "R"]
plt.legend(title="Translation Phases", loc="upper right", labels=desired_order)
plt.show()


In [None]:
#AUdf1 = AUdf[(AUdf.LogDur_L > 1) & (AUdf.LogDur_R > 1) & (AUdf.LogDur_S > 1)]
AUdf1 = AUdf[(AUdf.LogDur_S > 0)]
sns.histplot(data=AUdf1[(AUdf1.Phase == 'O')], x="LogDur_S", bins=30, alpha=0.1, stat="probability", element="poly", kde = True )
sns.histplot(data=AUdf1[(AUdf1.Phase == 'D')], x="LogDur_S", bins=40, alpha=0.1, stat="probability", kde = True )
sns.histplot(data=AUdf1[(AUdf1.Phase == 'R')], x="LogDur_S", bins=40, alpha=0.1, stat="probability", kde = True )

#sns.histplot(data=AUdf1[(AUdf1.Phase == 'O')], x="RelDur_S", bins=30, alpha=0.1, stat="probability", kde = True )
#sns.histplot(data=AUdf1[(AUdf1.Phase == 'D')], x="RelDur_S", bins=40, alpha=0.1, stat="probability", kde = True )
#sns.histplot(data=AUdf1[(AUdf1.Phase == 'R')], x="RelDur_S", bins=40, alpha=0.1, stat="probability", kde = True )

#print(AUdf[(AUdf.Type == 1)
desired_order = ["O", "D", "R"]
plt.legend(title="Translation Phases", loc="upper right", labels=desired_order)
plt.show()
