In [1]:
import pandas as pd # for manipulating data frames
import pingouin as pg # for running statistics
#import matplotlib.pyplot as pyplot
import plotly.express as px

In [2]:
data = pd.read_csv('logs/logs_merged.csv', sep=',')
data

Unnamed: 0,DesignName,ParticipantID,TrialID,Block1,Block2,VV,OC,visualSearchTime,ErrorCount
0,Preattentive Variables,1,1,1,1,Shadow,Medium,1552,0
1,Preattentive Variables,1,2,1,2,Shadow,Low,1134,0
2,Preattentive Variables,1,3,1,3,Shadow,High,1140,0
3,Preattentive Variables,1,4,1,4,Shadow,Low,768,0
4,Preattentive Variables,1,5,1,5,Shadow,Medium,904,0
...,...,...,...,...,...,...,...,...,...
265,Preattentive Variables,6,266,3,11,Shadow,Low,858,0
266,Preattentive Variables,6,267,3,12,Shadow,Medium,743,0
267,Preattentive Variables,6,268,3,13,Shadow,Low,648,0
268,Preattentive Variables,6,269,3,14,Shadow,High,676,0


In [3]:
data.describe(include = 'all')

Unnamed: 0,DesignName,ParticipantID,TrialID,Block1,Block2,VV,OC,visualSearchTime,ErrorCount
count,270,270.0,270.0,270.0,270.0,270,270,270.0,270.0
unique,1,,,,,3,3,,
top,Preattentive Variables,,,,,Shadow,Medium,,
freq,270,,,,,90,90,,
mean,,3.5,135.5,2.0,8.0,,,3073.048148,0.051852
std,,1.710997,78.086491,0.818013,4.328517,,,4067.195605,0.238288
min,,1.0,1.0,1.0,1.0,,,558.0,0.0
25%,,2.0,68.25,1.0,4.0,,,986.25,0.0
50%,,3.5,135.5,2.0,8.0,,,1504.0,0.0
75%,,5.0,202.75,3.0,12.0,,,3420.5,0.0


In [4]:
data.iloc[2]

DesignName          Preattentive Variables
ParticipantID                            1
TrialID                                  3
Block1                                   1
Block2                                   3
VV                                  Shadow
OC                                    High
visualSearchTime                      1140
ErrorCount                               0
Name: 2, dtype: object

In [5]:
data.dtypes

DesignName          object
ParticipantID        int64
TrialID              int64
Block1               int64
Block2               int64
VV                  object
OC                  object
visualSearchTime     int64
ErrorCount           int64
dtype: object

In [6]:
data['ParticipantID'] = data['ParticipantID'].astype('str')
data.dtypes

DesignName          object
ParticipantID       object
TrialID              int64
Block1               int64
Block2               int64
VV                  object
OC                  object
visualSearchTime     int64
ErrorCount           int64
dtype: object

In [7]:
### Applying them to a column gives a result of type series
data.visualSearchTime.mean()

3073.0481481481484

In [8]:
### We can get a breakdown by condition using the groupby function
data.groupby('VV').visualSearchTime.mean()

VV
Both      6437.455556
Motion    1511.677778
Shadow    1270.011111
Name: visualSearchTime, dtype: float64

In [9]:
### Applying those aggregating functions to a dataframe gives a result of type dataframe
data.groupby('VV').mean() # result is a dataframe

  data.groupby('VV').mean() # result is a dataframe


Unnamed: 0_level_0,TrialID,Block1,Block2,visualSearchTime,ErrorCount
VV,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Both,135.5,2.0,8.0,6437.455556,0.122222
Motion,135.5,2.0,8.0,1511.677778,0.011111
Shadow,135.5,2.0,8.0,1270.011111,0.022222


In [10]:
### aggregate is a more elaborate aggregate function
# the line below is equivalent to: data.groupby('Lens').mean()
# data.groupby('Lens').aggregate('mean') 
# but here aggregate is used to specify how to aggregate different columns
data.groupby('VV').aggregate({'TrialID': 'sum', 'visualSearchTime': 'mean'})

Unnamed: 0_level_0,TrialID,visualSearchTime
VV,Unnamed: 1_level_1,Unnamed: 2_level_1
Both,12195,6437.455556
Motion,12195,1511.677778
Shadow,12195,1270.011111


In [11]:
OCAsStr = data['OC'].copy().astype('str')
# make a copy of column ID and change its type from float to str
VVAsStr = data['VV'].copy().astype('str')
# now that we have strings, we can concatenate them using function 'cat'
data['Condition: OC, VV'] = OCAsStr.str.cat(VVAsStr, sep=", ")
data

Unnamed: 0,DesignName,ParticipantID,TrialID,Block1,Block2,VV,OC,visualSearchTime,ErrorCount,"Condition: OC, VV"
0,Preattentive Variables,1,1,1,1,Shadow,Medium,1552,0,"Medium, Shadow"
1,Preattentive Variables,1,2,1,2,Shadow,Low,1134,0,"Low, Shadow"
2,Preattentive Variables,1,3,1,3,Shadow,High,1140,0,"High, Shadow"
3,Preattentive Variables,1,4,1,4,Shadow,Low,768,0,"Low, Shadow"
4,Preattentive Variables,1,5,1,5,Shadow,Medium,904,0,"Medium, Shadow"
...,...,...,...,...,...,...,...,...,...,...
265,Preattentive Variables,6,266,3,11,Shadow,Low,858,0,"Low, Shadow"
266,Preattentive Variables,6,267,3,12,Shadow,Medium,743,0,"Medium, Shadow"
267,Preattentive Variables,6,268,3,13,Shadow,Low,648,0,"Low, Shadow"
268,Preattentive Variables,6,269,3,14,Shadow,High,676,0,"High, Shadow"


In [12]:
data.groupby('ParticipantID').count()

Unnamed: 0_level_0,DesignName,TrialID,Block1,Block2,VV,OC,visualSearchTime,ErrorCount,"Condition: OC, VV"
ParticipantID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,45,45,45,45,45,45,45,45,45
2,45,45,45,45,45,45,45,45,45
3,45,45,45,45,45,45,45,45,45
4,45,45,45,45,45,45,45,45,45
5,45,45,45,45,45,45,45,45,45
6,45,45,45,45,45,45,45,45,45


In [13]:
fig = px.histogram(data, x='Condition: OC, VV', color='ParticipantID')
fig.show()

In [14]:
fig = px.histogram(data, x='visualSearchTime')
fig.show()

In [15]:
fig = px.histogram(data, x='visualSearchTime', marginal='box')
fig.show()


In [16]:
fig = px.histogram(data, x='visualSearchTime', color='OC', marginal='box')
fig.show()

In [17]:
fig = px.histogram(data, x='visualSearchTime', color='VV', marginal='box')
fig.show()

In [18]:
fig = px.histogram(data, x='ErrorCount', color='OC')
fig.show()

In [19]:
fig = px.histogram(data, x='ErrorCount', color='VV')
fig.show()

In [20]:
fig = px.histogram(data, x='visualSearchTime', color='OC', marginal='box', log_x=True, log_y=False)
fig.show()
#not sure why Y axis is changed here?

Inferential statistics

In [26]:
bothData = data[(data['VV']=='Both')]
shadowData= data[(data['VV']=='Shadow')]
motionData = data[(data['VV']=='Motion')] 

In [67]:
correlation_table3 = pg.pairwise_corr(bothData, columns=['OC','visualSearchTime'])
correlation_table3

Unnamed: 0,X,Y,method,alternative,n,r,CI95%,p-unc,BF10,power
0,visualSearchTime,TrialID,pearson,two-sided,90,-0.062944,"[-0.27, 0.15]",0.555608,0.156,0.090642
1,visualSearchTime,Block1,pearson,two-sided,90,-0.111893,"[-0.31, 0.1]",0.293729,0.227,0.18334
2,visualSearchTime,Block2,pearson,two-sided,90,-0.017323,"[-0.22, 0.19]",0.871265,0.133,0.052847
3,visualSearchTime,ErrorCount,pearson,two-sided,90,-0.128493,"[-0.33, 0.08]",0.227447,0.27,0.227507


In [None]:
r2 = correlation_table['r'] * correlation_table['r']
r2
#run anova test for each hypothesis

In [27]:
anShadow = pg.rm_anova(data=shadowData, dv='visualSearchTime', within='OC', subject='ParticipantID')
anShadow

Unnamed: 0,Source,ddof1,ddof2,F,p-unc,ng2,eps
0,OC,2,10,1.492423,0.270904,0.135666,0.879256


In [30]:
posthocS = pg.pairwise_tests(data=shadowData, dv='visualSearchTime', within=['OC'], subject='ParticipantID', parametric=True, padjust='fdr_bh', effsize='hedges')
posthocS

Unnamed: 0,Contrast,A,B,Paired,Parametric,T,dof,alternative,p-unc,p-corr,p-adjust,BF10,hedges
0,OC,High,Low,True,True,1.470935,5.0,two-sided,0.201274,0.45472,fdr_bh,0.788,0.881953
1,OC,High,Medium,True,True,0.810051,5.0,two-sided,0.45472,0.45472,fdr_bh,0.483,0.284157
2,OC,Low,Medium,True,True,-1.083285,5.0,two-sided,0.328133,0.45472,fdr_bh,0.58,-0.54943


In [28]:
anMotion = pg.rm_anova(data=motionData, dv='visualSearchTime', within='OC', subject='ParticipantID')
anMotion

Unnamed: 0,Source,ddof1,ddof2,F,p-unc,ng2,eps
0,OC,2,10,0.324534,0.7302,0.024104,0.57851


In [31]:
posthocM = pg.pairwise_tests(data=motionData, dv='visualSearchTime', within=['OC'], subject='ParticipantID', parametric=True, padjust='fdr_bh', effsize='hedges')
posthocM

Unnamed: 0,Contrast,A,B,Paired,Parametric,T,dof,alternative,p-unc,p-corr,p-adjust,BF10,hedges
0,OC,High,Low,True,True,0.305158,5.0,two-sided,0.77254,0.77254,fdr_bh,0.388,0.15372
1,OC,High,Medium,True,True,0.862186,5.0,two-sided,0.427991,0.77254,fdr_bh,0.499,0.402543
2,OC,Low,Medium,True,True,0.68824,5.0,two-sided,0.521924,0.77254,fdr_bh,0.451,0.145611


In [29]:
anBoth = pg.rm_anova(data=bothData, dv='visualSearchTime', within='OC', subject='ParticipantID')
anBoth

Unnamed: 0,Source,ddof1,ddof2,F,p-unc,ng2,eps
0,OC,2,10,28.690857,7.2e-05,0.744987,0.567905


In [32]:
posthocB = pg.pairwise_tests(data=bothData, dv='visualSearchTime', within=['OC'], subject='ParticipantID', parametric=True, padjust='fdr_bh', effsize='hedges')
posthocB

Unnamed: 0,Contrast,A,B,Paired,Parametric,T,dof,alternative,p-unc,p-corr,p-adjust,BF10,hedges
0,OC,High,Low,True,True,6.756309,5.0,two-sided,0.001079,0.003236,fdr_bh,37.735,2.888427
1,OC,High,Medium,True,True,4.01908,5.0,two-sided,0.01013,0.01013,fdr_bh,6.833,2.014092
2,OC,Low,Medium,True,True,-5.505134,5.0,two-sided,0.002704,0.004056,fdr_bh,18.625,-2.563288


In [28]:
aovrm1way = pg.rm_anova(data=data, dv='visualSearchTime', within='VV', subject='ParticipantID')
aovrm1way

Unnamed: 0,Source,ddof1,ddof2,F,p-unc,ng2,eps
0,VV,2,10,84.868675,5.330996e-07,0.901544,0.648091


In [30]:
posthoc = pg.pairwise_tests(data=data, dv='visualSearchTime', within=['VV'], subject='ParticipantID', parametric=True, padjust='fdr_bh', effsize='hedges')
posthoc

Unnamed: 0,Contrast,A,B,Paired,Parametric,T,dof,alternative,p-unc,p-corr,p-adjust,BF10,hedges
0,VV,Both,Motion,True,True,9.220657,5.0,two-sided,0.000252,0.000378,fdr_bh,116.7,4.411975
1,VV,Both,Shadow,True,True,10.085631,5.0,two-sided,0.000164,0.000378,fdr_bh,163.166,4.764147
2,VV,Motion,Shadow,True,True,1.04656,5.0,two-sided,0.343235,0.343235,fdr_bh,0.565,0.550199


In [31]:
import math

def summarizeDF(df, factors, measure):
    summary = data.groupby(factors, as_index=False)[measure].aggregate({'Mean': 'mean', 'Count': 'count', 'Std':'std'})
    ci95_hi = []
    ci95_lo = []
    for i in summary.values:
        mean, count, std = i[len(factors)], i[len(factors)+1], i[len(factors)+2]
        ci95_hi.append(mean + 1.96*std/math.sqrt(count))
        ci95_lo.append(mean - 1.96*std/math.sqrt(count))

    summary['ci95_hi'] = ci95_hi
    summary['ci95_lo'] = ci95_lo
    return summary

In [33]:
stats = summarizeDF(data, ['VV'], 'visualSearchTime')
stats

Unnamed: 0,VV,Mean,Count,Std,ci95_hi,ci95_lo
0,Both,6437.455556,90,5532.468111,7580.475309,5294.435802
1,Motion,1511.677778,90,1273.581677,1774.802478,1248.553077
2,Shadow,1270.011111,90,754.055544,1425.800601,1114.221622


In [34]:
nice_color_palette = ['#66c2a5', '#fc8d62', '#8da0cb', '#e78ac3', '#a6d854']

fig = px.bar(stats, x='VV', y='Mean', color='VV', color_discrete_sequence=nice_color_palette).update_traces(
    error_y={
        'type': 'data',
        'symmetric': False,
        'array': stats['ci95_hi'] - stats['Mean'],
        'arrayminus': stats['Mean'] - stats['ci95_lo'],
    }
)
fig.update_layout({
    'plot_bgcolor' : 'rgba(0,0,0,0)'
})
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='LightGray')

fig.show()

In [35]:
aovrm2way = pg.rm_anova(data=data, dv='visualSearchTime', within=['VV', 'OC'], subject='ParticipantID')
aovrm2way


Epsilon values might be innaccurate in two-way repeated measures design where each  factor has more than 2 levels. Please  double-check your results.



Unnamed: 0,Source,SS,ddof1,ddof2,MS,F,p-unc,p-GG-corr,ng2,eps
0,VV,306145000.0,2,10,153072500.0,84.868675,5.330996e-07,4e-05,0.820884,0.648091
1,OC,64945440.0,2,10,32472720.0,41.369273,1.457802e-05,0.000128,0.492959,0.758404
2,VV * OC,105329700.0,4,20,26332440.0,20.638712,7.200757e-07,0.014565,0.611918,0.173146


In [36]:
posthoc = pg.pairwise_tests(data=data, dv='visualSearchTime', within=['VV', 'OC'], subject='ParticipantID', parametric=True, padjust='holm', effsize='cohen')
posthoc


The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.


The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.



Unnamed: 0,Contrast,VV,A,B,Paired,Parametric,T,dof,alternative,p-unc,p-corr,p-adjust,BF10,cohen
0,VV,-,Both,Motion,True,True,9.220657,5.0,two-sided,0.000252,0.000504,holm,116.7,4.779639
1,VV,-,Both,Shadow,True,True,10.085631,5.0,two-sided,0.000164,0.000492,holm,163.166,5.161159
2,VV,-,Motion,Shadow,True,True,1.04656,5.0,two-sided,0.343235,0.343235,holm,0.565,0.596049
3,OC,-,High,Low,True,True,8.246697,5.0,two-sided,0.000427,0.001282,holm,77.306,3.219035
4,OC,-,High,Medium,True,True,5.250736,5.0,two-sided,0.003324,0.006648,holm,15.907,2.258871
5,OC,-,Low,Medium,True,True,-3.971034,5.0,two-sided,0.010625,0.010625,holm,6.592,-1.636554
6,VV * OC,Both,High,Low,True,True,6.756309,5.0,two-sided,0.001079,0.009708,holm,37.735,3.129129
7,VV * OC,Both,High,Medium,True,True,4.01908,5.0,two-sided,0.01013,0.07091,holm,6.833,2.181933
8,VV * OC,Both,Low,Medium,True,True,-5.505134,5.0,two-sided,0.002704,0.021632,holm,18.625,-2.776895
9,VV * OC,Motion,High,Low,True,True,0.305158,5.0,two-sided,0.77254,1.0,holm,0.388,0.16653


In [38]:
stats = summarizeDF(data, ['VV','OC'], 'visualSearchTime')
stats

Unnamed: 0,VV,OC,Mean,Count,Std,ci95_hi,ci95_lo
0,Both,High,10584.033333,30,7523.658801,13276.340058,7891.726608
1,Both,Low,3250.833333,30,1562.358586,3809.916173,2691.750494
2,Both,Medium,5477.5,30,2375.730969,6327.644409,4627.355591
3,Motion,High,1617.866667,30,805.523635,1906.119597,1329.613736
4,Motion,Low,1510.633333,30,1782.810441,2148.603856,872.66281
5,Motion,Medium,1406.533333,30,1061.842567,1786.508816,1026.557851
6,Shadow,High,1464.966667,30,792.72696,1748.640366,1181.292967
7,Shadow,Low,1050.733333,30,666.856131,1289.364735,812.101932
8,Shadow,Medium,1294.333333,30,763.039265,1567.383427,1021.283239


In [39]:
nice_color_palette = ['#f1eef6', '#bdc9e1', '#74a9cf', '#2b8cbe', '#045a8d']
stats['OC'] = stats['OC'].astype('str')

fig = px.bar(stats, x='VV', y='Mean', color='OC', barmode='group', color_discrete_sequence=nice_color_palette).update_traces(
    error_y={
        'type': 'data',
        'symmetric': False,
        'array': stats['ci95_hi'] - stats['Mean'],
        'arrayminus': stats['Mean'] - stats['ci95_lo'],
    }
)
fig.update_layout({
    'plot_bgcolor' : 'rgba(0,0,0,0)'
})
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='LightGray')

fig.show()