In [None]:
import pandas as pd # for manipulating data frames
import pingouin as pg # for running statistics
#import matplotlib.pyplot as pyplot
import plotly.express as px

### First we check if our data is correct

In [None]:
data = pd.read_csv('logs/logs_merged.csv', sep=',')
data

In [None]:
data.describe(include = 'all')

In [None]:
data.iloc[2]

#### We check the types of our collumns

In [None]:
data.dtypes

In [None]:
data['ParticipantID'] = data['ParticipantID'].astype('str')
data.dtypes

In [None]:
### Applying them to a column gives a result of type series
data.visualSearchTime.mean()

In [None]:
### We can get a breakdown by condition using the groupby function
data.groupby('VV').visualSearchTime.mean()

In [None]:
### Applying those aggregating functions to a dataframe gives a result of type dataframe
data.groupby('VV').mean() # result is a dataframe

In [None]:
### aggregate is a more elaborate aggregate function
# the line below is equivalent to: data.groupby('Lens').mean()
# data.groupby('Lens').aggregate('mean') 
# but here aggregate is used to specify how to aggregate different columns
data.groupby('VV').aggregate({'TrialID': 'sum', 'visualSearchTime': 'mean'})

In [None]:
OCAsStr = data['OC'].copy().astype('str')
# make a copy of column ID and change its type from float to str
VVAsStr = data['VV'].copy().astype('str')
# now that we have strings, we can concatenate them using function 'cat'
data['Condition: OC, VV'] = OCAsStr.str.cat(VVAsStr, sep=", ")
data

## We check that every participant has done the same number of trial and we plot it to be sure

In [None]:
data.groupby('ParticipantID').count()

In [None]:
fig = px.histogram(data, x='Condition: OC, VV', color='ParticipantID')
fig.show()

## We plot the visual search time in a histogram to check the distribution of measures

In [None]:
fig = px.histogram(data, x='visualSearchTime', marginal='box')
fig.show()


## We do the same but including the object count to the mix

In [None]:
fig = px.histogram(data, x='visualSearchTime', color='OC', marginal='box', category_orders={"OC":["Low","Medium","High"]})
fig.show()

## This time we group the values by visual variable

In [None]:
fig = px.histogram(data, x='visualSearchTime', color='VV', marginal='box')
fig.show()

## We check the error count depending on the object count

In [None]:
fig = px.histogram(data, x='ErrorCount', color='OC', category_orders={"OC":["Low","Medium","High"]})
fig.show()

## and once again but depending on visual variable

In [None]:
fig = px.histogram(data, x='ErrorCount', color='VV')
fig.show()

## Inferential statistics

In [None]:
import math

def summarizeDF(df, factors, measure):
    summary = df.groupby(factors, as_index=False)[measure].aggregate({'Mean': 'mean', 'Count': 'count', 'Std':'std'})
    ci95_hi = []
    ci95_lo = []
    for i in summary.values:
        mean, count, std = i[len(factors)], i[len(factors)+1], i[len(factors)+2]
        ci95_hi.append(mean + 1.96*std/math.sqrt(count))
        ci95_lo.append(mean - 1.96*std/math.sqrt(count))

    summary['ci95_hi'] = ci95_hi
    summary['ci95_lo'] = ci95_lo
    return summary

## We define the color palette and split our data in different tables depending on visual variable

In [None]:
bothData = data[(data['VV']=='Both')]
shadowData= data[(data['VV']=='Shadow')]
motionData = data[(data['VV']=='Motion')]

nice_color_palette = ['#66c2a5', '#fc8d62', '#8da0cb', '#e78ac3', '#a6d854']

In [None]:
correlation_table3 = pg.pairwise_corr(bothData, columns=['OC','visualSearchTime'])
correlation_table3

In [None]:
#r2 = correlation_table['r'] * correlation_table['r']
#r2
#run anova test for each hypothesis

# one way repeated Anova test for Shadow visual variable
## visualSearchTime depending on ObjectCount

In [None]:
anShadow = pg.rm_anova(data=shadowData, dv='visualSearchTime', within='OC', subject='ParticipantID')
anShadow

#### We found no significant effect of object count on visual search time with the shadow visual variable (p > 0.25)

In [None]:
shadowStats = summarizeDF(shadowData, ['OC'], 'visualSearchTime')

fig = px.bar(shadowStats, x='OC', y='Mean', color='OC', color_discrete_sequence=nice_color_palette, category_orders={"OC":["Low","Medium","High"]}).update_traces(
    error_y={
        'type': 'data',
        'symmetric': False,
        'array': shadowStats['ci95_hi'] - shadowStats['Mean'],
        'arrayminus': shadowStats['Mean'] - shadowStats['ci95_lo'],
    }
)
fig.update_layout({
    'plot_bgcolor' : 'rgba(0,0,0,0)'
})
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='LightGray')

fig.show()

# one way repeated Anova test for Motion visual variable
## visualSearchTime depending on ObjectCount

In [None]:
anMotion = pg.rm_anova(data=motionData, dv='visualSearchTime', within='OC', subject='ParticipantID')
anMotion

#### We found no significant effect of object count on visual search time with the motion visual variable (p > 0.7)

In [None]:
motionStats = summarizeDF(motionData, ['OC'], 'visualSearchTime')

fig = px.bar(motionStats, x='OC', y='Mean', color='OC', color_discrete_sequence=nice_color_palette, category_orders={"OC":["Low","Medium","High"]}).update_traces(
    error_y={
        'type': 'data',
        'symmetric': False,
        'array': motionStats['ci95_hi'] - motionStats['Mean'],
        'arrayminus': motionStats['Mean'] - motionStats['ci95_lo'],
    }
)
fig.update_layout({
    'plot_bgcolor' : 'rgba(0,0,0,0)'
})
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='LightGray')

fig.show()

# one way repeated Anova test for Both visual variables
## visualSearchTime depending on ObjectCount

In [None]:
anBoth = pg.rm_anova(data=bothData, dv='visualSearchTime', within='OC', subject='ParticipantID')
anBoth

#### We found a significant effect of object count on visual search time with both visual variable (F(2,10) = 28.7, p < 0.001, ng2 = 0.74).

In [None]:
posthocB = pg.pairwise_tests(data=bothData, dv='visualSearchTime', within=['OC'], subject='ParticipantID', parametric=True, padjust='fdr_bh', effsize='hedges')
posthocB

#### All pairs significantly differ (all p's < 0.05)

In [None]:
bothStats = summarizeDF(bothData, ['OC'], 'visualSearchTime')

fig = px.bar(bothStats, x='OC', y='Mean', color='OC', color_discrete_sequence=nice_color_palette, category_orders={"OC":["Low","Medium","High"]}).update_traces(
    error_y={
        'type': 'data',
        'symmetric': False,
        'array': bothStats['ci95_hi'] - bothStats['Mean'],
        'arrayminus': bothStats['Mean'] - bothStats['ci95_lo'],
    }
)
fig.update_layout({
    'plot_bgcolor' : 'rgba(0,0,0,0)'
})
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='LightGray')

fig.show()

# one way repeated Anova test for visual variable
## visualSearchTime depending on VisualVariable

In [None]:
aovrm1way = pg.rm_anova(data=data, dv='visualSearchTime', within='VV', subject='ParticipantID')
aovrm1way

#### We found a significant effect of visual variable on visual search time (F(2,10) = 84.9, p < 0.001, ng2 = 0.90).

In [None]:
posthoc = pg.pairwise_tests(data=data, dv='visualSearchTime', within=['VV'], subject='ParticipantID', parametric=True, padjust='fdr_bh', effsize='hedges')
posthoc

#### All pairs of visual variables that have "Both" significantly differ (all p's < 0.05). The pair motion and shadow doesn't significantly differ (p = 0.34).

In [None]:
stats = summarizeDF(data, ['VV'], 'visualSearchTime')

fig = px.bar(stats, x='VV', y='Mean', color='VV', color_discrete_sequence=nice_color_palette).update_traces(
    error_y={
        'type': 'data',
        'symmetric': False,
        'array': stats['ci95_hi'] - stats['Mean'],
        'arrayminus': stats['Mean'] - stats['ci95_lo'],
    }
)
fig.update_layout({
    'plot_bgcolor' : 'rgba(0,0,0,0)'
})
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='LightGray')

fig.show()

## Finally, we are doing an anova test both on visual variable and object count to see what it would show us

In [None]:
aovrm2way = pg.rm_anova(data=data, dv='visualSearchTime', within=['VV', 'OC'], subject='ParticipantID')
aovrm2way

In [None]:
posthoc = pg.pairwise_tests(data=data, dv='visualSearchTime', within=['VV', 'OC'], subject='ParticipantID', parametric=True, padjust='holm', effsize='cohen')
posthoc

# We didn't get much from that but if we create a bar chart with everything in it, it looks pretty cool

In [None]:
stats = summarizeDF(data, ['VV','OC'], 'visualSearchTime')
stats['OC'] = stats['OC'].astype('str')

fig = px.bar(stats, x='VV', y='Mean', color='OC', barmode='group', color_discrete_sequence=nice_color_palette, category_orders={"OC":["Low","Medium","High"]}).update_traces(
    error_y={
        'type': 'data',
        'symmetric': False,
        'array': stats['ci95_hi'] - stats['Mean'],
        'arrayminus': stats['Mean'] - stats['ci95_lo'],
    }
)
fig.update_layout({
    'plot_bgcolor' : 'rgba(0,0,0,0)'
})
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='LightGray')

fig.show()

#### When we check the bar chart, we see an interaction effect of object count on the visual variable Both, we can see this effect in the post-hoc test where only Both as all p's < 0.05 and the other do not significantly differ all p's > 0.2