In [1]:
import numpy as np
import astropy
import statistics
from scipy import stats as scistats
from scipy.stats import wilcoxon
from scipy.stats import binomtest
from scipy.stats import f_oneway
from scipy.stats import ttest_ind
from scipy.stats import mannwhitneyu
from scipy.stats import friedmanchisquare
import scikit_posthocs as sp
from scipy.stats import shapiro

from astropy import stats
from statistics import multimode

In [2]:
correct = np.load('correct.npy') #Counts of correct answers by question
experts_correct = np.load('experts_correct.npy')
nonexperts_correct = np.load('nonexperts_correct.npy')
astromus_correct = np.load('astromus_correct.npy')
mus_correct = np.load('mus_correct.npy')
astro_correct = np.load('astro_correct.npy')
nonexperts_4_correct = np.load('nonexperts_4_correct.npy')
blv_correct = np.load('blv_correct.npy')
nonblv_correct = np.load('nonblv_correct.npy')

In [3]:
grades = np.load('grades.npy') #Counts of correct answers by participant
grades_experts = np.load('grades_experts.npy')
grades_nonexperts = np.load('grades_nonexperts.npy')
grades_astromus = np.load('grades_astromus.npy')
grades_mus = np.load('grades_mus.npy')
grades_astro = np.load('grades_astro.npy')
grades_nonexperts_4 = np.load('grades_nonexperts_4.npy')
grades_nonblv = np.load('grades_nonblv.npy')
grades_nonblv = np.load('grades_nonblv.npy')

In [4]:
groups = ["Global", "Experienced", "Non-experienced", "Astro", "Mus", "Astromus", "Nothing","BLV", "Non-BLV-2"]
counts = [correct, experts_correct, nonexperts_correct, astro_correct, mus_correct, astromus_correct, nonexperts_4_correct]
participants = np.load('participants.npy')

In [5]:
questions = len(correct)
questions

10

In [6]:
null_value = 0.23

# Statistics

In [7]:
#mean, median, mode
def calculations(counts, participants, questions):
    print("Participants:", participants)
    print(counts)
    mean = np.round(np.mean(counts/participants), 4) #overall average success rate
    median = np.round(np.median(counts/participants), 4) #median 
    mode = multimode(np.round(counts, 2))
    return mean, median, mode

In [8]:
null_value

0.23

In [9]:
# Wilcoxon test. Calculates if the observed overall average success rate
#is statistically significantly higher than chance.

def wilcoxon_test(counts, participants, null_value):
    observed_proportions = [x/participants for x in counts]
    # Compute differences from the null value
    differences = [x - null_value for x in observed_proportions]
    # Perform the Wilcoxon signed-rank test
    # alternative='greater' tests whether the median is significantly greater than the null
    test_statistic, p_value = wilcoxon(differences, alternative='greater')
    # Output results
    print("Success rates:", observed_proportions)
    print("Test statistic (W):", test_statistic)
    print("p-value (one-sided):", p_value)
    if p_value <= 0.05:
        print("Conclusion: Statistically significant")
    else:
        print("Conclusion: Not statistically significant")
    return test_statistic, p_value

In [10]:
# Alternative Binomial test
def binom_test(counts, participants, questions, null_value, less_or_greater):
    questions = questions-2
    test = binomtest(np.sum(counts), participants*questions, p=null_value, alternative=less_or_greater)
    if test.pvalue<0.05:
        print("Statistically significant")
    else:
        print("Not statistically significant")
    return test

In [11]:
# Alternative T-test
def t_test(counts, questions, null_value):  # Perform a one-sample t-test
    sample_mean = np.mean(counts)
    n = len(counts)
    sample_std = np.std(counts, ddof=1)  # Use sample standard deviation (ddof=1 for unbiased estimate)
    standard_error = sample_std / np.sqrt(n)
    test_statistic = (sample_mean - null_value) / standard_error
    degrees_freedom = n - 1
    p_value = scistats.t.sf(np.abs(test_statistic), df=degrees_freedom) * 2  # Two-tailed p-value
    if p_value <= 0.05:
        result = "Reject the null hypothesis. There is statistically significant evidence that the mean is different from the random choice value."
    else:
        result = "Fail to reject the null hypothesis. There is not enough statistically significant evidence that the mean is different from the random choice value."

    return test_statistic, p_value

# 1- Are the overall average success rates observed significant vs chance?

In [12]:
calculations(correct, participants[0], questions)

Participants: 44
[ 6  3 17 12  3 18 11 12 27  7]


(0.2636, 0.2614, [3, 12])

In [13]:
t_test(correct, questions, null_value)

(4.783865363972287, 0.0009958851934449115)

### Conclusion: All the overall average success rates observed are significantly higher than the random choice probability

In [14]:
calculations(experts_correct, participants[1], questions)

Participants: 15
[ 2  2  7  8  0  7  6  7 13  1]


(0.3533, 0.4333, [7])

In [15]:
t_test(experts_correct, questions, null_value)

(4.006795927129995, 0.0030787104869550042)

In [16]:
calculations(nonexperts_correct, participants[2], questions)

Participants: 15
[2 1 3 1 2 6 2 4 7 3]


(0.2067, 0.1667, [2])

In [17]:
t_test(nonexperts_correct, questions, null_value)

(4.4821869662029945, 0.0015276029627714581)

In [18]:
calculations(astro_correct, participants[5], questions)

Participants: 3
[1 1 1 2 0 1 1 1 3 0]


(0.3667, 0.3333, [1])

In [19]:
t_test(astro_correct, questions, null_value)

(3.1420707655385764, 0.011888322681261849)

In [20]:
calculations(mus_correct, participants[4], questions)

Participants: 4
[0 0 2 2 0 2 2 2 4 0]


(0.35, 0.5, [2])

In [21]:
t_test(mus_correct, questions, null_value)

(2.7408495211450363, 0.022815837613538514)

In [22]:
calculations(astromus_correct, participants[3], questions)

Participants: 3
[1 1 2 2 0 2 1 2 3 0]


(0.4667, 0.5, [2])

In [23]:
t_test(astromus_correct, questions, null_value)

(3.829723973641666, 0.004029426317185767)

In [24]:
calculations(nonexperts_4_correct, participants[6], questions)

Participants: 4
[0 1 2 1 0 3 0 2 1 1]


(0.275, 0.25, [1])

In [25]:
t_test(nonexperts_4_correct, questions, null_value)

(2.7665944668166005, 0.02187497923095555)

# 2- Are the differences observed in the means statistically significant?

In [38]:
f_stat, p_valor = scistats.f_oneway(experts_correct, nonexperts_correct)

print("Estadístico F:", f_stat)
print("Valor p:", p_valor)

Estadístico F: 2.4066298342541432
Valor p: 0.13822660817594473


In [26]:
# Kruskal-Wallis test: kruskal_p_value < 0.05 => reject the null hypothesis => statistical significance
H_statistic, kruskal_p_value = scistats.kruskal(experts_correct, nonexperts_correct)
print(H_statistic, kruskal_p_value)

1.3214837712519236 0.25032593367991124


### Conclusion: The differences in the means between experts and non experts are not statistically significant.

In [27]:
# T-test
stat, p_value = ttest_ind(experts_correct, nonexperts_correct, equal_var=False)
print(stat, p_value)

1.5513316325834858 0.14424100930246586


# 3- Are they meaningful?

In [28]:
# Cohen's d for Effect Size (T-test). Meaninful: Small effect size d=0.2, medium d=0.5, large d=0.8.

# Calculate means and standard deviations
mean1 = np.mean(experts_correct/participants[1])
mean2 = np.mean(nonexperts_correct/participants[2])
std1 = np.std(experts_correct, ddof=1)
std2 = np.std(nonexperts_correct, ddof=1)

# Calculate Cohen's d
cohen_d = (mean1 - mean2) / np.sqrt((std1**2 + std2**2) / 2)
print("Cohen's d:", cohen_d)

Cohen's d: 0.04625177314803205


# 4- Subgroup analysis

In [29]:
# Test normality for each group. p-value<0.05 =>not normally distributed
stat1, p1 = shapiro(astromus_correct)
stat2, p2 = shapiro(mus_correct)
stat3, p3 = shapiro(astro_correct)
stat4, p4 = shapiro(nonexperts_4_correct)

print(f"astromus normality: p={p1}")
print(f"mus normality: p={p2}")
print(f"astro normality: p={p3}")
print(f"nonexperts normality: p={p4}")

astromus normality: p=0.2449112981557846
mus normality: p=0.015408955514431
astro normality: p=0.025551754981279373
nonexperts normality: p=0.15201178193092346


In [30]:
# Kruskal-Wallis test: kruskal_p_value < 0.05 => reject the null hypothesis => statistical significance
H_statistic, kruskal_p_value = scistats.kruskal(astromus_correct, mus_correct, astro_correct, nonexperts_4_correct)
print(H_statistic, kruskal_p_value)

0.877958307786656 0.8307434530628528


In [31]:
#alternative test: Chi square
stat, p_value = friedmanchisquare(astromus_correct, mus_correct, astro_correct, nonexperts_4_correct)
print(stat, p_value)

1.707692307692285 0.6352249758328063


### Post-hoc analysis

In [32]:
# Dunn test with Bonferroni correction (6 comparisons => alpha = 0.05/6 = 0.0083)
dunn_result = sp.posthoc_dunn([astromus_correct, mus_correct, astro_correct, nonexperts_4_correct], p_adjust='bonferroni')

# Label the groups
dunn_result.index = ['astromus', 'mus', 'astro', 'nothing']
dunn_result.columns = ['astromus', 'mus', 'astro', 'nothing']

print(dunn_result)

          astromus  mus  astro  nothing
astromus       1.0  1.0    1.0      1.0
mus            1.0  1.0    1.0      1.0
astro          1.0  1.0    1.0      1.0
nothing        1.0  1.0    1.0      1.0


## Conclusion: The differences in the overall average success rates are not statistically significant

# BLV Analysis

In [33]:
calculations(blv_correct, participants[7], questions)

Participants: 2
[0 0 0 2 0 0 1 0 1 0]


(0.2, 0.0, [0])

In [34]:
t_test(blv_correct, questions, null_value)

(0.7688539286732973, 0.4616787021805435)

### Conclusion: The BLV average success rate difference with the random choice probability is not statistically significant.

In [35]:
calculations(nonblv_correct, participants[8], questions)

Participants: 2
[0 0 1 1 0 1 0 1 1 1]


(0.3, 0.5, [1])

In [36]:
t_test(nonblv_correct, questions, null_value)

(2.2657780120744397, 0.0497049486305836)

In [37]:
# Kruskal-Wallis test: kruskal_p_value < 0.05 => reject the null hypothesis => statistical significance
H_statistic, kruskal_p_value = scistats.kruskal(blv_correct, nonblv_correct)
print(H_statistic, kruskal_p_value)

1.0666666666666702 0.30169958247834416


### Conclusion: The differences between BLV and non BLV average success rates are not statistically significant.