In [6]:
import scipy.stats as stats
import pingouin as pg
import pandas as pd

In [7]:
df_excel = pd.read_excel("./WelchTest_GooglePlay_dataset.xlsx", "ANOVA_CATEGORIES_SIZE")

In [8]:
df_excel.describe()

Unnamed: 0,COMMUNICATION,FAMILY,GAME,PRODUCTIVITY
count,53.0,463.0,267.0,58.0
mean,15.067925,29.967063,44.339326,16.006897
std,15.222096,26.109084,27.57262,18.509295
min,1.0,1.2,1.3,1.2
25%,4.3,7.9,20.5,3.425
50%,8.3,22.0,40.0,7.15
75%,20.0,48.0,65.0,22.0
max,66.0,100.0,100.0,76.0


In [9]:
# Data for each group
communicationSize = df_excel["COMMUNICATION"]
familySize = df_excel["FAMILY"]
gameSize = df_excel["GAME"]
productivitySize = df_excel["PRODUCTIVITY"]

In [10]:
# Drop NaN values from the group arrays
communicationSize = communicationSize.dropna()
familySize = familySize.dropna()
gameSize = gameSize.dropna()
productivitySize = productivitySize.dropna()

In [11]:
# Perform ANOVA
f_value, p_value = stats.f_oneway(communicationSize, familySize, gameSize, productivitySize)
print("ANOVA results:")
print("F-value:", f_value)
print("p-value:", p_value)

ANOVA results:
F-value: 36.74202391905343
p-value: 2.601597843539059e-22


### EXCEL ANOVA TEST = 2,60159784353495E-22

In [12]:
# perform Bartlett's test
stats.bartlett(communicationSize, familySize, gameSize,productivitySize)

BartlettResult(statistic=33.50920936317704, pvalue=2.5148631539966e-07)

### EXCEL Test for varianshomogenitet = 2,5148631539966E-07

In [13]:
df = df_excel.stack().reset_index()

In [14]:
df.columns = ['idx','categories', 'values']

In [15]:
df = df.drop('idx', axis=1)

In [16]:
# Perform one-way ANOVA
anova = pg.anova(data=df, dv='values', between='categories', detailed=True)
anova

Unnamed: 0,Source,SS,DF,MS,F,p-unc,np2
0,categories,72264.761799,3,24088.253933,36.742024,2.601598e-22,0.116367
1,Within,548741.369997,837,655.604982,,,


In [17]:
# perform Welch's ANOVA
Wanova = pg.welch_anova(dv='values', between='categories', data=df)
Wanova

Unnamed: 0,Source,ddof1,ddof2,F,p-unc,np2
0,categories,3,167.054611,51.570539,1.2031800000000002e-23,0.116367


# Hvor ligger forskellen?

In [18]:
# Perform Welch's t-test between pairs of groups
t_value, p_value = stats.ttest_ind(communicationSize, familySize, equal_var=False)
print("\nWelch's t-test between communicationSize and familySize:")
print("t-value:", t_value)
print("p-value:", p_value)


Welch's t-test between communicationSize and familySize:
t-value: -6.163065616970236
p-value: 1.879007247817127e-08


In [19]:
t_value, p_value = stats.ttest_ind(communicationSize, gameSize, equal_var=False)
print("\nWelch's t-test between communicationSize and gameSize:")
print("t-value:", t_value)
print("p-value:", p_value)


Welch's t-test between communicationSize and gameSize:
t-value: -10.894212159293176
p-value: 4.519961029814106e-20


### Eneste hvor variansen er ens. Altså, H0 ikke kan forkastes

In [20]:
t_value, p_value = stats.ttest_ind(communicationSize, productivitySize, equal_var=False)
print("\nWelch's t-test between communicationSize and productivitySize:")
print("t-value:", t_value)
print("p-value:", p_value)


Welch's t-test between communicationSize and productivitySize:
t-value: -0.29287553961715473
p-value: 0.770180068541946


In [21]:
t_value, p_value = stats.ttest_ind(familySize, gameSize, equal_var=False)
print("\nWelch's t-test between familySize and gameSize:")
print("t-value:", t_value)
print("p-value:", p_value)


Welch's t-test between familySize and gameSize:
t-value: -6.915101468626785
p-value: 1.3489066117066136e-11


In [22]:
t_value, p_value = stats.ttest_ind(familySize, productivitySize, equal_var=False)
print("\nWelch's t-test between familySize and productivitySize:")
print("t-value:", t_value)
print("p-value:", p_value)


Welch's t-test between familySize and productivitySize:
t-value: 5.1391169421935485
p-value: 1.6392155447564927e-06


In [23]:
t_value, p_value = stats.ttest_ind(gameSize, productivitySize, equal_var=False)
print("\nWelch's t-test between gameSize and productivitySize:")
print("t-value:", t_value)
print("p-value:", p_value)


Welch's t-test between gameSize and productivitySize:
t-value: 9.575828378704504
p-value: 1.8574016221550509e-16


# Tukey Test

In [32]:
# Conduct post-hoc Tukey HSD test
posthoc = pg.pairwise_tukey(dv='values', between='categories', data=df)

In [33]:
posthoc

Unnamed: 0,A,B,mean(A),mean(B),diff,se,T,p-tukey,hedges
0,COMMUNICATION,FAMILY,15.067925,29.967063,-14.899138,3.712937,-4.012764,0.0003799924,-0.589852
1,COMMUNICATION,GAME,15.067925,44.339326,-29.271401,3.850371,-7.602229,6.988854e-13,-1.12498
2,COMMUNICATION,PRODUCTIVITY,15.067925,16.006897,-0.938972,4.865536,-0.192984,0.9974521,-0.054787
3,FAMILY,GAME,29.967063,44.339326,-14.372263,1.967598,-7.30447,4.123701e-12,-0.538677
4,FAMILY,PRODUCTIVITY,29.967063,16.006897,13.960166,3.566445,3.914309,0.0005669232,0.549123
5,GAME,PRODUCTIVITY,44.339326,16.006897,28.332429,3.70931,7.638193,5.914158e-13,1.078795


In [31]:
# Print the significant pairwise comparisons
significant_comparisons = posthoc[posthoc['p-tukey'] < 0.05]
print("Significant Comparisons:")
print(significant_comparisons[['A', 'B', 'mean(A)', 'mean(B)', 'p-tukey']])

Significant Comparisons:
               A             B    mean(A)    mean(B)       p-tukey
0  COMMUNICATION        FAMILY  15.067925  29.967063  3.799924e-04
1  COMMUNICATION          GAME  15.067925  44.339326  6.988854e-13
3         FAMILY          GAME  29.967063  44.339326  4.123701e-12
4         FAMILY  PRODUCTIVITY  29.967063  16.006897  5.669232e-04
5           GAME  PRODUCTIVITY  44.339326  16.006897  5.914158e-13
