In [None]:
#1
'''
Assumptions;
i.Independence of observations
ii.Normality
iii.Homogeneity of variances
'''

In [None]:
#2
'''
Three types of ANOVAs:
- One way ANOVA : This is used when there is one categorical independent variable  and one continuous dependent
    variable. 
- Two way ANOVA : This is used when there are two categorical independent variables and one continuous dependent
    variable. 
- Repeated measures ANOVA: his is used when there is one categorical independent variable and one continuous
    dependent variable, but the dependent variable is measured multiple times within each level of the 
    independent variable.
'''

In [None]:
#3
'''
The partitioning of variance in ANOVA refers to the process of dividing the total variance of a dependent
variable into different components that are associated with different sources of variation. These components
include the between-group variance, within-group variance, and total variance.
'''

In [1]:
#4
import numpy as np
import statsmodels.api as sm
from statsmodels.formula.api import ols

# Create the data
group1 = [4, 6, 8, 7, 5]
group2 = [9, 11, 13, 10, 12]
group3 = [15, 17, 19, 16, 18]
data = group1 + group2 + group3
groups = ['Group 1']*5 + ['Group 2']*5 + ['Group 3']*5

# Fit the model
model = ols('data ~ C(groups)', data={'data': data, 'groups': groups}).fit()
anova_table = sm.stats.anova_lm(model, typ=2)

# Extract the sums of squares
SST = anova_table['sum_sq']['C(groups)']
SSE = anova_table['sum_sq']['Residual']
SSR = SST - SSE

print("SST:", SST)
print("SSE:", SSE)
print("SSR:", SSR)


SST: 303.3333333333332
SSE: 30.0
SSR: 273.3333333333332


In [None]:
#5
import statsmodels.api as sm
from statsmodels.formula.api import ols

# create the ANOVA model
model = ols('dependent_var ~ factor_1 + factor_2 + factor_1:factor_2', data=data_frame).fit()

# calculate the main effects
main_effects = sm.stats.anova_lm(model, typ=1)['sum_sq'][:-1]

# calculate the interaction effect
interaction_effect = sm.stats.anova_lm(model, typ=1)['sum_sq'][-1]

In [None]:
#6
'''
If we obtained an F-statistic of 5.23 and a p-value of 0.02 from a one-way ANOVA,
It is likely that at least one of the groups has a different mean from the others.

The F-statistic of 5.23 indicates the ratio of variance between the group means to variance within the groups. 
A larger F-statistic indicates greater differences between the group means relative to the variability within 
the groups. 

The p-value of 0.02 indicates the probability of observing such an F-statistic or more extreme under the 
assumption that the null hypothesis is true. Since the p-value is less than the significance level of 0.05, 
we can reject the null hypothesis and conclude that there is a significant difference between the group means.
'''

In [None]:
#7
'''
methods of handling missing data:
-pairwise deletion
-mean substitution
-maximum likelihood estimation
-multiple imputation
'''

In [None]:
#8
'''
Post-hoc tests are used to make pairwise comparisons between groups after a significant difference
has been found in an ANOVA. 
'''

In [4]:
#9
import scipy.stats as stats
import numpy as np
# Define the data
diet_a = np.random.randint(2,12,50)
diet_b = np.random.randint(3,19,50)
diet_c = np.random.randint(2,8,50)

# Perform the one-way ANOVA
f_statistic, p_value = stats.f_oneway(diet_a, diet_b, diet_c)

# Print the results
print("F-statistic: ", f_statistic)
print("P-value: ", p_value)


F-statistic:  63.36004955545838
P-value:  1.4314708003491508e-20


In [5]:
#10
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols

# Create a dataframe with the data
data = {'Program': ['A']*10 + ['B']*10 + ['C']*10,
        'Experience': ['Novice']*15 + ['Experienced']*15,
        'Time': [10, 12, 9, 11, 10, 13, 8, 11, 10, 12,
                 15, 16, 14, 17, 15, 18, 13, 16, 15, 17,
                 20, 22, 21, 23, 22, 24, 19, 22, 21, 23]}
df = pd.DataFrame(data)

# Fit the ANOVA model
model = ols('Time ~ Program + Experience + Program:Experience', data=df).fit()
table = sm.stats.anova_lm(model, typ=2)

# Print the ANOVA table
print(table)


                       sum_sq    df         F    PR(>F)
Program              0.205444   2.0  0.044145  0.835222
Experience                NaN   1.0       NaN       NaN
Program:Experience   5.603333   2.0  1.204022  0.282578
Residual            60.500000  26.0       NaN       NaN


  F /= J


In [9]:
#11
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind, f_oneway, tukey_hsd

# Generate some example data
np.random.seed(123)
control_scores = np.random.normal(75, 10, size=100)
experimental_scores = np.random.normal(80, 10, size=100)

# Conduct two-sample t-test
t_stat, p_val = ttest_ind(control_scores, experimental_scores)
print("t-statistic: ", t_stat)
print("p-value: ", p_val)

# Conduct post-hoc test (Tukey HSD)
scores = np.concatenate([control_scores, experimental_scores])
groups = np.concatenate([np.zeros(100), np.ones(100)])
tukey_results = tukey_hsd(scores, groups)
print(tukey_results)


t-statistic:  -3.031617200418805
p-value:  0.002757729976398418
Tukey's HSD Pairwise Group Comparisons (95.0% Confidence Interval)
Comparison  Statistic  p-value  Lower CI  Upper CI
 (0 - 1)     77.038     0.000    75.536    78.539
 (1 - 0)    -77.038     0.000   -78.539   -75.536



In [10]:
#12
import pandas as pd
from scipy.stats import f_oneway
from statsmodels.stats.multicomp import pairwise_tukeyhsd

# create a dataframe with sales data
data = {'store': ['A']*30 + ['B']*30 + ['C']*30,
        'sales': [10, 12, 11, 14, 13, 15, 16, 17, 12, 14, 11, 13, 14, 16, 18, 19, 20, 22, 17, 15, 16, 18, 19, 20, 21, 22, 23, 24, 25, 26,
                  15, 16, 14, 18, 17, 16, 20, 22, 21, 23, 24, 25, 26, 27, 28, 29, 30, 32, 34, 21, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33]}

df = pd.DataFrame(data)

# conduct one-way ANOVA
f_stat, p_value = f_oneway(df[df['store']=='A']['sales'], df[df['store']=='B']['sales'], df[df['store']=='C']['sales'])

print('One-way ANOVA results:')
print('F-statistic:', f_stat)
print('p-value:', p_value)

# conduct post-hoc test (Tukey's HSD)
posthoc = pairwise_tukeyhsd(df['sales'], df['store'], alpha=0.05)

print('Post-hoc test results:')
print(posthoc)


ValueError: All arrays must be of the same length