In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols

In [2]:
data = {'Treatment-1': [67, 42, 67, 56, 62, 64, 59, 72, 71, 60],
        'Treatment-2': [50, 52, 43, 67, 67, 59, 67, 64, 63, 65],
        'Treatment-3': [48, 49, 50, 55, 56, 61, 61, 60, 59, 64],
        'Treatment-4': [47, 67, 54, 67, 68, 65, 65, 56, 60, 65]}
df = pd.DataFrame(data)
df

Unnamed: 0,Treatment-1,Treatment-2,Treatment-3,Treatment-4
0,67,50,48,47
1,42,52,49,67
2,67,43,50,54
3,56,67,55,67
4,62,67,56,68
5,64,59,61,65
6,59,67,61,65
7,72,64,60,56
8,71,63,59,60
9,60,65,64,65


In [3]:
data_melt =  df.melt(var_name = 'Treatment', value_name = 'Values')
data_melt

Unnamed: 0,Treatment,Values
0,Treatment-1,67
1,Treatment-1,42
2,Treatment-1,67
3,Treatment-1,56
4,Treatment-1,62
5,Treatment-1,64
6,Treatment-1,59
7,Treatment-1,72
8,Treatment-1,71
9,Treatment-1,60


In [4]:
model = ols('Values ~C(Treatment)',data = data_melt).fit()

anova_table = sm.stats.anova_lm(model)

print(anova_table)

                df  sum_sq    mean_sq         F   PR(>F)
C(Treatment)   3.0   196.5  65.500000  1.144327  0.34436
Residual      36.0  2060.6  57.238889       NaN      NaN


In [5]:
r_square = model.rsquared
print('R-Square:',round(r_square*100,2))

R-Square: 8.71


In [None]:
# df     sum_sq    mean_sq         F    PR(>F)
# Treatment  3.0  120.00000  40.00000  3.000000  0.060606
# Residual   36.0  480.00000  13.33333       NaN       NaN

In [None]:
# Variable	Explanation
# df (Degrees of Freedom)	- Represents the number of independent pieces of information used to estimate a parameter.
# - In ANOVA, there are usually two types of degrees of freedom:
# - Between Groups (or Treatment): Reflects the number of treatment groups minus 1.
# - Within Groups (or Error): Reflects the total number of observations minus the number of treatment groups.
# sum_sq (Sum of Squares)	- Measures the total variability in the data.
# - It's broken down into two components:
# - Between Groups: The variability explained by differences between the treatment groups.
# - Within Groups: The variability that is not explained by the treatment groups (due to random error or individual differences).
# mean_sq (Mean Square)	- Calculated by dividing the sum_sq by the corresponding df.
# - It represents the average variability per degree of freedom.
# - There's a mean_sq for both between groups and within groups.
# F (F-statistic)	- A test statistic that measures the ratio of the variability between groups to the variability within groups (mean_sq between / mean_sq within).
# - A larger F-statistic suggests that the differences between groups are more likely to be significant.
# PR(>F) (p-value)	- The probability of observing the calculated F-statistic (or a more extreme value) if there were no real differences between the treatment groups.
# - A small p-value (typically less than 0.05) indicates that the differences between groups are statistically significant, suggesting that the treatment has an effect.
# In Summary:

# The ANOVA table provides the key components for testing the hypothesis that there are no differences between the means of the treatment groups. By examining the F-statistic and its associated p-value, you can determine if there's enough evidence to reject the null hypothesis and conclude that at least one treatment group is significantly different from the others.

# Example:

# Let's say your anova_table looks like this:


# df     sum_sq    mean_sq         F    PR(>F)
# Treatment  3.0  120.00000  40.00000  3.000000  0.060606
# Residual   36.0  480.00000  13.33333       NaN       NaN
# Use code with caution
# Treatment:
# df = 3 (There are 4 treatment groups, so df = 4 - 1 = 3)
# sum_sq = 120 (Variability explained by differences between treatments)
# mean_sq = 40 (Average variability between treatments)
# Residual (Within Groups/Error):
# df = 36 (Total observations - number of groups)
# sum_sq = 480 (Variability not explained by treatments)
# mean_sq = 13.33 (Average variability within treatments)
# Overall:
# F = 3 (Ratio of between-group to within-group variability)
# PR(>F) = 0.06 (p-value)