# Analysis of Variance (ANOVA)
- 

## Load Dataset

In [1]:
import numpy as np

In [2]:
brands = np.array([
    [194, 189, 185, 183, 195],
    [184, 204, 183, 193, 197],
    [189, 190, 186, 184, 194],
    [189, 190, 183, 186, 202],
    [188, 189, 179, 194, 200],
    [186, 207, 191, 199, 211],
    [195, 203, 188, 196, 203],
    [186, 193, 196, 188, 206],
    [183, 181, 189, 193, 202],
    [188, 206, 194, 196, 195]
    ])
brands_T = brands.T
print(brands_T.shape[0], brands_T.shape[1])
# print(np.shape(brands_T), brands_T, len(brands_T[0]))

5 10


## Calculate the Separate Sample Mean

- Mean of each distribution 
- A single row of scalars makes up a single distribution

In [3]:
each_distribution_mean = []

def separate_sample_mean(distributions):

    for row in range(len(distributions)):
        each_distribution_mean.append(np.mean(distributions[row]))

    return each_distribution_mean

def grand_mean(ssm):
    
    return round(np.mean(ssm), 1)

In [4]:
ssm = separate_sample_mean(brands_T)
ssm

[188.2, 195.2, 187.4, 191.2, 200.5]

In [5]:
gm = grand_mean(ssm)
gm

192.5

---

## Calculate the Sum of Squares Between Distributions

- When looking at multiple distributions, what's the spread between them?
- Analogy : 3 people - LA, Simmons, and Me along with the size of our estate (the latter to get a mean; mean of our estate (avg size/spread of our estate) is represented by name_mean); Here, we will calculate : 
    - What's the spread between LA and Simmons?
    - What's the spread between LA and Me?
    - What's the spread between Simmmons and Me?
- To calculate :
    - Mean of each distribution (ed) : la_mean, simmons_mean, my_mean **$\rightarrow$ 3 different scalars**
    - Grand mean (gm) : np.mean(ed) = la_mean + simmons_mean + my_mean / 3 **$\rightarrow$ 1 scalar**
    - Square gm (sgm) : (ed - gm)^2 = (la_mean - gm)^2 + (simmons_mean)^2 + (my_mean - gm)^2 **$\rightarrow$ 3 different scalars**
    - #distributions (n)
    - ss_between (ssb) = (n * sgm) = n * (la_mean - gm)^2 + n * (simmons_mean)^2 + n * (my_mean - gm)^2 **$\rightarrow$ 1 scalar**

In [6]:
def calc_ss_between(n, x_i, x_g):

    ssb = np.sum(np.multiply(n, np.square(x_i - x_g)))
    
    return ssb

In [7]:
s_s_b = calc_ss_between(10, ssm, gm)
s_s_b

1174.8000000000002

---

## Calculate the Sum of Squares Within a Single Distribution for Each Distribution

- Also called the Sum of Squares Error
- When looking at each distribution alone, what's the spread within it?
- The larger the spread between each distribution $\Rightarrow$ the larger the spead between each distribution (ssm between)

In [8]:
def calc_ss_within(x, x_mean):
    # print(x, x_mean)
    ss_within = np.sum(np.square(np.subtract(x_mean.T, x)))
    return ss_within

In [9]:
s_s_m = calc_ss_within(ssm, brands_T)
s_s_m

1661.7

---
## Calculate the Mean Sum of Squares

- MSB = Mean Sum of Squares Between groups = ss_between / df_factor
- MSE = Error Mean of Squares = ss_within / df_error

- Degrees of freedom (DF) 
    - Factor : m (groups) -  1 $\Rightarrow$ 5 - 1
    - Error : n (total data points collected) - m $\Rightarrow$ 50 - 5
    - Total : n - 1

In [10]:
degrees_of_freedom = []

def calc_df(n, m):
    # print(n, m)
    factor = m - 1
    degrees_of_freedom.append(factor)
    error = n - m
    degrees_of_freedom.append(error)
    total = n - 1
    degrees_of_freedom.append(total)
    
    # print(degrees_of_freedom)
    return degrees_of_freedom

ms = []
def calc_ms(x, x_mean, brands, df):
    
    ss_between = calc_ss_between(len(brands_T[1]), x, x_mean)
    ms_between = ss_between / df[0]
    ms.append(ms_between)
    
    ss_within = calc_ss_within(x, brands)
    ms_within = ss_within / df[1]
    ms.append(ms_within)
    
    return ms

In [16]:
n = len(brands_T) * len(brands_T[1])
m = len(brands_T)
df = calc_df(n, m)
ms = calc_ms(ssm, gm, brands_T, df)
ms

[293.70000000000005, 36.92666666666667, 293.70000000000005, 36.92666666666667]

## Calculate SS Total

In [12]:
def calc_ss_total(ssb, sse):
    return ssb + sse

In [13]:
calc_ss_total(s_s_b, s_s_m)

2836.5

## Calculate F

In [14]:
def calc_f(msb, mse):
    
    return msb / mse

In [15]:
msb = ms[0]
mse = ms[1]
calc_f(msb, mse)

7.953601733164832