# ANOVA – Super Simple Introduction

- [Tutorial](https://www.spss-tutorials.com/anova-what-is-it/)

- 💡**Statistically Significant to me is we can use sample size from the total population size to be an accurate representation of the data**, where the sample size < total population size

- See why with tutorial link when discussing effect size
---

# Find
- [x] mean of each group
- [x] grand mean
- [x] variance of each group
- [ ] grand variance
- [x] plot mean of each group

---

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind



# Import Data

1. Organize Data

In [2]:
def load_data(dataset):

    original_data = pd.read_csv(dataset)
    copy_of_data = original_data.copy()
    return copy_of_data

In [4]:
path_to_dataset = "/Users/brinkley97/Documents/development/lab-information_sciences_institute/datasets/spss_dataset/One-Way ANOVA Calculation.csv"
# path_to_dataset = ""
data = load_data(path_to_dataset)

In [5]:
data

Unnamed: 0,Group,Score,Group Mean,Mean,GM - M,Squares Between,Score - GM,Squares Within,Count Groups,Count Observations
0,1,90,99.2,101.7,-2.5,6.25,-9.2,84.64,3,30
1,1,87,99.2,101.7,-2.5,6.25,-12.2,148.84,3,30
2,1,93,99.2,101.7,-2.5,6.25,-6.2,38.44,3,30
3,1,115,99.2,101.7,-2.5,6.25,15.8,249.64,3,30
4,1,97,99.2,101.7,-2.5,6.25,-2.2,4.84,3,30
5,1,85,99.2,101.7,-2.5,6.25,-14.2,201.64,3,30
6,1,102,99.2,101.7,-2.5,6.25,2.8,7.84,3,30
7,1,110,99.2,101.7,-2.5,6.25,10.8,116.64,3,30
8,1,111,99.2,101.7,-2.5,6.25,11.8,139.24,3,30
9,1,102,99.2,101.7,-2.5,6.25,2.8,7.84,3,30


## Organizing Data

- [x] Drop unneeded columns
- [x] Create new columns with groups

In [6]:
drop_columns = ["Group Mean", "Mean", "GM - M", "Squares Between", "Score - GM", "Squares Within", "Count Groups", "Count Observations"]
group_and_scores_only = data.drop(columns=drop_columns)

In [7]:
group_and_scores_only

Unnamed: 0,Group,Score
0,1,90
1,1,87
2,1,93
3,1,115
4,1,97
5,1,85
6,1,102
7,1,110
8,1,111
9,1,102


## Remove Group Column and Add Various Groups As Separate Columns

In [8]:
def value_to_col(data, new_cols):
    
    '''
    Get unique groups
    '''
    
    unique_values = data["Group"].value_counts()
    # print("unique_values : ", unique_values)
    
    unique_values_keys = unique_values.keys()
    # print("\nunique_values_keys : ", unique_values_keys)
    
    '''
    Drop "Group" column
    '''
    
    update_group_col = data.pop("Group")
    # print(remove_group_col)
    
    '''
    Set new cols
    '''
    new_col_1 = new_cols[2]
    new_col_2 = new_cols[1]
    new_col_3 = new_cols[0]
    
    '''
    Reconstruct with new cols
    '''
    
    data[new_col_1] = (update_group_col == unique_values_keys[0]) * data["Score"]
    data[new_col_2] = (update_group_col == unique_values_keys[1]) * data["Score"]
    data[new_col_3] = (update_group_col == unique_values_keys[2]) * data["Score"]
    
    # print(data)
    
    '''
    Replace 0 and 1
    '''
    data.loc[10:, ["1"]] = np.nan
    data.loc[0:9, ["2"]] = np.nan
    data.loc[20:, ["2"]] = np.nan
    data.loc[0:19, ["3"]] = np.nan
    
    '''
    Truncate specific columns
    '''
    new_index = range(1, 11)
    truncate_col_1 = data["1"].truncate(before=0, after=9)
    reformat_col_1 = np.array(truncate_col_1)
    update_truncated_col_1 = pd.DataFrame(reformat_col_1, columns=["1"], index=new_index)
    # print(update_truncated_col_1)
    
    truncate_col_2 = data["2"].truncate(before=10, after=19)
    reformat_col_2 = np.array(truncate_col_2)
    update_truncated_col_2 = pd.DataFrame(reformat_col_2, columns=["2"], index=new_index)
    # print("\n", update_truncated_col_2)

    truncate_col_3 = data["3"].truncate(before=20, after=29)
    reformat_col_3 = np.array(truncate_col_3)
    update_truncated_col_3 = pd.DataFrame(reformat_col_3, columns=["3"], index=new_index)
    # print("\n", update_truncated_col_3)
    
    '''
    Set index
    '''
    # truncate_col_1.set_index(
    
    formatted_data = pd.concat([update_truncated_col_1, update_truncated_col_2, update_truncated_col_3], axis=1)
  
    return formatted_data

In [9]:
groups = ["1", "2", "3"]

In [10]:
data = value_to_col(group_and_scores_only, groups)

In [11]:
data

Unnamed: 0,1,2,3
1,0.0,135.0,0.0
2,0.0,125.0,0.0
3,0.0,107.0,0.0
4,0.0,96.0,0.0
5,0.0,114.0,0.0
6,0.0,125.0,0.0
7,0.0,94.0,0.0
8,0.0,123.0,0.0
9,0.0,111.0,0.0
10,0.0,96.0,0.0


In [12]:
data.shape[0], data.shape[1]

(10, 3)

In [13]:
mean = []
grand_mean = []
variance = []
grand_variance = []
sum_of_squares = []
degrees_of_freedom = []
mean_square = []

def calc_statistical_data(data):
    
    '''
    Descriptives Data
        Mean
        Grand mean
        Variance
        Grand variance
    '''
    mean.append(data.mean())
    grand_mean.append(np.mean(mean))
    variance.append(round(data.var(), 1))
    # grand_variance.append(round((np.sum(variance)/3), 1))
    
    '''
    Sum of squares between groups
    '''
    gm_mean_diff = np.subtract(grand_mean, mean)
    gm_mean_diff_per_group_squared = np.square(gm_mean_diff)
    sum_of_squares_between = round(np.sum(np.multiply(data.shape[0], gm_mean_diff_per_group_squared)), 3)
    sum_of_squares.append(sum_of_squares_between)
    

    '''
    Sum of squares within groups
    '''
    datapoint_mean_diff = np.subtract(data, mean)
    datapoint_mean_diff_per_group_squared = np.square(datapoint_mean_diff)
    sum_of_squares_within = round(np.sum(np.sum(datapoint_mean_diff_per_group_squared)), 3)
    sum_of_squares.append(sum_of_squares_within)
    
    
    '''
    Total sum of squares
    '''
    sum_of_squares_total = round((sum_of_squares_between + sum_of_squares_within), 3)
    sum_of_squares.append(sum_of_squares_total)
    
    '''
    Degrees of freedom
    '''
    degrees_of_freedom_between = data.shape[1] - 1
    degrees_of_freedom.append(degrees_of_freedom_between)
    degrees_of_freedom_within = (data.shape[0] * data.shape[1]) - data.shape[1]
    degrees_of_freedom.append(degrees_of_freedom_within)
    degrees_of_freedom_total = degrees_of_freedom_between + degrees_of_freedom_within
    degrees_of_freedom.append(degrees_of_freedom_total)
    
    '''
    Mean square
    '''
    mean_square_between = round((sum_of_squares_between / degrees_of_freedom_between), 3)
    mean_square.append(mean_square_between)
    mean_square_within = round((sum_of_squares_within / degrees_of_freedom_within), 3)
    mean_square.append(mean_square_within)

    
    '''
    F
    '''
    f_score = round((mean_square_between / mean_square_within), 3)
    
    
    '''
    Significance
    '''
#     total = 0
#     for ms in mean_square:
#         if any(ms >= mean_square):
#             total += 1
    
#     significance_p_value = total
    significance_p_value = np.sum(np.random.f(degrees_of_freedom_between, degrees_of_freedom_within, size=30))
    # significance_p_value = ttest_ind(data[0], data[1])
    
    '''
    Effect size
    '''
    effect_size_eta_squared = round(sum_of_squares_between / (sum_of_squares_between + sum_of_squares_within), 2)
    
    return mean, grand_mean, variance, grand_variance, sum_of_squares, degrees_of_freedom, mean_square, f_score, significance_p_value, effect_size_eta_squared

In [14]:
statistical_data = calc_statistical_data(data)

ValueError: Unable to coerce list of <class 'pandas.core.series.Series'> to Series/DataFrame

In [15]:
statistical_data

NameError: name 'statistical_data' is not defined

In [16]:
statistical_data[0]

NameError: name 'statistical_data' is not defined

In [17]:
mean_of_data = np.array(statistical_data[0]).T

NameError: name 'statistical_data' is not defined

In [None]:
mean = pd.DataFrame(mean_of_data, columns=["Mean"], index=["School 1", "School 2", "School 3"])

In [None]:
mean

## pandas Plot vs matplotlib Plot

- mean.plot.barh() works well but can't add a title to the plot() function
- mean.plot(title="Mean IQ Scores by School (n = 10 per School").barh() throws an error so use matplotlib to add a title

In [None]:
# mean.plot(title="Mean IQ Scores by School (n = 10 per School").barh(color="Red")

In [None]:
mean.plot.barh(color="Red")

In [None]:
# plot_mean = plt.barh(mean, ["12.2"])

In [None]:
mean.plot.barh(color="Red")

In [None]:
statistical_data[2]

In [None]:
variance_of_data = np.array(statistical_data[2]).T

In [None]:
variance_of_data

In [None]:
variance = pd.DataFrame(variance_of_data, columns=["Variance"], index=["School 1", "School 2", "School 3"])

In [None]:
variance

In [None]:
variance.hist()

In [None]:
data

## Need to put scores on the y-axis

In [None]:
data.hist(column=["1", "2", "3"], orientation="horizontal")

In [None]:
data.plot.hist()

In [None]:
data

In [None]:
data