In [38]:
from scipy.stats import shapiro, ttest_ind, f_oneway
from tabulate import tabulate
import pandas as pd

In [39]:
df = pd.read_csv("Datasets/euroweight.csv")

conditions = [pd.Series([True] * len(df), index=df.index)] + [df["Batch"] == i for i in range(1, 8+1)]
labels = ["All batches"] + [f"Batch {i}" for i in range(1, 8+1)]

In [40]:
# Check for normal distribution
confidence = 0.95
alpha = 1 - confidence

table = []

for i in range(len(labels)):
    line = [labels[i]]

    data = df[conditions[i]]["Weight"]

    statistic, p_value = shapiro(data)
    line.append(statistic)
    line.append(p_value)

    if p_value > alpha:
        line.append("Yes")
    else:
        line.append("No")
    
    table.append(line)

print(tabulate(table, headers=["", "Statistic", "P-value", "Is normally distributed"], tablefmt="fancy_grid"))

╒═════════════╤═════════════╤═════════════╤═══════════════════════════╕
│             │   Statistic │     P-value │ Is normally distributed   │
╞═════════════╪═════════════╪═════════════╪═══════════════════════════╡
│ All batches │    0.975473 │ 5.02328e-18 │ No                        │
├─────────────┼─────────────┼─────────────┼───────────────────────────┤
│ Batch 1     │    0.995507 │ 0.683002    │ Yes                       │
├─────────────┼─────────────┼─────────────┼───────────────────────────┤
│ Batch 2     │    0.9909   │ 0.121877    │ Yes                       │
├─────────────┼─────────────┼─────────────┼───────────────────────────┤
│ Batch 3     │    0.863432 │ 4.08944e-14 │ No                        │
├─────────────┼─────────────┼─────────────┼───────────────────────────┤
│ Batch 4     │    0.995505 │ 0.682659    │ Yes                       │
├─────────────┼─────────────┼─────────────┼───────────────────────────┤
│ Batch 5     │    0.991034 │ 0.128993    │ Yes                 

In [46]:
table = []

for i in range(1, 8+1):
    line = [labels[i]]
    data = df[conditions[i]]
    mean_weight = data["Weight"].mean()
    std_deviation = data["Weight"].std()
    line.append(mean_weight)
    line.append(std_deviation)

    table.append(line)

print(tabulate(table, headers = ["", "Mean", "Std deviation"], tablefmt="fancy_grid"))



groups = [df[df["Batch"] == b]["Weight"] for b in df["Batch"].unique()]

# ANOVA test
f_stat, p_value = f_oneway(*groups)

print(f"Statistic = {f_stat}")
print(f"P-value     = {p_value}")

alpha = 0.05
print("Is equal mean: ", end="")
if p_value > alpha:
    print("Yes")
else:
    print("No")

╒═════════╤═════════╤═════════════════╕
│         │    Mean │   Std deviation │
╞═════════╪═════════╪═════════════════╡
│ Batch 1 │ 7.51966 │       0.0343613 │
├─────────┼─────────┼─────────────────┤
│ Batch 2 │ 7.52317 │       0.0354856 │
├─────────┼─────────┼─────────────────┤
│ Batch 3 │ 7.50954 │       0.0370408 │
├─────────┼─────────┼─────────────────┤
│ Batch 4 │ 7.5311  │       0.0294165 │
├─────────┼─────────┼─────────────────┤
│ Batch 5 │ 7.5314  │       0.0296252 │
├─────────┼─────────┼─────────────────┤
│ Batch 6 │ 7.51524 │       0.033424  │
├─────────┼─────────┼─────────────────┤
│ Batch 7 │ 7.52302 │       0.0329985 │
├─────────┼─────────┼─────────────────┤
│ Batch 8 │ 7.51674 │       0.0363746 │
╘═════════╧═════════╧═════════════════╛
Statistic = 12.67221788627366
P-value     = 5.361761521220631e-16
Is equal mean: No
